From ac768216132a4f557faf7eecb30d0a6a7688a793 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Thu, 11 Jun 2026 16:24:16 -0700 Subject: [PATCH 1/8] feat(statesync): canonical-syncer ConfigMap + fail-closed StateSyncReady gate (PLT-452) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the label-derived ResolvedRPCWitnesses witness feed with a controller- level canonical-syncer ConfigMap (keyed by chainID) as the state-sync trust root. configureStateSyncTask now sources rpc_servers from Status.ResolvedStateSyncers. Add a fail-closed StateSyncReady condition + gate (between reconcilePeers and ResolvePlan): when state-sync is enabled and <2 canonical syncers are configured (missing ConfigMap / unset ref / no chain entry all count), the controller sets StateSyncReady=False/NoSyncersConfigured, emits a Warning Event, and builds no state-sync plan — never a witness-less or untrusted-fallback sync. An active plan is left to finish (no yank on a transient ConfigMap blip). Disabled nodes report False/NotApplicable (always-present per condition discipline). The condition + ResolvedStateSyncers flush through the existing single optimistic- lock status patch. The controller's ≥2 is a CONFIGURED-COUNT check; the ≥2 byte-for-byte witness agreement (kill duplicate-to-fill) is the sidecar's job, landing with the seictl trust-hardening bump. Status.ResolvedRPCWitnesses is left present-but-unwritten (deprecated; CRD-field removal is a one-way door, deferred to the version bump). Read-only RBAC on the trust-root ConfigMap is not yet enforced (pre-existing cluster-wide configmaps grant) — tracked as a PLT-452 launch-gate blocker in PLT-471. Co-Authored-By: Claude Opus 4.8 --- api/v1alpha1/seinode_types.go | 41 ++- api/v1alpha1/zz_generated.deepcopy.go | 5 + cmd/main.go | 4 + config/crd/sei.io_seinodes.yaml | 23 +- internal/controller/node/controller.go | 8 + internal/controller/node/peers.go | 12 +- internal/controller/node/peers_test.go | 64 +---- internal/controller/node/statesync.go | 166 +++++++++++ internal/controller/node/statesync_test.go | 313 +++++++++++++++++++++ internal/peering/resolver.go | 52 ++-- internal/peering/resolver_test.go | 10 +- internal/planner/planner.go | 7 +- internal/planner/statesync_witness_test.go | 21 +- internal/platform/platform.go | 28 +- manifests/sei.io_seinodes.yaml | 23 +- 15 files changed, 638 insertions(+), 139 deletions(-) create mode 100644 internal/controller/node/statesync.go create mode 100644 internal/controller/node/statesync_test.go diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index 0679a9c2..a22d50b2 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -289,6 +289,26 @@ const ( // ConditionSeiNodePaused mirrors spec.paused: True when paused. ConditionSeiNodePaused = "Paused" + + // ConditionStateSyncReady gates the state-sync-bearing plan. Always-present + // once reconciled. True means canonical syncers are configured and the plan + // may proceed; False fails closed (no state-sync plan built). Its semantics + // are "canonical syncers configured + plan may proceed", NOT "trust + // hardened" — the >=2 byte-for-byte agreement check is the sidecar's job and + // lands with the seictl bump; this condition is a configured-count gate only. + ConditionStateSyncReady = "StateSyncReady" +) + +// Reasons for the StateSyncReady condition. +const ( + // ReasonStateSyncReady: state-sync enabled and >=2 canonical syncers are + // configured for the chain; the state-sync-bearing plan may proceed. + ReasonStateSyncReady = "Ready" + // ReasonStateSyncNoSyncersConfigured: state-sync enabled but the canonical- + // syncer ConfigMap yields <2 entries for the chain (fail closed). + ReasonStateSyncNoSyncersConfigured = "NoSyncersConfigured" + // ReasonStateSyncNotApplicable: the node does not enable state-sync. + ReasonStateSyncNotApplicable = "NotApplicable" ) // Reasons for the ImportPVCReady condition. @@ -360,15 +380,24 @@ type SeiNodeStatus struct { // +optional ResolvedPeers []string `json:"resolvedPeers,omitempty"` - // ResolvedRPCWitnesses carries the in-cluster RPC endpoints - // (`-0...svc.cluster.local:26657`) of the label-resolved - // peers, used as CometBFT state-sync light-client witnesses. Unlike - // ResolvedPeers these never carry an external P2P address — RPC is - // internal-only. When empty the sidecar derives witnesses from - // persistent_peers instead. + // ResolvedRPCWitnesses is DEPRECATED and no longer written. State-sync + // witnesses now come from the controller-level canonical-syncer ConfigMap + // (see ResolvedStateSyncers), not label-derived fleet peers. The field is + // retained present-but-unwritten this release (CRD field removal is a + // one-way door); remove it at the version bump. + // + // Deprecated: use ResolvedStateSyncers. // +optional ResolvedRPCWitnesses []string `json:"resolvedRPCWitnesses,omitempty"` + // ResolvedStateSyncers carries the canonical state-sync RPC endpoints + // (`host:port`) read from the canonical-syncer ConfigMap for this node's + // chain, fed verbatim into ConfigureStateSyncTask.RpcServers. Written by the + // StateSyncReady gate only when state-sync is enabled and >=2 syncers are + // configured; otherwise left empty (fail closed). + // +optional + ResolvedStateSyncers []string `json:"resolvedStateSyncers,omitempty"` + // StatefulSet references the StatefulSet the controller created for // this SeiNode. UID is the identity check: an STS with the expected // name but a different UID is not the one this controller created diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 06f84c58..98d37457 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1132,6 +1132,11 @@ func (in *SeiNodeStatus) DeepCopyInto(out *SeiNodeStatus) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.ResolvedStateSyncers != nil { + in, out := &in.ResolvedStateSyncers, &out.ResolvedStateSyncers + *out = make([]string, len(*in)) + copy(*out, *in) + } if in.StatefulSet != nil { in, out := &in.StatefulSet, &out.StatefulSet *out = new(StatefulSetRef) diff --git a/cmd/main.go b/cmd/main.go index bedb287f..9a4e720d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -163,6 +163,10 @@ func main() { KubeRBACProxyImage: os.Getenv("SEI_KUBE_RBAC_PROXY_IMAGE"), SidecarImage: os.Getenv("SEI_SIDECAR_IMAGE"), CosmosExporterImage: os.Getenv("SEI_COSMOS_EXPORTER_IMAGE"), + + // State-sync canonical syncers are opt-in; these may be empty. + StateSyncSyncersConfigMap: os.Getenv("SEI_STATESYNC_SYNCERS_CONFIGMAP"), + StateSyncSyncersNamespace: os.Getenv("SEI_STATESYNC_SYNCERS_NAMESPACE"), } if err := platformCfg.Validate(); err != nil { diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index a56e9c59..eb47bf05 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -972,12 +972,23 @@ spec: type: array resolvedRPCWitnesses: description: |- - ResolvedRPCWitnesses carries the in-cluster RPC endpoints - (`-0...svc.cluster.local:26657`) of the label-resolved - peers, used as CometBFT state-sync light-client witnesses. Unlike - ResolvedPeers these never carry an external P2P address — RPC is - internal-only. When empty the sidecar derives witnesses from - persistent_peers instead. + ResolvedRPCWitnesses is DEPRECATED and no longer written. State-sync + witnesses now come from the controller-level canonical-syncer ConfigMap + (see ResolvedStateSyncers), not label-derived fleet peers. The field is + retained present-but-unwritten this release (CRD field removal is a + one-way door); remove it at the version bump. + + Deprecated: use ResolvedStateSyncers. + items: + type: string + type: array + resolvedStateSyncers: + description: |- + ResolvedStateSyncers carries the canonical state-sync RPC endpoints + (`host:port`) read from the canonical-syncer ConfigMap for this node's + chain, fed verbatim into ConfigureStateSyncTask.RpcServers. Written by the + StateSyncReady gate only when state-sync is enabled and >=2 syncers are + configured; otherwise left empty (fail closed). items: type: string type: array diff --git a/internal/controller/node/controller.go b/internal/controller/node/controller.go index dcc4b884..d900178d 100644 --- a/internal/controller/node/controller.go +++ b/internal/controller/node/controller.go @@ -132,6 +132,14 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, fmt.Errorf("reconciling peers: %w", err) } + // State-sync gate sits before ResolvePlan so a fail-closed node never + // builds the state-sync-bearing plan. stop=true short-circuits the + // reconcile; the gate flushes via the existing flushStatus closure (single + // optimistic-lock patch, no separate write). + if res, stop, err := r.gateStateSync(ctx, node, flushStatus); stop { + return res, err + } + planAlreadyActive := node.Status.Plan != nil && node.Status.Plan.Phase == seiv1alpha1.TaskPlanActive if err := r.Planner.ResolvePlan(ctx, node); err != nil { return ctrl.Result{}, fmt.Errorf("resolving plan: %w", err) diff --git a/internal/controller/node/peers.go b/internal/controller/node/peers.go index 5cc853d0..484e6984 100644 --- a/internal/controller/node/peers.go +++ b/internal/controller/node/peers.go @@ -10,9 +10,12 @@ import ( ) // reconcilePeers resolves spec.peers into status.resolvedPeers (the composed -// persistent_peers set) and status.resolvedRPCWitnesses (state-sync witnesses). -// The plan plumbs the resolved set into config via the config-apply override -// (init path) or the config-patch (running path). +// persistent_peers set). The plan plumbs the resolved set into config via the +// config-apply override (init path) or the config-patch (running path). +// +// State-sync witnesses are no longer derived here from label-matched peers; +// they come from the controller-level canonical-syncer ConfigMap via the +// StateSyncReady gate (see statesync.go). func (r *SeiNodeReconciler) reconcilePeers(ctx context.Context, node *seiv1alpha1.SeiNode) error { resolver := peering.Resolver{ Reader: r.Client, @@ -27,8 +30,5 @@ func (r *SeiNodeReconciler) reconcilePeers(ctx context.Context, node *seiv1alpha if !slices.Equal(node.Status.ResolvedPeers, result.Peers) { node.Status.ResolvedPeers = result.Peers } - if !slices.Equal(node.Status.ResolvedRPCWitnesses, result.Witnesses) { - node.Status.ResolvedRPCWitnesses = result.Witnesses - } return nil } diff --git a/internal/controller/node/peers_test.go b/internal/controller/node/peers_test.go index 3cbe6b89..c24775c2 100644 --- a/internal/controller/node/peers_test.go +++ b/internal/controller/node/peers_test.go @@ -14,8 +14,6 @@ const ( testRoleValue = "validator" testConsumerName = "consumer" testPeer1ResolvedID = "mock-node-id@peer-1-0.peer-1.default.svc.cluster.local:26656" - testWitnessNS = "arctic-1" - testWitnessRole = "syncer" ) type errStub string @@ -111,54 +109,12 @@ func TestReconcilePeers_PrefersExternalAddress(t *testing.T) { t.Errorf("resolvedPeers[0] = %q, want %q", node.Status.ResolvedPeers[0], want) } - // The witness must be the internal RPC DNS, NOT the external P2P address: - // the NLB exposes P2P only. Writing the external address as a witness is - // the regression this fix prevents. - wantWitness := "pub-peer-0.pub-peer.default.svc.cluster.local:26657" - if len(node.Status.ResolvedRPCWitnesses) != 1 || node.Status.ResolvedRPCWitnesses[0] != wantWitness { - t.Errorf("resolvedRPCWitnesses = %v, want [%q]", node.Status.ResolvedRPCWitnesses, wantWitness) - } -} - -func TestReconcilePeers_WitnessesExcludeSelfAndUseRPCPort(t *testing.T) { - const peerName = "syncer-0-1" - node := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{ - Name: "syncer-0-0", Namespace: testWitnessNS, - Labels: map[string]string{testRoleLabel: testWitnessRole}, - }, - Spec: seiv1alpha1.SeiNodeSpec{ - ChainID: testWitnessNS, - Image: "sei:latest", - Peers: []seiv1alpha1.PeerSource{ - {Label: &seiv1alpha1.LabelPeerSource{ - Selector: map[string]string{testRoleLabel: testWitnessRole}, - }}, - }, - FullNode: &seiv1alpha1.FullNodeSpec{}, - }, - } - peer := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{ - Name: peerName, Namespace: testWitnessNS, - Labels: map[string]string{testRoleLabel: testWitnessRole}, - }, - Spec: seiv1alpha1.SeiNodeSpec{ - ChainID: testWitnessNS, - Image: "sei:latest", - FullNode: &seiv1alpha1.FullNodeSpec{}, - }, - } - - r, _ := newNodeReconciler(t, node, peer) - if err := r.reconcilePeers(context.Background(), node); err != nil { - t.Fatalf("reconcilePeers: %v", err) - } - - want := peerName + "-0." + peerName + "." + testWitnessNS + ".svc.cluster.local:26657" - if len(node.Status.ResolvedRPCWitnesses) != 1 || node.Status.ResolvedRPCWitnesses[0] != want { - t.Errorf("resolvedRPCWitnesses = %v, want [%q] (self excluded, RPC port)", - node.Status.ResolvedRPCWitnesses, want) + // State-sync witnesses are no longer derived from peers — they come from + // the canonical-syncer ConfigMap via the StateSyncReady gate. reconcilePeers + // must not write the deprecated ResolvedRPCWitnesses field. + //nolint:staticcheck // deliberately asserting the deprecated field stays unwritten + if node.Status.ResolvedRPCWitnesses != nil { + t.Errorf("resolvedRPCWitnesses should be unwritten, got %v", node.Status.ResolvedRPCWitnesses) //nolint:staticcheck // see above } } @@ -402,14 +358,6 @@ func TestReconcilePeers_NilSidecarFactorySkipsNewPeer(t *testing.T) { if len(node.Status.ResolvedPeers) != 0 { t.Fatalf("expected unresolvable peer to be skipped, got %d: %v", len(node.Status.ResolvedPeers), node.Status.ResolvedPeers) } - // Intentional asymmetry: the witness needs no node_id, so it is emitted - // even though the peer was skipped from persistent_peers. seid can dial a - // state-sync RPC witness it has no P2P peering with; do not "symmetrize" - // this with ResolvedPeers. - wantWitness := "peer-1-0.peer-1.default.svc.cluster.local:26657" - if len(node.Status.ResolvedRPCWitnesses) != 1 || node.Status.ResolvedRPCWitnesses[0] != wantWitness { - t.Errorf("expected witness despite skipped peer, got %v", node.Status.ResolvedRPCWitnesses) - } } // Nil factory + prior entry: preserve-prior branch fires. diff --git a/internal/controller/node/statesync.go b/internal/controller/node/statesync.go new file mode 100644 index 00000000..68a40928 --- /dev/null +++ b/internal/controller/node/statesync.go @@ -0,0 +1,166 @@ +package node + +import ( + "context" + "fmt" + "slices" + "strings" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" +) + +// minCanonicalSyncers is the controller-side fail-closed floor: state-sync +// requires at least two configured canonical syncers before the +// state-sync-bearing plan may proceed. This is a CONFIGURED-COUNT check; the +// >=2 byte-for-byte agreement check is the sidecar's job and lands with the +// seictl trust-hardening bump. +const minCanonicalSyncers = 2 + +// The controller only READS the canonical-syncer ConfigMap. The read-only +// intent is NOT yet enforced: the reconciler's pre-existing cluster-wide +// configmaps grant (controller.go) confers write access that controller-gen +// unions in, so this marker is documentary only. Scoping the controller to +// read-only on the trust root (so it cannot rewrite its own trust source) is a +// PLT-452 launch-gate blocker tracked in PLT-471, landing with the namespace +// lockdown that fixes the trust-root namespace. +// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch + +// gateStateSync runs the state-sync gate and, when the node fails closed, +// flushes the StateSyncReady condition via the caller's flushStatus closure +// (preserving the single-patch / optimistic-lock model), emits a Warning Event, +// and signals the reconciler to stop before ResolvePlan. stop=true means the +// plan loop must short-circuit; an already-active plan is left to run to +// completion (we don't yank a state-sync plan mid-execution on a transient +// ConfigMap blip — the gate re-asserts once the plan goes terminal). +func (r *SeiNodeReconciler) gateStateSync( + ctx context.Context, + node *seiv1alpha1.SeiNode, + flushStatus func() error, +) (_ ctrl.Result, stop bool, _ error) { + ready, err := r.reconcileStateSyncGate(ctx, node) + if err != nil { + return ctrl.Result{}, true, fmt.Errorf("reconciling state-sync gate: %w", err) + } + planActive := node.Status.Plan != nil && node.Status.Plan.Phase == seiv1alpha1.TaskPlanActive + if ready || planActive { + return ctrl.Result{}, false, nil + } + if err := flushStatus(); err != nil { + return ctrl.Result{}, true, fmt.Errorf("flushing state-sync gate status: %w", err) + } + r.Recorder.Eventf(node, corev1.EventTypeWarning, "StateSyncBlocked", + "state sync enabled but <%d canonical syncers configured for chain %q; not building plan", + minCanonicalSyncers, node.Spec.ChainID) + return ctrl.Result{RequeueAfter: statusPollInterval}, true, nil +} + +// reconcileStateSyncGate resolves the canonical-syncer set for a state-sync +// node and sets ConditionStateSyncReady accordingly. It mutates node.Status +// in-memory only — the condition and ResolvedStateSyncers are flushed by the +// caller's single optimistic-lock status patch, never a separate write. +// +// It returns true ("ready to plan") when the state-sync-bearing plan may +// proceed: either state-sync is disabled (NotApplicable — the plan has no +// state-sync task to gate) or >=2 canonical syncers are configured (Ready). +// It returns false to fail closed: state-sync is enabled but <2 syncers are +// configured (NoSyncersConfigured), in which case the caller must skip +// ResolvePlan, emit a Warning Event, and requeue. +func (r *SeiNodeReconciler) reconcileStateSyncGate(ctx context.Context, node *seiv1alpha1.SeiNode) (bool, error) { + snap := node.Spec.SnapshotSource() + if snap == nil || snap.StateSync == nil { + // State-sync disabled: no state-sync task in the plan to gate. + node.Status.ResolvedStateSyncers = nil + setStateSyncReady(node, metav1.ConditionFalse, seiv1alpha1.ReasonStateSyncNotApplicable, + "node does not enable state sync") + return true, nil + } + + syncers, err := r.canonicalSyncers(ctx, node.Spec.ChainID) + if err != nil { + return false, fmt.Errorf("reading canonical-syncer ConfigMap: %w", err) + } + + if len(syncers) < minCanonicalSyncers { + // Fail closed: do not feed a witness-less (or single-witness) set into + // the plan. Leave ResolvedStateSyncers empty so a stale set can't leak + // into ConfigureStateSyncTask on a later reconcile. + node.Status.ResolvedStateSyncers = nil + setStateSyncReady(node, metav1.ConditionFalse, seiv1alpha1.ReasonStateSyncNoSyncersConfigured, + fmt.Sprintf("state sync requires >=%d canonical syncers configured for chain %q; found %d", + minCanonicalSyncers, node.Spec.ChainID, len(syncers))) + return false, nil + } + + if !slices.Equal(node.Status.ResolvedStateSyncers, syncers) { + node.Status.ResolvedStateSyncers = syncers + } + setStateSyncReady(node, metav1.ConditionTrue, seiv1alpha1.ReasonStateSyncReady, + fmt.Sprintf("%d canonical syncers configured for chain %q", len(syncers), node.Spec.ChainID)) + return true, nil +} + +// canonicalSyncers reads the canonical-syncer ConfigMap and returns the parsed +// syncer RPC endpoints for the given chain. A missing ConfigMap, an unset +// ConfigMap reference, or a chain with no entry all yield an empty slice (no +// error) so the caller fails closed via the StateSyncReady gate rather than +// crashing — state-sync is opt-in and the ConfigMap may legitimately be absent +// until GitOps provisions it. +// +// Data shape: data[chainID] is a list of `host:port` RPC endpoints separated by +// newlines and/or commas. Blank entries are dropped; the result is sorted and +// de-duplicated for a stable witness set. +func (r *SeiNodeReconciler) canonicalSyncers(ctx context.Context, chainID string) ([]string, error) { + name := r.Platform.StateSyncSyncersConfigMap + if strings.TrimSpace(name) == "" { + return nil, nil + } + + cm := &corev1.ConfigMap{} + key := types.NamespacedName{Name: name, Namespace: r.Platform.StateSyncSyncersNamespace} + if err := r.Get(ctx, key, cm); err != nil { + if apierrors.IsNotFound(err) { + return nil, nil + } + return nil, err + } + + return parseSyncerList(cm.Data[chainID]), nil +} + +// parseSyncerList splits a ConfigMap value on newlines and commas, trims +// whitespace, drops blanks, then sorts and de-duplicates. +func parseSyncerList(raw string) []string { + fields := strings.FieldsFunc(raw, func(r rune) bool { + return r == '\n' || r == '\r' || r == ',' || r == ' ' || r == '\t' + }) + if len(fields) == 0 { + return nil + } + out := make([]string, 0, len(fields)) + for _, f := range fields { + if f != "" { + out = append(out, f) + } + } + slices.Sort(out) + return slices.Compact(out) +} + +// setStateSyncReady sets ConditionStateSyncReady with ObservedGeneration +// stamped, following the always-present condition discipline. +func setStateSyncReady(node *seiv1alpha1.SeiNode, status metav1.ConditionStatus, reason, message string) { + apimeta.SetStatusCondition(&node.Status.Conditions, metav1.Condition{ + Type: seiv1alpha1.ConditionStateSyncReady, + Status: status, + Reason: reason, + Message: message, + ObservedGeneration: node.Generation, + }) +} diff --git a/internal/controller/node/statesync_test.go b/internal/controller/node/statesync_test.go new file mode 100644 index 00000000..2588cbef --- /dev/null +++ b/internal/controller/node/statesync_test.go @@ -0,0 +1,313 @@ +package node + +import ( + "context" + "slices" + "strings" + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/record" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/planner" +) + +const ( + testSyncerCMName = "canonical-syncers" + testSyncerCMNS = "sei-platform" + testChainID = "arctic-1" +) + +func syncerConfigMap(data map[string]string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: testSyncerCMName, Namespace: testSyncerCMNS}, + Data: data, + } +} + +// stateSyncNode returns a FullNode with state sync enabled on the given chain. +func stateSyncNode(name, chainID string) *seiv1alpha1.SeiNode { + return &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "default"}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: chainID, + Image: "sei:latest", + FullNode: &seiv1alpha1.FullNodeSpec{ + Snapshot: &seiv1alpha1.SnapshotSource{StateSync: &seiv1alpha1.StateSyncSource{}}, + }, + }, + } +} + +func withSyncerConfigMap(r *SeiNodeReconciler) { + r.Platform.StateSyncSyncersConfigMap = testSyncerCMName + r.Platform.StateSyncSyncersNamespace = testSyncerCMNS +} + +func stateSyncCondition(node *seiv1alpha1.SeiNode) *metav1.Condition { + return apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionStateSyncReady) +} + +func TestParseSyncerList(t *testing.T) { + g := NewWithT(t) + cases := []struct { + name, in string + want []string + }{ + {"empty", "", nil}, + {"whitespace only", " \n\t ", nil}, + {"newline separated", "b:26657\na:26657", []string{"a:26657", "b:26657"}}, + {"comma separated", "b:26657,a:26657", []string{"a:26657", "b:26657"}}, + {"mixed with blanks", "a:26657,\n b:26657 ,,\n", []string{"a:26657", "b:26657"}}, + {"dedup", "a:26657\na:26657\nb:26657", []string{"a:26657", "b:26657"}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + g.Expect(parseSyncerList(tc.in)).To(Equal(tc.want)) + }) + } +} + +func TestStateSyncGate_EnabledWithTwoSyncers_Ready(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("n", testChainID) + cm := syncerConfigMap(map[string]string{ + testChainID: "syncer-1.arctic-1.example.com:26657\nsyncer-0.arctic-1.example.com:26657", + }) + r, _ := newNodeReconciler(t, node, cm) + withSyncerConfigMap(r) + + ready, err := r.reconcileStateSyncGate(ctx, node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ready).To(BeTrue()) + + cond := stateSyncCondition(node) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncReady)) + g.Expect(cond.ObservedGeneration).To(Equal(node.Generation)) + // Sorted + fed verbatim into the task source. + g.Expect(node.Status.ResolvedStateSyncers).To(Equal([]string{ + "syncer-0.arctic-1.example.com:26657", + "syncer-1.arctic-1.example.com:26657", + })) +} + +func TestStateSyncGate_EnabledWithOneSyncer_FailsClosed(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("n", testChainID) + cm := syncerConfigMap(map[string]string{testChainID: "only-one.arctic-1.example.com:26657"}) + r, _ := newNodeReconciler(t, node, cm) + withSyncerConfigMap(r) + + ready, err := r.reconcileStateSyncGate(ctx, node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ready).To(BeFalse()) + + cond := stateSyncCondition(node) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) + g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) +} + +func TestStateSyncGate_EnabledMissingConfigMap_FailsClosed(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("n", testChainID) + r, _ := newNodeReconciler(t, node) // no ConfigMap object + withSyncerConfigMap(r) + + ready, err := r.reconcileStateSyncGate(ctx, node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ready).To(BeFalse()) + + cond := stateSyncCondition(node) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) +} + +func TestStateSyncGate_EnabledNoChainEntry_FailsClosed(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("n", testChainID) + cm := syncerConfigMap(map[string]string{"other-chain": "a:26657\nb:26657"}) + r, _ := newNodeReconciler(t, node, cm) + withSyncerConfigMap(r) + + ready, err := r.reconcileStateSyncGate(ctx, node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ready).To(BeFalse()) + g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) +} + +func TestStateSyncGate_UnconfiguredConfigMapRef_FailsClosed(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("n", testChainID) + r, _ := newNodeReconciler(t, node) // Platform leaves the ConfigMap name empty. + + ready, err := r.reconcileStateSyncGate(ctx, node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ready).To(BeFalse()) + g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) +} + +func TestStateSyncGate_Disabled_NotApplicable(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + // S3 snapshot node: state sync not enabled. + node := newSnapshotNode("n", "default") + r, _ := newNodeReconciler(t, node) + withSyncerConfigMap(r) + + ready, err := r.reconcileStateSyncGate(ctx, node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ready).To(BeTrue()) + + cond := stateSyncCondition(node) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncNotApplicable)) + g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) +} + +// A node with no snapshot source at all (e.g. genesis validator) is treated as +// state-sync-disabled: NotApplicable, never blocks the plan. +func TestStateSyncGate_NoSnapshotSource_NotApplicable(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: "n", Namespace: "default"}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: "sei-test", + Image: "sei:latest", + FullNode: &seiv1alpha1.FullNodeSpec{}, + }, + } + r, _ := newNodeReconciler(t, node) + withSyncerConfigMap(r) + + ready, err := r.reconcileStateSyncGate(ctx, node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ready).To(BeTrue()) + g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNotApplicable)) +} + +// Stale ResolvedStateSyncers must be cleared when the gate later fails closed, +// so a previously-good set can't leak into ConfigureStateSyncTask. +func TestStateSyncGate_FailClosedClearsStaleSyncers(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("n", testChainID) + node.Status.ResolvedStateSyncers = []string{"old-a:26657", "old-b:26657"} + r, _ := newNodeReconciler(t, node) // unconfigured ref → fail closed + withSyncerConfigMap(r) + r.Platform.StateSyncSyncersConfigMap = "" // force unconfigured + + ready, err := r.reconcileStateSyncGate(ctx, node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ready).To(BeFalse()) + g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) +} + +// The gate must not block (or even set ResolvedStateSyncers on) a non-state-sync +// node — regression guard for the full reconcile path. +func TestStateSyncGate_NonStateSyncNodeUnaffected(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := newSnapshotNode("n", "default") + r, _ := newNodeReconciler(t, node) + + ready, err := r.reconcileStateSyncGate(ctx, node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ready).To(BeTrue()) + g.Expect(node.Status.ResolvedStateSyncers).To(BeEmpty()) + g.Expect(slices.Contains([]string{ + seiv1alpha1.ReasonStateSyncReady, seiv1alpha1.ReasonStateSyncNoSyncersConfigured, + }, stateSyncCondition(node).Reason)).To(BeFalse()) +} + +// Full reconcile path: a state-sync node with <2 syncers fails closed — no plan +// is built, the condition is persisted via the single status patch, and a +// Warning Event is emitted. +func TestReconcile_StateSyncFailClosed_NoPlanBuilt(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("ss-0", "default") + node.Spec.ChainID = testChainID + cm := syncerConfigMap(map[string]string{testChainID: "only-one:26657"}) + r, c := newNodeReconciler(t, node, cm) + rec := record.NewFakeRecorder(10) + r.Recorder = rec + withSyncerConfigMap(r) + + _, err := r.Reconcile(ctx, nodeReqFor("ss-0", "default")) + g.Expect(err).NotTo(HaveOccurred()) + + fetched := getSeiNode(t, ctx, c, "ss-0", "default") + g.Expect(fetched.Status.Plan).To(BeNil(), "no state-sync plan must be built when fail-closed") + cond := stateSyncCondition(fetched) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) + + var sawWarning bool + for { + select { + case e := <-rec.Events: + if strings.Contains(e, "StateSyncBlocked") { + sawWarning = true + } + continue + default: + } + break + } + g.Expect(sawWarning).To(BeTrue(), "expected a StateSyncBlocked Warning Event") +} + +// Full reconcile path: a state-sync node with >=2 syncers proceeds — a plan is +// built carrying the canonical syncers, and StateSyncReady is True. +func TestReconcile_StateSyncReady_BuildsPlanWithSyncers(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("ss-1", "default") + node.Spec.ChainID = testChainID + cm := syncerConfigMap(map[string]string{ + testChainID: "a.arctic-1.example.com:26657\nb.arctic-1.example.com:26657", + }) + r, c := newNodeReconciler(t, node, cm) + withSyncerConfigMap(r) + + _, err := r.Reconcile(ctx, nodeReqFor("ss-1", "default")) + g.Expect(err).NotTo(HaveOccurred()) + + fetched := getSeiNode(t, ctx, c, "ss-1", "default") + g.Expect(fetched.Status.Plan).NotTo(BeNil(), "a plan must be built when state-sync is ready") + g.Expect(findPlannedTask(fetched.Status.Plan, planner.TaskConfigureStateSync)). + NotTo(BeNil(), "plan must carry the configure-state-sync task") + g.Expect(fetched.Status.ResolvedStateSyncers).To(Equal([]string{ + "a.arctic-1.example.com:26657", + "b.arctic-1.example.com:26657", + })) + cond := stateSyncCondition(fetched) + g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncReady)) +} diff --git a/internal/peering/resolver.go b/internal/peering/resolver.go index 161f8f28..983370f0 100644 --- a/internal/peering/resolver.go +++ b/internal/peering/resolver.go @@ -1,6 +1,8 @@ // Package peering resolves a SeiNode's spec.peers into the fully-composed -// `@:` persistent_peers set and the in-cluster RPC witness -// endpoints for state sync. The controller is the sole owner of peer resolution. +// `@:` persistent_peers set. The controller is the sole +// owner of peer resolution. State-sync witnesses are NOT resolved here — they +// come from the controller-level canonical-syncer ConfigMap (see the node +// controller's StateSyncReady gate), not label-matched fleet peers. // // Resolve handles all three source kinds in-controller: // - Label — list matching SeiNodes, fetch each peer's node_id via its @@ -36,8 +38,8 @@ import ( // degraded or test environment without a sidecar factory still reconciles. var errNoSidecarFactory = errors.New("sidecar client factory is nil") -// Resolver resolves spec.peers into persistent_peers + RPC witnesses. All -// dependencies are injected so it is unit-testable without a cluster or AWS. +// Resolver resolves spec.peers into the persistent_peers set. All dependencies +// are injected so it is unit-testable without a cluster or AWS. type Resolver struct { // Reader lists SeiNodes for Label sources. Reader client.Reader @@ -57,9 +59,6 @@ type Result struct { // Peers are the fully-composed `@:` persistent_peers, // sorted and de-duplicated. Peers []string - // Witnesses are the in-cluster RPC endpoints of label-resolved peers, - // sorted and de-duplicated, for state-sync light-client use. - Witnesses []string } // Resolve resolves all of node.Spec.Peers. prior is the node's last resolved @@ -70,18 +69,17 @@ type Result struct { func (r *Resolver) Resolve(ctx context.Context, node *seiv1alpha1.SeiNode, prior []string) (Result, error) { priorByHost := indexResolvedPeersByHost(prior) - var peers, witnesses []string + var peers []string var preserveEC2Prior bool for i := range node.Spec.Peers { src := &node.Spec.Peers[i] switch { case src.Label != nil: - endpoints, w, err := r.resolveLabel(ctx, node, src.Label, priorByHost) + endpoints, err := r.resolveLabel(ctx, node, src.Label, priorByHost) if err != nil { return Result{}, err } peers = append(peers, endpoints...) - witnesses = append(witnesses, w...) case src.Static != nil: peers = append(peers, src.Static.Addresses...) case src.EC2Tags != nil: @@ -102,24 +100,19 @@ func (r *Resolver) Resolve(ctx context.Context, node *seiv1alpha1.SeiNode, prior slices.Sort(peers) peers = slices.Compact(peers) - slices.Sort(witnesses) - witnesses = slices.Compact(witnesses) - return Result{Peers: peers, Witnesses: witnesses}, nil + return Result{Peers: peers}, nil } -// resolveLabel lists SeiNodes matching the selector, composes each peer's -// `@:` entry, and yields the in-cluster RPC witness for -// each matched peer. A per-peer node_id fetch failure preserves the prior -// composed entry (from priorByHost) or skips the peer until it is resolvable — -// transients never wedge the fleet. Witnesses are deterministic from peer -// identity (no node_id needed) so every matched peer yields one regardless of -// sidecar reachability. +// resolveLabel lists SeiNodes matching the selector and composes each peer's +// `@:` entry. A per-peer node_id fetch failure preserves +// the prior composed entry (from priorByHost) or skips the peer until it is +// resolvable — transients never wedge the fleet. func (r *Resolver) resolveLabel( ctx context.Context, node *seiv1alpha1.SeiNode, src *seiv1alpha1.LabelPeerSource, priorByHost map[string]string, -) ([]string, []string, error) { +) ([]string, error) { logger := log.FromContext(ctx) ns := node.Namespace if src.Namespace != "" { @@ -131,18 +124,16 @@ func (r *Resolver) resolveLabel( client.InNamespace(ns), client.MatchingLabels(src.Selector), ); err != nil { - return nil, nil, fmt.Errorf("listing peers by label: %w", err) + return nil, fmt.Errorf("listing peers by label: %w", err) } - var endpoints, witnesses []string + var endpoints []string for i := range nodeList.Items { peer := &nodeList.Items[i] if peer.Name == node.Name && peer.Namespace == node.Namespace { continue } - witnesses = append(witnesses, peerRPCAddress(peer)) - address := peerAddress(peer) var sc task.SidecarClient err := errNoSidecarFactory @@ -164,7 +155,7 @@ func (r *Resolver) resolveLabel( } logger.Info("skipping peer until node_id is resolvable", "peer", peer.Name, "err", err) } - return endpoints, witnesses, nil + return endpoints, nil } // indexResolvedPeersByHost maps `host:port` → `@host:port` for O(1) @@ -190,12 +181,3 @@ func peerAddress(peer *seiv1alpha1.SeiNode) string { return fmt.Sprintf("%s-0.%s.%s.svc.cluster.local:%d", peer.Name, peer.Name, peer.Namespace, seiconfig.PortP2P) } - -// peerRPCAddress returns the in-cluster headless Service DNS for a peer's RPC -// port. Unlike peerAddress it never consults Spec.ExternalAddress: the external -// NLB exposes P2P only, so a state-sync light-client witness must target the -// cluster-internal RPC endpoint or seid exits on "no witnesses connected". -func peerRPCAddress(peer *seiv1alpha1.SeiNode) string { - return fmt.Sprintf("%s-0.%s.%s.svc.cluster.local:%d", - peer.Name, peer.Name, peer.Namespace, seiconfig.PortRPC) -} diff --git a/internal/peering/resolver_test.go b/internal/peering/resolver_test.go index 81f739e8..c9622526 100644 --- a/internal/peering/resolver_test.go +++ b/internal/peering/resolver_test.go @@ -101,12 +101,9 @@ func TestResolve_Static_Verbatim(t *testing.T) { } want := []string{"abc@1.2.3.4:26656", "def@5.6.7.8:26656"} assertEqualPeers(t, got.Peers, want) - if len(got.Witnesses) != 0 { - t.Errorf("static source should yield no witnesses, got %v", got.Witnesses) - } } -func TestResolve_Label_ComposesNodeIDAndWitness(t *testing.T) { +func TestResolve_Label_ComposesNodeID(t *testing.T) { consumer := fullNode("consumer", nil) consumer.Spec.Peers = []seiv1alpha1.PeerSource{ {Label: &seiv1alpha1.LabelPeerSource{Selector: map[string]string{roleLabel: rolePeer}}}, @@ -123,7 +120,6 @@ func TestResolve_Label_ComposesNodeIDAndWitness(t *testing.T) { t.Fatalf("Resolve: %v", err) } assertEqualPeers(t, got.Peers, []string{"node-xyz@peer-1-0.peer-1.default.svc.cluster.local:26656"}) - assertEqualPeers(t, got.Witnesses, []string{"peer-1-0.peer-1.default.svc.cluster.local:26657"}) } func TestResolve_Label_PrefersExternalAddress(t *testing.T) { @@ -143,9 +139,7 @@ func TestResolve_Label_PrefersExternalAddress(t *testing.T) { if err != nil { t.Fatalf("Resolve: %v", err) } - // Peer uses external P2P address; witness stays internal RPC DNS. assertEqualPeers(t, got.Peers, []string{"nid@pub.example.com:26656"}) - assertEqualPeers(t, got.Witnesses, []string{"pub-0.pub.default.svc.cluster.local:26657"}) } func TestResolve_Label_EmptySetYieldsNoPeers(t *testing.T) { @@ -187,8 +181,6 @@ func TestResolve_Label_TransientFailurePreservesPriorEntry(t *testing.T) { } // node_id fetch failed but a prior entry for this host exists → preserved. assertEqualPeers(t, got.Peers, prior) - // Witness is deterministic from identity → still emitted. - assertEqualPeers(t, got.Witnesses, []string{"peer-1-0.peer-1.default.svc.cluster.local:26657"}) } func TestResolve_Label_TransientFailureNoPriorSkips(t *testing.T) { diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 81a45135..d5073b57 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -656,9 +656,14 @@ func snapshotRestoreTask(snap *seiv1alpha1.SnapshotSource) sidecar.SnapshotResto func configureStateSyncTask(node *seiv1alpha1.SeiNode) sidecar.ConfigureStateSyncTask { snap := node.Spec.SnapshotSource() + // RpcServers come from the controller-level canonical-syncer ConfigMap, + // resolved by the StateSyncReady gate into Status.ResolvedStateSyncers + // (replacing the label-derived ResolvedRPCWitnesses path). The gate + // guarantees >=2 entries here; the >=2 byte-for-byte agreement check is the + // sidecar's job and lands with the seictl trust-hardening bump. t := sidecar.ConfigureStateSyncTask{ UseLocalSnapshot: hasS3Snapshot(snap), - RpcServers: node.Status.ResolvedRPCWitnesses, + RpcServers: node.Status.ResolvedStateSyncers, } if snap != nil { if snap.TrustPeriod != "" { diff --git a/internal/planner/statesync_witness_test.go b/internal/planner/statesync_witness_test.go index 79234dd9..47c86bd9 100644 --- a/internal/planner/statesync_witness_test.go +++ b/internal/planner/statesync_witness_test.go @@ -7,10 +7,10 @@ import ( seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" ) -func TestConfigureStateSyncTask_PassesResolvedWitnesses(t *testing.T) { - witnesses := []string{ - "syncer-0-0-0.syncer-0-0.arctic-1.svc.cluster.local:26657", - "syncer-0-1-0.syncer-0-1.arctic-1.svc.cluster.local:26657", +func TestConfigureStateSyncTask_PassesCanonicalSyncers(t *testing.T) { + syncers := []string{ + "syncer-0.arctic-1.example.com:26657", + "syncer-1.arctic-1.example.com:26657", } node := &seiv1alpha1.SeiNode{ Spec: seiv1alpha1.SeiNodeSpec{ @@ -18,13 +18,13 @@ func TestConfigureStateSyncTask_PassesResolvedWitnesses(t *testing.T) { Snapshot: &seiv1alpha1.SnapshotSource{TrustPeriod: "168h0m0s", BackfillBlocks: 6000}, }, }, - Status: seiv1alpha1.SeiNodeStatus{ResolvedRPCWitnesses: witnesses}, + Status: seiv1alpha1.SeiNodeStatus{ResolvedStateSyncers: syncers}, } task := configureStateSyncTask(node) - if !slices.Equal(task.RpcServers, witnesses) { - t.Errorf("RpcServers = %v, want %v", task.RpcServers, witnesses) + if !slices.Equal(task.RpcServers, syncers) { + t.Errorf("RpcServers = %v, want %v", task.RpcServers, syncers) } if task.TrustPeriod != "168h0m0s" { t.Errorf("TrustPeriod = %q, want 168h0m0s", task.TrustPeriod) @@ -34,9 +34,10 @@ func TestConfigureStateSyncTask_PassesResolvedWitnesses(t *testing.T) { } } -// No resolved witnesses (e.g. EC2/static peers) leaves RpcServers empty so the -// sidecar falls back to deriving witnesses from persistent_peers. -func TestConfigureStateSyncTask_NoWitnessesLeavesEmpty(t *testing.T) { +// No resolved syncers leaves RpcServers empty. In production the StateSyncReady +// gate fails closed before this task is reached when <2 syncers are configured; +// this guards the builder's nil-safety regardless. +func TestConfigureStateSyncTask_NoSyncersLeavesEmpty(t *testing.T) { node := &seiv1alpha1.SeiNode{} task := configureStateSyncTask(node) if len(task.RpcServers) != 0 { diff --git a/internal/platform/platform.go b/internal/platform/platform.go index d620b655..4f88d7f6 100644 --- a/internal/platform/platform.go +++ b/internal/platform/platform.go @@ -21,8 +21,9 @@ const ( ) // Config holds infrastructure-level settings that vary per deployment -// environment. All fields are required and read from environment variables -// in main.go. See platformtest.Config() for test fixtures. +// environment. Fields are read from environment variables in main.go and are +// required unless documented otherwise — the StateSyncSyncers* pair is optional +// (state-sync is opt-in). See platformtest.Config() for test fixtures. type Config struct { NodepoolName string NodepoolArchive string @@ -58,6 +59,23 @@ type Config struct { // CosmosExporterImage is the sei-cosmos-exporter sidecar image. // The cosmos-exporter container is attached to every SeiNode pod. CosmosExporterImage string + + // StateSyncSyncersConfigMap is the name of the canonical-syncer ConfigMap + // the controller reads to populate state-sync rpc_servers. It is the trust + // root for state-sync: read-only to the controller, GitOps-written, RBAC- + // locked. Keyed by chain ID (data[chainID] = syncer RPC endpoints, as bare + // host:port — no http:// scheme prefix; the sidecar adds the scheme). + // + // State-sync is opt-in, so this and StateSyncSyncersNamespace may be empty + // when no node uses state-sync. When a node DOES enable state-sync and this + // is unset (or the ConfigMap yields <2 entries for its chain), the + // controller fails closed via StateSyncReady=False/NoSyncersConfigured + // rather than building a witness-less plan. + StateSyncSyncersConfigMap string + + // StateSyncSyncersNamespace is the namespace of StateSyncSyncersConfigMap. + // Required when StateSyncSyncersConfigMap is set (Validate enforces the pair). + StateSyncSyncersNamespace string } // NodepoolForMode returns the Karpenter NodePool name for the given @@ -104,5 +122,11 @@ func (c Config) Validate() error { return fmt.Errorf("%s is required", name) } } + // The state-sync syncer ConfigMap is optional, but a name without a + // namespace would issue a cluster-scoped Get that silently fails closed — + // catch that misconfiguration explicitly. + if c.StateSyncSyncersConfigMap != "" && c.StateSyncSyncersNamespace == "" { + return fmt.Errorf("SEI_STATESYNC_SYNCERS_NAMESPACE is required when SEI_STATESYNC_SYNCERS_CONFIGMAP is set") + } return nil } diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index a56e9c59..eb47bf05 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -972,12 +972,23 @@ spec: type: array resolvedRPCWitnesses: description: |- - ResolvedRPCWitnesses carries the in-cluster RPC endpoints - (`-0...svc.cluster.local:26657`) of the label-resolved - peers, used as CometBFT state-sync light-client witnesses. Unlike - ResolvedPeers these never carry an external P2P address — RPC is - internal-only. When empty the sidecar derives witnesses from - persistent_peers instead. + ResolvedRPCWitnesses is DEPRECATED and no longer written. State-sync + witnesses now come from the controller-level canonical-syncer ConfigMap + (see ResolvedStateSyncers), not label-derived fleet peers. The field is + retained present-but-unwritten this release (CRD field removal is a + one-way door); remove it at the version bump. + + Deprecated: use ResolvedStateSyncers. + items: + type: string + type: array + resolvedStateSyncers: + description: |- + ResolvedStateSyncers carries the canonical state-sync RPC endpoints + (`host:port`) read from the canonical-syncer ConfigMap for this node's + chain, fed verbatim into ConfigureStateSyncTask.RpcServers. Written by the + StateSyncReady gate only when state-sync is enabled and >=2 syncers are + configured; otherwise left empty (fail closed). items: type: string type: array From a49d93b0d646d6447f2fb4588a924eb56c172234 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Thu, 11 Jun 2026 16:38:06 -0700 Subject: [PATCH 2/8] fix(statesync): seed StateSyncReady before the paused/failed early-returns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit StateSyncReady is documented always-present, but Reconcile returned early on spec.Paused and PhaseFailed before the gate seeded it, so a paused- or failed-from-creation node never got the condition (Cursor Bugbot). Split the gate: reconcileStateSyncGate (resolution — sets the always-present condition) now runs once before the early-returns, so the condition rides the existing flush on every path; enforceStateSyncGate (fail-closed stop/event) runs only on the active path. Single optimistic-lock patch preserved; gocyclo held under 30 by extracting emitPhaseTransition. Adds a paused-node always-present test. Co-Authored-By: Claude Opus 4.8 --- internal/controller/node/controller.go | 83 +++++++++++++--------- internal/controller/node/statesync.go | 33 +++++---- internal/controller/node/statesync_test.go | 50 +++++++++++++ 3 files changed, 119 insertions(+), 47 deletions(-) diff --git a/internal/controller/node/controller.go b/internal/controller/node/controller.go index d900178d..9e890b44 100644 --- a/internal/controller/node/controller.go +++ b/internal/controller/node/controller.go @@ -107,6 +107,17 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return r.Status().Patch(ctx, node, statusBase) } + // Resolve the always-present StateSyncReady condition before the Failed and + // Paused early-returns so it rides the existing flush on every path (Failed + // flush, Paused flush, and the normal end-of-reconcile patch) — no separate + // status write. The fail-closed enforcement happens later, on the active + // path only, via enforceStateSyncGate using the `ready` resolved here (one + // ConfigMap read per reconcile). + stateSyncReady, err := r.reconcileStateSyncGate(ctx, node) + if err != nil { + return ctrl.Result{}, fmt.Errorf("resolving state-sync gate: %w", err) + } + // Failed is terminal — flush any condition updates and exit. if node.Status.Phase == seiv1alpha1.PhaseFailed { if err := flushStatus(); err != nil { @@ -132,11 +143,12 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, fmt.Errorf("reconciling peers: %w", err) } - // State-sync gate sits before ResolvePlan so a fail-closed node never - // builds the state-sync-bearing plan. stop=true short-circuits the - // reconcile; the gate flushes via the existing flushStatus closure (single - // optimistic-lock patch, no separate write). - if res, stop, err := r.gateStateSync(ctx, node, flushStatus); stop { + // Enforce the state-sync gate before ResolvePlan so a fail-closed node never + // builds the state-sync-bearing plan. Uses the `ready` resolved above (no + // re-read of the ConfigMap). stop=true short-circuits the reconcile; the + // gate flushes via the existing flushStatus closure (single optimistic-lock + // patch, no separate write). + if res, stop, err := r.enforceStateSyncGate(node, stateSyncReady, flushStatus); stop { return res, err } @@ -171,33 +183,7 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return result, execErr } - // Emit metrics/events if the phase changed. - if node.Status.Phase != observedPhase { - ns, name := node.Namespace, node.Name - nodePhaseTransitions.Add(ctx, 1, - metric.WithAttributes( - observability.AttrController.String(seiNodeControllerName), - observability.AttrNamespace.String(ns), - observability.AttrFromPhase.String(string(observedPhase)), - observability.AttrToPhase.String(string(node.Status.Phase)), - ), - ) - emitNodePhase(ns, name, node.Status.Phase) - r.Recorder.Eventf(node, corev1.EventTypeNormal, "PhaseTransition", - "Phase changed from %s to %s", observedPhase, node.Status.Phase) - - // Record time spent in the previous phase. - if node.Status.PhaseTransitionTime != nil && observedPhase != "" { - dur := time.Since(node.Status.PhaseTransitionTime.Time).Seconds() - nodePhaseDuration.Record(ctx, dur, - metric.WithAttributes( - observability.AttrNamespace.String(ns), - observability.AttrChainID.String(node.Spec.ChainID), - observability.AttrPhase.String(string(observedPhase)), - ), - ) - } - } + r.emitPhaseTransition(ctx, node, observedPhase) // Running nodes with no active plan requeue on a steady-state interval. // Spec changes trigger immediate reconciles via GenerationChangedPredicate. @@ -208,6 +194,39 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return result, nil } +// emitPhaseTransition records phase-transition metrics and a PhaseTransition +// Event when the node's phase changed during this reconcile. A no-op when the +// phase is unchanged. +func (r *SeiNodeReconciler) emitPhaseTransition(ctx context.Context, node *seiv1alpha1.SeiNode, observedPhase seiv1alpha1.SeiNodePhase) { + if node.Status.Phase == observedPhase { + return + } + ns, name := node.Namespace, node.Name + nodePhaseTransitions.Add(ctx, 1, + metric.WithAttributes( + observability.AttrController.String(seiNodeControllerName), + observability.AttrNamespace.String(ns), + observability.AttrFromPhase.String(string(observedPhase)), + observability.AttrToPhase.String(string(node.Status.Phase)), + ), + ) + emitNodePhase(ns, name, node.Status.Phase) + r.Recorder.Eventf(node, corev1.EventTypeNormal, "PhaseTransition", + "Phase changed from %s to %s", observedPhase, node.Status.Phase) + + // Record time spent in the previous phase. + if node.Status.PhaseTransitionTime != nil && observedPhase != "" { + dur := time.Since(node.Status.PhaseTransitionTime.Time).Seconds() + nodePhaseDuration.Record(ctx, dur, + metric.WithAttributes( + observability.AttrNamespace.String(ns), + observability.AttrChainID.String(node.Spec.ChainID), + observability.AttrPhase.String(string(observedPhase)), + ), + ) + } +} + // SetupWithManager sets up the controller with the Manager. func (r *SeiNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). diff --git a/internal/controller/node/statesync.go b/internal/controller/node/statesync.go index 68a40928..4764fee9 100644 --- a/internal/controller/node/statesync.go +++ b/internal/controller/node/statesync.go @@ -32,22 +32,22 @@ const minCanonicalSyncers = 2 // lockdown that fixes the trust-root namespace. // +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch -// gateStateSync runs the state-sync gate and, when the node fails closed, -// flushes the StateSyncReady condition via the caller's flushStatus closure +// enforceStateSyncGate is the fail-closed half of the state-sync gate. It runs +// AFTER reconcileStateSyncGate has resolved the always-present StateSyncReady +// condition and returned `ready`. When the node is not ready and has no active +// plan, it flushes the resolved condition via the caller's flushStatus closure // (preserving the single-patch / optimistic-lock model), emits a Warning Event, -// and signals the reconciler to stop before ResolvePlan. stop=true means the -// plan loop must short-circuit; an already-active plan is left to run to -// completion (we don't yank a state-sync plan mid-execution on a transient -// ConfigMap blip — the gate re-asserts once the plan goes terminal). -func (r *SeiNodeReconciler) gateStateSync( - ctx context.Context, +// and signals the reconciler to stop before ResolvePlan so a fail-closed node +// never builds the state-sync-bearing plan. stop=true means the plan loop must +// short-circuit; an already-active plan is left to run to completion (we don't +// yank a state-sync plan mid-execution on a transient ConfigMap blip — the gate +// re-asserts once the plan goes terminal). It makes no API calls itself, so it +// takes no context — resolution (the ConfigMap read) already happened. +func (r *SeiNodeReconciler) enforceStateSyncGate( node *seiv1alpha1.SeiNode, + ready bool, flushStatus func() error, ) (_ ctrl.Result, stop bool, _ error) { - ready, err := r.reconcileStateSyncGate(ctx, node) - if err != nil { - return ctrl.Result{}, true, fmt.Errorf("reconciling state-sync gate: %w", err) - } planActive := node.Status.Plan != nil && node.Status.Plan.Phase == seiv1alpha1.TaskPlanActive if ready || planActive { return ctrl.Result{}, false, nil @@ -62,9 +62,12 @@ func (r *SeiNodeReconciler) gateStateSync( } // reconcileStateSyncGate resolves the canonical-syncer set for a state-sync -// node and sets ConditionStateSyncReady accordingly. It mutates node.Status -// in-memory only — the condition and ResolvedStateSyncers are flushed by the -// caller's single optimistic-lock status patch, never a separate write. +// node and sets the always-present ConditionStateSyncReady accordingly. It +// mutates node.Status in-memory only — the condition and ResolvedStateSyncers +// are flushed by the caller's single optimistic-lock status patch, never a +// separate write. It is the resolution half of the gate: the caller runs it +// before the Failed/Paused early-returns so StateSyncReady is seeded on every +// path, then feeds `ready` into enforceStateSyncGate for fail-closed handling. // // It returns true ("ready to plan") when the state-sync-bearing plan may // proceed: either state-sync is disabled (NotApplicable — the plan has no diff --git a/internal/controller/node/statesync_test.go b/internal/controller/node/statesync_test.go index 2588cbef..66f217c2 100644 --- a/internal/controller/node/statesync_test.go +++ b/internal/controller/node/statesync_test.go @@ -282,6 +282,56 @@ func TestReconcile_StateSyncFailClosed_NoPlanBuilt(t *testing.T) { g.Expect(sawWarning).To(BeTrue(), "expected a StateSyncBlocked Warning Event") } +// Full reconcile path: a paused node still gets the always-present +// StateSyncReady condition seeded, even though reconcile returns early on +// spec.paused before the gate enforcement. State-sync enabled with no syncers +// resolves to False/NoSyncersConfigured; state-sync disabled resolves to +// False/NotApplicable. Either way the condition must be present — and a paused +// node must build NO plan (pause semantics preserved). +func TestReconcile_PausedNode_StateSyncReadyStillSeeded(t *testing.T) { + cases := []struct { + name string + node *seiv1alpha1.SeiNode + withCM bool + wantReason string + }{ + { + name: "state-sync enabled, no syncers", + node: stateSyncNode("paused-ss", "default"), + withCM: true, + wantReason: seiv1alpha1.ReasonStateSyncNoSyncersConfigured, + }, + { + name: "state-sync disabled", + node: newSnapshotNode("paused-s3", "default"), + withCM: false, + wantReason: seiv1alpha1.ReasonStateSyncNotApplicable, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + tc.node.Spec.Paused = true + r, c := newNodeReconciler(t, tc.node) + if tc.withCM { + withSyncerConfigMap(r) + } + + _, err := r.Reconcile(ctx, nodeReqFor(tc.node.Name, "default")) + g.Expect(err).NotTo(HaveOccurred()) + + fetched := getSeiNode(t, ctx, c, tc.node.Name, "default") + cond := stateSyncCondition(fetched) + g.Expect(cond).NotTo(BeNil(), "StateSyncReady must be present even on a paused node") + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(tc.wantReason)) + g.Expect(fetched.Status.Plan).To(BeNil(), "a paused node must build no plan") + }) + } +} + // Full reconcile path: a state-sync node with >=2 syncers proceeds — a plan is // built carrying the canonical syncers, and StateSyncReady is True. func TestReconcile_StateSyncReady_BuildsPlanWithSyncers(t *testing.T) { From 66b7712c1b534f97d1bebecdd08ad0646bbe7f7b Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 12 Jun 2026 07:49:38 -0700 Subject: [PATCH 3/8] =?UTF-8?q?docs(statesync):=20descope=20sidecar=20trus?= =?UTF-8?q?t-hardening=20=E2=80=94=20reliability=20via=20curation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per owner direction, drop the planned seictl ≥2-agreement/equivocation hardening as over-engineering. The controller already solves the real problem: state-sync witnesses come from a curated canonical-syncer ConfigMap (seeded per cluster) instead of label-matched peers, decoupling peers from state-sync and removing the sharp edge of using fine-as-peers-but-invalid-as-witnesses nodes. The sidecar keeps its existing trust-pinning behavior; reliability comes from curating the syncer set, not cross-witness checking. Comment-only. Co-Authored-By: Claude Opus 4.8 --- api/v1alpha1/seinode_types.go | 8 ++++---- internal/controller/node/statesync.go | 7 ++++--- internal/planner/planner.go | 6 +++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index a22d50b2..ac18312e 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -292,10 +292,10 @@ const ( // ConditionStateSyncReady gates the state-sync-bearing plan. Always-present // once reconciled. True means canonical syncers are configured and the plan - // may proceed; False fails closed (no state-sync plan built). Its semantics - // are "canonical syncers configured + plan may proceed", NOT "trust - // hardened" — the >=2 byte-for-byte agreement check is the sidecar's job and - // lands with the seictl bump; this condition is a configured-count gate only. + // may proceed; False fails closed (no state-sync plan built, and peers are + // never used as witnesses). It is a configured-count gate: witness + // reliability comes from curating the canonical-syncer set, and the sidecar + // establishes the trust point from them as it does today. ConditionStateSyncReady = "StateSyncReady" ) diff --git a/internal/controller/node/statesync.go b/internal/controller/node/statesync.go index 4764fee9..1ad9b4e0 100644 --- a/internal/controller/node/statesync.go +++ b/internal/controller/node/statesync.go @@ -18,9 +18,10 @@ import ( // minCanonicalSyncers is the controller-side fail-closed floor: state-sync // requires at least two configured canonical syncers before the -// state-sync-bearing plan may proceed. This is a CONFIGURED-COUNT check; the -// >=2 byte-for-byte agreement check is the sidecar's job and lands with the -// seictl trust-hardening bump. +// state-sync-bearing plan may proceed (CometBFT needs >=2 rpc-servers, and we +// never fall back to peers as witnesses). Reliability comes from curating the +// canonical-syncer set, not from cross-witness checking — the sidecar keeps its +// existing trust-pinning behavior. const minCanonicalSyncers = 2 // The controller only READS the canonical-syncer ConfigMap. The read-only diff --git a/internal/planner/planner.go b/internal/planner/planner.go index d5073b57..c8e102d3 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -658,9 +658,9 @@ func configureStateSyncTask(node *seiv1alpha1.SeiNode) sidecar.ConfigureStateSyn snap := node.Spec.SnapshotSource() // RpcServers come from the controller-level canonical-syncer ConfigMap, // resolved by the StateSyncReady gate into Status.ResolvedStateSyncers - // (replacing the label-derived ResolvedRPCWitnesses path). The gate - // guarantees >=2 entries here; the >=2 byte-for-byte agreement check is the - // sidecar's job and lands with the seictl trust-hardening bump. + // (replacing the label-derived ResolvedRPCWitnesses path — peers are no + // longer used as witnesses). The gate guarantees >=2 curated entries here; + // the sidecar establishes the trust point from them as it does today. t := sidecar.ConfigureStateSyncTask{ UseLocalSnapshot: hasS3Snapshot(snap), RpcServers: node.Status.ResolvedStateSyncers, From 4a9a17ceca6bd3fa8ae2403422cefcdff01de824 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 12 Jun 2026 10:47:23 -0700 Subject: [PATCH 4/8] test(statesync): hoist repeated literals to consts (goconst) The new statesync_test.go tripped goconst on the main-targeted lint (default, sei:latest, a:26657/b:26657, sei-test). Hoist them to test consts; the rest of the package keeps its existing literals (unchanged, grandfathered). Co-Authored-By: Claude Opus 4.8 --- internal/controller/node/statesync_test.go | 47 ++++++++++++---------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/internal/controller/node/statesync_test.go b/internal/controller/node/statesync_test.go index 66f217c2..1e689416 100644 --- a/internal/controller/node/statesync_test.go +++ b/internal/controller/node/statesync_test.go @@ -20,6 +20,11 @@ const ( testSyncerCMName = "canonical-syncers" testSyncerCMNS = "sei-platform" testChainID = "arctic-1" + testNamespace = "default" + testImage = "sei:latest" + testNodeName = "sei-test" + syncerA = "a:26657" + syncerB = "b:26657" ) func syncerConfigMap(data map[string]string) *corev1.ConfigMap { @@ -32,10 +37,10 @@ func syncerConfigMap(data map[string]string) *corev1.ConfigMap { // stateSyncNode returns a FullNode with state sync enabled on the given chain. func stateSyncNode(name, chainID string) *seiv1alpha1.SeiNode { return &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "default"}, + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: testNamespace}, Spec: seiv1alpha1.SeiNodeSpec{ ChainID: chainID, - Image: "sei:latest", + Image: testImage, FullNode: &seiv1alpha1.FullNodeSpec{ Snapshot: &seiv1alpha1.SnapshotSource{StateSync: &seiv1alpha1.StateSyncSource{}}, }, @@ -60,10 +65,10 @@ func TestParseSyncerList(t *testing.T) { }{ {"empty", "", nil}, {"whitespace only", " \n\t ", nil}, - {"newline separated", "b:26657\na:26657", []string{"a:26657", "b:26657"}}, - {"comma separated", "b:26657,a:26657", []string{"a:26657", "b:26657"}}, - {"mixed with blanks", "a:26657,\n b:26657 ,,\n", []string{"a:26657", "b:26657"}}, - {"dedup", "a:26657\na:26657\nb:26657", []string{"a:26657", "b:26657"}}, + {"newline separated", "b:26657\na:26657", []string{syncerA, syncerB}}, + {"comma separated", "b:26657,a:26657", []string{syncerA, syncerB}}, + {"mixed with blanks", "a:26657,\n b:26657 ,,\n", []string{syncerA, syncerB}}, + {"dedup", "a:26657\na:26657\nb:26657", []string{syncerA, syncerB}}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { @@ -169,7 +174,7 @@ func TestStateSyncGate_Disabled_NotApplicable(t *testing.T) { ctx := context.Background() // S3 snapshot node: state sync not enabled. - node := newSnapshotNode("n", "default") + node := newSnapshotNode("n", testNamespace) r, _ := newNodeReconciler(t, node) withSyncerConfigMap(r) @@ -190,10 +195,10 @@ func TestStateSyncGate_NoSnapshotSource_NotApplicable(t *testing.T) { ctx := context.Background() node := &seiv1alpha1.SeiNode{ - ObjectMeta: metav1.ObjectMeta{Name: "n", Namespace: "default"}, + ObjectMeta: metav1.ObjectMeta{Name: "n", Namespace: testNamespace}, Spec: seiv1alpha1.SeiNodeSpec{ - ChainID: "sei-test", - Image: "sei:latest", + ChainID: testNodeName, + Image: testImage, FullNode: &seiv1alpha1.FullNodeSpec{}, }, } @@ -230,7 +235,7 @@ func TestStateSyncGate_NonStateSyncNodeUnaffected(t *testing.T) { g := NewWithT(t) ctx := context.Background() - node := newSnapshotNode("n", "default") + node := newSnapshotNode("n", testNamespace) r, _ := newNodeReconciler(t, node) ready, err := r.reconcileStateSyncGate(ctx, node) @@ -249,7 +254,7 @@ func TestReconcile_StateSyncFailClosed_NoPlanBuilt(t *testing.T) { g := NewWithT(t) ctx := context.Background() - node := stateSyncNode("ss-0", "default") + node := stateSyncNode("ss-0", testNamespace) node.Spec.ChainID = testChainID cm := syncerConfigMap(map[string]string{testChainID: "only-one:26657"}) r, c := newNodeReconciler(t, node, cm) @@ -257,10 +262,10 @@ func TestReconcile_StateSyncFailClosed_NoPlanBuilt(t *testing.T) { r.Recorder = rec withSyncerConfigMap(r) - _, err := r.Reconcile(ctx, nodeReqFor("ss-0", "default")) + _, err := r.Reconcile(ctx, nodeReqFor("ss-0", testNamespace)) g.Expect(err).NotTo(HaveOccurred()) - fetched := getSeiNode(t, ctx, c, "ss-0", "default") + fetched := getSeiNode(t, ctx, c, "ss-0", testNamespace) g.Expect(fetched.Status.Plan).To(BeNil(), "no state-sync plan must be built when fail-closed") cond := stateSyncCondition(fetched) g.Expect(cond).NotTo(BeNil()) @@ -297,13 +302,13 @@ func TestReconcile_PausedNode_StateSyncReadyStillSeeded(t *testing.T) { }{ { name: "state-sync enabled, no syncers", - node: stateSyncNode("paused-ss", "default"), + node: stateSyncNode("paused-ss", testNamespace), withCM: true, wantReason: seiv1alpha1.ReasonStateSyncNoSyncersConfigured, }, { name: "state-sync disabled", - node: newSnapshotNode("paused-s3", "default"), + node: newSnapshotNode("paused-s3", testNamespace), withCM: false, wantReason: seiv1alpha1.ReasonStateSyncNotApplicable, }, @@ -319,10 +324,10 @@ func TestReconcile_PausedNode_StateSyncReadyStillSeeded(t *testing.T) { withSyncerConfigMap(r) } - _, err := r.Reconcile(ctx, nodeReqFor(tc.node.Name, "default")) + _, err := r.Reconcile(ctx, nodeReqFor(tc.node.Name, testNamespace)) g.Expect(err).NotTo(HaveOccurred()) - fetched := getSeiNode(t, ctx, c, tc.node.Name, "default") + fetched := getSeiNode(t, ctx, c, tc.node.Name, testNamespace) cond := stateSyncCondition(fetched) g.Expect(cond).NotTo(BeNil(), "StateSyncReady must be present even on a paused node") g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) @@ -338,7 +343,7 @@ func TestReconcile_StateSyncReady_BuildsPlanWithSyncers(t *testing.T) { g := NewWithT(t) ctx := context.Background() - node := stateSyncNode("ss-1", "default") + node := stateSyncNode("ss-1", testNamespace) node.Spec.ChainID = testChainID cm := syncerConfigMap(map[string]string{ testChainID: "a.arctic-1.example.com:26657\nb.arctic-1.example.com:26657", @@ -346,10 +351,10 @@ func TestReconcile_StateSyncReady_BuildsPlanWithSyncers(t *testing.T) { r, c := newNodeReconciler(t, node, cm) withSyncerConfigMap(r) - _, err := r.Reconcile(ctx, nodeReqFor("ss-1", "default")) + _, err := r.Reconcile(ctx, nodeReqFor("ss-1", testNamespace)) g.Expect(err).NotTo(HaveOccurred()) - fetched := getSeiNode(t, ctx, c, "ss-1", "default") + fetched := getSeiNode(t, ctx, c, "ss-1", testNamespace) g.Expect(fetched.Status.Plan).NotTo(BeNil(), "a plan must be built when state-sync is ready") g.Expect(findPlannedTask(fetched.Status.Plan, planner.TaskConfigureStateSync)). NotTo(BeNil(), "plan must carry the configure-state-sync task") From 47308651761ee0a5aadf31fa40652952804a8f2a Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 12 Jun 2026 11:11:25 -0700 Subject: [PATCH 5/8] fix(statesync): gate plan construction, not the reconcile (Cursor review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the fail-closed state-sync gate from a reconcile short-circuit to a plan-construction check, fixing three Cursor findings: - HIGH: terminal plans now always clear — handleTerminalPlan runs before the block check, so a fail-closed node no longer strands Complete/Failed plans. - MED: a transient canonical-syncer ConfigMap read error is non-fatal — it sets StateSyncReady=Unknown/ConfigMapReadError, clears ResolvedStateSyncers, and requeues, instead of aborting StatefulSet/Paused/Failed/flush handling. - MED: StateSyncBlocked fires once per transition, not every requeue. enforceStateSyncGate is deleted; stateSyncBlocksPlan declines a state-sync plan when StateSyncReady != True (init path only — running update plans carry no state-sync task). Fail-closed correctness is preserved: never builds with <2 or stale/unresolved syncers. Co-Authored-By: Claude Opus 4.8 --- api/v1alpha1/seinode_types.go | 5 + internal/controller/node/controller.go | 55 +++++-- internal/controller/node/reconciler_test.go | 46 ++++++ internal/controller/node/statesync.go | 66 +++----- internal/controller/node/statesync_test.go | 168 ++++++++++++++++---- internal/planner/planner.go | 40 ++++- 6 files changed, 287 insertions(+), 93 deletions(-) diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index ac18312e..c4623c83 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -309,6 +309,11 @@ const ( ReasonStateSyncNoSyncersConfigured = "NoSyncersConfigured" // ReasonStateSyncNotApplicable: the node does not enable state-sync. ReasonStateSyncNotApplicable = "NotApplicable" + // ReasonStateSyncConfigMapReadError: the canonical-syncer ConfigMap read + // failed for a non-NotFound reason (transient API error). Fails closed and + // requeues; the rest of the reconcile (StatefulSet, Failed/Paused handling, + // status flush) still runs. + ReasonStateSyncConfigMapReadError = "ConfigMapReadError" ) // Reasons for the ImportPVCReady condition. diff --git a/internal/controller/node/controller.go b/internal/controller/node/controller.go index 9e890b44..2b5b7585 100644 --- a/internal/controller/node/controller.go +++ b/internal/controller/node/controller.go @@ -97,6 +97,7 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct statusBase := client.MergeFromWithOptions(before, client.MergeFromWithOptimisticLock{}) observedPhase := node.Status.Phase prevSidecar := apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionSidecarReady) + prevStateSync := apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionStateSyncReady) setNodePausedCondition(node) @@ -110,13 +111,11 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // Resolve the always-present StateSyncReady condition before the Failed and // Paused early-returns so it rides the existing flush on every path (Failed // flush, Paused flush, and the normal end-of-reconcile patch) — no separate - // status write. The fail-closed enforcement happens later, on the active - // path only, via enforceStateSyncGate using the `ready` resolved here (one - // ConfigMap read per reconcile). - stateSyncReady, err := r.reconcileStateSyncGate(ctx, node) - if err != nil { - return ctrl.Result{}, fmt.Errorf("resolving state-sync gate: %w", err) - } + // status write. Fail-closed enforcement lives in ResolvePlan, which declines + // to build a state-sync plan when this condition isn't True; that keeps + // terminal-plan cleanup and non-state-sync work running. A transient + // ConfigMap read error requeues without aborting the steps below. + stateSyncTransient := r.reconcileStateSyncGate(ctx, node) // Failed is terminal — flush any condition updates and exit. if node.Status.Phase == seiv1alpha1.PhaseFailed { @@ -143,21 +142,16 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, fmt.Errorf("reconciling peers: %w", err) } - // Enforce the state-sync gate before ResolvePlan so a fail-closed node never - // builds the state-sync-bearing plan. Uses the `ready` resolved above (no - // re-read of the ConfigMap). stop=true short-circuits the reconcile; the - // gate flushes via the existing flushStatus closure (single optimistic-lock - // patch, no separate write). - if res, stop, err := r.enforceStateSyncGate(node, stateSyncReady, flushStatus); stop { - return res, err - } - planAlreadyActive := node.Status.Plan != nil && node.Status.Plan.Phase == seiv1alpha1.TaskPlanActive + // ResolvePlan runs unconditionally: it clears terminal plans and drives + // non-state-sync work. Its internal fail-closed gate declines to build a + // state-sync plan when StateSyncReady isn't True. if err := r.Planner.ResolvePlan(ctx, node); err != nil { return ctrl.Result{}, fmt.Errorf("resolving plan: %w", err) } r.emitSidecarReadinessEvent(node, prevSidecar) + r.emitStateSyncBlockedEvent(node, prevStateSync) var result ctrl.Result var execErr error @@ -191,6 +185,12 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{RequeueAfter: statusPollInterval}, nil } + // A transient canonical-syncer ConfigMap read failed closed without a plan + // to drive a requeue; poll so the gate re-resolves once the API recovers. + if stateSyncTransient && result.IsZero() { + return ctrl.Result{RequeueAfter: statusPollInterval}, nil + } + return result, nil } @@ -327,3 +327,26 @@ func (r *SeiNodeReconciler) emitSidecarReadinessEvent(node *seiv1alpha1.SeiNode, "sidecar Healthz returned 200; mark-ready gate is open") } } + +// emitStateSyncBlockedEvent fires a StateSyncBlocked Warning once, on the +// transition into fail-closed (StateSyncReady leaving True/absent for a +// fail-closed reason) — not on every requeue. NotApplicable (state-sync +// disabled) never trips it. +func (r *SeiNodeReconciler) emitStateSyncBlockedEvent(node *seiv1alpha1.SeiNode, prev *metav1.Condition) { + cur := apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionStateSyncReady) + if cur == nil || cur.Status == metav1.ConditionTrue { + return + } + blockedReason := cur.Reason == seiv1alpha1.ReasonStateSyncNoSyncersConfigured || + cur.Reason == seiv1alpha1.ReasonStateSyncConfigMapReadError + if !blockedReason { + return + } + // Transition = previously True, absent, or a different (non-blocked) reason. + if prev != nil && prev.Status == cur.Status && prev.Reason == cur.Reason { + return + } + r.Recorder.Eventf(node, corev1.EventTypeWarning, "StateSyncBlocked", + "state sync enabled but not ready for chain %q (%s); not building plan", + node.Spec.ChainID, cur.Reason) +} diff --git a/internal/controller/node/reconciler_test.go b/internal/controller/node/reconciler_test.go index 6f547bea..e40f81c6 100644 --- a/internal/controller/node/reconciler_test.go +++ b/internal/controller/node/reconciler_test.go @@ -15,6 +15,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" "github.com/sei-protocol/sei-k8s-controller/internal/noderesource" @@ -73,6 +74,51 @@ func newNodeReconcilerWithSidecar(t *testing.T, mock *mockSidecarClient, objs .. return r, c } +// newNodeReconcilerWithGetError builds a reconciler whose client returns getErr +// (when non-nil) for the matching key on Get, used to exercise transient +// API-error handling. Other operations pass through to the fake client. +func newNodeReconcilerWithGetError(t *testing.T, node *seiv1alpha1.SeiNode, getErr func(client.ObjectKey) error) (*SeiNodeReconciler, client.Client) { + t.Helper() + s := newNodeTestScheme(t) + c := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(node). + WithStatusSubresource(&seiv1alpha1.SeiNode{}). + WithInterceptorFuncs(interceptor.Funcs{ + Get: func(ctx context.Context, cl client.WithWatch, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { + if err := getErr(key); err != nil { + return err + } + return cl.Get(ctx, key, obj, opts...) + }, + }). + Build() + mock := &mockSidecarClient{nodeID: "mock-node-id"} + r := &SeiNodeReconciler{ + Client: c, + Scheme: s, + Recorder: record.NewFakeRecorder(100), + Platform: platformtest.Config(), + Planner: &planner.NodeResolver{ + BuildSidecarClient: func(_ *seiv1alpha1.SeiNode) (task.SidecarClient, error) { return mock, nil }, + Platform: platformtest.Config(), + }, + PlanExecutor: &planner.Executor[*seiv1alpha1.SeiNode]{ + ConfigFor: func(_ context.Context, n *seiv1alpha1.SeiNode) task.ExecutionConfig { + return task.ExecutionConfig{ + BuildSidecarClient: func() (task.SidecarClient, error) { return mock, nil }, + KubeClient: c, + APIReader: c, + Scheme: s, + Resource: n, + Platform: platformtest.Config(), + } + }, + }, + } + return r, c +} + func nodeReqFor(name, namespace string) ctrl.Request { //nolint:unparam // test helper designed for reuse return ctrl.Request{NamespacedName: types.NamespacedName{Name: name, Namespace: namespace}} } diff --git a/internal/controller/node/statesync.go b/internal/controller/node/statesync.go index 1ad9b4e0..1603849e 100644 --- a/internal/controller/node/statesync.go +++ b/internal/controller/node/statesync.go @@ -11,7 +11,6 @@ import ( apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - ctrl "sigs.k8s.io/controller-runtime" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" ) @@ -33,62 +32,41 @@ const minCanonicalSyncers = 2 // lockdown that fixes the trust-root namespace. // +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch -// enforceStateSyncGate is the fail-closed half of the state-sync gate. It runs -// AFTER reconcileStateSyncGate has resolved the always-present StateSyncReady -// condition and returned `ready`. When the node is not ready and has no active -// plan, it flushes the resolved condition via the caller's flushStatus closure -// (preserving the single-patch / optimistic-lock model), emits a Warning Event, -// and signals the reconciler to stop before ResolvePlan so a fail-closed node -// never builds the state-sync-bearing plan. stop=true means the plan loop must -// short-circuit; an already-active plan is left to run to completion (we don't -// yank a state-sync plan mid-execution on a transient ConfigMap blip — the gate -// re-asserts once the plan goes terminal). It makes no API calls itself, so it -// takes no context — resolution (the ConfigMap read) already happened. -func (r *SeiNodeReconciler) enforceStateSyncGate( - node *seiv1alpha1.SeiNode, - ready bool, - flushStatus func() error, -) (_ ctrl.Result, stop bool, _ error) { - planActive := node.Status.Plan != nil && node.Status.Plan.Phase == seiv1alpha1.TaskPlanActive - if ready || planActive { - return ctrl.Result{}, false, nil - } - if err := flushStatus(); err != nil { - return ctrl.Result{}, true, fmt.Errorf("flushing state-sync gate status: %w", err) - } - r.Recorder.Eventf(node, corev1.EventTypeWarning, "StateSyncBlocked", - "state sync enabled but <%d canonical syncers configured for chain %q; not building plan", - minCanonicalSyncers, node.Spec.ChainID) - return ctrl.Result{RequeueAfter: statusPollInterval}, true, nil -} - // reconcileStateSyncGate resolves the canonical-syncer set for a state-sync // node and sets the always-present ConditionStateSyncReady accordingly. It // mutates node.Status in-memory only — the condition and ResolvedStateSyncers // are flushed by the caller's single optimistic-lock status patch, never a -// separate write. It is the resolution half of the gate: the caller runs it -// before the Failed/Paused early-returns so StateSyncReady is seeded on every -// path, then feeds `ready` into enforceStateSyncGate for fail-closed handling. +// separate write. The caller runs it before the Failed/Paused early-returns so +// StateSyncReady is seeded on every path. +// +// Fail-closed is enforced downstream, not here: the planner declines to build a +// state-sync plan whenever StateSyncReady is not True (see ResolvePlan). That +// keeps ResolvePlan running on every reconcile, so handleTerminalPlan still +// clears terminal plans and non-state-sync work proceeds — this method only +// resolves the condition. // -// It returns true ("ready to plan") when the state-sync-bearing plan may -// proceed: either state-sync is disabled (NotApplicable — the plan has no -// state-sync task to gate) or >=2 canonical syncers are configured (Ready). -// It returns false to fail closed: state-sync is enabled but <2 syncers are -// configured (NoSyncersConfigured), in which case the caller must skip -// ResolvePlan, emit a Warning Event, and requeue. -func (r *SeiNodeReconciler) reconcileStateSyncGate(ctx context.Context, node *seiv1alpha1.SeiNode) (bool, error) { +// A transient (non-NotFound) ConfigMap read error is NOT fatal: it sets +// StateSyncReady=Unknown/ConfigMapReadError and returns transient=true so the +// caller requeues without aborting the StatefulSet/Failed/Paused/flush path. A +// missing ConfigMap or <2 entries fails closed via False/NoSyncersConfigured. +func (r *SeiNodeReconciler) reconcileStateSyncGate(ctx context.Context, node *seiv1alpha1.SeiNode) (transient bool) { snap := node.Spec.SnapshotSource() if snap == nil || snap.StateSync == nil { // State-sync disabled: no state-sync task in the plan to gate. node.Status.ResolvedStateSyncers = nil setStateSyncReady(node, metav1.ConditionFalse, seiv1alpha1.ReasonStateSyncNotApplicable, "node does not enable state sync") - return true, nil + return false } syncers, err := r.canonicalSyncers(ctx, node.Spec.ChainID) if err != nil { - return false, fmt.Errorf("reading canonical-syncer ConfigMap: %w", err) + // Transient API error: fail closed (clear any stale set) but don't abort + // the reconcile. Requeue and re-read next tick. + node.Status.ResolvedStateSyncers = nil + setStateSyncReady(node, metav1.ConditionUnknown, seiv1alpha1.ReasonStateSyncConfigMapReadError, + fmt.Sprintf("reading canonical-syncer ConfigMap for chain %q: %v", node.Spec.ChainID, err)) + return true } if len(syncers) < minCanonicalSyncers { @@ -99,7 +77,7 @@ func (r *SeiNodeReconciler) reconcileStateSyncGate(ctx context.Context, node *se setStateSyncReady(node, metav1.ConditionFalse, seiv1alpha1.ReasonStateSyncNoSyncersConfigured, fmt.Sprintf("state sync requires >=%d canonical syncers configured for chain %q; found %d", minCanonicalSyncers, node.Spec.ChainID, len(syncers))) - return false, nil + return false } if !slices.Equal(node.Status.ResolvedStateSyncers, syncers) { @@ -107,7 +85,7 @@ func (r *SeiNodeReconciler) reconcileStateSyncGate(ctx context.Context, node *se } setStateSyncReady(node, metav1.ConditionTrue, seiv1alpha1.ReasonStateSyncReady, fmt.Sprintf("%d canonical syncers configured for chain %q", len(syncers), node.Spec.ChainID)) - return true, nil + return false } // canonicalSyncers reads the canonical-syncer ConfigMap and returns the parsed diff --git a/internal/controller/node/statesync_test.go b/internal/controller/node/statesync_test.go index 1e689416..57d22c88 100644 --- a/internal/controller/node/statesync_test.go +++ b/internal/controller/node/statesync_test.go @@ -7,10 +7,14 @@ import ( "testing" . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" "github.com/sei-protocol/sei-k8s-controller/internal/planner" @@ -25,6 +29,7 @@ const ( testNodeName = "sei-test" syncerA = "a:26657" syncerB = "b:26657" + syncerSingle = "only-one:26657" ) func syncerConfigMap(data map[string]string) *corev1.ConfigMap { @@ -88,9 +93,8 @@ func TestStateSyncGate_EnabledWithTwoSyncers_Ready(t *testing.T) { r, _ := newNodeReconciler(t, node, cm) withSyncerConfigMap(r) - ready, err := r.reconcileStateSyncGate(ctx, node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(ready).To(BeTrue()) + transient := r.reconcileStateSyncGate(ctx, node) + g.Expect(transient).To(BeFalse()) cond := stateSyncCondition(node) g.Expect(cond).NotTo(BeNil()) @@ -113,9 +117,8 @@ func TestStateSyncGate_EnabledWithOneSyncer_FailsClosed(t *testing.T) { r, _ := newNodeReconciler(t, node, cm) withSyncerConfigMap(r) - ready, err := r.reconcileStateSyncGate(ctx, node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(ready).To(BeFalse()) + transient := r.reconcileStateSyncGate(ctx, node) + g.Expect(transient).To(BeFalse()) cond := stateSyncCondition(node) g.Expect(cond).NotTo(BeNil()) @@ -132,9 +135,8 @@ func TestStateSyncGate_EnabledMissingConfigMap_FailsClosed(t *testing.T) { r, _ := newNodeReconciler(t, node) // no ConfigMap object withSyncerConfigMap(r) - ready, err := r.reconcileStateSyncGate(ctx, node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(ready).To(BeFalse()) + transient := r.reconcileStateSyncGate(ctx, node) + g.Expect(transient).To(BeFalse()) cond := stateSyncCondition(node) g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) @@ -150,9 +152,8 @@ func TestStateSyncGate_EnabledNoChainEntry_FailsClosed(t *testing.T) { r, _ := newNodeReconciler(t, node, cm) withSyncerConfigMap(r) - ready, err := r.reconcileStateSyncGate(ctx, node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(ready).To(BeFalse()) + transient := r.reconcileStateSyncGate(ctx, node) + g.Expect(transient).To(BeFalse()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) } @@ -163,9 +164,8 @@ func TestStateSyncGate_UnconfiguredConfigMapRef_FailsClosed(t *testing.T) { node := stateSyncNode("n", testChainID) r, _ := newNodeReconciler(t, node) // Platform leaves the ConfigMap name empty. - ready, err := r.reconcileStateSyncGate(ctx, node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(ready).To(BeFalse()) + transient := r.reconcileStateSyncGate(ctx, node) + g.Expect(transient).To(BeFalse()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) } @@ -178,9 +178,8 @@ func TestStateSyncGate_Disabled_NotApplicable(t *testing.T) { r, _ := newNodeReconciler(t, node) withSyncerConfigMap(r) - ready, err := r.reconcileStateSyncGate(ctx, node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(ready).To(BeTrue()) + transient := r.reconcileStateSyncGate(ctx, node) + g.Expect(transient).To(BeFalse()) cond := stateSyncCondition(node) g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) @@ -205,9 +204,8 @@ func TestStateSyncGate_NoSnapshotSource_NotApplicable(t *testing.T) { r, _ := newNodeReconciler(t, node) withSyncerConfigMap(r) - ready, err := r.reconcileStateSyncGate(ctx, node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(ready).To(BeTrue()) + transient := r.reconcileStateSyncGate(ctx, node) + g.Expect(transient).To(BeFalse()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNotApplicable)) } @@ -223,9 +221,8 @@ func TestStateSyncGate_FailClosedClearsStaleSyncers(t *testing.T) { withSyncerConfigMap(r) r.Platform.StateSyncSyncersConfigMap = "" // force unconfigured - ready, err := r.reconcileStateSyncGate(ctx, node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(ready).To(BeFalse()) + transient := r.reconcileStateSyncGate(ctx, node) + g.Expect(transient).To(BeFalse()) g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) } @@ -238,9 +235,8 @@ func TestStateSyncGate_NonStateSyncNodeUnaffected(t *testing.T) { node := newSnapshotNode("n", testNamespace) r, _ := newNodeReconciler(t, node) - ready, err := r.reconcileStateSyncGate(ctx, node) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(ready).To(BeTrue()) + transient := r.reconcileStateSyncGate(ctx, node) + g.Expect(transient).To(BeFalse()) g.Expect(node.Status.ResolvedStateSyncers).To(BeEmpty()) g.Expect(slices.Contains([]string{ seiv1alpha1.ReasonStateSyncReady, seiv1alpha1.ReasonStateSyncNoSyncersConfigured, @@ -256,7 +252,7 @@ func TestReconcile_StateSyncFailClosed_NoPlanBuilt(t *testing.T) { node := stateSyncNode("ss-0", testNamespace) node.Spec.ChainID = testChainID - cm := syncerConfigMap(map[string]string{testChainID: "only-one:26657"}) + cm := syncerConfigMap(map[string]string{testChainID: syncerSingle}) r, c := newNodeReconciler(t, node, cm) rec := record.NewFakeRecorder(10) r.Recorder = rec @@ -337,6 +333,122 @@ func TestReconcile_PausedNode_StateSyncReadyStillSeeded(t *testing.T) { } } +// Fail-closed must NOT block terminal-plan cleanup. A state-sync node with <2 +// syncers and a terminal plan on status must have that plan cleared +// (handleTerminalPlan runs inside ResolvePlan, which now runs unconditionally) +// while still building NO new state-sync plan. +func TestReconcile_StateSyncFailClosed_ClearsTerminalPlan(t *testing.T) { + cases := []struct { + name string + phase seiv1alpha1.TaskPlanPhase + }{ + {"complete", seiv1alpha1.TaskPlanComplete}, + {"failed", seiv1alpha1.TaskPlanFailed}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("ss-term", testNamespace) + node.Spec.ChainID = testChainID + node.Status.Phase = seiv1alpha1.PhaseInitializing + node.Status.Plan = &seiv1alpha1.TaskPlan{ + ID: "stale-plan", + Phase: tc.phase, + Tasks: []seiv1alpha1.PlannedTask{{Type: planner.TaskConfigApply, Status: seiv1alpha1.TaskComplete}}, + } + cm := syncerConfigMap(map[string]string{testChainID: syncerSingle}) + r, c := newNodeReconciler(t, node, cm) + withSyncerConfigMap(r) + + _, err := r.Reconcile(ctx, nodeReqFor("ss-term", testNamespace)) + g.Expect(err).NotTo(HaveOccurred()) + + fetched := getSeiNode(t, ctx, c, "ss-term", testNamespace) + g.Expect(fetched.Status.Plan).To(BeNil(), + "terminal plan must be cleared even when fail-closed") + cond := stateSyncCondition(fetched) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) + }) + } +} + +// A non-NotFound ConfigMap read error must be transient: reconcileStatefulSet, +// Paused/Failed handling, and the status flush still run, the reconcile +// requeues instead of hard-aborting, and StateSyncReady reflects it via +// Unknown/ConfigMapReadError. +func TestReconcile_StateSyncConfigMapReadError_Transient(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("ss-err", testNamespace) + node.Spec.ChainID = testChainID + r, c := newNodeReconcilerWithGetError(t, node, func(key client.ObjectKey) error { + if key.Name == testSyncerCMName { + return apierrors.NewServiceUnavailable("etcd unavailable") + } + return nil + }) + withSyncerConfigMap(r) + + res, err := r.Reconcile(ctx, nodeReqFor("ss-err", testNamespace)) + g.Expect(err).NotTo(HaveOccurred(), "transient ConfigMap error must not hard-abort") + g.Expect(res.RequeueAfter).To(BeNumerically(">", 0), "transient error must requeue") + + fetched := getSeiNode(t, ctx, c, "ss-err", testNamespace) + cond := stateSyncCondition(fetched) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionUnknown)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncConfigMapReadError)) + g.Expect(fetched.Status.Plan).To(BeNil(), "no state-sync plan while the gate is unresolved") + // StatefulSet sync still ran despite the ConfigMap error. + sts := &appsv1.StatefulSet{} + g.Expect(c.Get(ctx, types.NamespacedName{Name: "ss-err", Namespace: testNamespace}, sts)).To(Succeed(), + "reconcileStatefulSet must run even when the ConfigMap read fails") +} + +// The StateSyncBlocked Warning fires once on the transition into fail-closed, +// not on every requeue. +func TestReconcile_StateSyncBlocked_EventFiresOncePerTransition(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("ss-evt", testNamespace) + node.Spec.ChainID = testChainID + cm := syncerConfigMap(map[string]string{testChainID: syncerSingle}) + r, _ := newNodeReconciler(t, node, cm) + rec := record.NewFakeRecorder(10) + r.Recorder = rec + withSyncerConfigMap(r) + + countBlocked := func() int { + n := 0 + for { + select { + case e := <-rec.Events: + if strings.Contains(e, "StateSyncBlocked") { + n++ + } + continue + default: + } + return n + } + } + + _, err := r.Reconcile(ctx, nodeReqFor("ss-evt", testNamespace)) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(countBlocked()).To(Equal(1), "first fail-closed reconcile emits StateSyncBlocked") + + // Plan stays nil and the condition is already False/NoSyncersConfigured, so a + // second reconcile is a no-op transition — no repeat event. + _, err = r.Reconcile(ctx, nodeReqFor("ss-evt", testNamespace)) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(countBlocked()).To(Equal(0), "steady-state requeue must not re-emit StateSyncBlocked") +} + // Full reconcile path: a state-sync node with >=2 syncers proceeds — a plan is // built carrying the canonical syncers, and StateSyncReady is True. func TestReconcile_StateSyncReady_BuildsPlanWithSyncers(t *testing.T) { diff --git a/internal/planner/planner.go b/internal/planner/planner.go index c8e102d3..e4b35343 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -155,6 +155,16 @@ func (p *NodeResolver) ResolvePlan(ctx context.Context, node *seiv1alpha1.SeiNod handleTerminalPlan(ctx, node) + // Fail-closed state-sync gate. Runs after handleTerminalPlan (so terminal + // plans still clear) but before building: a state-sync node whose + // StateSyncReady condition isn't True must never get a state-sync-bearing + // plan (CometBFT needs >=2 rpc-servers; we never fall back to peers). The + // condition is resolved upstream by the controller's reconcileStateSyncGate. + // Non-state-sync work is unaffected — only init-plan construction is gated. + if stateSyncBlocksPlan(node) { + return nil + } + mode, err := p.plannerForMode(node) if err != nil { return err @@ -180,6 +190,25 @@ func (p *NodeResolver) ResolvePlan(ctx context.Context, node *seiv1alpha1.SeiNod return nil } +// stateSyncBlocksPlan reports whether the fail-closed state-sync gate must +// suppress plan construction this reconcile. It fires only on the init path +// (pre-Running), which is the only path that builds a state-sync-bearing plan +// via buildSidecarProgression — a Running node's update plans carry no +// state-sync task, so an image roll must not be blocked by a syncer-ConfigMap +// blip. The gate trips when state-sync is enabled and the controller-resolved +// StateSyncReady condition is not True (NoSyncersConfigured or the transient +// ConfigMapReadError). A missing condition (not yet resolved) does not gate. +func stateSyncBlocksPlan(node *seiv1alpha1.SeiNode) bool { + if node.Status.Phase == seiv1alpha1.PhaseRunning { + return false + } + if !hasStateSync(node.Spec.SnapshotSource()) { + return false + } + cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionStateSyncReady) + return cond != nil && cond.Status != metav1.ConditionTrue +} + // handleTerminalPlan handles completed or failed plans: clears conditions // and nils the plan so the planner can build the next one if needed. func handleTerminalPlan(ctx context.Context, node *seiv1alpha1.SeiNode) { @@ -656,11 +685,12 @@ func snapshotRestoreTask(snap *seiv1alpha1.SnapshotSource) sidecar.SnapshotResto func configureStateSyncTask(node *seiv1alpha1.SeiNode) sidecar.ConfigureStateSyncTask { snap := node.Spec.SnapshotSource() - // RpcServers come from the controller-level canonical-syncer ConfigMap, - // resolved by the StateSyncReady gate into Status.ResolvedStateSyncers - // (replacing the label-derived ResolvedRPCWitnesses path — peers are no - // longer used as witnesses). The gate guarantees >=2 curated entries here; - // the sidecar establishes the trust point from them as it does today. + // RpcServers come from the controller-level canonical-syncer ConfigMap via + // Status.ResolvedStateSyncers (peers are no longer used as witnesses). This + // runs only when stateSyncBlocksPlan passed, i.e. StateSyncReady=True, which + // guarantees >=2 curated entries. The set is snapshotted into the task params + // here and never re-read at execution — so a transient ConfigMap blip on an + // already-active plan can't empty an in-flight witness list. t := sidecar.ConfigureStateSyncTask{ UseLocalSnapshot: hasS3Snapshot(snap), RpcServers: node.Status.ResolvedStateSyncers, From f4cd247cab33e08f3f4fadce7812f92fe1e27a4b Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 12 Jun 2026 11:47:39 -0700 Subject: [PATCH 6/8] refactor(statesync): source canonical syncers from a mounted file, not the API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per owner review: replace the live-API-read canonical-syncer ConfigMap with a read-only mounted file (YAML chainID -> [host:port]). Removes the live K8s API dependency and the configmaps read-RBAC from the state-sync path, and closes the controller-binary in-process write path to the trust root (the read-only mount physically enforces what the RBAC marker only documented). Source swap only — the fail-closed StateSyncReady gate, plan-construction gating, terminal-cleanup ordering, and event-on-transition are unchanged. Mapping preserved: missing/empty file -> fail closed (NoSyncersConfigured); read/parse error -> transient (Unknown/SyncerSourceError + requeue, reconcile not aborted). Mounted as a directory volume (never subPath) from an optional ConfigMap so the controller starts and fails closed when syncers aren't provisioned. First slice of PLT-475 (env -> mounted app-config). The residual cluster-wide configmaps write grant remains PLT-471's fix. Co-Authored-By: Claude Opus 4.8 --- api/v1alpha1/seinode_types.go | 12 +- cmd/main.go | 6 +- config/manager/manager.yaml | 21 +- internal/controller/node/controller.go | 6 +- internal/controller/node/reconciler_test.go | 46 ----- internal/controller/node/statesync.go | 81 ++++---- internal/controller/node/statesync_test.go | 216 +++++++++++--------- internal/planner/planner.go | 4 +- internal/platform/platform.go | 35 ++-- 9 files changed, 203 insertions(+), 224 deletions(-) diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index c4623c83..c502ff8f 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -305,15 +305,15 @@ const ( // configured for the chain; the state-sync-bearing plan may proceed. ReasonStateSyncReady = "Ready" // ReasonStateSyncNoSyncersConfigured: state-sync enabled but the canonical- - // syncer ConfigMap yields <2 entries for the chain (fail closed). + // syncer source yields <2 entries for the chain (fail closed). ReasonStateSyncNoSyncersConfigured = "NoSyncersConfigured" // ReasonStateSyncNotApplicable: the node does not enable state-sync. ReasonStateSyncNotApplicable = "NotApplicable" - // ReasonStateSyncConfigMapReadError: the canonical-syncer ConfigMap read - // failed for a non-NotFound reason (transient API error). Fails closed and - // requeues; the rest of the reconcile (StatefulSet, Failed/Paused handling, - // status flush) still runs. - ReasonStateSyncConfigMapReadError = "ConfigMapReadError" + // ReasonStateSyncSyncerSourceError: reading or parsing the canonical-syncer + // source file failed for a reason other than absence (transient). Fails + // closed and requeues; the rest of the reconcile (StatefulSet, Failed/Paused + // handling, status flush) still runs. + ReasonStateSyncSyncerSourceError = "SyncerSourceError" ) // Reasons for the ImportPVCReady condition. diff --git a/cmd/main.go b/cmd/main.go index 9a4e720d..7e4c8737 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -164,9 +164,9 @@ func main() { SidecarImage: os.Getenv("SEI_SIDECAR_IMAGE"), CosmosExporterImage: os.Getenv("SEI_COSMOS_EXPORTER_IMAGE"), - // State-sync canonical syncers are opt-in; these may be empty. - StateSyncSyncersConfigMap: os.Getenv("SEI_STATESYNC_SYNCERS_CONFIGMAP"), - StateSyncSyncersNamespace: os.Getenv("SEI_STATESYNC_SYNCERS_NAMESPACE"), + // State-sync canonical syncers are opt-in; this may be empty. Points at a + // read-only mounted file (a GitOps-written ConfigMap volume). + StateSyncSyncersFile: os.Getenv("SEI_STATESYNC_SYNCERS_FILE"), } if err := platformCfg.Validate(); err != nil { diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index d0e3a1e9..ff978676 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -81,6 +81,11 @@ spec: value: gateway - name: SEI_GATEWAY_DOMAIN value: prod.platform.sei.io + # Read-only canonical-syncer source. Points at the mounted ConfigMap + # (directory mount, not subPath) so GitOps swaps propagate without a + # pod restart. Optional: the controller fails closed when absent. + - name: SEI_STATESYNC_SYNCERS_FILE + value: /etc/sei/statesync-syncers/syncers.yaml ports: - containerPort: 8080 name: metrics @@ -110,7 +115,19 @@ spec: requests: cpu: 50m memory: 128Mi - volumeMounts: [] - volumes: [] + volumeMounts: + # Directory mount (not subPath): subPath snapshots the ConfigMap at + # mount time and never updates, defeating the fresh-read-per-reconcile + # contract. readOnly so the controller can't rewrite its trust root. + - name: statesync-syncers + mountPath: /etc/sei/statesync-syncers + readOnly: true + volumes: + # GitOps provisions the ConfigMap content out of this repo. optional so + # the controller starts (and fails closed) when it isn't present yet. + - name: statesync-syncers + configMap: + name: canonical-syncers + optional: true serviceAccountName: controller-manager terminationGracePeriodSeconds: 10 diff --git a/internal/controller/node/controller.go b/internal/controller/node/controller.go index 2b5b7585..cb12cd63 100644 --- a/internal/controller/node/controller.go +++ b/internal/controller/node/controller.go @@ -114,8 +114,8 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // status write. Fail-closed enforcement lives in ResolvePlan, which declines // to build a state-sync plan when this condition isn't True; that keeps // terminal-plan cleanup and non-state-sync work running. A transient - // ConfigMap read error requeues without aborting the steps below. - stateSyncTransient := r.reconcileStateSyncGate(ctx, node) + // syncer-source read error requeues without aborting the steps below. + stateSyncTransient := r.reconcileStateSyncGate(node) // Failed is terminal — flush any condition updates and exit. if node.Status.Phase == seiv1alpha1.PhaseFailed { @@ -338,7 +338,7 @@ func (r *SeiNodeReconciler) emitStateSyncBlockedEvent(node *seiv1alpha1.SeiNode, return } blockedReason := cur.Reason == seiv1alpha1.ReasonStateSyncNoSyncersConfigured || - cur.Reason == seiv1alpha1.ReasonStateSyncConfigMapReadError + cur.Reason == seiv1alpha1.ReasonStateSyncSyncerSourceError if !blockedReason { return } diff --git a/internal/controller/node/reconciler_test.go b/internal/controller/node/reconciler_test.go index e40f81c6..6f547bea 100644 --- a/internal/controller/node/reconciler_test.go +++ b/internal/controller/node/reconciler_test.go @@ -15,7 +15,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - "sigs.k8s.io/controller-runtime/pkg/client/interceptor" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" "github.com/sei-protocol/sei-k8s-controller/internal/noderesource" @@ -74,51 +73,6 @@ func newNodeReconcilerWithSidecar(t *testing.T, mock *mockSidecarClient, objs .. return r, c } -// newNodeReconcilerWithGetError builds a reconciler whose client returns getErr -// (when non-nil) for the matching key on Get, used to exercise transient -// API-error handling. Other operations pass through to the fake client. -func newNodeReconcilerWithGetError(t *testing.T, node *seiv1alpha1.SeiNode, getErr func(client.ObjectKey) error) (*SeiNodeReconciler, client.Client) { - t.Helper() - s := newNodeTestScheme(t) - c := fake.NewClientBuilder(). - WithScheme(s). - WithObjects(node). - WithStatusSubresource(&seiv1alpha1.SeiNode{}). - WithInterceptorFuncs(interceptor.Funcs{ - Get: func(ctx context.Context, cl client.WithWatch, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { - if err := getErr(key); err != nil { - return err - } - return cl.Get(ctx, key, obj, opts...) - }, - }). - Build() - mock := &mockSidecarClient{nodeID: "mock-node-id"} - r := &SeiNodeReconciler{ - Client: c, - Scheme: s, - Recorder: record.NewFakeRecorder(100), - Platform: platformtest.Config(), - Planner: &planner.NodeResolver{ - BuildSidecarClient: func(_ *seiv1alpha1.SeiNode) (task.SidecarClient, error) { return mock, nil }, - Platform: platformtest.Config(), - }, - PlanExecutor: &planner.Executor[*seiv1alpha1.SeiNode]{ - ConfigFor: func(_ context.Context, n *seiv1alpha1.SeiNode) task.ExecutionConfig { - return task.ExecutionConfig{ - BuildSidecarClient: func() (task.SidecarClient, error) { return mock, nil }, - KubeClient: c, - APIReader: c, - Scheme: s, - Resource: n, - Platform: platformtest.Config(), - } - }, - }, - } - return r, c -} - func nodeReqFor(name, namespace string) ctrl.Request { //nolint:unparam // test helper designed for reuse return ctrl.Request{NamespacedName: types.NamespacedName{Name: name, Namespace: namespace}} } diff --git a/internal/controller/node/statesync.go b/internal/controller/node/statesync.go index 1603849e..0c855962 100644 --- a/internal/controller/node/statesync.go +++ b/internal/controller/node/statesync.go @@ -1,16 +1,14 @@ package node import ( - "context" "fmt" + "os" "slices" "strings" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/yaml" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" ) @@ -23,15 +21,6 @@ import ( // existing trust-pinning behavior. const minCanonicalSyncers = 2 -// The controller only READS the canonical-syncer ConfigMap. The read-only -// intent is NOT yet enforced: the reconciler's pre-existing cluster-wide -// configmaps grant (controller.go) confers write access that controller-gen -// unions in, so this marker is documentary only. Scoping the controller to -// read-only on the trust root (so it cannot rewrite its own trust source) is a -// PLT-452 launch-gate blocker tracked in PLT-471, landing with the namespace -// lockdown that fixes the trust-root namespace. -// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch - // reconcileStateSyncGate resolves the canonical-syncer set for a state-sync // node and sets the always-present ConditionStateSyncReady accordingly. It // mutates node.Status in-memory only — the condition and ResolvedStateSyncers @@ -45,11 +34,11 @@ const minCanonicalSyncers = 2 // clears terminal plans and non-state-sync work proceeds — this method only // resolves the condition. // -// A transient (non-NotFound) ConfigMap read error is NOT fatal: it sets -// StateSyncReady=Unknown/ConfigMapReadError and returns transient=true so the +// A transient (non-absence) read or parse error is NOT fatal: it sets +// StateSyncReady=Unknown/SyncerSourceError and returns transient=true so the // caller requeues without aborting the StatefulSet/Failed/Paused/flush path. A -// missing ConfigMap or <2 entries fails closed via False/NoSyncersConfigured. -func (r *SeiNodeReconciler) reconcileStateSyncGate(ctx context.Context, node *seiv1alpha1.SeiNode) (transient bool) { +// missing source file or <2 entries fails closed via False/NoSyncersConfigured. +func (r *SeiNodeReconciler) reconcileStateSyncGate(node *seiv1alpha1.SeiNode) (transient bool) { snap := node.Spec.SnapshotSource() if snap == nil || snap.StateSync == nil { // State-sync disabled: no state-sync task in the plan to gate. @@ -59,13 +48,13 @@ func (r *SeiNodeReconciler) reconcileStateSyncGate(ctx context.Context, node *se return false } - syncers, err := r.canonicalSyncers(ctx, node.Spec.ChainID) + syncers, err := r.canonicalSyncers(node.Spec.ChainID) if err != nil { - // Transient API error: fail closed (clear any stale set) but don't abort - // the reconcile. Requeue and re-read next tick. + // Transient read/parse error: fail closed (clear any stale set) but don't + // abort the reconcile. Requeue and re-read next tick. node.Status.ResolvedStateSyncers = nil - setStateSyncReady(node, metav1.ConditionUnknown, seiv1alpha1.ReasonStateSyncConfigMapReadError, - fmt.Sprintf("reading canonical-syncer ConfigMap for chain %q: %v", node.Spec.ChainID, err)) + setStateSyncReady(node, metav1.ConditionUnknown, seiv1alpha1.ReasonStateSyncSyncerSourceError, + fmt.Sprintf("reading canonical-syncer source for chain %q: %v", node.Spec.ChainID, err)) return true } @@ -88,35 +77,47 @@ func (r *SeiNodeReconciler) reconcileStateSyncGate(ctx context.Context, node *se return false } -// canonicalSyncers reads the canonical-syncer ConfigMap and returns the parsed -// syncer RPC endpoints for the given chain. A missing ConfigMap, an unset -// ConfigMap reference, or a chain with no entry all yield an empty slice (no -// error) so the caller fails closed via the StateSyncReady gate rather than -// crashing — state-sync is opt-in and the ConfigMap may legitimately be absent -// until GitOps provisions it. +// canonicalSyncers reads the read-only canonical-syncer file fresh and returns +// the parsed syncer RPC endpoints for the given chain. An unset path, a missing +// file, or a chain with no entry all yield an empty slice (no error) so the +// caller fails closed via the StateSyncReady gate rather than crashing — +// state-sync is opt-in and the file may legitimately be absent until GitOps +// provisions the backing ConfigMap. Any other read or parse error is returned +// so the gate can treat it as transient. +// +// File shape: a YAML map of chainID -> list of bare `host:port` RPC endpoints +// (no scheme; the sidecar adds it). Each chain's entries are trimmed, blanks +// dropped, sorted, and de-duplicated for a stable witness set. // -// Data shape: data[chainID] is a list of `host:port` RPC endpoints separated by -// newlines and/or commas. Blank entries are dropped; the result is sorted and -// de-duplicated for a stable witness set. -func (r *SeiNodeReconciler) canonicalSyncers(ctx context.Context, chainID string) ([]string, error) { - name := r.Platform.StateSyncSyncersConfigMap - if strings.TrimSpace(name) == "" { +// Read fresh on every call: a mounted ConfigMap swaps atomically (a symlink +// flip on the directory mount), so re-reading picks up GitOps updates without a +// pod restart. Never cache an open handle. +func (r *SeiNodeReconciler) canonicalSyncers(chainID string) ([]string, error) { + path := strings.TrimSpace(r.Platform.StateSyncSyncersFile) + if path == "" { return nil, nil } - cm := &corev1.ConfigMap{} - key := types.NamespacedName{Name: name, Namespace: r.Platform.StateSyncSyncersNamespace} - if err := r.Get(ctx, key, cm); err != nil { - if apierrors.IsNotFound(err) { + raw, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { return nil, nil } return nil, err } - return parseSyncerList(cm.Data[chainID]), nil + syncers := map[string][]string{} + if err := yaml.Unmarshal(raw, &syncers); err != nil { + return nil, fmt.Errorf("parsing canonical-syncer file %q: %w", path, err) + } + + // A YAML list entry may itself carry comma/whitespace-joined endpoints + // (forward-compatible with PLT-475's stateSync.syncers), so route the joined + // value through the same splitter the ConfigMap source used. + return parseSyncerList(strings.Join(syncers[chainID], "\n")), nil } -// parseSyncerList splits a ConfigMap value on newlines and commas, trims +// parseSyncerList splits a syncer value on newlines and commas, trims // whitespace, drops blanks, then sorts and de-duplicates. func parseSyncerList(raw string) []string { fields := strings.FieldsFunc(raw, func(r rune) bool { diff --git a/internal/controller/node/statesync_test.go b/internal/controller/node/statesync_test.go index 57d22c88..b1fe0391 100644 --- a/internal/controller/node/statesync_test.go +++ b/internal/controller/node/statesync_test.go @@ -2,41 +2,64 @@ package node import ( "context" + "os" + "path/filepath" "slices" "strings" "testing" . "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/client" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" "github.com/sei-protocol/sei-k8s-controller/internal/planner" ) const ( - testSyncerCMName = "canonical-syncers" - testSyncerCMNS = "sei-platform" - testChainID = "arctic-1" - testNamespace = "default" - testImage = "sei:latest" - testNodeName = "sei-test" - syncerA = "a:26657" - syncerB = "b:26657" - syncerSingle = "only-one:26657" + testChainID = "arctic-1" + testNamespace = "default" + testImage = "sei:latest" + testNodeName = "sei-test" + syncerA = "a:26657" + syncerB = "b:26657" + syncerSingle = "only-one:26657" ) -func syncerConfigMap(data map[string]string) *corev1.ConfigMap { - return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{Name: testSyncerCMName, Namespace: testSyncerCMNS}, - Data: data, +// syncerFileYAML renders a chainID -> [host:port] map as the read-only syncer +// file content (matching canonicalSyncers' expected YAML shape). +func syncerFileYAML(byChain map[string][]string) string { + var b strings.Builder + for chain, syncers := range byChain { + b.WriteString(chain) + b.WriteString(":\n") + for _, s := range syncers { + b.WriteString(" - ") + b.WriteString(s) + b.WriteString("\n") + } } + return b.String() +} + +// writeSyncerFile writes content to a temp file and points the reconciler's +// platform at it. The file lives under t.TempDir(), auto-cleaned by the test. +func writeSyncerFile(t *testing.T, r *SeiNodeReconciler, content string) { + t.Helper() + path := filepath.Join(t.TempDir(), "syncers.yaml") + if err := os.WriteFile(path, []byte(content), 0o600); err != nil { + t.Fatalf("writing syncer file: %v", err) + } + r.Platform.StateSyncSyncersFile = path +} + +// withSyncers writes a chainID -> syncers file and wires the reconciler to it. +func withSyncers(t *testing.T, r *SeiNodeReconciler, byChain map[string][]string) { + t.Helper() + writeSyncerFile(t, r, syncerFileYAML(byChain)) } // stateSyncNode returns a FullNode with state sync enabled on the given chain. @@ -53,11 +76,6 @@ func stateSyncNode(name, chainID string) *seiv1alpha1.SeiNode { } } -func withSyncerConfigMap(r *SeiNodeReconciler) { - r.Platform.StateSyncSyncersConfigMap = testSyncerCMName - r.Platform.StateSyncSyncersNamespace = testSyncerCMNS -} - func stateSyncCondition(node *seiv1alpha1.SeiNode) *metav1.Condition { return apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionStateSyncReady) } @@ -84,16 +102,14 @@ func TestParseSyncerList(t *testing.T) { func TestStateSyncGate_EnabledWithTwoSyncers_Ready(t *testing.T) { g := NewWithT(t) - ctx := context.Background() node := stateSyncNode("n", testChainID) - cm := syncerConfigMap(map[string]string{ - testChainID: "syncer-1.arctic-1.example.com:26657\nsyncer-0.arctic-1.example.com:26657", + r, _ := newNodeReconciler(t, node) + withSyncers(t, r, map[string][]string{ + testChainID: {"syncer-1.arctic-1.example.com:26657", "syncer-0.arctic-1.example.com:26657"}, }) - r, _ := newNodeReconciler(t, node, cm) - withSyncerConfigMap(r) - transient := r.reconcileStateSyncGate(ctx, node) + transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) cond := stateSyncCondition(node) @@ -110,14 +126,12 @@ func TestStateSyncGate_EnabledWithTwoSyncers_Ready(t *testing.T) { func TestStateSyncGate_EnabledWithOneSyncer_FailsClosed(t *testing.T) { g := NewWithT(t) - ctx := context.Background() node := stateSyncNode("n", testChainID) - cm := syncerConfigMap(map[string]string{testChainID: "only-one.arctic-1.example.com:26657"}) - r, _ := newNodeReconciler(t, node, cm) - withSyncerConfigMap(r) + r, _ := newNodeReconciler(t, node) + withSyncers(t, r, map[string][]string{testChainID: {"only-one.arctic-1.example.com:26657"}}) - transient := r.reconcileStateSyncGate(ctx, node) + transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) cond := stateSyncCondition(node) @@ -127,15 +141,16 @@ func TestStateSyncGate_EnabledWithOneSyncer_FailsClosed(t *testing.T) { g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) } -func TestStateSyncGate_EnabledMissingConfigMap_FailsClosed(t *testing.T) { +// A configured path pointing at a file that doesn't exist (the backing +// ConfigMap isn't provisioned yet) fails closed, not transient. +func TestStateSyncGate_EnabledMissingFile_FailsClosed(t *testing.T) { g := NewWithT(t) - ctx := context.Background() node := stateSyncNode("n", testChainID) - r, _ := newNodeReconciler(t, node) // no ConfigMap object - withSyncerConfigMap(r) + r, _ := newNodeReconciler(t, node) + r.Platform.StateSyncSyncersFile = filepath.Join(t.TempDir(), "absent.yaml") // never created - transient := r.reconcileStateSyncGate(ctx, node) + transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) cond := stateSyncCondition(node) @@ -145,40 +160,54 @@ func TestStateSyncGate_EnabledMissingConfigMap_FailsClosed(t *testing.T) { func TestStateSyncGate_EnabledNoChainEntry_FailsClosed(t *testing.T) { g := NewWithT(t) - ctx := context.Background() node := stateSyncNode("n", testChainID) - cm := syncerConfigMap(map[string]string{"other-chain": "a:26657\nb:26657"}) - r, _ := newNodeReconciler(t, node, cm) - withSyncerConfigMap(r) + r, _ := newNodeReconciler(t, node) + withSyncers(t, r, map[string][]string{"other-chain": {syncerA, syncerB}}) - transient := r.reconcileStateSyncGate(ctx, node) + transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) } -func TestStateSyncGate_UnconfiguredConfigMapRef_FailsClosed(t *testing.T) { +// An unreadable/unparseable file is transient (Unknown + requeue), distinct +// from absence which fails closed. +func TestStateSyncGate_ParseError_Transient(t *testing.T) { g := NewWithT(t) - ctx := context.Background() node := stateSyncNode("n", testChainID) - r, _ := newNodeReconciler(t, node) // Platform leaves the ConfigMap name empty. + r, _ := newNodeReconciler(t, node) + writeSyncerFile(t, r, "this: [is: not: valid: yaml") // malformed - transient := r.reconcileStateSyncGate(ctx, node) + transient := r.reconcileStateSyncGate(node) + g.Expect(transient).To(BeTrue()) + + cond := stateSyncCondition(node) + g.Expect(cond.Status).To(Equal(metav1.ConditionUnknown)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncSyncerSourceError)) + g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) +} + +func TestStateSyncGate_UnconfiguredSource_FailsClosed(t *testing.T) { + g := NewWithT(t) + + node := stateSyncNode("n", testChainID) + r, _ := newNodeReconciler(t, node) // Platform leaves StateSyncSyncersFile empty. + + transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) } func TestStateSyncGate_Disabled_NotApplicable(t *testing.T) { g := NewWithT(t) - ctx := context.Background() // S3 snapshot node: state sync not enabled. node := newSnapshotNode("n", testNamespace) r, _ := newNodeReconciler(t, node) - withSyncerConfigMap(r) + withSyncers(t, r, map[string][]string{testChainID: {syncerA, syncerB}}) - transient := r.reconcileStateSyncGate(ctx, node) + transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) cond := stateSyncCondition(node) @@ -191,7 +220,6 @@ func TestStateSyncGate_Disabled_NotApplicable(t *testing.T) { // state-sync-disabled: NotApplicable, never blocks the plan. func TestStateSyncGate_NoSnapshotSource_NotApplicable(t *testing.T) { g := NewWithT(t) - ctx := context.Background() node := &seiv1alpha1.SeiNode{ ObjectMeta: metav1.ObjectMeta{Name: "n", Namespace: testNamespace}, @@ -202,9 +230,9 @@ func TestStateSyncGate_NoSnapshotSource_NotApplicable(t *testing.T) { }, } r, _ := newNodeReconciler(t, node) - withSyncerConfigMap(r) + withSyncers(t, r, map[string][]string{testChainID: {syncerA, syncerB}}) - transient := r.reconcileStateSyncGate(ctx, node) + transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNotApplicable)) } @@ -213,15 +241,12 @@ func TestStateSyncGate_NoSnapshotSource_NotApplicable(t *testing.T) { // so a previously-good set can't leak into ConfigureStateSyncTask. func TestStateSyncGate_FailClosedClearsStaleSyncers(t *testing.T) { g := NewWithT(t) - ctx := context.Background() node := stateSyncNode("n", testChainID) node.Status.ResolvedStateSyncers = []string{"old-a:26657", "old-b:26657"} - r, _ := newNodeReconciler(t, node) // unconfigured ref → fail closed - withSyncerConfigMap(r) - r.Platform.StateSyncSyncersConfigMap = "" // force unconfigured + r, _ := newNodeReconciler(t, node) // unconfigured source → fail closed - transient := r.reconcileStateSyncGate(ctx, node) + transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) } @@ -230,12 +255,11 @@ func TestStateSyncGate_FailClosedClearsStaleSyncers(t *testing.T) { // node — regression guard for the full reconcile path. func TestStateSyncGate_NonStateSyncNodeUnaffected(t *testing.T) { g := NewWithT(t) - ctx := context.Background() node := newSnapshotNode("n", testNamespace) r, _ := newNodeReconciler(t, node) - transient := r.reconcileStateSyncGate(ctx, node) + transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) g.Expect(node.Status.ResolvedStateSyncers).To(BeEmpty()) g.Expect(slices.Contains([]string{ @@ -252,11 +276,10 @@ func TestReconcile_StateSyncFailClosed_NoPlanBuilt(t *testing.T) { node := stateSyncNode("ss-0", testNamespace) node.Spec.ChainID = testChainID - cm := syncerConfigMap(map[string]string{testChainID: syncerSingle}) - r, c := newNodeReconciler(t, node, cm) + r, c := newNodeReconciler(t, node) rec := record.NewFakeRecorder(10) r.Recorder = rec - withSyncerConfigMap(r) + withSyncers(t, r, map[string][]string{testChainID: {syncerSingle}}) _, err := r.Reconcile(ctx, nodeReqFor("ss-0", testNamespace)) g.Expect(err).NotTo(HaveOccurred()) @@ -291,22 +314,22 @@ func TestReconcile_StateSyncFailClosed_NoPlanBuilt(t *testing.T) { // node must build NO plan (pause semantics preserved). func TestReconcile_PausedNode_StateSyncReadyStillSeeded(t *testing.T) { cases := []struct { - name string - node *seiv1alpha1.SeiNode - withCM bool - wantReason string + name string + node *seiv1alpha1.SeiNode + withSyncers bool + wantReason string }{ { - name: "state-sync enabled, no syncers", - node: stateSyncNode("paused-ss", testNamespace), - withCM: true, - wantReason: seiv1alpha1.ReasonStateSyncNoSyncersConfigured, + name: "state-sync enabled, no syncers", + node: stateSyncNode("paused-ss", testNamespace), + withSyncers: true, + wantReason: seiv1alpha1.ReasonStateSyncNoSyncersConfigured, }, { - name: "state-sync disabled", - node: newSnapshotNode("paused-s3", testNamespace), - withCM: false, - wantReason: seiv1alpha1.ReasonStateSyncNotApplicable, + name: "state-sync disabled", + node: newSnapshotNode("paused-s3", testNamespace), + withSyncers: false, + wantReason: seiv1alpha1.ReasonStateSyncNotApplicable, }, } for _, tc := range cases { @@ -316,8 +339,9 @@ func TestReconcile_PausedNode_StateSyncReadyStillSeeded(t *testing.T) { tc.node.Spec.Paused = true r, c := newNodeReconciler(t, tc.node) - if tc.withCM { - withSyncerConfigMap(r) + if tc.withSyncers { + // Empty map for the chain → fail closed, but source is wired. + withSyncers(t, r, map[string][]string{"other-chain": {syncerA, syncerB}}) } _, err := r.Reconcile(ctx, nodeReqFor(tc.node.Name, testNamespace)) @@ -358,9 +382,8 @@ func TestReconcile_StateSyncFailClosed_ClearsTerminalPlan(t *testing.T) { Phase: tc.phase, Tasks: []seiv1alpha1.PlannedTask{{Type: planner.TaskConfigApply, Status: seiv1alpha1.TaskComplete}}, } - cm := syncerConfigMap(map[string]string{testChainID: syncerSingle}) - r, c := newNodeReconciler(t, node, cm) - withSyncerConfigMap(r) + r, c := newNodeReconciler(t, node) + withSyncers(t, r, map[string][]string{testChainID: {syncerSingle}}) _, err := r.Reconcile(ctx, nodeReqFor("ss-term", testNamespace)) g.Expect(err).NotTo(HaveOccurred()) @@ -375,38 +398,33 @@ func TestReconcile_StateSyncFailClosed_ClearsTerminalPlan(t *testing.T) { } } -// A non-NotFound ConfigMap read error must be transient: reconcileStatefulSet, +// An unparseable syncer file must be transient: reconcileStatefulSet, // Paused/Failed handling, and the status flush still run, the reconcile // requeues instead of hard-aborting, and StateSyncReady reflects it via -// Unknown/ConfigMapReadError. -func TestReconcile_StateSyncConfigMapReadError_Transient(t *testing.T) { +// Unknown/SyncerSourceError. +func TestReconcile_StateSyncSyncerSourceError_Transient(t *testing.T) { g := NewWithT(t) ctx := context.Background() node := stateSyncNode("ss-err", testNamespace) node.Spec.ChainID = testChainID - r, c := newNodeReconcilerWithGetError(t, node, func(key client.ObjectKey) error { - if key.Name == testSyncerCMName { - return apierrors.NewServiceUnavailable("etcd unavailable") - } - return nil - }) - withSyncerConfigMap(r) + r, c := newNodeReconciler(t, node) + writeSyncerFile(t, r, "{ this is not: valid yaml: at all") // malformed res, err := r.Reconcile(ctx, nodeReqFor("ss-err", testNamespace)) - g.Expect(err).NotTo(HaveOccurred(), "transient ConfigMap error must not hard-abort") + g.Expect(err).NotTo(HaveOccurred(), "transient syncer-source error must not hard-abort") g.Expect(res.RequeueAfter).To(BeNumerically(">", 0), "transient error must requeue") fetched := getSeiNode(t, ctx, c, "ss-err", testNamespace) cond := stateSyncCondition(fetched) g.Expect(cond).NotTo(BeNil()) g.Expect(cond.Status).To(Equal(metav1.ConditionUnknown)) - g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncConfigMapReadError)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncSyncerSourceError)) g.Expect(fetched.Status.Plan).To(BeNil(), "no state-sync plan while the gate is unresolved") - // StatefulSet sync still ran despite the ConfigMap error. + // StatefulSet sync still ran despite the syncer-source error. sts := &appsv1.StatefulSet{} g.Expect(c.Get(ctx, types.NamespacedName{Name: "ss-err", Namespace: testNamespace}, sts)).To(Succeed(), - "reconcileStatefulSet must run even when the ConfigMap read fails") + "reconcileStatefulSet must run even when the syncer source read fails") } // The StateSyncBlocked Warning fires once on the transition into fail-closed, @@ -417,11 +435,10 @@ func TestReconcile_StateSyncBlocked_EventFiresOncePerTransition(t *testing.T) { node := stateSyncNode("ss-evt", testNamespace) node.Spec.ChainID = testChainID - cm := syncerConfigMap(map[string]string{testChainID: syncerSingle}) - r, _ := newNodeReconciler(t, node, cm) + r, _ := newNodeReconciler(t, node) rec := record.NewFakeRecorder(10) r.Recorder = rec - withSyncerConfigMap(r) + withSyncers(t, r, map[string][]string{testChainID: {syncerSingle}}) countBlocked := func() int { n := 0 @@ -457,11 +474,10 @@ func TestReconcile_StateSyncReady_BuildsPlanWithSyncers(t *testing.T) { node := stateSyncNode("ss-1", testNamespace) node.Spec.ChainID = testChainID - cm := syncerConfigMap(map[string]string{ - testChainID: "a.arctic-1.example.com:26657\nb.arctic-1.example.com:26657", + r, c := newNodeReconciler(t, node) + withSyncers(t, r, map[string][]string{ + testChainID: {"a.arctic-1.example.com:26657", "b.arctic-1.example.com:26657"}, }) - r, c := newNodeReconciler(t, node, cm) - withSyncerConfigMap(r) _, err := r.Reconcile(ctx, nodeReqFor("ss-1", testNamespace)) g.Expect(err).NotTo(HaveOccurred()) diff --git a/internal/planner/planner.go b/internal/planner/planner.go index e4b35343..8f93f121 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -194,10 +194,10 @@ func (p *NodeResolver) ResolvePlan(ctx context.Context, node *seiv1alpha1.SeiNod // suppress plan construction this reconcile. It fires only on the init path // (pre-Running), which is the only path that builds a state-sync-bearing plan // via buildSidecarProgression — a Running node's update plans carry no -// state-sync task, so an image roll must not be blocked by a syncer-ConfigMap +// state-sync task, so an image roll must not be blocked by a syncer-source // blip. The gate trips when state-sync is enabled and the controller-resolved // StateSyncReady condition is not True (NoSyncersConfigured or the transient -// ConfigMapReadError). A missing condition (not yet resolved) does not gate. +// SyncerSourceError). A missing condition (not yet resolved) does not gate. func stateSyncBlocksPlan(node *seiv1alpha1.SeiNode) bool { if node.Status.Phase == seiv1alpha1.PhaseRunning { return false diff --git a/internal/platform/platform.go b/internal/platform/platform.go index 4f88d7f6..d4f09e05 100644 --- a/internal/platform/platform.go +++ b/internal/platform/platform.go @@ -22,7 +22,7 @@ const ( // Config holds infrastructure-level settings that vary per deployment // environment. Fields are read from environment variables in main.go and are -// required unless documented otherwise — the StateSyncSyncers* pair is optional +// required unless documented otherwise — StateSyncSyncersFile is optional // (state-sync is opt-in). See platformtest.Config() for test fixtures. type Config struct { NodepoolName string @@ -60,22 +60,19 @@ type Config struct { // The cosmos-exporter container is attached to every SeiNode pod. CosmosExporterImage string - // StateSyncSyncersConfigMap is the name of the canonical-syncer ConfigMap - // the controller reads to populate state-sync rpc_servers. It is the trust - // root for state-sync: read-only to the controller, GitOps-written, RBAC- - // locked. Keyed by chain ID (data[chainID] = syncer RPC endpoints, as bare - // host:port — no http:// scheme prefix; the sidecar adds the scheme). + // StateSyncSyncersFile is the path to the read-only canonical-syncer file the + // controller reads to populate state-sync rpc_servers. It is the trust root + // for state-sync: a GitOps-written ConfigMap mounted read-only (directory + // mount, not subPath, so atomic ConfigMap swaps propagate without a pod + // restart). Content is a YAML map keyed by chain ID (chainID -> list of bare + // host:port endpoints — no http:// scheme prefix; the sidecar adds it). // - // State-sync is opt-in, so this and StateSyncSyncersNamespace may be empty - // when no node uses state-sync. When a node DOES enable state-sync and this - // is unset (or the ConfigMap yields <2 entries for its chain), the - // controller fails closed via StateSyncReady=False/NoSyncersConfigured - // rather than building a witness-less plan. - StateSyncSyncersConfigMap string - - // StateSyncSyncersNamespace is the namespace of StateSyncSyncersConfigMap. - // Required when StateSyncSyncersConfigMap is set (Validate enforces the pair). - StateSyncSyncersNamespace string + // State-sync is opt-in, so this may be empty when no node uses state-sync. + // When a node DOES enable state-sync and this is unset (or the file is + // missing, or yields <2 entries for its chain), the controller fails closed + // via StateSyncReady=False/NoSyncersConfigured rather than building a + // witness-less plan. + StateSyncSyncersFile string } // NodepoolForMode returns the Karpenter NodePool name for the given @@ -122,11 +119,5 @@ func (c Config) Validate() error { return fmt.Errorf("%s is required", name) } } - // The state-sync syncer ConfigMap is optional, but a name without a - // namespace would issue a cluster-scoped Get that silently fails closed — - // catch that misconfiguration explicitly. - if c.StateSyncSyncersConfigMap != "" && c.StateSyncSyncersNamespace == "" { - return fmt.Errorf("SEI_STATESYNC_SYNCERS_NAMESPACE is required when SEI_STATESYNC_SYNCERS_CONFIGMAP is set") - } return nil } From ab26a2b80143fb310843d220502453f4e85f385f Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 12 Jun 2026 12:13:32 -0700 Subject: [PATCH 7/8] refactor(statesync): source syncers from the controller app-config file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per owner review: make #397's mounted file the controller's generic application-config file (SEI_CONTROLLER_CONFIG -> /etc/sei-controller/config.yaml) with state-sync as one section, rather than a bespoke syncers file — so it's the foundation PLT-475 extends, not a throwaway shape. - platform.FileConfig{ StateSync: { Syncers map[chainID][]host:port } } typed, sibling to the env-sourced platform.Config (file-sourced vs env-sourced kept distinct; the config path is still an env var, so the env-driven convention holds). - platform.Config.StateSyncSyncersFile -> ControllerConfigFile; canonicalSyncers reads cfg.StateSync.Syncers[chainID]. Conventional os.ReadFile + sigs.k8s.io/yaml into the typed struct, fresh per reconcile for the dynamic syncer section. - Mount renamed: ConfigMap sei-controller-config, directory mount at /etc/sei-controller (not subPath), readOnly, optional. Fail-closed/transient semantics unchanged. First slice of PLT-475. Co-Authored-By: Claude Opus 4.8 --- cmd/main.go | 4 +-- config/manager/manager.yaml | 14 +++++----- internal/controller/node/statesync.go | 31 +++++++++++----------- internal/controller/node/statesync_test.go | 20 ++++++++------ internal/platform/platform.go | 31 ++++++++++++++++------ 5 files changed, 60 insertions(+), 40 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 7e4c8737..84f89905 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -164,9 +164,9 @@ func main() { SidecarImage: os.Getenv("SEI_SIDECAR_IMAGE"), CosmosExporterImage: os.Getenv("SEI_COSMOS_EXPORTER_IMAGE"), - // State-sync canonical syncers are opt-in; this may be empty. Points at a + // The application-config file is opt-in; this may be empty. Points at a // read-only mounted file (a GitOps-written ConfigMap volume). - StateSyncSyncersFile: os.Getenv("SEI_STATESYNC_SYNCERS_FILE"), + ControllerConfigFile: os.Getenv("SEI_CONTROLLER_CONFIG"), } if err := platformCfg.Validate(); err != nil { diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index ff978676..d88d7a1c 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -81,11 +81,11 @@ spec: value: gateway - name: SEI_GATEWAY_DOMAIN value: prod.platform.sei.io - # Read-only canonical-syncer source. Points at the mounted ConfigMap + # Read-only application-config source. Points at the mounted ConfigMap # (directory mount, not subPath) so GitOps swaps propagate without a # pod restart. Optional: the controller fails closed when absent. - - name: SEI_STATESYNC_SYNCERS_FILE - value: /etc/sei/statesync-syncers/syncers.yaml + - name: SEI_CONTROLLER_CONFIG + value: /etc/sei-controller/config.yaml ports: - containerPort: 8080 name: metrics @@ -119,15 +119,15 @@ spec: # Directory mount (not subPath): subPath snapshots the ConfigMap at # mount time and never updates, defeating the fresh-read-per-reconcile # contract. readOnly so the controller can't rewrite its trust root. - - name: statesync-syncers - mountPath: /etc/sei/statesync-syncers + - name: sei-controller-config + mountPath: /etc/sei-controller readOnly: true volumes: # GitOps provisions the ConfigMap content out of this repo. optional so # the controller starts (and fails closed) when it isn't present yet. - - name: statesync-syncers + - name: sei-controller-config configMap: - name: canonical-syncers + name: sei-controller-config optional: true serviceAccountName: controller-manager terminationGracePeriodSeconds: 10 diff --git a/internal/controller/node/statesync.go b/internal/controller/node/statesync.go index 0c855962..d6d5bc97 100644 --- a/internal/controller/node/statesync.go +++ b/internal/controller/node/statesync.go @@ -11,6 +11,7 @@ import ( "sigs.k8s.io/yaml" seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/platform" ) // minCanonicalSyncers is the controller-side fail-closed floor: state-sync @@ -77,23 +78,24 @@ func (r *SeiNodeReconciler) reconcileStateSyncGate(node *seiv1alpha1.SeiNode) (t return false } -// canonicalSyncers reads the read-only canonical-syncer file fresh and returns -// the parsed syncer RPC endpoints for the given chain. An unset path, a missing -// file, or a chain with no entry all yield an empty slice (no error) so the -// caller fails closed via the StateSyncReady gate rather than crashing — +// canonicalSyncers reads the read-only application-config file fresh and +// returns the parsed syncer RPC endpoints for the given chain. An unset path, a +// missing file, or a chain with no entry all yield an empty slice (no error) so +// the caller fails closed via the StateSyncReady gate rather than crashing — // state-sync is opt-in and the file may legitimately be absent until GitOps // provisions the backing ConfigMap. Any other read or parse error is returned // so the gate can treat it as transient. // -// File shape: a YAML map of chainID -> list of bare `host:port` RPC endpoints -// (no scheme; the sidecar adds it). Each chain's entries are trimmed, blanks -// dropped, sorted, and de-duplicated for a stable witness set. +// File shape: see platform.FileConfig — the stateSync.syncers section is a YAML +// map of chainID -> list of bare `host:port` RPC endpoints (no scheme; the +// sidecar adds it). Each chain's entries are trimmed, blanks dropped, sorted, +// and de-duplicated for a stable witness set. // // Read fresh on every call: a mounted ConfigMap swaps atomically (a symlink // flip on the directory mount), so re-reading picks up GitOps updates without a // pod restart. Never cache an open handle. func (r *SeiNodeReconciler) canonicalSyncers(chainID string) ([]string, error) { - path := strings.TrimSpace(r.Platform.StateSyncSyncersFile) + path := strings.TrimSpace(r.Platform.ControllerConfigFile) if path == "" { return nil, nil } @@ -106,15 +108,14 @@ func (r *SeiNodeReconciler) canonicalSyncers(chainID string) ([]string, error) { return nil, err } - syncers := map[string][]string{} - if err := yaml.Unmarshal(raw, &syncers); err != nil { - return nil, fmt.Errorf("parsing canonical-syncer file %q: %w", path, err) + var cfg platform.FileConfig + if err := yaml.Unmarshal(raw, &cfg); err != nil { + return nil, fmt.Errorf("parsing controller config file %q: %w", path, err) } - // A YAML list entry may itself carry comma/whitespace-joined endpoints - // (forward-compatible with PLT-475's stateSync.syncers), so route the joined - // value through the same splitter the ConfigMap source used. - return parseSyncerList(strings.Join(syncers[chainID], "\n")), nil + // A YAML list entry may itself carry comma/whitespace-joined endpoints, so + // route the joined value through the same splitter the ConfigMap source used. + return parseSyncerList(strings.Join(cfg.StateSync.Syncers[chainID], "\n")), nil } // parseSyncerList splits a syncer value on newlines and commas, trims diff --git a/internal/controller/node/statesync_test.go b/internal/controller/node/statesync_test.go index b1fe0391..20d23fc8 100644 --- a/internal/controller/node/statesync_test.go +++ b/internal/controller/node/statesync_test.go @@ -29,15 +29,19 @@ const ( syncerSingle = "only-one:26657" ) -// syncerFileYAML renders a chainID -> [host:port] map as the read-only syncer -// file content (matching canonicalSyncers' expected YAML shape). +// syncerFileYAML renders a chainID -> [host:port] map as the read-only +// controller-config file content, wrapped under the stateSync.syncers section +// (matching canonicalSyncers' expected platform.FileConfig shape). func syncerFileYAML(byChain map[string][]string) string { var b strings.Builder + b.WriteString("stateSync:\n") + b.WriteString(" syncers:\n") for chain, syncers := range byChain { + b.WriteString(" ") b.WriteString(chain) b.WriteString(":\n") for _, s := range syncers { - b.WriteString(" - ") + b.WriteString(" - ") b.WriteString(s) b.WriteString("\n") } @@ -49,11 +53,11 @@ func syncerFileYAML(byChain map[string][]string) string { // platform at it. The file lives under t.TempDir(), auto-cleaned by the test. func writeSyncerFile(t *testing.T, r *SeiNodeReconciler, content string) { t.Helper() - path := filepath.Join(t.TempDir(), "syncers.yaml") + path := filepath.Join(t.TempDir(), "config.yaml") if err := os.WriteFile(path, []byte(content), 0o600); err != nil { - t.Fatalf("writing syncer file: %v", err) + t.Fatalf("writing controller config file: %v", err) } - r.Platform.StateSyncSyncersFile = path + r.Platform.ControllerConfigFile = path } // withSyncers writes a chainID -> syncers file and wires the reconciler to it. @@ -148,7 +152,7 @@ func TestStateSyncGate_EnabledMissingFile_FailsClosed(t *testing.T) { node := stateSyncNode("n", testChainID) r, _ := newNodeReconciler(t, node) - r.Platform.StateSyncSyncersFile = filepath.Join(t.TempDir(), "absent.yaml") // never created + r.Platform.ControllerConfigFile = filepath.Join(t.TempDir(), "absent.yaml") // never created transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) @@ -192,7 +196,7 @@ func TestStateSyncGate_UnconfiguredSource_FailsClosed(t *testing.T) { g := NewWithT(t) node := stateSyncNode("n", testChainID) - r, _ := newNodeReconciler(t, node) // Platform leaves StateSyncSyncersFile empty. + r, _ := newNodeReconciler(t, node) // Platform leaves ControllerConfigFile empty. transient := r.reconcileStateSyncGate(node) g.Expect(transient).To(BeFalse()) diff --git a/internal/platform/platform.go b/internal/platform/platform.go index d4f09e05..e3d549f5 100644 --- a/internal/platform/platform.go +++ b/internal/platform/platform.go @@ -22,8 +22,12 @@ const ( // Config holds infrastructure-level settings that vary per deployment // environment. Fields are read from environment variables in main.go and are -// required unless documented otherwise — StateSyncSyncersFile is optional +// required unless documented otherwise — ControllerConfigFile is optional // (state-sync is opt-in). See platformtest.Config() for test fixtures. +// +// Config is env-sourced infra; FileConfig (below) is the file-sourced +// application config. They are deliberately distinct: ControllerConfigFile is +// the path to the latter, not its contents. type Config struct { NodepoolName string NodepoolArchive string @@ -60,19 +64,30 @@ type Config struct { // The cosmos-exporter container is attached to every SeiNode pod. CosmosExporterImage string - // StateSyncSyncersFile is the path to the read-only canonical-syncer file the - // controller reads to populate state-sync rpc_servers. It is the trust root - // for state-sync: a GitOps-written ConfigMap mounted read-only (directory + // ControllerConfigFile is the path to the read-only application-config file + // the controller reads (SEI_CONTROLLER_CONFIG). It is the trust root for + // state-sync today: a GitOps-written ConfigMap mounted read-only (directory // mount, not subPath, so atomic ConfigMap swaps propagate without a pod - // restart). Content is a YAML map keyed by chain ID (chainID -> list of bare - // host:port endpoints — no http:// scheme prefix; the sidecar adds it). + // restart). Content is YAML decoded into FileConfig. // - // State-sync is opt-in, so this may be empty when no node uses state-sync. + // The file is opt-in, so this may be empty when no node uses state-sync. // When a node DOES enable state-sync and this is unset (or the file is // missing, or yields <2 entries for its chain), the controller fails closed // via StateSyncReady=False/NoSyncersConfigured rather than building a // witness-less plan. - StateSyncSyncersFile string + ControllerConfigFile string +} + +// FileConfig is the controller's file-sourced application config (SEI_CONTROLLER_CONFIG). +// One section today (StateSync); PLT-475 folds the remaining infra knobs in. +type FileConfig struct { + StateSync StateSyncConfig `json:"stateSync"` +} + +// StateSyncConfig is the state-sync section of the application config. +type StateSyncConfig struct { + // Syncers maps chainID -> bare host:port RPC endpoints (no scheme; sidecar adds it). + Syncers map[string][]string `json:"syncers"` } // NodepoolForMode returns the Karpenter NodePool name for the given From 011ff5337c2f43c0e9a3fad307a526643133a866 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 12 Jun 2026 12:28:41 -0700 Subject: [PATCH 8/8] fix(statesync): requeue fail-closed nodes so they unblock when syncers appear MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A pre-Running state-sync node that's fail-closed (False/NoSyncersConfigured) built no plan and returned with no RequeueAfter — only transient errors and Running nodes requeued. The canonical-syncer file is a no-watch mount, so the node never re-polled and stayed blocked forever after GitOps added syncers. reconcileStateSyncGate now returns `blocked` (state-sync enabled AND StateSyncReady != True — covers both NoSyncersConfigured and the transient SyncerSourceError); the end-of-reconcile requeue keys on it. A blocked node polls statusPollInterval, re-reads the mounted file fresh each reconcile, and unblocks once >=2 syncers appear. result.IsZero() still defers to the running-node poll and plan-execution requeues. No ConfigMap watch (would re-introduce the API dependency the file mount removed). Plus a comment sweep of the PR's changed files to the crisp/human register. Co-Authored-By: Claude Opus 4.8 --- internal/controller/node/controller.go | 15 +++-- internal/controller/node/statesync.go | 43 ++++++------- internal/controller/node/statesync_test.go | 75 ++++++++++++++++------ internal/planner/planner.go | 4 +- internal/platform/platform.go | 1 - 5 files changed, 83 insertions(+), 55 deletions(-) diff --git a/internal/controller/node/controller.go b/internal/controller/node/controller.go index cb12cd63..fd1435fe 100644 --- a/internal/controller/node/controller.go +++ b/internal/controller/node/controller.go @@ -113,9 +113,9 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct // flush, Paused flush, and the normal end-of-reconcile patch) — no separate // status write. Fail-closed enforcement lives in ResolvePlan, which declines // to build a state-sync plan when this condition isn't True; that keeps - // terminal-plan cleanup and non-state-sync work running. A transient - // syncer-source read error requeues without aborting the steps below. - stateSyncTransient := r.reconcileStateSyncGate(node) + // terminal-plan cleanup and non-state-sync work running. A blocked gate + // requeues (see end of reconcile) without aborting the steps below. + stateSyncBlocked := r.reconcileStateSyncGate(node) // Failed is terminal — flush any condition updates and exit. if node.Status.Phase == seiv1alpha1.PhaseFailed { @@ -185,9 +185,12 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{RequeueAfter: statusPollInterval}, nil } - // A transient canonical-syncer ConfigMap read failed closed without a plan - // to drive a requeue; poll so the gate re-resolves once the API recovers. - if stateSyncTransient && result.IsZero() { + // A blocked state-sync node (fail-closed or transient) builds no plan to + // drive a requeue, and the syncer file is a mounted volume with no watch. + // Poll so the gate re-resolves and unblocks once GitOps provisions or fixes + // the syncers. IsZero defers to any stronger requeue above (running-node + // poll, plan execution), so an active-plan node is unaffected. + if stateSyncBlocked && result.IsZero() { return ctrl.Result{RequeueAfter: statusPollInterval}, nil } diff --git a/internal/controller/node/statesync.go b/internal/controller/node/statesync.go index d6d5bc97..b87d3c73 100644 --- a/internal/controller/node/statesync.go +++ b/internal/controller/node/statesync.go @@ -22,24 +22,19 @@ import ( // existing trust-pinning behavior. const minCanonicalSyncers = 2 -// reconcileStateSyncGate resolves the canonical-syncer set for a state-sync -// node and sets the always-present ConditionStateSyncReady accordingly. It -// mutates node.Status in-memory only — the condition and ResolvedStateSyncers -// are flushed by the caller's single optimistic-lock status patch, never a -// separate write. The caller runs it before the Failed/Paused early-returns so -// StateSyncReady is seeded on every path. +// reconcileStateSyncGate resolves the canonical-syncer set and sets the +// always-present ConditionStateSyncReady. It mutates node.Status in-memory only; +// the caller's single status patch flushes it. Run before the Failed/Paused +// early-returns so StateSyncReady is seeded on every path. // -// Fail-closed is enforced downstream, not here: the planner declines to build a -// state-sync plan whenever StateSyncReady is not True (see ResolvePlan). That -// keeps ResolvePlan running on every reconcile, so handleTerminalPlan still -// clears terminal plans and non-state-sync work proceeds — this method only -// resolves the condition. -// -// A transient (non-absence) read or parse error is NOT fatal: it sets -// StateSyncReady=Unknown/SyncerSourceError and returns transient=true so the -// caller requeues without aborting the StatefulSet/Failed/Paused/flush path. A -// missing source file or <2 entries fails closed via False/NoSyncersConfigured. -func (r *SeiNodeReconciler) reconcileStateSyncGate(node *seiv1alpha1.SeiNode) (transient bool) { +// Fail-closed is enforced downstream: the planner declines to build a state-sync +// plan whenever StateSyncReady is not True (see ResolvePlan), so this method only +// resolves the condition. It returns blocked=true whenever state-sync is enabled +// but the condition didn't resolve to True — both NoSyncersConfigured and the +// transient SyncerSourceError. The caller requeues on blocked: the syncer file is +// a mounted volume with no watch, so polling is the only way the gate re-resolves +// and unblocks once GitOps provisions or fixes the syncers. +func (r *SeiNodeReconciler) reconcileStateSyncGate(node *seiv1alpha1.SeiNode) (blocked bool) { snap := node.Spec.SnapshotSource() if snap == nil || snap.StateSync == nil { // State-sync disabled: no state-sync task in the plan to gate. @@ -67,7 +62,7 @@ func (r *SeiNodeReconciler) reconcileStateSyncGate(node *seiv1alpha1.SeiNode) (t setStateSyncReady(node, metav1.ConditionFalse, seiv1alpha1.ReasonStateSyncNoSyncersConfigured, fmt.Sprintf("state sync requires >=%d canonical syncers configured for chain %q; found %d", minCanonicalSyncers, node.Spec.ChainID, len(syncers))) - return false + return true } if !slices.Equal(node.Status.ResolvedStateSyncers, syncers) { @@ -86,14 +81,12 @@ func (r *SeiNodeReconciler) reconcileStateSyncGate(node *seiv1alpha1.SeiNode) (t // provisions the backing ConfigMap. Any other read or parse error is returned // so the gate can treat it as transient. // -// File shape: see platform.FileConfig — the stateSync.syncers section is a YAML -// map of chainID -> list of bare `host:port` RPC endpoints (no scheme; the -// sidecar adds it). Each chain's entries are trimmed, blanks dropped, sorted, -// and de-duplicated for a stable witness set. +// Endpoints are bare host:port (no scheme; the sidecar adds it); see +// platform.FileConfig for the file shape. // -// Read fresh on every call: a mounted ConfigMap swaps atomically (a symlink -// flip on the directory mount), so re-reading picks up GitOps updates without a -// pod restart. Never cache an open handle. +// Read fresh on every call: a mounted ConfigMap swaps atomically (a symlink flip +// on the directory mount), so re-reading picks up GitOps updates without a pod +// restart. Never cache an open handle. func (r *SeiNodeReconciler) canonicalSyncers(chainID string) ([]string, error) { path := strings.TrimSpace(r.Platform.ControllerConfigFile) if path == "" { diff --git a/internal/controller/node/statesync_test.go b/internal/controller/node/statesync_test.go index 20d23fc8..72c1de9c 100644 --- a/internal/controller/node/statesync_test.go +++ b/internal/controller/node/statesync_test.go @@ -113,8 +113,8 @@ func TestStateSyncGate_EnabledWithTwoSyncers_Ready(t *testing.T) { testChainID: {"syncer-1.arctic-1.example.com:26657", "syncer-0.arctic-1.example.com:26657"}, }) - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeFalse()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeFalse()) cond := stateSyncCondition(node) g.Expect(cond).NotTo(BeNil()) @@ -135,8 +135,8 @@ func TestStateSyncGate_EnabledWithOneSyncer_FailsClosed(t *testing.T) { r, _ := newNodeReconciler(t, node) withSyncers(t, r, map[string][]string{testChainID: {"only-one.arctic-1.example.com:26657"}}) - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeFalse()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeTrue()) cond := stateSyncCondition(node) g.Expect(cond).NotTo(BeNil()) @@ -154,8 +154,8 @@ func TestStateSyncGate_EnabledMissingFile_FailsClosed(t *testing.T) { r, _ := newNodeReconciler(t, node) r.Platform.ControllerConfigFile = filepath.Join(t.TempDir(), "absent.yaml") // never created - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeFalse()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeTrue()) cond := stateSyncCondition(node) g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) @@ -169,8 +169,8 @@ func TestStateSyncGate_EnabledNoChainEntry_FailsClosed(t *testing.T) { r, _ := newNodeReconciler(t, node) withSyncers(t, r, map[string][]string{"other-chain": {syncerA, syncerB}}) - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeFalse()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeTrue()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) } @@ -183,8 +183,8 @@ func TestStateSyncGate_ParseError_Transient(t *testing.T) { r, _ := newNodeReconciler(t, node) writeSyncerFile(t, r, "this: [is: not: valid: yaml") // malformed - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeTrue()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeTrue()) cond := stateSyncCondition(node) g.Expect(cond.Status).To(Equal(metav1.ConditionUnknown)) @@ -198,8 +198,8 @@ func TestStateSyncGate_UnconfiguredSource_FailsClosed(t *testing.T) { node := stateSyncNode("n", testChainID) r, _ := newNodeReconciler(t, node) // Platform leaves ControllerConfigFile empty. - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeFalse()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeTrue()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) } @@ -211,8 +211,8 @@ func TestStateSyncGate_Disabled_NotApplicable(t *testing.T) { r, _ := newNodeReconciler(t, node) withSyncers(t, r, map[string][]string{testChainID: {syncerA, syncerB}}) - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeFalse()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeFalse()) cond := stateSyncCondition(node) g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) @@ -236,8 +236,8 @@ func TestStateSyncGate_NoSnapshotSource_NotApplicable(t *testing.T) { r, _ := newNodeReconciler(t, node) withSyncers(t, r, map[string][]string{testChainID: {syncerA, syncerB}}) - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeFalse()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeFalse()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNotApplicable)) } @@ -250,8 +250,8 @@ func TestStateSyncGate_FailClosedClearsStaleSyncers(t *testing.T) { node.Status.ResolvedStateSyncers = []string{"old-a:26657", "old-b:26657"} r, _ := newNodeReconciler(t, node) // unconfigured source → fail closed - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeFalse()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeTrue()) g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) } @@ -263,8 +263,8 @@ func TestStateSyncGate_NonStateSyncNodeUnaffected(t *testing.T) { node := newSnapshotNode("n", testNamespace) r, _ := newNodeReconciler(t, node) - transient := r.reconcileStateSyncGate(node) - g.Expect(transient).To(BeFalse()) + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeFalse()) g.Expect(node.Status.ResolvedStateSyncers).To(BeEmpty()) g.Expect(slices.Contains([]string{ seiv1alpha1.ReasonStateSyncReady, seiv1alpha1.ReasonStateSyncNoSyncersConfigured, @@ -310,6 +310,41 @@ func TestReconcile_StateSyncFailClosed_NoPlanBuilt(t *testing.T) { g.Expect(sawWarning).To(BeTrue(), "expected a StateSyncBlocked Warning Event") } +// A pre-Running, fail-closed state-sync node builds no plan, so nothing else +// drives a requeue and the mounted syncer file has no watch. The gate must +// requeue on the poll interval; once the file gains >=2 syncers the next +// reconcile re-resolves the gate and builds the plan (unblocks). +func TestReconcile_StateSyncFailClosed_RequeuesAndUnblocks(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + + node := stateSyncNode("ss-poll", testNamespace) + node.Spec.ChainID = testChainID + node.Status.Phase = seiv1alpha1.PhasePending + r, c := newNodeReconciler(t, node) + withSyncers(t, r, map[string][]string{testChainID: {syncerSingle}}) + + res, err := r.Reconcile(ctx, nodeReqFor("ss-poll", testNamespace)) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(statusPollInterval), + "fail-closed pre-Running node must poll the mounted syncer file") + + fetched := getSeiNode(t, ctx, c, "ss-poll", testNamespace) + g.Expect(fetched.Status.Plan).To(BeNil()) + g.Expect(stateSyncCondition(fetched).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) + + // GitOps provisions the syncers; the mounted file now satisfies the floor. + withSyncers(t, r, map[string][]string{testChainID: {syncerA, syncerB}}) + + _, err = r.Reconcile(ctx, nodeReqFor("ss-poll", testNamespace)) + g.Expect(err).NotTo(HaveOccurred()) + + fetched = getSeiNode(t, ctx, c, "ss-poll", testNamespace) + g.Expect(fetched.Status.Plan).NotTo(BeNil(), "gate unblocks once the file has >=2 syncers") + g.Expect(findPlannedTask(fetched.Status.Plan, planner.TaskConfigureStateSync)).NotTo(BeNil()) + g.Expect(stateSyncCondition(fetched).Status).To(Equal(metav1.ConditionTrue)) +} + // Full reconcile path: a paused node still gets the always-present // StateSyncReady condition seeded, even though reconcile returns early on // spec.paused before the gate enforcement. State-sync enabled with no syncers diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 8f93f121..4427d39b 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -158,9 +158,7 @@ func (p *NodeResolver) ResolvePlan(ctx context.Context, node *seiv1alpha1.SeiNod // Fail-closed state-sync gate. Runs after handleTerminalPlan (so terminal // plans still clear) but before building: a state-sync node whose // StateSyncReady condition isn't True must never get a state-sync-bearing - // plan (CometBFT needs >=2 rpc-servers; we never fall back to peers). The - // condition is resolved upstream by the controller's reconcileStateSyncGate. - // Non-state-sync work is unaffected — only init-plan construction is gated. + // plan (CometBFT needs >=2 rpc-servers; we never fall back to peers). if stateSyncBlocksPlan(node) { return nil } diff --git a/internal/platform/platform.go b/internal/platform/platform.go index e3d549f5..b97ac43f 100644 --- a/internal/platform/platform.go +++ b/internal/platform/platform.go @@ -79,7 +79,6 @@ type Config struct { } // FileConfig is the controller's file-sourced application config (SEI_CONTROLLER_CONFIG). -// One section today (StateSync); PLT-475 folds the remaining infra knobs in. type FileConfig struct { StateSync StateSyncConfig `json:"stateSync"` }