Skip to content
Merged
46 changes: 40 additions & 6 deletions api/v1alpha1/seinode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,31 @@ const (

// ConditionSeiNodePaused mirrors spec.paused: True when paused.
ConditionSeiNodePaused = "Paused"

// ConditionStateSyncReady gates the state-sync-bearing plan. Always-present
// once reconciled. True means canonical syncers are configured and the plan
// may proceed; False fails closed (no state-sync plan built, and peers are
// never used as witnesses). It is a configured-count gate: witness
// reliability comes from curating the canonical-syncer set, and the sidecar
// establishes the trust point from them as it does today.
ConditionStateSyncReady = "StateSyncReady"
)

// Reasons for the StateSyncReady condition.
const (
// ReasonStateSyncReady: state-sync enabled and >=2 canonical syncers are
// configured for the chain; the state-sync-bearing plan may proceed.
ReasonStateSyncReady = "Ready"
// ReasonStateSyncNoSyncersConfigured: state-sync enabled but the canonical-
// syncer source yields <2 entries for the chain (fail closed).
ReasonStateSyncNoSyncersConfigured = "NoSyncersConfigured"
// ReasonStateSyncNotApplicable: the node does not enable state-sync.
ReasonStateSyncNotApplicable = "NotApplicable"
// ReasonStateSyncSyncerSourceError: reading or parsing the canonical-syncer
// source file failed for a reason other than absence (transient). Fails
// closed and requeues; the rest of the reconcile (StatefulSet, Failed/Paused
// handling, status flush) still runs.
ReasonStateSyncSyncerSourceError = "SyncerSourceError"
)

// Reasons for the ImportPVCReady condition.
Expand Down Expand Up @@ -360,15 +385,24 @@ type SeiNodeStatus struct {
// +optional
ResolvedPeers []string `json:"resolvedPeers,omitempty"`

// ResolvedRPCWitnesses carries the in-cluster RPC endpoints
// (`<peer>-0.<peer>.<ns>.svc.cluster.local:26657`) of the label-resolved
// peers, used as CometBFT state-sync light-client witnesses. Unlike
// ResolvedPeers these never carry an external P2P address — RPC is
// internal-only. When empty the sidecar derives witnesses from
// persistent_peers instead.
// ResolvedRPCWitnesses is DEPRECATED and no longer written. State-sync
// witnesses now come from the controller-level canonical-syncer ConfigMap
// (see ResolvedStateSyncers), not label-derived fleet peers. The field is
// retained present-but-unwritten this release (CRD field removal is a
// one-way door); remove it at the version bump.
//
// Deprecated: use ResolvedStateSyncers.
// +optional
ResolvedRPCWitnesses []string `json:"resolvedRPCWitnesses,omitempty"`

// ResolvedStateSyncers carries the canonical state-sync RPC endpoints
// (`host:port`) read from the canonical-syncer ConfigMap for this node's
// chain, fed verbatim into ConfigureStateSyncTask.RpcServers. Written by the
// StateSyncReady gate only when state-sync is enabled and >=2 syncers are
// configured; otherwise left empty (fail closed).
// +optional
ResolvedStateSyncers []string `json:"resolvedStateSyncers,omitempty"`

// StatefulSet references the StatefulSet the controller created for
// this SeiNode. UID is the identity check: an STS with the expected
// name but a different UID is not the one this controller created
Expand Down
5 changes: 5 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ func main() {
KubeRBACProxyImage: os.Getenv("SEI_KUBE_RBAC_PROXY_IMAGE"),
SidecarImage: os.Getenv("SEI_SIDECAR_IMAGE"),
CosmosExporterImage: os.Getenv("SEI_COSMOS_EXPORTER_IMAGE"),

// The application-config file is opt-in; this may be empty. Points at a
// read-only mounted file (a GitOps-written ConfigMap volume).
ControllerConfigFile: os.Getenv("SEI_CONTROLLER_CONFIG"),
}

if err := platformCfg.Validate(); err != nil {
Expand Down
23 changes: 17 additions & 6 deletions config/crd/sei.io_seinodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -972,12 +972,23 @@ spec:
type: array
resolvedRPCWitnesses:
description: |-
ResolvedRPCWitnesses carries the in-cluster RPC endpoints
(`<peer>-0.<peer>.<ns>.svc.cluster.local:26657`) of the label-resolved
peers, used as CometBFT state-sync light-client witnesses. Unlike
ResolvedPeers these never carry an external P2P address — RPC is
internal-only. When empty the sidecar derives witnesses from
persistent_peers instead.
ResolvedRPCWitnesses is DEPRECATED and no longer written. State-sync
witnesses now come from the controller-level canonical-syncer ConfigMap
(see ResolvedStateSyncers), not label-derived fleet peers. The field is
retained present-but-unwritten this release (CRD field removal is a
one-way door); remove it at the version bump.

Deprecated: use ResolvedStateSyncers.
items:
type: string
type: array
resolvedStateSyncers:
description: |-
ResolvedStateSyncers carries the canonical state-sync RPC endpoints
(`host:port`) read from the canonical-syncer ConfigMap for this node's
chain, fed verbatim into ConfigureStateSyncTask.RpcServers. Written by the
StateSyncReady gate only when state-sync is enabled and >=2 syncers are
configured; otherwise left empty (fail closed).
items:
type: string
type: array
Expand Down
21 changes: 19 additions & 2 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ spec:
value: gateway
- name: SEI_GATEWAY_DOMAIN
value: prod.platform.sei.io
# Read-only application-config source. Points at the mounted ConfigMap
# (directory mount, not subPath) so GitOps swaps propagate without a
# pod restart. Optional: the controller fails closed when absent.
- name: SEI_CONTROLLER_CONFIG
value: /etc/sei-controller/config.yaml
ports:
- containerPort: 8080
name: metrics
Expand Down Expand Up @@ -110,7 +115,19 @@ spec:
requests:
cpu: 50m
memory: 128Mi
volumeMounts: []
volumes: []
volumeMounts:
# Directory mount (not subPath): subPath snapshots the ConfigMap at
# mount time and never updates, defeating the fresh-read-per-reconcile
# contract. readOnly so the controller can't rewrite its trust root.
- name: sei-controller-config
mountPath: /etc/sei-controller
readOnly: true
volumes:
# GitOps provisions the ConfigMap content out of this repo. optional so
# the controller starts (and fails closed) when it isn't present yet.
- name: sei-controller-config
configMap:
name: sei-controller-config
optional: true
serviceAccountName: controller-manager
terminationGracePeriodSeconds: 10
107 changes: 80 additions & 27 deletions internal/controller/node/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
statusBase := client.MergeFromWithOptions(before, client.MergeFromWithOptimisticLock{})
observedPhase := node.Status.Phase
prevSidecar := apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionSidecarReady)
prevStateSync := apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionStateSyncReady)

setNodePausedCondition(node)

Expand All @@ -107,6 +108,15 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
return r.Status().Patch(ctx, node, statusBase)
}

// Resolve the always-present StateSyncReady condition before the Failed and
// Paused early-returns so it rides the existing flush on every path (Failed
// flush, Paused flush, and the normal end-of-reconcile patch) — no separate
// status write. Fail-closed enforcement lives in ResolvePlan, which declines
// to build a state-sync plan when this condition isn't True; that keeps
// terminal-plan cleanup and non-state-sync work running. A blocked gate
// requeues (see end of reconcile) without aborting the steps below.
stateSyncBlocked := r.reconcileStateSyncGate(node)

// Failed is terminal — flush any condition updates and exit.
if node.Status.Phase == seiv1alpha1.PhaseFailed {
if err := flushStatus(); err != nil {
Expand All @@ -133,11 +143,15 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
}

planAlreadyActive := node.Status.Plan != nil && node.Status.Plan.Phase == seiv1alpha1.TaskPlanActive
// ResolvePlan runs unconditionally: it clears terminal plans and drives
// non-state-sync work. Its internal fail-closed gate declines to build a
// state-sync plan when StateSyncReady isn't True.
if err := r.Planner.ResolvePlan(ctx, node); err != nil {
return ctrl.Result{}, fmt.Errorf("resolving plan: %w", err)
}

r.emitSidecarReadinessEvent(node, prevSidecar)
r.emitStateSyncBlockedEvent(node, prevStateSync)

var result ctrl.Result
var execErr error
Expand All @@ -163,43 +177,59 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
return result, execErr
}

// Emit metrics/events if the phase changed.
if node.Status.Phase != observedPhase {
ns, name := node.Namespace, node.Name
nodePhaseTransitions.Add(ctx, 1,
metric.WithAttributes(
observability.AttrController.String(seiNodeControllerName),
observability.AttrNamespace.String(ns),
observability.AttrFromPhase.String(string(observedPhase)),
observability.AttrToPhase.String(string(node.Status.Phase)),
),
)
emitNodePhase(ns, name, node.Status.Phase)
r.Recorder.Eventf(node, corev1.EventTypeNormal, "PhaseTransition",
"Phase changed from %s to %s", observedPhase, node.Status.Phase)

// Record time spent in the previous phase.
if node.Status.PhaseTransitionTime != nil && observedPhase != "" {
dur := time.Since(node.Status.PhaseTransitionTime.Time).Seconds()
nodePhaseDuration.Record(ctx, dur,
metric.WithAttributes(
observability.AttrNamespace.String(ns),
observability.AttrChainID.String(node.Spec.ChainID),
observability.AttrPhase.String(string(observedPhase)),
),
)
}
}
r.emitPhaseTransition(ctx, node, observedPhase)

// Running nodes with no active plan requeue on a steady-state interval.
// Spec changes trigger immediate reconciles via GenerationChangedPredicate.
if node.Status.Phase == seiv1alpha1.PhaseRunning && (node.Status.Plan == nil || node.Status.Plan.Phase != seiv1alpha1.TaskPlanActive) {
return ctrl.Result{RequeueAfter: statusPollInterval}, nil
}

// A blocked state-sync node (fail-closed or transient) builds no plan to
// drive a requeue, and the syncer file is a mounted volume with no watch.
// Poll so the gate re-resolves and unblocks once GitOps provisions or fixes
// the syncers. IsZero defers to any stronger requeue above (running-node
// poll, plan execution), so an active-plan node is unaffected.
if stateSyncBlocked && result.IsZero() {
return ctrl.Result{RequeueAfter: statusPollInterval}, nil
}
Comment thread
cursor[bot] marked this conversation as resolved.

return result, nil
}

// emitPhaseTransition records phase-transition metrics and a PhaseTransition
// Event when the node's phase changed during this reconcile. A no-op when the
// phase is unchanged.
func (r *SeiNodeReconciler) emitPhaseTransition(ctx context.Context, node *seiv1alpha1.SeiNode, observedPhase seiv1alpha1.SeiNodePhase) {
if node.Status.Phase == observedPhase {
return
}
ns, name := node.Namespace, node.Name
nodePhaseTransitions.Add(ctx, 1,
metric.WithAttributes(
observability.AttrController.String(seiNodeControllerName),
observability.AttrNamespace.String(ns),
observability.AttrFromPhase.String(string(observedPhase)),
observability.AttrToPhase.String(string(node.Status.Phase)),
),
)
emitNodePhase(ns, name, node.Status.Phase)
r.Recorder.Eventf(node, corev1.EventTypeNormal, "PhaseTransition",
"Phase changed from %s to %s", observedPhase, node.Status.Phase)

// Record time spent in the previous phase.
if node.Status.PhaseTransitionTime != nil && observedPhase != "" {
dur := time.Since(node.Status.PhaseTransitionTime.Time).Seconds()
nodePhaseDuration.Record(ctx, dur,
metric.WithAttributes(
observability.AttrNamespace.String(ns),
observability.AttrChainID.String(node.Spec.ChainID),
observability.AttrPhase.String(string(observedPhase)),
),
)
}
}

// SetupWithManager sets up the controller with the Manager.
func (r *SeiNodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
Expand Down Expand Up @@ -300,3 +330,26 @@ func (r *SeiNodeReconciler) emitSidecarReadinessEvent(node *seiv1alpha1.SeiNode,
"sidecar Healthz returned 200; mark-ready gate is open")
}
}

// emitStateSyncBlockedEvent fires a StateSyncBlocked Warning once, on the
// transition into fail-closed (StateSyncReady leaving True/absent for a
// fail-closed reason) — not on every requeue. NotApplicable (state-sync
// disabled) never trips it.
func (r *SeiNodeReconciler) emitStateSyncBlockedEvent(node *seiv1alpha1.SeiNode, prev *metav1.Condition) {
cur := apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionStateSyncReady)
if cur == nil || cur.Status == metav1.ConditionTrue {
return
}
blockedReason := cur.Reason == seiv1alpha1.ReasonStateSyncNoSyncersConfigured ||
cur.Reason == seiv1alpha1.ReasonStateSyncSyncerSourceError
if !blockedReason {
return
}
// Transition = previously True, absent, or a different (non-blocked) reason.
if prev != nil && prev.Status == cur.Status && prev.Reason == cur.Reason {
return
}
r.Recorder.Eventf(node, corev1.EventTypeWarning, "StateSyncBlocked",
"state sync enabled but not ready for chain %q (%s); not building plan",
node.Spec.ChainID, cur.Reason)
}
12 changes: 6 additions & 6 deletions internal/controller/node/peers.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@ import (
)

// reconcilePeers resolves spec.peers into status.resolvedPeers (the composed
// persistent_peers set) and status.resolvedRPCWitnesses (state-sync witnesses).
// The plan plumbs the resolved set into config via the config-apply override
// (init path) or the config-patch (running path).
// persistent_peers set). The plan plumbs the resolved set into config via the
// config-apply override (init path) or the config-patch (running path).
//
// State-sync witnesses are no longer derived here from label-matched peers;
// they come from the controller-level canonical-syncer ConfigMap via the
// StateSyncReady gate (see statesync.go).
func (r *SeiNodeReconciler) reconcilePeers(ctx context.Context, node *seiv1alpha1.SeiNode) error {
resolver := peering.Resolver{
Reader: r.Client,
Expand All @@ -27,8 +30,5 @@ func (r *SeiNodeReconciler) reconcilePeers(ctx context.Context, node *seiv1alpha
if !slices.Equal(node.Status.ResolvedPeers, result.Peers) {
node.Status.ResolvedPeers = result.Peers
}
if !slices.Equal(node.Status.ResolvedRPCWitnesses, result.Witnesses) {
node.Status.ResolvedRPCWitnesses = result.Witnesses
}
return nil
}
Loading
Loading