From 87d62bbbded86dc6a1be15b4e6b2d9f6eaaa7f64 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 12 Jun 2026 16:10:20 -0700 Subject: [PATCH 1/3] feat(platform): make the app-config file authoritative (PLT-475 PR-2) Drops the transitional env fallback established in PR-1 (#402): infra config now comes solely from the mounted app-config file. - platform.Load reads infra fields straight from FileConfig (no fileOrEnv); gateway fields stay env-sourced (pending PLT-451). The migrated env-name constants are removed. - Config.Validate reports the file key for infra fields (no "or SEI_*"), and now also requires images.cosmosExporter (previously validated lazily at pod-build). - config/manager/manager.yaml drops the migrated infra env vars; gateway + SEI_CONTROLLER_CONFIG remain. - CLAUDE.md + docs/controller-app-config.md updated to "file-authoritative". - Tests rewritten for file-authoritative behavior (file sourced, infra env ignored, missing field fails Validate, gateway from env). Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 2 +- config/manager/manager.yaml | 45 +------- docs/controller-app-config.md | 9 +- internal/platform/load.go | 110 +++++++------------- internal/platform/load_test.go | 181 ++++++++++++++++----------------- internal/platform/platform.go | 86 +++++++--------- 6 files changed, 173 insertions(+), 260 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 722674f..f82a7f4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -119,6 +119,6 @@ make docker-push IMG= # Push container image - **Condition ownership** — The planner owns all condition management on the owning resource. It sets conditions when creating plans (e.g., `NodeUpdateInProgress=True`) and when observing terminal plans (e.g., `NodeUpdateInProgress=False`). The executor does not set conditions — it only mutates plan/task state and phase transitions. - **Single-patch model** — All status mutations (plan state, conditions, phase, currentImage) accumulate in-memory during a reconcile and are flushed in a single `Status().Patch()` at the end. Tasks mutate owned resources (StatefulSets, Services, PVCs); the executor mutates plan state in-memory; the reconciler flushes once. - **Resource generators** live in `internal/noderesource/` — pure functions that produce StatefulSets, Services, and PVCs from a SeiNode spec. Used by both the controller and plan tasks. -- **Platform config** is resolved by `platform.Load` (`internal/platform/load.go`). Infra fields (scheduling, storage, resources, snapshot/genesis/result-export buckets, images) are read from the mounted app-config file (`SEI_CONTROLLER_CONFIG` → `platform.FileConfig`) when present, falling back to their historical env vars — PLT-475, transitional: the env fallback is removed in a follow-up once the ConfigMap is verified populated. Networking/gateway fields (`SEI_GATEWAY_*`, `SEI_P2P_ENDPOINT_DOMAIN`, `SEI_NLB_TARGET_TYPE`) stay env-sourced pending their removal from the controller in PLT-451. The file is read once at startup for infra fields (an infra change needs a restart); the `stateSync` section is re-read per reconcile (it hot-reloads). See `internal/platform/platform.go` for the field list and `docs/controller-app-config.md` for the file schema. +- **Platform config** is resolved by `platform.Load` (`internal/platform/load.go`). Infra fields (scheduling, storage, resources, snapshot/genesis/result-export buckets, images) come from the mounted app-config file (`SEI_CONTROLLER_CONFIG` → `platform.FileConfig`), which is authoritative — a required field unset in the file fails `Config.Validate` at startup. Networking/gateway fields (`SEI_GATEWAY_*`, `SEI_P2P_ENDPOINT_DOMAIN`, `SEI_NLB_TARGET_TYPE`) stay env-sourced pending their removal from the controller in PLT-451. The file is read once at startup for infra fields (an infra change needs a restart); the `stateSync` section is re-read per reconcile (it hot-reloads). See `internal/platform/platform.go` for the field list and `docs/controller-app-config.md` for the file schema. - **Genesis resolution** is handled by the sidecar autonomously: embedded sei-config for well-known chains, S3 fallback at `{SEI_GENESIS_BUCKET}/{chainID}/genesis.json` for custom chains. - Config keys in seid's `config.toml` use **hyphens** (e.g., `persistent-peers`, `trust-height`), not underscores. diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index d88d7a1..472f7ef 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -35,46 +35,11 @@ spec: image: 189176372795.dkr.ecr.us-east-2.amazonaws.com/sei/sei-k8s-controller:8b3f1749067bd07140fccd6d05da7894754688ca name: manager env: - - name: SEI_NODEPOOL_NAME - value: sei-node - - name: SEI_NODEPOOL_ARCHIVE - value: sei-archive - - name: SEI_TOLERATION_KEY - value: sei.io/workload - - name: SEI_SERVICE_ACCOUNT - value: seid-node - - name: SEI_STORAGE_CLASS_PERF - value: gp3-10k-750 - - name: SEI_STORAGE_CLASS_DEFAULT - value: gp3 - - name: SEI_STORAGE_CLASS_ARCHIVE - value: gp3-archive - - name: SEI_STORAGE_SIZE_DEFAULT - value: 2000Gi - - name: SEI_STORAGE_SIZE_ARCHIVE - value: 40Ti - - name: SEI_RESOURCE_CPU_ARCHIVE - value: "48" - - name: SEI_RESOURCE_MEM_ARCHIVE - value: 448Gi - - name: SEI_RESOURCE_CPU_DEFAULT - value: "4" - - name: SEI_RESOURCE_MEM_DEFAULT - value: 32Gi - - name: SEI_SNAPSHOT_BUCKET - value: dev-sei-snapshots - - name: SEI_SNAPSHOT_REGION - value: us-east-2 - - name: SEI_RESULT_EXPORT_BUCKET - value: dev-sei-shadow-results - - name: SEI_RESULT_EXPORT_REGION - value: us-east-2 - - name: SEI_RESULT_EXPORT_PREFIX - value: shadow-results/ - - name: SEI_GENESIS_BUCKET - value: dev-sei-k8s-genesis-artifacts - - name: SEI_GENESIS_REGION - value: us-east-2 + # Infra config (node pools, storage, resources, snapshot/genesis/ + # result-export buckets, images) is sourced from the mounted + # app-config ConfigMap (SEI_CONTROLLER_CONFIG below); see + # docs/controller-app-config.md. Gateway config stays env-sourced + # pending its removal from the controller in PLT-451. - name: SEI_GATEWAY_NAME value: sei-gateway - name: SEI_GATEWAY_NAMESPACE diff --git a/docs/controller-app-config.md b/docs/controller-app-config.md index 871ef52..efc7ae8 100644 --- a/docs/controller-app-config.md +++ b/docs/controller-app-config.md @@ -15,12 +15,11 @@ Two read paths, by design: - **`stateSync`** is re-read **per reconcile** so syncer changes hot-reload without a restart (the directory mount swaps atomically). -## Transitional env fallback (PLT-475) +## Source of truth -For each infra field, a non-empty file value wins; an absent one falls back to -its historical `SEI_*` env var. So an unset `SEI_CONTROLLER_CONFIG` reproduces -the original all-env behavior. The fallback is removed in a follow-up once the -ConfigMap is verified populated, after which the file is authoritative. +The file is **authoritative** for infra config: a required field unset in the +file fails `Config.Validate` at startup (the controller does not boot). There is +no env-var fallback for these fields. Networking/gateway config (`SEI_GATEWAY_*`, `SEI_P2P_ENDPOINT_DOMAIN`, `SEI_NLB_TARGET_TYPE`) is **not** in the file — it stays env-sourced pending its diff --git a/internal/platform/load.go b/internal/platform/load.go index 06c2ad9..e057d17 100644 --- a/internal/platform/load.go +++ b/internal/platform/load.go @@ -9,58 +9,27 @@ import ( ) // Environment-variable names. SEI_CONTROLLER_CONFIG points at the read-only -// app-config file (a GitOps-written ConfigMap mounted as a directory); the rest -// are the historical infra knobs Load falls back to when a field is absent from -// that file. Single source of truth — referenced by Load and Config.Validate. +// app-config file (a GitOps-written ConfigMap mounted as a directory). The +// gateway vars stay env-sourced pending their removal from the controller in +// the GitOps networking move (PLT-451); all other infra config is file-sourced. const ( envControllerConfig = "SEI_CONTROLLER_CONFIG" - envNodepoolName = "SEI_NODEPOOL_NAME" - envNodepoolArchive = "SEI_NODEPOOL_ARCHIVE" - envTolerationKey = "SEI_TOLERATION_KEY" - envServiceAccount = "SEI_SERVICE_ACCOUNT" - - envStorageClassPerf = "SEI_STORAGE_CLASS_PERF" - envStorageClassDefault = "SEI_STORAGE_CLASS_DEFAULT" - envStorageClassArchive = "SEI_STORAGE_CLASS_ARCHIVE" - envStorageSizeDefault = "SEI_STORAGE_SIZE_DEFAULT" - envStorageSizeArchive = "SEI_STORAGE_SIZE_ARCHIVE" - - envResourceCPUArchive = "SEI_RESOURCE_CPU_ARCHIVE" - envResourceMemArchive = "SEI_RESOURCE_MEM_ARCHIVE" - envResourceCPUDefault = "SEI_RESOURCE_CPU_DEFAULT" - envResourceMemDefault = "SEI_RESOURCE_MEM_DEFAULT" - - envSnapshotBucket = "SEI_SNAPSHOT_BUCKET" - envSnapshotRegion = "SEI_SNAPSHOT_REGION" - - envResultExportBucket = "SEI_RESULT_EXPORT_BUCKET" - envResultExportRegion = "SEI_RESULT_EXPORT_REGION" - envResultExportPrefix = "SEI_RESULT_EXPORT_PREFIX" - - envGenesisBucket = "SEI_GENESIS_BUCKET" - envGenesisRegion = "SEI_GENESIS_REGION" - - envSidecarImage = "SEI_SIDECAR_IMAGE" - envKubeRBACProxyImage = "SEI_KUBE_RBAC_PROXY_IMAGE" - envCosmosExporterImage = "SEI_COSMOS_EXPORTER_IMAGE" - envGatewayName = "SEI_GATEWAY_NAME" envGatewayNamespace = "SEI_GATEWAY_NAMESPACE" envGatewayDomain = "SEI_GATEWAY_DOMAIN" envGatewayPublicDomain = "SEI_GATEWAY_PUBLIC_DOMAIN" ) -// Load resolves the platform Config at startup. A non-empty value in the -// app-config file wins; an absent infra field falls back to its historical env -// var, so an unset SEI_CONTROLLER_CONFIG yields the original all-env behavior. -// That env fallback is transitional — removed once the ConfigMap is populated -// everywhere (PLT-475). Networking/gateway fields and the config-file path -// itself are env-sourced. +// Load resolves the platform Config at startup. Infra config is read from the +// app-config file (SEI_CONTROLLER_CONFIG → FileConfig), which is authoritative. +// The networking/gateway fields and the config-file path itself stay +// env-sourced, pending the gateway fields' removal in the GitOps networking +// move (PLT-451). // -// The file is read once here; infra changes therefore require a controller -// restart. The stateSync section is read per-reconcile elsewhere (it hot-reloads). -// Caller is expected to run Config.Validate after Load. +// The file is read once here; infra changes require a controller restart. The +// stateSync section is read per-reconcile elsewhere (it hot-reloads). Caller is +// expected to run Config.Validate after Load. func Load() (Config, error) { path := strings.TrimSpace(os.Getenv(envControllerConfig)) file, err := ReadFileConfig(path) @@ -69,38 +38,38 @@ func Load() (Config, error) { } return Config{ - NodepoolName: fileOrEnv(file.Scheduling.NodepoolName, envNodepoolName), - NodepoolArchive: fileOrEnv(file.Scheduling.NodepoolArchive, envNodepoolArchive), - TolerationKey: fileOrEnv(file.Scheduling.TolerationKey, envTolerationKey), - ServiceAccount: fileOrEnv(file.Scheduling.ServiceAccount, envServiceAccount), + NodepoolName: file.Scheduling.NodepoolName, + NodepoolArchive: file.Scheduling.NodepoolArchive, + TolerationKey: file.Scheduling.TolerationKey, + ServiceAccount: file.Scheduling.ServiceAccount, - StorageClassPerf: fileOrEnv(file.Storage.ClassPerf, envStorageClassPerf), - StorageClassDefault: fileOrEnv(file.Storage.ClassDefault, envStorageClassDefault), - StorageClassArchive: fileOrEnv(file.Storage.ClassArchive, envStorageClassArchive), - StorageSizeDefault: fileOrEnv(file.Storage.SizeDefault, envStorageSizeDefault), - StorageSizeArchive: fileOrEnv(file.Storage.SizeArchive, envStorageSizeArchive), + StorageClassPerf: file.Storage.ClassPerf, + StorageClassDefault: file.Storage.ClassDefault, + StorageClassArchive: file.Storage.ClassArchive, + StorageSizeDefault: file.Storage.SizeDefault, + StorageSizeArchive: file.Storage.SizeArchive, - ResourceCPUArchive: fileOrEnv(file.Resources.CPUArchive, envResourceCPUArchive), - ResourceMemArchive: fileOrEnv(file.Resources.MemArchive, envResourceMemArchive), - ResourceCPUDefault: fileOrEnv(file.Resources.CPUDefault, envResourceCPUDefault), - ResourceMemDefault: fileOrEnv(file.Resources.MemDefault, envResourceMemDefault), + ResourceCPUArchive: file.Resources.CPUArchive, + ResourceMemArchive: file.Resources.MemArchive, + ResourceCPUDefault: file.Resources.CPUDefault, + ResourceMemDefault: file.Resources.MemDefault, - SnapshotBucket: fileOrEnv(file.Snapshot.Bucket, envSnapshotBucket), - SnapshotRegion: fileOrEnv(file.Snapshot.Region, envSnapshotRegion), + SnapshotBucket: file.Snapshot.Bucket, + SnapshotRegion: file.Snapshot.Region, - ResultExportBucket: fileOrEnv(file.ResultExport.Bucket, envResultExportBucket), - ResultExportRegion: fileOrEnv(file.ResultExport.Region, envResultExportRegion), - ResultExportPrefix: fileOrEnv(file.ResultExport.Prefix, envResultExportPrefix), + ResultExportBucket: file.ResultExport.Bucket, + ResultExportRegion: file.ResultExport.Region, + ResultExportPrefix: file.ResultExport.Prefix, - GenesisBucket: fileOrEnv(file.Genesis.Bucket, envGenesisBucket), - GenesisRegion: fileOrEnv(file.Genesis.Region, envGenesisRegion), + GenesisBucket: file.Genesis.Bucket, + GenesisRegion: file.Genesis.Region, - SidecarImage: fileOrEnv(file.Images.Sidecar, envSidecarImage), - KubeRBACProxyImage: fileOrEnv(file.Images.KubeRBACProxy, envKubeRBACProxyImage), - CosmosExporterImage: fileOrEnv(file.Images.CosmosExporter, envCosmosExporterImage), + SidecarImage: file.Images.Sidecar, + KubeRBACProxyImage: file.Images.KubeRBACProxy, + CosmosExporterImage: file.Images.CosmosExporter, // Networking/gateway: env-only, pending removal in the GitOps networking - // move (PLT-451). Not migrated to the file to avoid migrate-then-delete. + // move (PLT-451). GatewayName: os.Getenv(envGatewayName), GatewayNamespace: os.Getenv(envGatewayNamespace), GatewayDomain: os.Getenv(envGatewayDomain), @@ -132,12 +101,3 @@ func ReadFileConfig(path string) (FileConfig, error) { } return cfg, nil } - -// fileOrEnv returns the file value when non-empty, otherwise the named env var -// (the transitional fallback). -func fileOrEnv(fileVal, envVar string) string { - if strings.TrimSpace(fileVal) != "" { - return fileVal - } - return os.Getenv(envVar) -} diff --git a/internal/platform/load_test.go b/internal/platform/load_test.go index 8b209e8..9ebba4f 100644 --- a/internal/platform/load_test.go +++ b/internal/platform/load_test.go @@ -3,47 +3,49 @@ package platform import ( "os" "path/filepath" + "strings" "testing" ) -// envNodepool is asserted in multiple fallback cases, so it's named (goconst). -const envNodepool = "env-nodepool" +// fullConfig is a complete, valid app-config file — every required infra field set. +const fullConfig = ` +scheduling: + nodepoolName: file-nodepool + nodepoolArchive: file-nodepool-archive + tolerationKey: file-toleration + serviceAccount: file-sa +storage: + classPerf: file-perf + classDefault: file-default + classArchive: file-archive + sizeDefault: file-size-default + sizeArchive: file-size-archive +resources: + cpuArchive: file-cpu-archive + memArchive: file-mem-archive + cpuDefault: file-cpu-default + memDefault: file-mem-default +snapshot: + bucket: file-snap-bucket + region: file-snap-region +resultExport: + bucket: file-export-bucket + region: file-export-region + prefix: file-export-prefix +genesis: + bucket: file-genesis-bucket + region: file-genesis-region +images: + sidecar: file-sidecar + kubeRBACProxy: file-rbac-proxy + cosmosExporter: file-cosmos-exporter +` -// setMigratedEnv sets every migrated infra env var to a recognizable "env-" -// prefixed value so a test can assert which source a resolved field came from. -func setMigratedEnv(t *testing.T) { +func setGatewayEnv(t *testing.T) { t.Helper() - for _, kv := range [][2]string{ - {envNodepoolName, envNodepool}, - {envNodepoolArchive, "env-nodepool-archive"}, - {envTolerationKey, "env-toleration"}, - {envServiceAccount, "env-sa"}, - {envStorageClassPerf, "env-perf"}, - {envStorageClassDefault, "env-default"}, - {envStorageClassArchive, "env-archive"}, - {envStorageSizeDefault, "env-size-default"}, - {envStorageSizeArchive, "env-size-archive"}, - {envResourceCPUArchive, "env-cpu-archive"}, - {envResourceMemArchive, "env-mem-archive"}, - {envResourceCPUDefault, "env-cpu-default"}, - {envResourceMemDefault, "env-mem-default"}, - {envSnapshotBucket, "env-snap-bucket"}, - {envSnapshotRegion, "env-snap-region"}, - {envResultExportBucket, "env-export-bucket"}, - {envResultExportRegion, "env-export-region"}, - {envResultExportPrefix, "env-export-prefix"}, - {envGenesisBucket, "env-genesis-bucket"}, - {envGenesisRegion, "env-genesis-region"}, - {envSidecarImage, "env-sidecar"}, - {envKubeRBACProxyImage, "env-rbac-proxy"}, - {envCosmosExporterImage, "env-cosmos-exporter"}, - {envGatewayName, "env-gw-name"}, - {envGatewayNamespace, "env-gw-ns"}, - {envGatewayDomain, "env-gw-domain"}, - {envGatewayPublicDomain, "env-gw-public"}, - } { - t.Setenv(kv[0], kv[1]) - } + t.Setenv(envGatewayName, "env-gw-name") + t.Setenv(envGatewayNamespace, "env-gw-ns") + t.Setenv(envGatewayDomain, "env-gw-domain") } func writeConfig(t *testing.T, body string) string { @@ -55,10 +57,13 @@ func writeConfig(t *testing.T, body string) string { return path } -// No file configured: every infra field resolves from the environment. -func TestLoad_NoFile_AllEnv(t *testing.T) { - setMigratedEnv(t) - t.Setenv(envControllerConfig, "") +// Infra config is sourced from the file (authoritative); gateway from env. An +// infra env var, even when set, is ignored — there is no fallback. +func TestLoad_InfraFromFile_GatewayFromEnv(t *testing.T) { + setGatewayEnv(t) + t.Setenv("SEI_NODEPOOL_NAME", "env-ignored") // no longer consulted + path := writeConfig(t, fullConfig) + t.Setenv(envControllerConfig, path) cfg, err := Load() if err != nil { @@ -67,83 +72,75 @@ func TestLoad_NoFile_AllEnv(t *testing.T) { if err := cfg.Validate(); err != nil { t.Fatalf("Validate: %v", err) } - if cfg.NodepoolName != envNodepool || cfg.SnapshotBucket != "env-snap-bucket" || cfg.SidecarImage != "env-sidecar" { - t.Errorf("expected env-sourced values, got nodepool=%q snapshot=%q sidecar=%q", - cfg.NodepoolName, cfg.SnapshotBucket, cfg.SidecarImage) + if cfg.NodepoolName != "file-nodepool" { + t.Errorf("NodepoolName = %q, want file-nodepool (infra env must be ignored)", cfg.NodepoolName) + } + if cfg.CosmosExporterImage != "file-cosmos-exporter" { + t.Errorf("CosmosExporterImage = %q, want file-cosmos-exporter", cfg.CosmosExporterImage) + } + if cfg.GatewayName != "env-gw-name" { + t.Errorf("GatewayName = %q, want env-gw-name", cfg.GatewayName) } - if cfg.ControllerConfigFile != "" { - t.Errorf("ControllerConfigFile = %q, want empty", cfg.ControllerConfigFile) + if cfg.ControllerConfigFile != path { + t.Errorf("ControllerConfigFile = %q, want %q", cfg.ControllerConfigFile, path) } } -// A field present in the file wins; a field absent from the file falls back to -// its env var. Networking/gateway fields are always env-sourced. -func TestLoad_FileWinsEnvFallback(t *testing.T) { - setMigratedEnv(t) - path := writeConfig(t, ` -scheduling: - nodepoolName: file-nodepool - serviceAccount: file-sa -storage: - classPerf: file-perf -images: - sidecar: file-sidecar -`) - t.Setenv(envControllerConfig, path) +// No file means no infra config — Validate fails at startup (no env fallback), +// naming the file key. +func TestLoad_NoFile_FailsValidate(t *testing.T) { + setGatewayEnv(t) + t.Setenv("SEI_NODEPOOL_NAME", "env-ignored") + t.Setenv(envControllerConfig, "") cfg, err := Load() if err != nil { t.Fatalf("Load: %v", err) } - - // File-sourced. - if cfg.NodepoolName != "file-nodepool" { - t.Errorf("NodepoolName = %q, want file-nodepool", cfg.NodepoolName) - } - if cfg.ServiceAccount != "file-sa" { - t.Errorf("ServiceAccount = %q, want file-sa", cfg.ServiceAccount) + err = cfg.Validate() + if err == nil { + t.Fatal("expected Validate to fail with no infra config, got nil") } - if cfg.StorageClassPerf != "file-perf" { - t.Errorf("StorageClassPerf = %q, want file-perf", cfg.StorageClassPerf) - } - if cfg.SidecarImage != "file-sidecar" { - t.Errorf("SidecarImage = %q, want file-sidecar", cfg.SidecarImage) + if !strings.Contains(err.Error(), "scheduling.nodepoolName") { + t.Errorf("error should name the file key, got %q", err.Error()) } +} - // Env fallback (absent from file). - if cfg.NodepoolArchive != "env-nodepool-archive" { - t.Errorf("NodepoolArchive = %q, want env fallback", cfg.NodepoolArchive) - } - if cfg.TolerationKey != "env-toleration" { - t.Errorf("TolerationKey = %q, want env fallback", cfg.TolerationKey) - } +// A required field absent from the file fails Validate, naming the file key. +// Covers the CosmosExporterImage fold-in (now required at startup). +func TestLoad_MissingField_FailsValidate(t *testing.T) { + setGatewayEnv(t) + body := strings.Replace(fullConfig, " cosmosExporter: file-cosmos-exporter\n", "", 1) + path := writeConfig(t, body) + t.Setenv(envControllerConfig, path) - // Networking/gateway: always env, never file. - if cfg.GatewayName != "env-gw-name" || cfg.GatewayDomain != "env-gw-domain" { - t.Errorf("gateway fields should be env-sourced, got name=%q domain=%q", cfg.GatewayName, cfg.GatewayDomain) + cfg, err := Load() + if err != nil { + t.Fatalf("Load: %v", err) } - if cfg.ControllerConfigFile != path { - t.Errorf("ControllerConfigFile = %q, want %q", cfg.ControllerConfigFile, path) + if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "images.cosmosExporter") { + t.Fatalf("want Validate error naming images.cosmosExporter, got %v", err) } } -// A configured-but-missing file is not an error (the file is opt-in); resolution -// falls back to the environment. -func TestLoad_MissingFileFallsBackToEnv(t *testing.T) { - setMigratedEnv(t) - t.Setenv(envControllerConfig, filepath.Join(t.TempDir(), "absent.yaml")) +// A missing gateway env var fails Validate, naming the env var (gateway is still +// env-sourced pending PLT-451). +func TestLoad_MissingGateway_FailsValidate(t *testing.T) { + path := writeConfig(t, fullConfig) + t.Setenv(envControllerConfig, path) + // gateway env deliberately unset cfg, err := Load() if err != nil { t.Fatalf("Load: %v", err) } - if cfg.NodepoolName != envNodepool { - t.Errorf("NodepoolName = %q, want env fallback", cfg.NodepoolName) + if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), envGatewayName) { + t.Fatalf("want Validate error naming %s, got %v", envGatewayName, err) } } -// Malformed YAML is a hard error — a present-but-broken file must not silently -// fall back to env (that would mask an operator mistake). +// Malformed YAML is a hard error — a present-but-broken file must not resolve to +// an empty (and silently invalid) Config. func TestLoad_MalformedFile_Errors(t *testing.T) { path := writeConfig(t, "scheduling: [not-a-map") t.Setenv(envControllerConfig, path) diff --git a/internal/platform/platform.go b/internal/platform/platform.go index 089383d..d838dd9 100644 --- a/internal/platform/platform.go +++ b/internal/platform/platform.go @@ -21,9 +21,8 @@ const ( ) // Config holds infrastructure-level settings that vary per deployment -// environment. It is resolved by Load: the infra fields are read from the -// app-config file (FileConfig) when present, falling back to their historical -// env vars (PLT-475, transitional); the networking/gateway fields and +// environment. It is resolved by Load: infra fields come from the app-config +// file (FileConfig), which is authoritative; the networking/gateway fields and // ControllerConfigFile are env-sourced. Fields are required unless documented // otherwise — ControllerConfigFile is optional (state-sync is opt-in). See // platformtest.Config() for test fixtures. @@ -80,10 +79,9 @@ type Config struct { // FileConfig is the controller's file-sourced application config (SEI_CONTROLLER_CONFIG). // // The infra sections (scheduling, storage, resources, snapshot, resultExport, -// genesis, images) carry the infra config that was historically env-sourced. -// They are resolved once at startup by Load, the file value winning over the -// env fallback. The stateSync section is read per-reconcile (it hot-reloads); -// the infra sections are not (an infra change warrants a restart). +// genesis, images) are the authoritative source for that config, resolved once +// at startup by Load. The stateSync section is read per-reconcile (it +// hot-reloads); the infra sections are not (an infra change warrants a restart). // // Networking/gateway config is deliberately absent — it stays env-sourced // pending its removal from the controller in the GitOps networking move (PLT-451). @@ -159,52 +157,46 @@ func (c Config) NodepoolForMode(mode string) string { return c.NodepoolName } -// Validate returns an error if a required field is missing from both the -// app-config file and the environment. The source label names the file key and -// the env var so the error points at either fix; networking/gateway fields name -// only their env var. +// Validate returns an error if a required field is unset. name is the field's +// app-config file key, except the networking/gateway fields (still env-sourced +// pending PLT-451) which name their env var. Slice order is the report order +// for the first missing field. func (c Config) Validate() error { - // fileKey is empty for env-only fields (networking/gateway); they report - // just the env var. Slice order is the report order for the first missing. required := []struct { - fileKey string - envVar string - val string + name string + val string }{ - {"scheduling.nodepoolName", envNodepoolName, c.NodepoolName}, - {"scheduling.nodepoolArchive", envNodepoolArchive, c.NodepoolArchive}, - {"scheduling.tolerationKey", envTolerationKey, c.TolerationKey}, - {"scheduling.serviceAccount", envServiceAccount, c.ServiceAccount}, - {"storage.classPerf", envStorageClassPerf, c.StorageClassPerf}, - {"storage.classDefault", envStorageClassDefault, c.StorageClassDefault}, - {"storage.classArchive", envStorageClassArchive, c.StorageClassArchive}, - {"storage.sizeDefault", envStorageSizeDefault, c.StorageSizeDefault}, - {"storage.sizeArchive", envStorageSizeArchive, c.StorageSizeArchive}, - {"resources.cpuArchive", envResourceCPUArchive, c.ResourceCPUArchive}, - {"resources.memArchive", envResourceMemArchive, c.ResourceMemArchive}, - {"resources.cpuDefault", envResourceCPUDefault, c.ResourceCPUDefault}, - {"resources.memDefault", envResourceMemDefault, c.ResourceMemDefault}, - {"snapshot.bucket", envSnapshotBucket, c.SnapshotBucket}, - {"snapshot.region", envSnapshotRegion, c.SnapshotRegion}, - {"resultExport.bucket", envResultExportBucket, c.ResultExportBucket}, - {"resultExport.region", envResultExportRegion, c.ResultExportRegion}, - {"resultExport.prefix", envResultExportPrefix, c.ResultExportPrefix}, - {"genesis.bucket", envGenesisBucket, c.GenesisBucket}, - {"genesis.region", envGenesisRegion, c.GenesisRegion}, - {"images.sidecar", envSidecarImage, c.SidecarImage}, - {"images.kubeRBACProxy", envKubeRBACProxyImage, c.KubeRBACProxyImage}, - {"", envGatewayName, c.GatewayName}, - {"", envGatewayNamespace, c.GatewayNamespace}, - {"", envGatewayDomain, c.GatewayDomain}, + {"scheduling.nodepoolName", c.NodepoolName}, + {"scheduling.nodepoolArchive", c.NodepoolArchive}, + {"scheduling.tolerationKey", c.TolerationKey}, + {"scheduling.serviceAccount", c.ServiceAccount}, + {"storage.classPerf", c.StorageClassPerf}, + {"storage.classDefault", c.StorageClassDefault}, + {"storage.classArchive", c.StorageClassArchive}, + {"storage.sizeDefault", c.StorageSizeDefault}, + {"storage.sizeArchive", c.StorageSizeArchive}, + {"resources.cpuArchive", c.ResourceCPUArchive}, + {"resources.memArchive", c.ResourceMemArchive}, + {"resources.cpuDefault", c.ResourceCPUDefault}, + {"resources.memDefault", c.ResourceMemDefault}, + {"snapshot.bucket", c.SnapshotBucket}, + {"snapshot.region", c.SnapshotRegion}, + {"resultExport.bucket", c.ResultExportBucket}, + {"resultExport.region", c.ResultExportRegion}, + {"resultExport.prefix", c.ResultExportPrefix}, + {"genesis.bucket", c.GenesisBucket}, + {"genesis.region", c.GenesisRegion}, + {"images.sidecar", c.SidecarImage}, + {"images.kubeRBACProxy", c.KubeRBACProxyImage}, + {"images.cosmosExporter", c.CosmosExporterImage}, + {envGatewayName, c.GatewayName}, + {envGatewayNamespace, c.GatewayNamespace}, + {envGatewayDomain, c.GatewayDomain}, } for _, f := range required { - if strings.TrimSpace(f.val) != "" { - continue + if strings.TrimSpace(f.val) == "" { + return fmt.Errorf("%s is required", f.name) } - if f.fileKey == "" { - return fmt.Errorf("%s is required", f.envVar) - } - return fmt.Errorf("%s (or %s) is required", f.fileKey, f.envVar) } return nil } From afc9988ea1df8896de24a743e020665fc04a0857 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 12 Jun 2026 16:16:45 -0700 Subject: [PATCH 2/3] docs: scrub dead SEI_* infra env references (PR-2 cross-review) Platform + kubernetes lenses flagged stale env references now that infra config is file-authoritative: - docs/controller-app-config.md: drop the per-key # SEI_* annotations and the "env-var fallback" framing; the file is the source. - README.md: Platform Configuration section now points at the app-config file (was "reads all settings from environment variables"). - noderesource.go: kubeRBACProxy/cosmosExporter "not configured" errors name the file key, not the removed env var. Co-Authored-By: Claude Opus 4.8 --- README.md | 26 ++---------- docs/controller-app-config.md | 57 +++++++++++++-------------- internal/noderesource/noderesource.go | 4 +- 3 files changed, 33 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index a078101..289d2f8 100644 --- a/README.md +++ b/README.md @@ -79,29 +79,9 @@ spec: ## Platform Configuration -The controller reads all infrastructure-level settings from environment variables. Every field is required — the controller fails fast at startup if any are missing. - -| Env var | Description | -|---------|-------------| -| `SEI_NODEPOOL_NAME` | Karpenter NodePool for pod scheduling | -| `SEI_TOLERATION_KEY` | Taint key to tolerate | -| `SEI_TOLERATION_VALUE` | Taint value to tolerate | -| `SEI_SERVICE_ACCOUNT` | ServiceAccount for node pods | -| `SEI_STORAGE_CLASS_PERF` | StorageClass for full/validator/archive nodes | -| `SEI_STORAGE_CLASS_DEFAULT` | StorageClass for other modes | -| `SEI_STORAGE_SIZE_DEFAULT` | PVC size for full/validator nodes | -| `SEI_STORAGE_SIZE_ARCHIVE` | PVC size for archive nodes | -| `SEI_RESOURCE_CPU_ARCHIVE` | CPU request for archive nodes | -| `SEI_RESOURCE_MEM_ARCHIVE` | Memory request for archive nodes | -| `SEI_RESOURCE_CPU_DEFAULT` | CPU request for full/validator nodes | -| `SEI_RESOURCE_MEM_DEFAULT` | Memory request for full/validator nodes | -| `SEI_SNAPSHOT_BUCKET` | S3 bucket for snapshot storage | -| `SEI_SNAPSHOT_REGION` | AWS region for snapshot S3 operations | -| `SEI_RESULT_EXPORT_BUCKET` | S3 bucket for shadow result exports | -| `SEI_RESULT_EXPORT_REGION` | AWS region for result export bucket | -| `SEI_RESULT_EXPORT_PREFIX` | S3 key prefix for result exports | -| `SEI_GENESIS_BUCKET` | S3 bucket for genesis artifacts | -| `SEI_GENESIS_REGION` | AWS region for genesis artifacts bucket | +Infrastructure-level settings (node pools, storage, resources, snapshot/genesis/result-export buckets, sidecar images) are read from the mounted app-config file (`SEI_CONTROLLER_CONFIG` → `platform.FileConfig`), which is authoritative — the controller fails fast at startup if a required field is unset. See [`docs/controller-app-config.md`](docs/controller-app-config.md) for the schema. + +Gateway config (`SEI_GATEWAY_NAME`, `SEI_GATEWAY_NAMESPACE`, `SEI_GATEWAY_DOMAIN`) and the config-file path (`SEI_CONTROLLER_CONFIG`) remain environment variables. ## Development diff --git a/docs/controller-app-config.md b/docs/controller-app-config.md index efc7ae8..2addcd5 100644 --- a/docs/controller-app-config.md +++ b/docs/controller-app-config.md @@ -36,46 +36,45 @@ stateSync: - rpc-1.example.net:26657 - rpc-2.example.net:26657 -# --- infra (read once at startup; env-var fallback during PLT-475) --- +# --- infra (authoritative; read once at startup) --- scheduling: - nodepoolName: sei-node # SEI_NODEPOOL_NAME - nodepoolArchive: sei-archive # SEI_NODEPOOL_ARCHIVE - tolerationKey: sei.io/workload # SEI_TOLERATION_KEY - serviceAccount: seid-node # SEI_SERVICE_ACCOUNT - -storage: # note: no sizePerf — matches the historical env layout - classPerf: gp3-10k-750 # SEI_STORAGE_CLASS_PERF - classDefault: gp3 # SEI_STORAGE_CLASS_DEFAULT - classArchive: gp3-archive # SEI_STORAGE_CLASS_ARCHIVE - sizeDefault: 2000Gi # SEI_STORAGE_SIZE_DEFAULT - sizeArchive: 40Ti # SEI_STORAGE_SIZE_ARCHIVE + nodepoolName: sei-node + nodepoolArchive: sei-archive + tolerationKey: sei.io/workload + serviceAccount: seid-node + +storage: # no sizePerf — perf is a storage-class tier only + classPerf: gp3-10k-750 + classDefault: gp3 + classArchive: gp3-archive + sizeDefault: 2000Gi + sizeArchive: 40Ti resources: - cpuArchive: "48" # SEI_RESOURCE_CPU_ARCHIVE - memArchive: 448Gi # SEI_RESOURCE_MEM_ARCHIVE - cpuDefault: "4" # SEI_RESOURCE_CPU_DEFAULT - memDefault: 32Gi # SEI_RESOURCE_MEM_DEFAULT + cpuArchive: "48" + memArchive: 448Gi + cpuDefault: "4" + memDefault: 32Gi snapshot: - bucket: sei-snapshots # SEI_SNAPSHOT_BUCKET - region: us-east-2 # SEI_SNAPSHOT_REGION + bucket: sei-snapshots + region: us-east-2 resultExport: - bucket: sei-shadow-results # SEI_RESULT_EXPORT_BUCKET - region: us-east-2 # SEI_RESULT_EXPORT_REGION - prefix: shadow-results/ # SEI_RESULT_EXPORT_PREFIX + bucket: sei-shadow-results + region: us-east-2 + prefix: shadow-results/ genesis: - bucket: sei-k8s-genesis # SEI_GENESIS_BUCKET - region: us-east-2 # SEI_GENESIS_REGION + bucket: sei-k8s-genesis + region: us-east-2 images: - sidecar: ghcr.io/sei-protocol/seictl@sha256:... # SEI_SIDECAR_IMAGE - kubeRBACProxy: quay.io/brancz/kube-rbac-proxy:v0.19.1 # SEI_KUBE_RBAC_PROXY_IMAGE - cosmosExporter: ghcr.io/sei-protocol/sei-cosmos-exporter@sha256:... # SEI_COSMOS_EXPORTER_IMAGE + sidecar: ghcr.io/sei-protocol/seictl@sha256:... + kubeRBACProxy: quay.io/brancz/kube-rbac-proxy:v0.19.1 + cosmosExporter: ghcr.io/sei-protocol/sei-cosmos-exporter@sha256:... ``` -A present-but-unparseable file is a hard startup error — it never silently falls -back to env. Required fields missing from both sources fail `Config.Validate` -with a message naming the file key and the env var. +A present-but-unparseable file is a hard startup error. A required infra field +unset in the file fails `Config.Validate` at startup, naming the file key. diff --git a/internal/noderesource/noderesource.go b/internal/noderesource/noderesource.go index 2ffb4ed..1b25822 100644 --- a/internal/noderesource/noderesource.go +++ b/internal/noderesource/noderesource.go @@ -224,7 +224,7 @@ func DefaultResourcesForMode(mode string, p PlatformConfig) corev1.ResourceRequi // never seid main or any non-sidecar init container. func GenerateStatefulSet(node *seiv1alpha1.SeiNode, p PlatformConfig) (*appsv1.StatefulSet, error) { if p.KubeRBACProxyImage == "" { - return nil, fmt.Errorf("SEI_KUBE_RBAC_PROXY_IMAGE is not configured on the controller") + return nil, fmt.Errorf("images.kubeRBACProxy is not configured in the app-config file") } one := int32(1) labels := ResourceLabels(node) @@ -644,7 +644,7 @@ func cosmosExporterWaitCommand() (command []string, args []string) { // buildCosmosExporterContainer renders the cosmos-exporter sidecar. func buildCosmosExporterContainer(p PlatformConfig) (corev1.Container, error) { if p.CosmosExporterImage == "" { - return corev1.Container{}, fmt.Errorf("SEI_COSMOS_EXPORTER_IMAGE is required on the operator Deployment") + return corev1.Container{}, fmt.Errorf("images.cosmosExporter is required in the app-config file") } command, args := cosmosExporterWaitCommand() return corev1.Container{ From 056102a1c59a77531e216e63f7473289b3a6e4b7 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 12 Jun 2026 16:17:50 -0700 Subject: [PATCH 3/3] test: update noderesource error-substring assertions to file keys Co-Authored-By: Claude Opus 4.8 --- internal/noderesource/noderesource_test.go | 2 +- internal/noderesource/sidecar_proxy_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/noderesource/noderesource_test.go b/internal/noderesource/noderesource_test.go index 579654e..cb60fa1 100644 --- a/internal/noderesource/noderesource_test.go +++ b/internal/noderesource/noderesource_test.go @@ -1349,7 +1349,7 @@ func TestCosmosExporter_ErrorWhenImageUnset(t *testing.T) { _, err := GenerateStatefulSet(node, cfg) g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("SEI_COSMOS_EXPORTER_IMAGE is required")) + g.Expect(err.Error()).To(ContainSubstring("images.cosmosExporter is required")) } func TestCosmosExporter_ReadinessProbe_TargetsExporterListener(t *testing.T) { diff --git a/internal/noderesource/sidecar_proxy_test.go b/internal/noderesource/sidecar_proxy_test.go index 0470f38..472bb1c 100644 --- a/internal/noderesource/sidecar_proxy_test.go +++ b/internal/noderesource/sidecar_proxy_test.go @@ -101,5 +101,5 @@ func TestGenerateStatefulSet_ProxyImageMissing_Errors(t *testing.T) { p.KubeRBACProxyImage = "" _, err := GenerateStatefulSet(newGenesisNode("a", "default"), p) g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("KUBE_RBAC_PROXY_IMAGE")) + g.Expect(err.Error()).To(ContainSubstring("images.kubeRBACProxy")) }