diff --git a/cmd/ateapi/main.go b/cmd/ateapi/main.go index 34101bb12..87d5a7433 100644 --- a/cmd/ateapi/main.go +++ b/cmd/ateapi/main.go @@ -25,10 +25,10 @@ import ( "time" "github.com/agent-substrate/substrate/cmd/ateapi/internal/controlapi" - "github.com/agent-substrate/substrate/cmd/ateapi/internal/credbundle" "github.com/agent-substrate/substrate/cmd/ateapi/internal/sessionidentity" "github.com/agent-substrate/substrate/cmd/ateapi/internal/store/ateredis" "github.com/agent-substrate/substrate/internal/ateinterceptors" + "github.com/agent-substrate/substrate/internal/credbundle" "github.com/agent-substrate/substrate/internal/serverboot" "github.com/agent-substrate/substrate/internal/version" "github.com/agent-substrate/substrate/pkg/client/clientset/versioned" diff --git a/cmd/atenet/internal/root.go b/cmd/atenet/internal/root.go index 7c7e66806..53ae85d13 100644 --- a/cmd/atenet/internal/root.go +++ b/cmd/atenet/internal/root.go @@ -18,6 +18,7 @@ import ( "fmt" "os" + "github.com/agent-substrate/substrate/cmd/atenet/internal/router" "github.com/agent-substrate/substrate/internal/version" "github.com/spf13/cobra" ) @@ -37,6 +38,6 @@ func Execute() { } func init() { - rootCmd.AddCommand(NewRouterCmd()) + rootCmd.AddCommand(router.NewRouterCmd()) rootCmd.AddCommand(NewDnsCmd()) } diff --git a/cmd/atenet/internal/router.go b/cmd/atenet/internal/router/config.go similarity index 72% rename from cmd/atenet/internal/router.go rename to cmd/atenet/internal/router/config.go index 5cff97233..bd7de3437 100644 --- a/cmd/atenet/internal/router.go +++ b/cmd/atenet/internal/router/config.go @@ -12,25 +12,49 @@ // See the License for the specific language governing permissions and // limitations under the License. -package internal +package router import ( "fmt" "time" "github.com/spf13/cobra" - - "github.com/agent-substrate/substrate/cmd/atenet/internal/router" ) +type authConfig struct { + AteapiClientCertPath string + AteapiCACertsPath string +} + +// routerConfig holds deployment setup and endpoint options for the router node instance. +type routerConfig struct { + Standalone bool + Namespace string + Kubeconfig string + AteapiAddr string + HttpPort int + XdsPort int + ExtprocPort int + ExtprocAddr string + EnvoyImage string + TemplatesFile string + StatusPort int + HealthInterval time.Duration + HttpsPort int + EnvoyCertPath string + LogLevel string + MetricsAddr string + Auth authConfig +} + func NewRouterCmd() *cobra.Command { - var cfg router.RouterConfig + var cfg routerConfig cmd := &cobra.Command{ Use: "router", Short: "Router components including xDS server and Envoy ExtProc gateway processing server", RunE: func(cmd *cobra.Command, args []string) error { - srv, err := router.NewRouterServer(cfg) + srv, err := NewRouterServer(cfg) if err != nil { return fmt.Errorf("failed to create router server: %w", err) } @@ -46,6 +70,8 @@ func NewRouterCmd() *cobra.Command { cmd.Flags().StringVar(&cfg.Namespace, "namespace", "default", "Target operations namespace") cmd.Flags().StringVar(&cfg.Kubeconfig, "kubeconfig", "", "Absolute path to the kubeconfig configuration file") cmd.Flags().StringVar(&cfg.AteapiAddr, "ateapi-address", "api.ate-system.svc:443", "gRPC host address of the cluster ateapi Control instance") + cmd.Flags().StringVar(&cfg.Auth.AteapiClientCertPath, "ateapi-client-cert", "/run/podidentity.podcert.ate.dev/credential-bundle.pem", "Path to the podidentity credential bundle the router presents as its client cert to ateapi.") + cmd.Flags().StringVar(&cfg.Auth.AteapiCACertsPath, "ateapi-ca-certs", "/run/servicedns-ca.podcert.ate.dev/trust-bundle.pem", "Path to the servicedns trust bundle used to verify ateapi's serving cert.") cmd.Flags().IntVar(&cfg.HttpPort, "port-http", 8080, "TCP port for workload traffic entering through the Envoy Router") cmd.Flags().IntVar(&cfg.XdsPort, "port-xds", 18000, "TCP port listening for the xDS dynamic Envoy connections") cmd.Flags().IntVar(&cfg.ExtprocPort, "port-extproc", 50051, "Listen port for the Envoy dynamic External Processing (ext_proc) server") @@ -55,7 +81,7 @@ func NewRouterCmd() *cobra.Command { cmd.Flags().IntVar(&cfg.StatusPort, "status-port", 4040, "Port to serve /statusz on (set <= 0 to disable serving status)") cmd.Flags().DurationVar(&cfg.HealthInterval, "health-interval", 1*time.Second, "Interval for checking health of dependent services") cmd.Flags().IntVar(&cfg.HttpsPort, "port-https", 8443, "TCP port for HTTPS workload traffic entering through the Envoy Router") - cmd.Flags().StringVar(&cfg.EnvoyCertPath, "envoy-cert-path", "", "Path to the Envoy certificate file (if empty, a self-signed cert will be generated for testing)") + cmd.Flags().StringVar(&cfg.EnvoyCertPath, "envoy-cert-path", "", "Path to the Envoy certificate file.") return cmd } diff --git a/cmd/atenet/internal/router/controller.go b/cmd/atenet/internal/router/controller.go index 1b0a1ed18..e66243d5b 100644 --- a/cmd/atenet/internal/router/controller.go +++ b/cmd/atenet/internal/router/controller.go @@ -28,7 +28,7 @@ import ( type Controller struct { k8sClient client.Client clientset kubernetes.Interface - cfg RouterConfig + cfg routerConfig xdsSrv *XdsServer extprocSrv *ExtProcServer @@ -39,7 +39,7 @@ type Controller struct { func NewController( k8sClient client.Client, clientset kubernetes.Interface, - cfg RouterConfig, + cfg routerConfig, xdsSrv *XdsServer, extprocSrv *ExtProcServer, ) *Controller { diff --git a/cmd/atenet/internal/router/envoyrunner.go b/cmd/atenet/internal/router/envoyrunner.go index df8765c62..4eee29303 100644 --- a/cmd/atenet/internal/router/envoyrunner.go +++ b/cmd/atenet/internal/router/envoyrunner.go @@ -37,10 +37,10 @@ const ( // Envoy proxy instance running inside Kubernetes. type envoyrunner struct { k8sClient client.Client - cfg RouterConfig + cfg routerConfig } -func newEnvoyRunner(k8sClient client.Client, cfg RouterConfig) *envoyrunner { +func newEnvoyRunner(k8sClient client.Client, cfg routerConfig) *envoyrunner { return &envoyrunner{ k8sClient: k8sClient, cfg: cfg, diff --git a/cmd/atenet/internal/router/health.go b/cmd/atenet/internal/router/health.go index 62ae1ae02..b38d4a2f6 100644 --- a/cmd/atenet/internal/router/health.go +++ b/cmd/atenet/internal/router/health.go @@ -53,10 +53,10 @@ type routerHealth struct { interval time.Duration clientset kubernetes.Interface apiClient ateapipb.ControlClient - cfg RouterConfig + cfg routerConfig } -func newRouterHealth(interval time.Duration, clientset kubernetes.Interface, apiClient ateapipb.ControlClient, cfg RouterConfig) *routerHealth { +func newRouterHealth(interval time.Duration, clientset kubernetes.Interface, apiClient ateapipb.ControlClient, cfg routerConfig) *routerHealth { if interval <= 0 { interval = time.Second } diff --git a/cmd/atenet/internal/router/router.go b/cmd/atenet/internal/router/router.go index 797ce36c0..9140ea089 100644 --- a/cmd/atenet/internal/router/router.go +++ b/cmd/atenet/internal/router/router.go @@ -16,23 +16,17 @@ package router import ( "context" - "crypto/rand" - "crypto/rsa" "crypto/tls" "crypto/x509" - "crypto/x509/pkix" - "encoding/pem" "errors" "fmt" "log/slog" - "math/big" "net" "net/http" "os" "os/signal" "strings" "syscall" - "time" "github.com/spf13/cobra" "golang.org/x/sync/errgroup" @@ -46,6 +40,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" + "github.com/agent-substrate/substrate/internal/credbundle" "github.com/agent-substrate/substrate/internal/serverboot" v1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" "github.com/agent-substrate/substrate/pkg/proto/ateapipb" @@ -60,29 +55,9 @@ func init() { utilruntime.Must(v1alpha1.AddToScheme(scheme)) } -// RouterConfig holds deployment setup and endpoint options for the router node instance. -type RouterConfig struct { - Standalone bool - Namespace string - Kubeconfig string - AteapiAddr string - HttpPort int - XdsPort int - ExtprocPort int - ExtprocAddr string - EnvoyImage string - TemplatesFile string - StatusPort int - HealthInterval time.Duration - HttpsPort int - EnvoyCertPath string - LogLevel string - MetricsAddr string -} - // RouterServer instantiates and coordinates runtime threads executing system modules. type RouterServer struct { - cfg RouterConfig + cfg routerConfig Cmd *cobra.Command k8sClient client.Client @@ -93,7 +68,7 @@ type RouterServer struct { atStore atStore } -func NewRouterServer(cfg RouterConfig) (*RouterServer, error) { +func NewRouterServer(cfg routerConfig) (*RouterServer, error) { var k8sClient client.Client var clientset kubernetes.Interface var err error @@ -125,7 +100,12 @@ func NewRouterServer(cfg RouterConfig) (*RouterServer, error) { } } - conn, err := grpc.NewClient(cfg.AteapiAddr, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true}))) + creds, err := cfg.apiTransportCredentials() + if err != nil { + return nil, fmt.Errorf("failed to build ateapi transport credentials: %w", err) + } + + conn, err := grpc.NewClient(cfg.AteapiAddr, grpc.WithTransportCredentials(creds)) if err != nil { return nil, fmt.Errorf("failed to establish grpc channel to ateapi client: %w", err) } @@ -149,6 +129,46 @@ func NewRouterServer(cfg RouterConfig) (*RouterServer, error) { }, nil } +// ateapiTransportCreds builds the TLS credentials the router uses to dial +// ateapi. When both the servicedns trust bundle and the podidentity client +// credential bundle are present (the in-cluster case, mounted via projected +// pod-certificate volumes), it performs mTLS: it verifies ateapi's serving cert +// against the servicedns trust bundle and presents its own podidentity SPIFFE +// client cert. When that material is absent, it returns an error rather than +// falling back to an insecure connection. +func (cfg routerConfig) apiTransportCredentials() (credentials.TransportCredentials, error) { + tlsCfg, err := apiTLSConfig(cfg) + if err != nil { + return nil, err + } + return credentials.NewTLS(tlsCfg), nil +} + +func apiTLSConfig(cfg routerConfig) (*tls.Config, error) { + if _, err := os.Stat(cfg.Auth.AteapiCACertsPath); err != nil { + return nil, fmt.Errorf("error reading ate apiserver CA path from %q, error=%w", + cfg.Auth.AteapiCACertsPath, err) + } + if _, err := os.Stat(cfg.Auth.AteapiClientCertPath); err != nil { + return nil, fmt.Errorf("error reading ate apiserver client cert path from %q, error=%w", + cfg.Auth.AteapiClientCertPath, err) + } + + caBytes, err := os.ReadFile(cfg.Auth.AteapiCACertsPath) + if err != nil { + return nil, fmt.Errorf("read ateapi CA certs: %w", err) + } + rootCAs := x509.NewCertPool() + if !rootCAs.AppendCertsFromPEM(caBytes) { + return nil, fmt.Errorf("parse ateapi CA certs from %s", cfg.Auth.AteapiCACertsPath) + } + + return &tls.Config{ + RootCAs: rootCAs, + GetClientCertificate: credbundle.ClientLoader(cfg.Auth.AteapiClientCertPath), + }, nil +} + func (s *RouterServer) Run(ctx context.Context) error { ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -188,17 +208,7 @@ func (s *RouterServer) Run(ctx context.Context) error { xdsSrv := NewXdsServer(s.cfg.XdsPort) xdsSrv.SetConfig(s.cfg.HttpPort, s.cfg.ExtprocPort, s.cfg.ExtprocAddr) - var certContent, keyContent string - if s.cfg.EnvoyCertPath == "" { - slog.InfoContext(ctx, "No Envoy certificate path provided, generating self-signed certificate for testing") - var err error - certContent, keyContent, err = generateSelfSignedCert() - if err != nil { - return fmt.Errorf("failed to generate self-signed cert: %w", err) - } - } - - xdsSrv.SetTlsConfig(s.cfg.HttpsPort, s.cfg.EnvoyCertPath, certContent, keyContent) + xdsSrv.SetTlsConfig(s.cfg.HttpsPort, s.cfg.EnvoyCertPath) if s.extprocSrv == nil { routeDuration, err := newRouteDurationHistogram() if err != nil { @@ -274,39 +284,3 @@ func (s *RouterServer) Run(ctx context.Context) error { return g.Wait() } - -func generateSelfSignedCert() (string, string, error) { - priv, err := rsa.GenerateKey(rand.Reader, 2048) - if err != nil { - return "", "", err - } - - template := x509.Certificate{ - SerialNumber: big.NewInt(1), - Subject: pkix.Name{ - Organization: []string{"Substrate Local Test"}, - }, - NotBefore: time.Now(), - NotAfter: time.Now().Add(time.Hour * 24 * 365), - - KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - BasicConstraintsValid: true, - DNSNames: []string{"localhost"}, - } - - derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) - if err != nil { - return "", "", err - } - - certPem := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) - - privBytes, err := x509.MarshalPKCS8PrivateKey(priv) - if err != nil { - return "", "", err - } - keyPem := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) - - return string(certPem), string(keyPem), nil -} diff --git a/cmd/atenet/internal/router/status_test.go b/cmd/atenet/internal/router/status_test.go index 5ce878d72..2fe2af070 100644 --- a/cmd/atenet/internal/router/status_test.go +++ b/cmd/atenet/internal/router/status_test.go @@ -16,12 +16,20 @@ package router import ( "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" "encoding/json" + "encoding/pem" "fmt" "io" + "math/big" "net" "net/http" "os" + "path/filepath" "strings" "testing" "time" @@ -47,7 +55,10 @@ func TestStatuszEndpoint(t *testing.T) { defer os.Remove(tmpFile.Name()) tmpFile.Close() - cfg := RouterConfig{ + // NewRouterServer requires ateapi mTLS material; generate it for the test. + caPath, clientCertPath := writeTestTLSMaterial(t) + + cfg := routerConfig{ Standalone: true, Namespace: "default", StatusPort: httpPort, @@ -55,6 +66,10 @@ func TestStatuszEndpoint(t *testing.T) { XdsPort: 18000, ExtprocPort: 50051, TemplatesFile: tmpFile.Name(), + Auth: authConfig{ + AteapiCACertsPath: caPath, + AteapiClientCertPath: clientCertPath, + }, } srv, err := NewRouterServer(cfg) @@ -140,3 +155,44 @@ func TestStatuszEndpoint(t *testing.T) { t.Errorf("Target parameters unassigned inside context payload context properties: found %s", dashboard.Queries[0].Target) } } + +// writeTestTLSMaterial generates a self-signed certificate and writes a CA trust +// bundle and a client credential bundle to temp files, returning their paths. +// NewRouterServer requires both to build its ateapi mTLS credentials. +func writeTestTLSMaterial(t *testing.T) (caPath, clientCertPath string) { + t.Helper() + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + t.Fatalf("generating key: %v", err) + } + template := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "test-ca"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(time.Hour), + IsCA: true, + BasicConstraintsValid: true, + KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature, + } + der, err := x509.CreateCertificate(rand.Reader, template, template, &key.PublicKey, key) + if err != nil { + t.Fatalf("creating certificate: %v", err) + } + certPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) + keyDER, err := x509.MarshalPKCS8PrivateKey(key) + if err != nil { + t.Fatalf("marshaling key: %v", err) + } + keyPEM := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: keyDER}) + + dir := t.TempDir() + caPath = filepath.Join(dir, "ca.pem") + if err := os.WriteFile(caPath, certPEM, 0o600); err != nil { + t.Fatalf("writing CA file: %v", err) + } + clientCertPath = filepath.Join(dir, "client.pem") + if err := os.WriteFile(clientCertPath, append(certPEM, keyPEM...), 0o600); err != nil { + t.Fatalf("writing client cert file: %v", err) + } + return caPath, clientCertPath +} diff --git a/cmd/atenet/internal/router/xds.go b/cmd/atenet/internal/router/xds.go index 964fc5e92..a6af45d23 100644 --- a/cmd/atenet/internal/router/xds.go +++ b/cmd/atenet/internal/router/xds.go @@ -76,10 +76,8 @@ type XdsServer struct { mu sync.Mutex - httpsPort int - certPath string - certContent string - keyContent string + httpsPort int + certPath string } func NewXdsServer(xdsPort int) *XdsServer { @@ -104,13 +102,11 @@ func (x *XdsServer) SetConfig(ingressPort int, extprocPort int, extprocAddr stri x.extprocAddr = extprocAddr } -func (x *XdsServer) SetTlsConfig(httpsPort int, certPath string, certContent string, keyContent string) { +func (x *XdsServer) SetTlsConfig(httpsPort int, certPath string) { x.mu.Lock() defer x.mu.Unlock() x.httpsPort = httpsPort x.certPath = certPath - x.certContent = certContent - x.keyContent = keyContent } func (x *XdsServer) UpdateSnapshot() error { @@ -453,30 +449,15 @@ func (x *XdsServer) buildHttpsListener() *listenerv3.Listener { } func (x *XdsServer) buildTlsCertificate() *tlsv3.TlsCertificate { - if x.certPath != "" { - return &tlsv3.TlsCertificate{ - CertificateChain: &corev3.DataSource{ - Specifier: &corev3.DataSource_Filename{ - Filename: x.certPath, - }, - }, - PrivateKey: &corev3.DataSource{ - Specifier: &corev3.DataSource_Filename{ - Filename: x.certPath, // Assuming combined file - }, - }, - } - } - return &tlsv3.TlsCertificate{ CertificateChain: &corev3.DataSource{ - Specifier: &corev3.DataSource_InlineString{ - InlineString: x.certContent, + Specifier: &corev3.DataSource_Filename{ + Filename: x.certPath, }, }, PrivateKey: &corev3.DataSource{ - Specifier: &corev3.DataSource_InlineString{ - InlineString: x.keyContent, + Specifier: &corev3.DataSource_Filename{ + Filename: x.certPath, // Assuming combined file }, }, } diff --git a/cmd/atenet/internal/router/xds_test.go b/cmd/atenet/internal/router/xds_test.go index 92e347648..9a2c2460f 100644 --- a/cmd/atenet/internal/router/xds_test.go +++ b/cmd/atenet/internal/router/xds_test.go @@ -141,7 +141,7 @@ func TestXdsServer_UpdateSnapshot(t *testing.T) { func TestXdsServer_UpdateSnapshot_WithHttps(t *testing.T) { server := NewXdsServer(18000) server.SetConfig(8085, 50053, "127.0.0.1") - server.SetTlsConfig(8443, "", "dummy-cert", "dummy-key") + server.SetTlsConfig(8443, "") err := server.UpdateSnapshot() if err != nil { diff --git a/cmd/podcertcontroller/internal/podidentitysigner/podidentitysigner.go b/cmd/podcertcontroller/internal/podidentitysigner/podidentitysigner.go index 5b63a15e4..04bbc5bd9 100644 --- a/cmd/podcertcontroller/internal/podidentitysigner/podidentitysigner.go +++ b/cmd/podcertcontroller/internal/podidentitysigner/podidentitysigner.go @@ -121,6 +121,7 @@ func (h *Impl) MakeCert(ctx context.Context, pcr *certsv1beta1.PodCertificateReq Path: path.Join("ns", pcr.ObjectMeta.Namespace, "sa", pcr.Spec.ServiceAccountName), } + parent := h.caPool.CAs[0].RootCertificate template := &x509.Certificate{ BasicConstraintsValid: true, NotBefore: notBefore, @@ -128,9 +129,14 @@ func (h *Impl) MakeCert(ctx context.Context, pcr *certsv1beta1.PodCertificateReq URIs: []*url.URL{spiffeURI}, KeyUsage: x509.KeyUsageDigitalSignature, ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth}, + // Link the leaf to its issuing CA by key id so verifiers can disambiguate + // a multi-CA trust bundle (e.g. valkey trusts both the servicedns and + // podidentity CAs). Mandated by RFC 5280 4.2.1.1 for the multiple-issuer + // case. + AuthorityKeyId: parent.SubjectKeyId, } - subjectCertDER, err := x509.CreateCertificate(rand.Reader, template, h.caPool.CAs[0].RootCertificate, subjectPublicKey, h.caPool.CAs[0].SigningKey) + subjectCertDER, err := x509.CreateCertificate(rand.Reader, template, parent, subjectPublicKey, h.caPool.CAs[0].SigningKey) if err != nil { return fmt.Errorf("while signing subject cert: %w", err) } diff --git a/cmd/podcertcontroller/internal/servicednssigner/servicednssigner.go b/cmd/podcertcontroller/internal/servicednssigner/servicednssigner.go index 4258f1921..1a995b20d 100644 --- a/cmd/podcertcontroller/internal/servicednssigner/servicednssigner.go +++ b/cmd/podcertcontroller/internal/servicednssigner/servicednssigner.go @@ -135,6 +135,13 @@ func (h *Impl) MakeCert(ctx context.Context, pcr *certsv1beta1.PodCertificateReq } } + // This is returned as a transient error to allow retries. + // Without this, ate-apiserver can have a servicedns cert without DNS name + // while the covering Service is being created, and cache it for 24 hours. + if len(dnsNames) == 0 { + return fmt.Errorf("pod %s/%s is not (yet) selected by any Service; refusing to issue a serving cert with no DNS SANs", pcr.ObjectMeta.Namespace, pcr.Spec.PodName) + } + // TODO: Encode the OIDC issuer of the cluster into the certificate. subjectPublicKey, err := podcertificate.PublicKey(pcr) @@ -156,6 +163,7 @@ func (h *Impl) MakeCert(ctx context.Context, pcr *certsv1beta1.PodCertificateReq notAfter := notBefore.Add(lifetime) beginRefreshAt := notAfter.Add(-30 * time.Minute) + parent := h.caPool.CAs[0].RootCertificate template := &x509.Certificate{ BasicConstraintsValid: true, NotBefore: notBefore, @@ -163,9 +171,12 @@ func (h *Impl) MakeCert(ctx context.Context, pcr *certsv1beta1.PodCertificateReq DNSNames: dnsNames, KeyUsage: x509.KeyUsageDigitalSignature, ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth}, + // Link the leaf to its issuing CA by key id. Needed this for Valkey + // to understand which CA to use when validating a client cert. + AuthorityKeyId: parent.SubjectKeyId, } - subjectCertDER, err := x509.CreateCertificate(rand.Reader, template, h.caPool.CAs[0].RootCertificate, subjectPublicKey, h.caPool.CAs[0].SigningKey) + subjectCertDER, err := x509.CreateCertificate(rand.Reader, template, parent, subjectPublicKey, h.caPool.CAs[0].SigningKey) if err != nil { return fmt.Errorf("while signing subject cert: %w", err) } diff --git a/hack/install-ate.sh b/hack/install-ate.sh index 6c7f46056..980823c15 100755 --- a/hack/install-ate.sh +++ b/hack/install-ate.sh @@ -116,17 +116,27 @@ run_ko() { esac } -create_valkey_ca_certs_secret() { - log_step "create_valkey_ca_certs_secret" - local ca_certs="" - # Extract from in-cluster service-dns-ca-pool secret (base64 json) +# Extract a CA pool secret's RootCertificateDER and emit it as a PEM certificate. +ca_pool_root_pem() { + local secret="$1" local pool_json="" - pool_json=$(run_kubectl get secret -n podcertificate-controller-system service-dns-ca-pool -o jsonpath='{.data.pool}' | base64 --decode) - # Extract RootCertificateDER base64 string + pool_json=$(run_kubectl get secret -n podcertificate-controller-system "${secret}" -o jsonpath='{.data.pool}' | base64 --decode) local der_base64="" der_base64=$(echo "${pool_json}" | grep -o '"RootCertificateDER":"[^"]*' | sed 's/"RootCertificateDER":"//') - # Convert DER to PEM certificate - ca_certs=$(echo "${der_base64}" | base64 --decode | openssl x509 -inform der -outform pem) + echo "${der_base64}" | base64 --decode | openssl x509 -inform der -outform pem +} + +create_valkey_ca_certs_secret() { + log_step "create_valkey_ca_certs_secret" + # valkey requires a single tls-ca-cert-file to verify client and server certs it sees, + # so it needs both CAs: + # - servicedns CA: verifies valkey peers' server certs. + # - podidentity CA: verifies the client certs that connect to valkey + # (apiserver, the init job, and peers acting as clients). + local ca_certs="" + ca_certs=$(printf '%s\n%s\n' \ + "$(ca_pool_root_pem service-dns-ca-pool)" \ + "$(ca_pool_root_pem pod-identity-ca-pool)") run_kubectl create secret generic valkey-ca-certs \ --from-literal=ca.crt="${ca_certs}" \ @@ -176,7 +186,9 @@ create_api_server_env_vars() { redis_address="valkey-cluster.ate-system.svc:6379" use_iam_auth="false" tls_server_name="valkey-cluster.ate-system.svc" - client_cert="/run/servicedns.podcert.ate.dev/credential-bundle.pem" + # The apiserver dials valkey as a client, so it presents a podidentity + # (SPIFFE) client cert rather than a servicedns serving cert. + client_cert="/run/podidentity.podcert.ate.dev/credential-bundle.pem" echo "REDIS_ADDRESS: ${redis_address}" diff --git a/cmd/ateapi/internal/credbundle/credbundle.go b/internal/credbundle/credbundle.go similarity index 85% rename from cmd/ateapi/internal/credbundle/credbundle.go rename to internal/credbundle/credbundle.go index 58d86df76..659a53899 100644 --- a/cmd/ateapi/internal/credbundle/credbundle.go +++ b/internal/credbundle/credbundle.go @@ -38,6 +38,17 @@ func Loader(path string) func(*tls.ClientHelloInfo) (*tls.Certificate, error) { } } +// ClientLoader is the client-side counterpart to Loader. It returns a function +// suitable for use as GetClientCertificate in a tls.Config, re-reading the +// bundle on each handshake so that in-place pod-certificate rotations are +// picked up. +func ClientLoader(path string) func(*tls.CertificateRequestInfo) (*tls.Certificate, error) { + // TODO: Introduce caching. + return func(_ *tls.CertificateRequestInfo) (*tls.Certificate, error) { + return Parse(path) + } +} + // Parse reads a private key and certificate chain from a credential bundle file as written by the // Kubernetes Pod Certificates mechanism. func Parse(bundlePath string) (*tls.Certificate, error) { diff --git a/manifests/ate-install/ate-api-server.yaml b/manifests/ate-install/ate-api-server.yaml index 8d3c17086..9c5e338da 100644 --- a/manifests/ate-install/ate-api-server.yaml +++ b/manifests/ate-install/ate-api-server.yaml @@ -114,8 +114,12 @@ spec: name: ate-api-server-envvars optional: true volumeMounts: + # servicedns: the apiserver's own gRPC serving cert (DNS SAN api.ate-system.svc). - name: "servicedns" mountPath: "/run/servicedns.podcert.ate.dev" + # podidentity: the apiserver's client identity (SPIFFE) when it dials valkey. + - name: "podidentity" + mountPath: "/run/podidentity.podcert.ate.dev" - name: "session-id-jwt-pool" mountPath: "/run/session-id-jwt-pool" # Note: See README.md for how to generate this secret. @@ -146,6 +150,13 @@ spec: signerName: servicedns.podcert.ate.dev/identity keyType: ECDSAP256 credentialBundlePath: credential-bundle.pem + - name: "podidentity" + projected: + sources: + - podCertificate: + signerName: podidentity.podcert.ate.dev/identity + keyType: ECDSAP256 + credentialBundlePath: credential-bundle.pem - name: "session-id-jwt-pool" projected: sources: diff --git a/manifests/ate-install/atenet-router.yaml b/manifests/ate-install/atenet-router.yaml index 43f69ab5e..0fcf94da0 100644 --- a/manifests/ate-install/atenet-router.yaml +++ b/manifests/ate-install/atenet-router.yaml @@ -132,6 +132,8 @@ spec: - "--port-extproc=50051" - "--extproc-address=127.0.0.1" - "--ateapi-address=api.ate-system.svc:443" + - "--ateapi-client-cert=/run/podidentity.podcert.ate.dev/credential-bundle.pem" + - "--ateapi-ca-certs=/run/servicedns-ca.podcert.ate.dev/trust-bundle.pem" - "--status-port=4040" - "--port-https=8443" - "--envoy-cert-path=/run/servicedns.podcert.ate.dev/credential-bundle.pem" @@ -161,6 +163,13 @@ spec: containerPort: 4040 - name: metrics containerPort: 9090 + volumeMounts: + # Router's own client identity presented to ateapi. + - name: "podidentity" + mountPath: "/run/podidentity.podcert.ate.dev" + # Trust bundle used to verify ateapi's servicedns serving cert. + - name: "servicedns-ca" + mountPath: "/run/servicedns-ca.podcert.ate.dev" - name: envoy image: envoyproxy/envoy:v1.30-latest command: @@ -192,6 +201,22 @@ spec: signerName: servicedns.podcert.ate.dev/identity keyType: ECDSAP256 credentialBundlePath: credential-bundle.pem + - name: "podidentity" + projected: + sources: + - podCertificate: + signerName: podidentity.podcert.ate.dev/identity + keyType: ECDSAP256 + credentialBundlePath: credential-bundle.pem + - name: "servicedns-ca" + projected: + sources: + - clusterTrustBundle: + signerName: servicedns.podcert.ate.dev/identity + labelSelector: + matchLabels: + podcert.ate.dev/canarying: live + path: trust-bundle.pem --- apiVersion: v1 kind: Service diff --git a/manifests/ate-install/valkey.yaml b/manifests/ate-install/valkey.yaml index ac649a555..0b69cf699 100644 --- a/manifests/ate-install/valkey.yaml +++ b/manifests/ate-install/valkey.yaml @@ -28,6 +28,8 @@ data: # Load certificates from projected volume tls-cert-file /run/servicedns.podcert.ate.dev/credential-bundle.pem tls-key-file /run/servicedns.podcert.ate.dev/credential-bundle.pem + tls-client-cert-file /run/podidentity.podcert.ate.dev/credential-bundle.pem + tls-client-key-file /run/podidentity.podcert.ate.dev/credential-bundle.pem tls-ca-cert-file /etc/valkey-ca/ca.crt tls-auth-clients yes @@ -99,6 +101,8 @@ spec: mountPath: /etc/valkey - name: servicedns mountPath: /run/servicedns.podcert.ate.dev + - name: podidentity + mountPath: /run/podidentity.podcert.ate.dev - name: valkey-ca-certs mountPath: /etc/valkey-ca readOnly: true @@ -115,6 +119,13 @@ spec: signerName: servicedns.podcert.ate.dev/identity keyType: ECDSAP256 credentialBundlePath: credential-bundle.pem + - name: podidentity + projected: + sources: + - podCertificate: + signerName: podidentity.podcert.ate.dev/identity + keyType: ECDSAP256 + credentialBundlePath: credential-bundle.pem - name: valkey-ca-certs projected: sources: @@ -148,8 +159,11 @@ spec: - name: init image: valkey/valkey:8.0 volumeMounts: - - name: servicedns - mountPath: /run/servicedns.podcert.ate.dev + # The init job is a client of the cluster, not a Service-backed server, + # so it uses a podidentity client cert rather than a servicedns + # serving cert (which requires DNS SANs it would not have). + - name: podidentity + mountPath: /run/podidentity.podcert.ate.dev - name: valkey-ca-certs mountPath: /etc/valkey-ca readOnly: true @@ -174,19 +188,19 @@ spec: done echo "Checking if Valkey cluster is already initialized..." - until valkey-cli --tls --cacert /etc/valkey-ca/ca.crt --cert /run/servicedns.podcert.ate.dev/credential-bundle.pem --key /run/servicedns.podcert.ate.dev/credential-bundle.pem -h valkey-cluster-0.valkey-cluster-service.ate-system.svc ping >/dev/null 2>&1; do + until valkey-cli --tls --cacert /etc/valkey-ca/ca.crt --cert /run/podidentity.podcert.ate.dev/credential-bundle.pem --key /run/podidentity.podcert.ate.dev/credential-bundle.pem -h valkey-cluster-0.valkey-cluster-service.ate-system.svc ping >/dev/null 2>&1; do echo "Waiting for valkey-cluster-0 to respond to ping..." sleep 2 done - INIT_STATUS=$(valkey-cli --tls --cacert /etc/valkey-ca/ca.crt --cert /run/servicedns.podcert.ate.dev/credential-bundle.pem --key /run/servicedns.podcert.ate.dev/credential-bundle.pem -h valkey-cluster-0.valkey-cluster-service.ate-system.svc cluster info 2>/dev/null | grep cluster_state || true) + INIT_STATUS=$(valkey-cli --tls --cacert /etc/valkey-ca/ca.crt --cert /run/podidentity.podcert.ate.dev/credential-bundle.pem --key /run/podidentity.podcert.ate.dev/credential-bundle.pem -h valkey-cluster-0.valkey-cluster-service.ate-system.svc cluster info 2>/dev/null | grep cluster_state || true) if [ -z "${INIT_STATUS}" ] || ! echo "${INIT_STATUS}" | grep -q "cluster_state:ok"; then echo "Initializing Valkey cluster..." valkey-cli --tls \ --cacert /etc/valkey-ca/ca.crt \ - --cert /run/servicedns.podcert.ate.dev/credential-bundle.pem \ - --key /run/servicedns.podcert.ate.dev/credential-bundle.pem \ + --cert /run/podidentity.podcert.ate.dev/credential-bundle.pem \ + --key /run/podidentity.podcert.ate.dev/credential-bundle.pem \ --cluster create ${POD_IPS} \ --cluster-replicas 1 \ --cluster-yes @@ -195,11 +209,11 @@ spec: echo "Cluster already initialized." fi volumes: - - name: servicedns + - name: podidentity projected: sources: - podCertificate: - signerName: servicedns.podcert.ate.dev/identity + signerName: podidentity.podcert.ate.dev/identity keyType: ECDSAP256 credentialBundlePath: credential-bundle.pem - name: valkey-ca-certs