diff --git a/.github/workflows/rs_ci.yml b/.github/workflows/rs_ci.yml index 5e81261e5..4d94598e2 100644 --- a/.github/workflows/rs_ci.yml +++ b/.github/workflows/rs_ci.yml @@ -182,3 +182,17 @@ jobs: with: token: ${{ secrets.GITHUB_TOKEN }} working-directory: bottlecap + + # Guards cold-start H1 (PR #1276): the default build must keep reqwest on + # compiled-in webpki roots (no per-build native-cert filesystem scan), and the + # FIPS build must keep native roots. Reads the resolved feature graph only + # (no compilation), so it's fast and needs no protoc. + tls-root-features: + name: TLS Root Features + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + - run: ./scripts/verify_tls_root_features.sh diff --git a/bottlecap/Cargo.lock b/bottlecap/Cargo.lock index b353820ce..fba6a2d38 100644 --- a/bottlecap/Cargo.lock +++ b/bottlecap/Cargo.lock @@ -545,6 +545,7 @@ dependencies = [ "tikv-jemallocator", "time", "tokio", + "tokio-rustls", "tokio-util", "tonic 0.14.5", "tonic-types", @@ -3323,6 +3324,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" dependencies = [ "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", diff --git a/bottlecap/Cargo.toml b/bottlecap/Cargo.toml index 714afa77f..153b812b0 100644 --- a/bottlecap/Cargo.toml +++ b/bottlecap/Cargo.toml @@ -95,6 +95,10 @@ tower = { version = "0.5", features = ["util"] } mock_instant = "0.6" serial_test = "3.1" tempfile = "3.20" +# Local TLS server for the http.rs root-trust tests (H1: webpki default + +# tls_cert_file mitigation). Already present transitively via reqwest's +# hyper-rustls, so promoting it to a dev-dep adds nothing to the shipped binary. +tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "tls12"] } # fake-intake test harness: decode msgpack+gzip stats payloads on arrival rmp-serde = { version = "1.3.1", default-features = false } flate2 = { version = "1.1", default-features = false, features = ["rust_backend"] } @@ -129,7 +133,20 @@ tikv-jemallocator = "0.5" [features] default = [ - "reqwest/rustls-tls-native-roots", + # Use webpki-roots (compiled-in Mozilla CA bundle) rather than + # rustls-tls-native-roots for the non-FIPS reqwest clients. native-roots + # calls rustls_native_certs::load_native_certs() — a filesystem cert-bundle + # scan/parse — on *every* reqwest::Client::build(), and bottlecap builds + # three reqwest clients during cold-start init (the register client in + # bin/bottlecap/main.rs, the shared flush client in src/http.rs, and the + # secrets-decrypt client in src/secrets/decrypt.rs for KMS/SM/SSM/STS). + # webpki-roots removes that per-build scan. Trade-off: webpki trusts only the + # public Mozilla roots, so private/internal OS-installed CAs are no longer + # picked up implicitly; users relying on those must point DD_PROXY_HTTPS at a + # trusted intercept or supply the CA via tls_cert_file (which still layers on + # top via add_root_certificate in src/http.rs). FIPS builds keep native-roots + # (see the `fips` feature) and are unaffected. + "reqwest/rustls-tls-webpki-roots", "datadog-fips/default", "datadog-agent-config/https", "libdd-common/https", diff --git a/bottlecap/src/http.rs b/bottlecap/src/http.rs index 05ba20d32..ec9b821a6 100644 --- a/bottlecap/src/http.rs +++ b/bottlecap/src/http.rs @@ -152,7 +152,9 @@ pub fn headers_to_map(headers: &HeaderMap) -> HashMap { #[cfg(test)] mod tests { use super::*; + use std::io::Write; use std::sync::atomic::{AtomicUsize, Ordering}; + use tempfile::NamedTempFile; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::net::TcpListener; @@ -245,4 +247,238 @@ mod tests { "expected 2 fresh TCP connections, got {opened} (pooling not disabled?)" ); } + + // --------------------------------------------------------------------- + // TLS root-trust tests for cold-start hypothesis H1 (PR #1276). The + // non-FIPS build trusts compiled-in webpki roots; `tls_cert_file` is the + // supported mitigation for environments behind a private CA (e.g. a + // TLS-intercepting DD_PROXY_HTTPS). These pin both halves of that contract + // so the trust model can't regress silently. + // --------------------------------------------------------------------- + + // Self-contained test PKI, generated offline with OpenSSL (ECDSA P-256, + // valid ~100 years). The CA is a *private* root deliberately NOT in the + // webpki/Mozilla bundle, so it exercises exactly the path this PR changes. + // Regenerate with: + // openssl ecparam -name prime256v1 -genkey -noout -out ca.key + // openssl req -x509 -new -key ca.key -sha256 -days 36500 \ + // -subj "/CN=Bottlecap Test CA" \ + // -addext "basicConstraints=critical,CA:TRUE" \ + // -addext "keyUsage=critical,keyCertSign,cRLSign" -out ca.crt + // openssl ecparam -name prime256v1 -genkey -noout -out leaf.key + // openssl req -new -key leaf.key -subj "/CN=localhost" -out leaf.csr + // printf 'basicConstraints=CA:FALSE\nkeyUsage=critical,digitalSignature\nextendedKeyUsage=serverAuth\nsubjectAltName=DNS:localhost\n' > leaf.ext + // openssl x509 -req -in leaf.csr -CA ca.crt -CAkey ca.key -CAcreateserial \ + // -sha256 -days 36500 -extfile leaf.ext -out leaf.crt + // openssl pkcs8 -topk8 -nocrypt -in leaf.key -out leaf.pk8.pem + const TEST_CA_CERT_PEM: &str = r" +-----BEGIN CERTIFICATE----- +MIIBnzCCAUWgAwIBAgIUf5l+dmNjL/xU10EM0qk4Iseh3nQwCgYIKoZIzj0EAwIw +HDEaMBgGA1UEAwwRQm90dGxlY2FwIFRlc3QgQ0EwIBcNMjYwNjI1MTgzNjMwWhgP +MjEyNjA2MDExODM2MzBaMBwxGjAYBgNVBAMMEUJvdHRsZWNhcCBUZXN0IENBMFkw +EwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEK9O0WnLz1pRAD3RZzYzsCf8vEieLLwnV +D8RNYJBaEBZBJbE+4snr+0vhVm2mGtIojZTJ0bc5Z98JHpZ57LBpwqNjMGEwHQYD +VR0OBBYEFJtZHbWEv1FI0KacM/4ctJkUuMg2MB8GA1UdIwQYMBaAFJtZHbWEv1FI +0KacM/4ctJkUuMg2MA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMAoG +CCqGSM49BAMCA0gAMEUCIQDFLc+zjel9LGmytihROgErrPc6WDxmziypva+k2cSN +CAIgYb1nsDag2/bwlzf4OOcYvqU9xNRdXMarkRxn4o5rPR4= +-----END CERTIFICATE----- +"; + const TEST_SERVER_CERT_PEM: &str = r" +-----BEGIN CERTIFICATE----- +MIIBvTCCAWSgAwIBAgIUOkYng/RZxQcYlMY5I+RAXzR7dSkwCgYIKoZIzj0EAwIw +HDEaMBgGA1UEAwwRQm90dGxlY2FwIFRlc3QgQ0EwIBcNMjYwNjI1MTgzNjMwWhgP +MjEyNjA2MDExODM2MzBaMBQxEjAQBgNVBAMMCWxvY2FsaG9zdDBZMBMGByqGSM49 +AgEGCCqGSM49AwEHA0IABOuKun0roY5MAEOgA3NNebQa9l56HVsNFwGJ4a5chM2T +s+vToAoyflPMZfxuS6PUv2WHrahmUH5WZKr0XaT/RIijgYkwgYYwCQYDVR0TBAIw +ADAOBgNVHQ8BAf8EBAMCB4AwEwYDVR0lBAwwCgYIKwYBBQUHAwEwFAYDVR0RBA0w +C4IJbG9jYWxob3N0MB0GA1UdDgQWBBTcqKduDXHEOeUXAL6fqwAYl50VVTAfBgNV +HSMEGDAWgBSbWR21hL9RSNCmnDP+HLSZFLjINjAKBggqhkjOPQQDAgNHADBEAiAT +nwOMjOwnQoeLvZgHhNqrzlVGNqnT4FPudJTgTKrAAAIgVMw8wqk70JXLm4pEdN8/ +oMUo5j7nV6S8Hd02b5hW5pM= +-----END CERTIFICATE----- +"; + const TEST_SERVER_KEY_PEM: &str = r" +-----BEGIN PRIVATE KEY----- +MIGHAgEAMBMGByqGSM49AgEGCCqGSM49AwEHBG0wawIBAQQgkflx4lEcI8Fafj8E +EgBejgnIKCAkaQXo+p0h3a6IsQOhRANCAATrirp9K6GOTABDoANzTXm0GvZeeh1b +DRcBieGuXITNk7Pr06AKMn5TzGX8bkuj1L9lh62oZlB+VmSq9F2k/0SI +-----END PRIVATE KEY----- +"; + + /// Installs a process-default rustls `CryptoProvider` matching the build's + /// TLS backend (mirrors `traces/http_client.rs`); idempotent across tests. + fn ensure_crypto_provider() { + #[cfg(feature = "fips")] + let _ = rustls::crypto::aws_lc_rs::default_provider().install_default(); + #[cfg(not(feature = "fips"))] + let _ = rustls::crypto::ring::default_provider().install_default(); + } + + /// Spawns a local HTTPS (HTTP/1.1-over-TLS) server presenting the test leaf + /// certificate (signed by `TEST_CA_CERT_PEM`); returns the bound port. + async fn spawn_tls_server() -> u16 { + ensure_crypto_provider(); + let certs = rustls_pemfile::certs(&mut BufReader::new(TEST_SERVER_CERT_PEM.as_bytes())) + .collect::, _>>() + .expect("parse test server cert"); + let key = rustls_pemfile::private_key(&mut BufReader::new(TEST_SERVER_KEY_PEM.as_bytes())) + .expect("read test server key") + .expect("test server key present"); + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(certs, key) + .expect("build server TLS config"); + let acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + + let listener = TcpListener::bind("127.0.0.1:0") + .await + .expect("bind tls listener"); + let port = listener.local_addr().expect("listener local_addr").port(); + tokio::spawn(async move { + loop { + let Ok((sock, _)) = listener.accept().await else { + return; + }; + let acceptor = acceptor.clone(); + tokio::spawn(async move { + let Ok(mut tls) = acceptor.accept(sock).await else { + return; + }; + let mut buf = vec![0u8; 8192]; + let mut accumulated = Vec::new(); + loop { + let n = match tls.read(&mut buf).await { + Ok(0) | Err(_) => return, + Ok(n) => n, + }; + accumulated.extend_from_slice(&buf[..n]); + while let Some(idx) = accumulated.windows(4).position(|w| w == b"\r\n\r\n") + { + accumulated.drain(..idx + 4); + if tls + .write_all( + b"HTTP/1.1 200 OK\r\nContent-Length: 2\r\nConnection: keep-alive\r\n\r\nok", + ) + .await + .is_err() + { + return; + } + } + } + }); + } + }); + port + } + + fn ca_temp_file() -> NamedTempFile { + let mut f = NamedTempFile::new().expect("create temp CA file"); + f.write_all(TEST_CA_CERT_PEM.as_bytes()) + .expect("write temp CA file"); + f + } + + /// `tls_cert_file` must add the private CA *on top of* the webpki roots, so a + /// server presenting a privately-signed cert is trusted. This is the + /// documented mitigation for private-CA / TLS-intercepting-proxy setups. + #[tokio::test] + async fn tls_cert_file_trusts_private_ca() { + let port = spawn_tls_server().await; + let ca = ca_temp_file(); + let cfg = config::Config { + http_protocol: Some("http1".to_string()), + flush_timeout: 5, + tls_cert_file: Some(ca.path().to_str().expect("ca path utf8").to_string()), + ..config::Config::default() + }; + let client = get_client(&Arc::new(cfg)); + let resp = client + .get(format!("https://localhost:{port}/")) + .send() + .await + .expect("request should succeed when tls_cert_file trusts the private CA"); + assert_eq!(resp.status(), 200); + } + + /// Without `tls_cert_file`, the default root set (webpki on non-FIPS builds) + /// must reject a privately-signed cert. Pins the post-PR trust behavior: + /// private-CA users must opt in via `tls_cert_file`. + #[tokio::test] + async fn private_ca_rejected_without_tls_cert_file() { + let port = spawn_tls_server().await; + let cfg = config::Config { + http_protocol: Some("http1".to_string()), + flush_timeout: 5, + ..config::Config::default() + }; + let client = get_client(&Arc::new(cfg)); + let result = client + .get(format!("https://localhost:{port}/")) + .send() + .await; + assert!( + result.is_err(), + "default roots must reject a privately-signed cert without tls_cert_file" + ); + } + + /// `skip_ssl_validation` remains a full escape hatch: an untrusted cert is + /// accepted regardless of the configured root set. + #[tokio::test] + async fn skip_ssl_validation_accepts_untrusted_cert() { + let port = spawn_tls_server().await; + let cfg = config::Config { + http_protocol: Some("http1".to_string()), + flush_timeout: 5, + skip_ssl_validation: true, + ..config::Config::default() + }; + let client = get_client(&Arc::new(cfg)); + let resp = client + .get(format!("https://localhost:{port}/")) + .send() + .await + .expect("request should succeed when skip_ssl_validation is set"); + assert_eq!(resp.status(), 200); + } + + #[test] + fn load_custom_cert_parses_single() { + let ca = ca_temp_file(); + let certs = load_custom_cert(ca.path().to_str().expect("path utf8")) + .expect("should parse one certificate"); + assert_eq!(certs.len(), 1); + } + + #[test] + fn load_custom_cert_parses_multiple() { + let mut f = NamedTempFile::new().expect("temp file"); + f.write_all(format!("{TEST_CA_CERT_PEM}{TEST_SERVER_CERT_PEM}").as_bytes()) + .expect("write certs"); + let certs = load_custom_cert(f.path().to_str().expect("path utf8")) + .expect("should parse two certificates"); + assert_eq!(certs.len(), 2); + } + + #[test] + fn load_custom_cert_empty_file_errors() { + let f = NamedTempFile::new().expect("temp file"); + assert!( + load_custom_cert(f.path().to_str().expect("path utf8")).is_err(), + "empty file must error" + ); + } + + #[test] + fn load_custom_cert_non_pem_errors() { + let mut f = NamedTempFile::new().expect("temp file"); + f.write_all(b"this is not a certificate") + .expect("write garbage"); + assert!( + load_custom_cert(f.path().to_str().expect("path utf8")).is_err(), + "non-PEM content must error" + ); + } } diff --git a/bottlecap/src/secrets/decrypt.rs b/bottlecap/src/secrets/decrypt.rs index 23e4be81c..37effc65f 100644 --- a/bottlecap/src/secrets/decrypt.rs +++ b/bottlecap/src/secrets/decrypt.rs @@ -30,6 +30,18 @@ pub async fn resolve_secrets( { let before_decrypt = Instant::now(); + // Dedicated client for *direct* AWS control-plane calls (KMS / Secrets + // Manager / SSM / STS, and the SnapStart container-credentials endpoint). + // It intentionally uses the default root set (compiled-in webpki roots on + // non-FIPS builds; see bottlecap/Cargo.toml) with no proxy and no + // tls_cert_file: AWS endpoints present certificates that chain to public + // AWS CAs, and DD_PROXY_HTTPS / tls_cert_file are for *Datadog* egress, not + // AWS API traffic (which goes direct, or through VPC endpoints that also + // present public certs). skip_ssl_validation is deliberately not applied + // here either — we do not disable certificate validation on the calls that + // fetch credentials and secrets. The delegated-auth path (dd_org_uuid) + // talks to a Datadog endpoint and so uses `shared_client` below, which does + // honor the proxy and tls_cert_file settings. let builder = match create_reqwest_client_builder() { Ok(builder) => builder, Err(err) => { diff --git a/scripts/verify_tls_root_features.sh b/scripts/verify_tls_root_features.sh new file mode 100755 index 000000000..49891f6c1 --- /dev/null +++ b/scripts/verify_tls_root_features.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# +# Guards the TLS trust configuration from cold-start hypothesis H1 (PR #1276). +# +# The non-FIPS (default) build must trust the compiled-in webpki root bundle so +# it does NOT run rustls_native_certs::load_native_certs() — a per-build +# filesystem cert scan — on the cold-start critical path. The FIPS build must +# keep native roots. Cargo unifies features additively across the dependency +# graph, so a dependency added later could silently flip reqwest's root source +# (re-introducing the scan, or cross-linking webpki into FIPS). This catches +# that before it ships. Errors are handled explicitly (no `set -e`) so a +# non-matching grep can't masquerade as a build failure. +set -uo pipefail + +cd "$(dirname "$0")/../bottlecap" || exit 2 + +fail() { + echo "FAIL: $1" >&2 + exit 1 +} + +# Activated feature list for reqwest under a given bottlecap feature set. +reqwest_feature_line() { # $1 = bottlecap feature set + cargo tree --no-default-features --features "$1" -i reqwest -f '{p}|{f}' 2>/dev/null \ + | grep -m1 '^reqwest ' | sed 's/^[^|]*|//' +} + +echo "[tls-roots] default build → reqwest must use webpki roots, not native" +def="$(reqwest_feature_line default)" +[ -n "$def" ] || fail "could not determine reqwest features for the default build" +case ",$def," in + *,rustls-tls-webpki-roots,*) : ;; + *) fail "default build is missing reqwest/rustls-tls-webpki-roots (got: $def)" ;; +esac +case "$def" in + *rustls-tls-native-roots*) + fail "default build pulls reqwest native roots — re-introduces the cold-start native-cert scan (got: $def)" ;; +esac + +echo "[tls-roots] fips build → reqwest must use native roots, not webpki" +fips="$(reqwest_feature_line fips)" +[ -n "$fips" ] || fail "could not determine reqwest features for the fips build" +case "$fips" in + *rustls-tls-native-roots*) : ;; + *) fail "fips build is missing reqwest native roots (got: $fips)" ;; +esac +case ",$fips," in + *,rustls-tls-webpki-roots,*) + fail "webpki roots cross-linked into the fips build (got: $fips)" ;; +esac + +echo "[tls-roots] OK — reqwest root sources are correct for both builds"