From d46a7ec647faeeff98d5b936686dc8d5f6c6f194 Mon Sep 17 00:00:00 2001 From: Jordan Gonzalez <30836115+duncanista@users.noreply.github.com> Date: Tue, 23 Jun 2026 18:27:35 -0400 Subject: [PATCH] feat(init): add cold-start init-phase timing instrumentation Add debug-gated checkpoints at the key cold-start init boundaries (crypto provider, TLS client build, config parse, shared client, register, dogstatsd, trace agent, telemetry subscribe, ready), plus a one-time available_parallelism() log. Each checkpoint logs delta (time since the previous checkpoint = that phase's own cost) and cumulative (time since process start), in milliseconds to 6 decimal places (nanosecond resolution) so sub-millisecond phases are visible. Init time is attributed per phase directly, with no manual subtraction. The per-phase bookkeeping is guarded behind a DEBUG-level check, so it stays effectively free at the default info level. This is the measurement prerequisite (H0) for the cold-start improvements. --- bottlecap/src/bin/bottlecap/main.rs | 47 +++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/bottlecap/src/bin/bottlecap/main.rs b/bottlecap/src/bin/bottlecap/main.rs index 39f352149..440fa2f7b 100644 --- a/bottlecap/src/bin/bottlecap/main.rs +++ b/bottlecap/src/bin/bottlecap/main.rs @@ -97,6 +97,7 @@ use dogstatsd::{ }; use libdd_trace_obfuscation::obfuscation_config; use reqwest::Client; +use std::sync::atomic::{AtomicU64, Ordering}; use std::{collections::hash_map, env, path::Path, str::FromStr, sync::Arc}; use tokio::time::Instant; use tokio::{sync::Mutex as TokioMutex, sync::mpsc::Sender}; @@ -114,6 +115,12 @@ async fn main() -> anyhow::Result<()> { log_fips_status(&aws_config.region); let version_without_next = EXTENSION_VERSION.split('-').next().unwrap_or("NA"); debug!("Starting Datadog Extension v{version_without_next}"); + // Tokio sizes its worker pool from available_parallelism(); logging it makes the + // value the runtime actually sees observable per memory tier (see cold-start H15). + debug!( + "INIT | available_parallelism={:?}", + std::thread::available_parallelism() + ); // Debug: Wait for debugger to attach if DD_DEBUG_WAIT_FOR_ATTACH is set if let Ok(wait_secs) = env::var("DD_DEBUG_WAIT_FOR_ATTACH") @@ -126,11 +133,15 @@ async fn main() -> anyhow::Result<()> { } prepare_client_provider()?; + // Separate checkpoint so the next difference isolates the TLS client build from + // crypto-provider setup (the latter is non-trivial in FIPS builds). + log_init_checkpoint(start_time, "crypto_provider_ready"); let client = create_reqwest_client_builder() .map_err(|e| anyhow::anyhow!("Failed to create client builder: {e:?}"))? .no_proxy() .build() .map_err(|e| anyhow::anyhow!("Failed to create client: {e:?}"))?; + log_init_checkpoint(start_time, "tls_client_build"); let cloned_client = client.clone(); let runtime_api = aws_config.runtime_api.clone(); @@ -148,6 +159,7 @@ async fn main() -> anyhow::Result<()> { let lambda_directory: String = env::var("LAMBDA_TASK_ROOT").unwrap_or_else(|_| "/var/task".to_string()); let config = Arc::new(config::get_config(Path::new(&lambda_directory))); + log_init_checkpoint(start_time, "config_parse"); let aws_config = Arc::new(aws_config); // Build one shared reqwest::Client for metrics, logs, trace proxy flushing, and calls to @@ -155,11 +167,13 @@ async fn main() -> anyhow::Result<()> { // just increments a refcount and shares the connection pool. let shared_client = bottlecap::http::get_client(&config); let api_key_factory = create_api_key_factory(&config, &aws_config, &shared_client); + log_init_checkpoint(start_time, "shared_client_ready"); let r = response .await .map_err(|e| anyhow::anyhow!("Failed to join task: {e:?}"))? .map_err(|e| anyhow::anyhow!("Failed to register extension: {e:?}"))?; + log_init_checkpoint(start_time, "register_ready"); match extension_loop_active( Arc::clone(&aws_config), @@ -191,6 +205,34 @@ fn init_ustr() { }); } +/// Cumulative nanoseconds (since process start) recorded at the previous init +/// checkpoint, so each checkpoint can report its own per-phase delta without the +/// reader having to subtract consecutive lines by hand. +static LAST_INIT_CHECKPOINT_NS: AtomicU64 = AtomicU64::new(0); + +/// Records a cold-start init checkpoint at `phase`, logging both `delta` (time +/// since the previous checkpoint, i.e. this phase's own cost) and `cumulative` +/// (time since process start), in milliseconds to 6 decimal places (nanosecond +/// resolution) so sub-millisecond phases are still visible. This attributes init +/// time per phase so the cold-start optimizations can be measured, not guessed. +/// +/// The bookkeeping is guarded behind a `DEBUG`-level check, so it stays +/// effectively free at the default `info` level (no clock read, no logging). +fn log_init_checkpoint(start_time: Instant, phase: &str) { + if !tracing::enabled!(tracing::Level::DEBUG) { + return; + } + let cumulative = start_time.elapsed(); + let cumulative_ns = u64::try_from(cumulative.as_nanos()).unwrap_or(u64::MAX); + let previous_ns = LAST_INIT_CHECKPOINT_NS.swap(cumulative_ns, Ordering::Relaxed); + let delta = std::time::Duration::from_nanos(cumulative_ns.saturating_sub(previous_ns)); + debug!( + "INIT | phase={phase} delta={:.6}ms cumulative={:.6}ms", + delta.as_secs_f64() * 1000.0, + cumulative.as_secs_f64() * 1000.0 + ); +} + fn enable_logging_subsystem() { let log_level = LogLevel::from_str( std::env::var("DD_LOG_LEVEL") @@ -328,6 +370,7 @@ async fn extension_loop_active( &shared_client, ) .await; + log_init_checkpoint(start_time, "dogstatsd_started"); let propagator = Arc::new(DatadogCompositePropagator::new(Arc::clone(config))); // Lifecycle Invocation Processor @@ -373,6 +416,7 @@ async fn extension_loop_active( appsec_processor.clone(), &shared_client, ); + log_init_checkpoint(start_time, "trace_agent_started"); let api_runtime_proxy_shutdown_signal = start_api_runtime_proxy( config, @@ -402,6 +446,7 @@ async fn extension_loop_active( aws_config.is_managed_instance_mode(), ) .await?; + log_init_checkpoint(start_time, "telemetry_subscribed"); let otlp_cancel_token = start_otlp_agent( config, @@ -421,6 +466,8 @@ async fn extension_loop_active( "Datadog Next-Gen Extension ready in {:}ms", start_time.elapsed().as_millis().to_string() ); + // Terminal checkpoint so the whole init timeline is one greppable "INIT |" series. + log_init_checkpoint(start_time, "ready"); if aws_config.is_managed_instance_mode() { // Clone Arc references for the background flusher task