diff --git a/Makefile.toml b/Makefile.toml index ad4ecba1..d226501a 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -302,6 +302,10 @@ args = [ # | baseline-live-docker-clean | command | | # | baseline-production-synthetic | command | | # | baseline-production-private | command | | +# | baseline-production-private-addendum | command | | +# | baseline-backfill-10k-docker | command | | +# | baseline-backfill-100k-docker | command | | +# | baseline-soak-docker | command | | [tasks.baseline-live-docker] workspace = false @@ -354,6 +358,38 @@ args = [ "set -euo pipefail; manifest=\"$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)\"; if [ -z \"$manifest\" ]; then echo \"ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private\" >&2; exit 1; fi; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=production-private; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", ] +[tasks.baseline-production-private-addendum] +workspace = false +command = "bash" +args = [ + "-lc", + "set -euo pipefail; manifest=\"$(printenv ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST || true)\"; if [ -z \"$manifest\" ]; then echo \"ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST is required for baseline-production-private-addendum\" >&2; exit 1; fi; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; selected_projects=\"$(printenv ELF_BASELINE_PROJECTS || true)\"; if [ -z \"$selected_projects\" ]; then selected_projects=\"ELF\"; fi; addendum=\"$(printenv ELF_BASELINE_PRIVATE_ADDENDUM || true)\"; if [ -z \"$addendum\" ]; then addendum=\"tmp/live-baseline/private-production-addendum.md\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=\"$selected_projects\"; export ELF_BASELINE_PROFILE=production-private; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner; ELF_BASELINE_MARKDOWN_REPORT=\"$addendum\" cargo make baseline-live-report; echo \"Private production addendum: $addendum\"", +] + +[tasks.baseline-backfill-10k-docker] +workspace = false +command = "bash" +args = [ + "-lc", + "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; backfill_docs=\"$(printenv ELF_BASELINE_BACKFILL_DOCS || true)\"; if [ -z \"$backfill_docs\" ]; then backfill_docs=\"10000\"; fi; elf_timeout=\"$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)\"; if [ -z \"$elf_timeout\" ]; then elf_timeout=\"14400\"; fi; max_elf_seconds=\"$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)\"; if [ -z \"$max_elf_seconds\" ]; then max_elf_seconds=\"$elf_timeout\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=ELF; export ELF_BASELINE_PROFILE=backfill; export ELF_BASELINE_BACKFILL_DOCS=\"$backfill_docs\"; export ELF_BASELINE_ELF_TIMEOUT_SECONDS=\"$elf_timeout\"; export ELF_BASELINE_MAX_ELF_SECONDS=\"$max_elf_seconds\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +] + +[tasks.baseline-backfill-100k-docker] +workspace = false +command = "bash" +args = [ + "-lc", + "set -euo pipefail; enabled=\"$(printenv ELF_BASELINE_ENABLE_EXPENSIVE || true)\"; if [ \"$enabled\" != \"1\" ]; then echo \"ELF_BASELINE_ENABLE_EXPENSIVE=1 is required for baseline-backfill-100k-docker\" >&2; exit 1; fi; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; backfill_docs=\"$(printenv ELF_BASELINE_BACKFILL_DOCS || true)\"; if [ -z \"$backfill_docs\" ]; then backfill_docs=\"100000\"; fi; elf_timeout=\"$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)\"; if [ -z \"$elf_timeout\" ]; then elf_timeout=\"86400\"; fi; max_elf_seconds=\"$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)\"; if [ -z \"$max_elf_seconds\" ]; then max_elf_seconds=\"$elf_timeout\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=ELF; export ELF_BASELINE_PROFILE=backfill; export ELF_BASELINE_BACKFILL_DOCS=\"$backfill_docs\"; export ELF_BASELINE_ELF_TIMEOUT_SECONDS=\"$elf_timeout\"; export ELF_BASELINE_MAX_ELF_SECONDS=\"$max_elf_seconds\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +] + +[tasks.baseline-soak-docker] +workspace = false +command = "bash" +args = [ + "-lc", + "set -euo pipefail; head=\"$(git rev-parse HEAD)\"; if [ -n \"$(git status --porcelain)\" ]; then head=\"$head+dirty\"; fi; soak_seconds=\"$(printenv ELF_BASELINE_SOAK_SECONDS || true)\"; if [ -z \"$soak_seconds\" ]; then soak_seconds=\"3600\"; fi; elf_timeout=\"$(printenv ELF_BASELINE_ELF_TIMEOUT_SECONDS || true)\"; if [ -z \"$elf_timeout\" ]; then elf_timeout=\"$((soak_seconds + 1800))\"; fi; max_elf_seconds=\"$(printenv ELF_BASELINE_MAX_ELF_SECONDS || true)\"; if [ -z \"$max_elf_seconds\" ]; then max_elf_seconds=\"$elf_timeout\"; fi; export ELF_BASELINE_ELF_HEAD=\"$head\"; export ELF_BASELINE_PROJECTS=ELF; export ELF_BASELINE_PROFILE=stress; export ELF_BASELINE_SOAK_SECONDS=\"$soak_seconds\"; export ELF_BASELINE_ELF_TIMEOUT_SECONDS=\"$elf_timeout\"; export ELF_BASELINE_MAX_ELF_SECONDS=\"$max_elf_seconds\"; docker compose -f docker-compose.baseline.yml run --build --rm baseline-runner", +] + # Real-world job benchmark smoke # | task | type | cwd | diff --git a/README.md b/README.md index 185e750b..cae2d70b 100644 --- a/README.md +++ b/README.md @@ -145,7 +145,12 @@ with the production embedding provider path, `Qwen3-Embedding-8B`, and those states are reported as limitations, not hidden as proof. - The benchmark runner and report publisher are checked in and Docker-isolated: `cargo make baseline-live-docker`, `cargo make baseline-backfill-docker`, - `cargo make baseline-live-report`, and `cargo make baseline-live-docker-clean`. + `cargo make baseline-production-private-addendum`, + `cargo make baseline-backfill-10k-docker`, + `cargo make baseline-backfill-100k-docker`, + `cargo make baseline-soak-docker`, `cargo make baseline-live-report`, and + `cargo make baseline-live-docker-clean`. Expensive 100k and long-soak profiles are + opt-in and do not run in normal checks. Detailed evidence and interpretation: diff --git a/apps/elf-eval/src/bin/live_baseline_elf.rs b/apps/elf-eval/src/bin/live_baseline_elf.rs index 82703ad8..d20ea4dd 100644 --- a/apps/elf-eval/src/bin/live_baseline_elf.rs +++ b/apps/elf-eval/src/bin/live_baseline_elf.rs @@ -212,6 +212,24 @@ struct ResourceEnvelopeEvidence { max_elapsed_seconds: f64, rss_kb: Option, max_rss_kb: u64, + postgres_database_bytes: Option, + corpus_dir_bytes: u64, + report_dir_bytes: Option, + checkpoint_file_bytes: Option, +} + +#[derive(Debug, Serialize)] +struct CostProxyReport { + schema: &'static str, + scope: &'static str, + embedding_mode: EmbeddingMode, + estimated_input_chars: usize, + estimated_input_tokens: usize, + token_estimation: &'static str, + configured_usd_per_1k_tokens: Option, + estimated_usd: Option, + document_count: usize, + query_count: usize, } #[derive(Debug, Serialize)] @@ -240,12 +258,14 @@ struct ElfBaselineReport { reason: String, head: String, embedding: EmbeddingRuntimeReport, + cost_proxy: CostProxyReport, backfill: BackfillReport, indexing: IndexingReport, summary: QuerySummary, check_summary: CheckSummary, checks: Vec, queries: Vec, + ops_cases: Vec, } #[derive(Debug, Serialize)] @@ -264,6 +284,20 @@ struct QuerySummary { wrong_result_count: usize, latency_ms_total: f64, latency_ms_mean: f64, + latency_ms_p50: f64, + latency_ms_p95: f64, + latency_ms_p99: f64, + latency_ms_max: f64, +} + +#[derive(Debug, Serialize)] +struct OperationalCase { + name: &'static str, + default_status: &'static str, + operator_status: &'static str, + command: &'static str, + evidence: &'static str, + safety: &'static str, } #[derive(Debug, Serialize)] @@ -1024,36 +1058,6 @@ fn concurrency_probe_indexes(note_count: usize) -> Vec { indexes } -fn resource_envelope_check(elapsed_seconds: f64) -> CheckResult { - let max_elapsed_seconds = env::var("ELF_BASELINE_MAX_ELF_SECONDS") - .ok() - .and_then(|value| value.parse::().ok()) - .unwrap_or(600.0); - let max_rss_kb = env::var("ELF_BASELINE_MAX_ELF_RSS_KB") - .ok() - .and_then(|value| value.parse::().ok()) - .unwrap_or(1_500_000); - let rss_kb = current_rss_kb(); - let pass = elapsed_seconds <= max_elapsed_seconds && rss_kb.is_none_or(|rss| rss <= max_rss_kb); - - CheckResult { - name: "resource_envelope", - status: if pass { "pass" } else { "lifecycle_fail" }, - reason: if pass { - "ELF live-baseline runtime stayed within the configured local resource envelope." - .to_string() - } else { - "ELF live-baseline runtime exceeded the configured local resource envelope.".to_string() - }, - evidence: serde_json::json!(ResourceEnvelopeEvidence { - elapsed_seconds, - max_elapsed_seconds, - rss_kb, - max_rss_kb, - }), - } -} - fn current_rss_kb() -> Option { let status = fs::read_to_string("/proc/self/status").ok()?; @@ -1065,6 +1069,150 @@ fn current_rss_kb() -> Option { }) } +fn path_size_bytes(path: &Path) -> color_eyre::Result { + let metadata = fs::metadata(path)?; + + if metadata.is_file() { + return Ok(metadata.len()); + } + if !metadata.is_dir() { + return Ok(0); + } + + let mut bytes = 0_u64; + + for entry in fs::read_dir(path)? { + let entry = entry?; + + bytes = bytes.saturating_add(path_size_bytes(&entry.path())?); + } + + Ok(bytes) +} + +fn cost_proxy_report( + notes: &[CorpusNote], + queries: &[QueryResult], + embedding: &EmbeddingRuntimeReport, +) -> CostProxyReport { + let note_chars = notes.iter().map(|note| note.text.len()).sum::(); + let query_chars = queries.iter().map(|query| query.query.len()).sum::(); + let estimated_input_chars = note_chars.saturating_add(query_chars); + let estimated_input_tokens = estimated_input_chars.saturating_add(3) / 4; + let configured_usd_per_1k_tokens = env::var("ELF_BASELINE_COST_PER_1K_TOKENS_USD") + .ok() + .and_then(|value| value.parse::().ok()); + let estimated_usd = + configured_usd_per_1k_tokens.map(|rate| estimated_input_tokens as f64 / 1_000.0 * rate); + + CostProxyReport { + schema: "elf.live_baseline.cost_proxy/v1", + scope: "primary corpus note text plus declared same-corpus query text", + embedding_mode: embedding.mode, + estimated_input_chars, + estimated_input_tokens, + token_estimation: "ceil(ascii_utf8_chars / 4)", + configured_usd_per_1k_tokens, + estimated_usd, + document_count: notes.len(), + query_count: queries.len(), + } +} + +fn latency_percentile(latencies: &[f64], percentile: f64) -> f64 { + if latencies.is_empty() { + return 0.0; + } + + let mut sorted = latencies.to_vec(); + + sorted.sort_by(f64::total_cmp); + + let rank = ((sorted.len().saturating_sub(1)) as f64 * percentile).ceil() as usize; + + sorted[rank.min(sorted.len().saturating_sub(1))] +} + +fn operational_case( + name: &'static str, + default_status: &'static str, + operator_status: &'static str, + command: &'static str, + evidence: &'static str, + safety: &'static str, +) -> OperationalCase { + OperationalCase { name, default_status, operator_status, command, evidence, safety } +} + +fn operational_cases() -> Vec { + vec![ + operational_case( + "private_corpus_addendum", + "fails_closed_without_manifest", + "opt_in", + "ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST=tmp/private-production-corpus/manifest.json cargo make baseline-production-private-addendum", + "tmp/live-baseline/private-production-addendum.md", + "Markdown addendum reports manifest id, evidence ids, tasks, checks, latency, resource, and cost proxy fields; private text remains in tmp JSON/logs only.", + ), + operational_case( + "backfill_10k_resume", + "not_run", + "opt_in", + "cargo make baseline-backfill-10k-docker", + "tmp/live-baseline/live-baseline-report.json", + "Runs Docker-owned dependencies and records checkpoint resume, duplicates, latency percentiles, resource usage, and cost proxy fields.", + ), + operational_case( + "backfill_100k_resume", + "guarded", + "expensive_opt_in", + "ELF_BASELINE_ENABLE_EXPENSIVE=1 cargo make baseline-backfill-100k-docker", + "tmp/live-baseline/live-baseline-report.json", + "Fails closed unless the expensive-run guard is explicitly enabled.", + ), + operational_case( + "provider_outage", + "not_run", + "documented_operator_probe", + "ELF_BASELINE_ELF_EMBEDDING_MODE=provider with an unavailable embedding endpoint and cargo make baseline-production-synthetic", + "ELF project status incomplete or blocked with provider failure in tmp/live-baseline/ELF.log", + "Use only synthetic or sanitized manifests; do not place provider keys in committed files.", + ), + operational_case( + "compose_start_stop_upgrade", + "documented", + "runbook", + "docs/guide/single_user_production.md Sections 2, 4, and 5", + "storage health, API health, migration check, and post-upgrade search smoke", + "Backup Postgres before binary/config upgrade; rollback restores the previous backup and rebuilds Qdrant.", + ), + operational_case( + "postgres_restore_qdrant_rebuild", + "documented", + "runbook_or_clean_volume_proof", + "docs/guide/single_user_production.md Sections 6 through 9", + "Postgres restored row count, admin qdrant rebuild counts, and search-after-restore response", + "Qdrant remains derived and rebuild uses Postgres-held vectors without embedding provider calls.", + ), + operational_case( + "migration_rollback", + "documented", + "runbook", + "docs/guide/single_user_production.md Section 5 rollback path", + "pre-upgrade backup path, restored source rows, qdrant rebuild, and health check", + "No reverse migration is claimed; rollback means previous binary/config plus restored Postgres backup.", + ), + operational_case( + "unattended_soak", + "bounded", + "opt_in", + "ELF_BASELINE_PROJECTS=ELF ELF_BASELINE_PROFILE=stress ELF_BASELINE_SOAK_SECONDS=3600 cargo make baseline-live-docker", + "soak_stability_e2e check and resource_envelope check in tmp/live-baseline/live-baseline-report.json", + "Long soak duration is env-controlled and not part of the default smoke profile.", + ), + ] +} + fn incomplete_check(name: &'static str, reason: &str) -> CheckResult { CheckResult { name, @@ -1269,6 +1417,58 @@ fn git_head() -> color_eyre::Result { Ok(String::from_utf8(output.stdout)?.trim().to_string()) } +async fn resource_envelope_check( + service: &ElfService, + corpus_dir: &Path, + report_path: &Path, + checkpoint_path: &Path, + elapsed_seconds: f64, +) -> CheckResult { + let max_elapsed_seconds = env::var("ELF_BASELINE_MAX_ELF_SECONDS") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(600.0); + let max_rss_kb = env::var("ELF_BASELINE_MAX_ELF_RSS_KB") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(1_500_000); + let rss_kb = current_rss_kb(); + let pass = elapsed_seconds <= max_elapsed_seconds && rss_kb.is_none_or(|rss| rss <= max_rss_kb); + let postgres_database_bytes = postgres_database_bytes(service).await.ok(); + let corpus_dir_bytes = path_size_bytes(corpus_dir).unwrap_or_default(); + let report_dir_bytes = report_path.parent().and_then(|path| path_size_bytes(path).ok()); + let checkpoint_file_bytes = checkpoint_path.metadata().ok().map(|metadata| metadata.len()); + + CheckResult { + name: "resource_envelope", + status: if pass { "pass" } else { "lifecycle_fail" }, + reason: if pass { + "ELF live-baseline runtime stayed within the configured local resource envelope." + .to_string() + } else { + "ELF live-baseline runtime exceeded the configured local resource envelope.".to_string() + }, + evidence: serde_json::json!(ResourceEnvelopeEvidence { + elapsed_seconds, + max_elapsed_seconds, + rss_kb, + max_rss_kb, + postgres_database_bytes, + corpus_dir_bytes, + report_dir_bytes, + checkpoint_file_bytes, + }), + } +} + +async fn postgres_database_bytes(service: &ElfService) -> color_eyre::Result { + let bytes = sqlx::query_scalar::<_, i64>("SELECT pg_database_size(current_database())::bigint") + .fetch_one(&service.db.pool) + .await?; + + Ok(bytes) +} + async fn load_existing_backfill_notes( service: &ElfService, ) -> color_eyre::Result> { @@ -1581,6 +1781,11 @@ async fn run(args: Args) -> color_eyre::Result { let fail_count = query_results.len().saturating_sub(pass_count); let latency_ms_total = query_results.iter().map(|result| result.latency_ms).sum::(); let latency_ms_mean = latency_ms_total / query_results.len().max(1) as f64; + let latency_values = query_results.iter().map(|result| result.latency_ms).collect::>(); + let latency_ms_p50 = latency_percentile(&latency_values, 0.50); + let latency_ms_p95 = latency_percentile(&latency_values, 0.95); + let latency_ms_p99 = latency_percentile(&latency_values, 0.99); + let latency_ms_max = latency_values.iter().copied().fold(0.0_f64, f64::max); let retrieval_status = if fail_count == 0 { "retrieval_pass" } else { "retrieval_wrong_result" }; let mut checks = vec![ @@ -1596,7 +1801,16 @@ async fn run(args: Args) -> color_eyre::Result { checks.push(soak_check); } - checks.push(resource_envelope_check(started_at.elapsed().as_secs_f64())); + checks.push( + resource_envelope_check( + &service, + &args.corpus, + &args.out, + &backfill_checkpoint_path, + started_at.elapsed().as_secs_f64(), + ) + .await, + ); let check_summary = summarize_checks(&checks); let status = project_status_from_summary(&check_summary); @@ -1613,13 +1827,16 @@ async fn run(args: Args) -> color_eyre::Result { check_summary.not_encoded ) }; + let embedding = embedding_runtime_report(&service.cfg); + let cost_proxy = cost_proxy_report(¬es, &query_results, &embedding); let report = ElfBaselineReport { schema: "elf.live_baseline.elf_result/v1", status, retrieval_status, reason, head: git_head().unwrap_or_else(|_| "unknown".to_string()), - embedding: embedding_runtime_report(&service.cfg), + embedding, + cost_proxy, backfill: backfill.report, indexing: IndexingReport { note_count: notes.len(), @@ -1634,10 +1851,15 @@ async fn run(args: Args) -> color_eyre::Result { wrong_result_count: fail_count, latency_ms_total, latency_ms_mean, + latency_ms_p50, + latency_ms_p95, + latency_ms_p99, + latency_ms_max, }, check_summary, checks, queries: query_results, + ops_cases: operational_cases(), }; drop(service); diff --git a/docker-compose.baseline.yml b/docker-compose.baseline.yml index efdf1fd5..1495166a 100644 --- a/docker-compose.baseline.yml +++ b/docker-compose.baseline.yml @@ -45,6 +45,7 @@ services: EMBEDDING_PROVIDER_ID: ${EMBEDDING_PROVIDER_ID:-} EMBEDDING_TIMEOUT_MS: ${EMBEDDING_TIMEOUT_MS:-} ELF_BASELINE_CONCURRENT_NOTES: ${ELF_BASELINE_CONCURRENT_NOTES:-} + ELF_BASELINE_COST_PER_1K_TOKENS_USD: ${ELF_BASELINE_COST_PER_1K_TOKENS_USD:-} ELF_BASELINE_ELF_EMBEDDING_API_BASE: ${ELF_BASELINE_ELF_EMBEDDING_API_BASE:-} ELF_BASELINE_ELF_EMBEDDING_API_KEY: ${ELF_BASELINE_ELF_EMBEDDING_API_KEY:-} ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS: ${ELF_BASELINE_ELF_EMBEDDING_DIMENSIONS:-} @@ -63,6 +64,7 @@ services: ELF_BASELINE_MAX_ELF_SECONDS: ${ELF_BASELINE_MAX_ELF_SECONDS:-600} ELF_BASELINE_PROFILE: ${ELF_BASELINE_PROFILE:-smoke} ELF_BASELINE_PROJECTS: ${ELF_BASELINE_PROJECTS:-all} + ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST: ${ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST:-} ELF_BASELINE_REPORT_DIR: /workspace/tmp/live-baseline ELF_BASELINE_SCALE_DOCS: ${ELF_BASELINE_SCALE_DOCS:-120} ELF_BASELINE_SOAK_PROBE_INTERVAL_MS: ${ELF_BASELINE_SOAK_PROBE_INTERVAL_MS:-} @@ -90,7 +92,7 @@ services: - elf-live-baseline-cargo-git:/usr/local/cargo/git - elf-live-baseline-cargo-registry:/usr/local/cargo/registry - elf-live-baseline-target:/workspace/target - - ./tmp/live-baseline:/workspace/tmp/live-baseline + - ./tmp:/workspace/tmp volumes: elf-live-baseline-cargo-git: diff --git a/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md b/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md index d1491423..5dda8783 100644 --- a/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md +++ b/docs/guide/benchmarking/2026-06-09-production-adoption-gate-report.md @@ -263,6 +263,30 @@ Recommended non-blocking follow-ups: typed benchmark improvement opportunities only if external parity coverage remains a roadmap goal. +## Post-Gate Repeatability Extension + +XY-850 extends the live-baseline runner after this gate without changing the gate's +historical verdict. The private-corpus result remains bounded until an operator-owned +manifest is supplied. + +New repeatable paths: + +- `cargo make baseline-production-private-addendum` runs the private profile and writes + a safe Markdown addendum to `tmp/live-baseline/private-production-addendum.md` by + default. It still fails closed when `ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST` is + absent. +- `cargo make baseline-backfill-10k-docker` runs an ELF-only 10k generated backfill + resume profile. +- `ELF_BASELINE_ENABLE_EXPENSIVE=1 cargo make baseline-backfill-100k-docker` runs the + guarded 100k profile. Without the guard, the task exits before starting Docker work. +- `cargo make baseline-soak-docker` runs an explicit ELF-only soak profile, defaulting + to one hour unless `ELF_BASELINE_SOAK_SECONDS` is set. + +New report fields include duplicate-source count, checkpoint resume state, latency +mean/P50/P95/P99/max, RSS and disk-size proxies, a planning-only cost proxy, and +operator-case commands for provider outage, migration rollback, Docker Compose +start/stop/upgrade, Postgres restore, Qdrant rebuild, and unattended soak. + ## Runner Repairs Made By This Gate Two small runner fixes were required to collect the fresh evidence: diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index 9717c2de..1d58857f 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -25,6 +25,7 @@ cleanup, use `docs/guide/single_user_production.md`. - `live_baseline_benchmark.md`: run, clean up, publish, and interpret the live Docker-only benchmark matrix, including generated public and production-corpus + profiles, private addendum publication, opt-in 10k/100k backfill, and soak profiles. - `2026-06-09-live-baseline-report.md`: checked-in evidence snapshot for the June 9, 2026 ELF production-provider stress run and all-project smoke comparison. diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index c29f6125..40a04c4b 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -66,6 +66,9 @@ query references an unknown evidence ID. It does not fall back to the checked-in synthetic fixture. Use `ELF_BASELINE_BACKFILL_DOCS` to set the generated corpus size for the backfill profile; values such as `10000` are supported for operator-controlled stress runs. +Use `cargo make baseline-backfill-10k-docker` for the checked-in 10k operator profile. +Use `cargo make baseline-backfill-100k-docker` only with +`ELF_BASELINE_ENABLE_EXPENSIVE=1`; the task fails closed without that explicit guard. Use `ELF_BASELINE_CONCURRENT_NOTES`, `ELF_BASELINE_MAX_ELF_SECONDS`, and `ELF_BASELINE_MAX_ELF_RSS_KB` to tune ELF's concurrent-write and resource-envelope checks. @@ -73,7 +76,9 @@ Use `ELF_BASELINE_SOAK_SECONDS`, `ELF_BASELINE_SOAK_ROUNDS`, and `ELF_BASELINE_SOAK_PROBE_INTERVAL_MS` to tune ELF's repeated write/search soak window. The smoke profile does not run soak by default; the scale/full profiles run a short 15-second soak by default, and the stress profile runs a 60-second soak by -default. +default. Use `cargo make baseline-soak-docker` for an explicit one-hour ELF-only soak, +or override `ELF_BASELINE_SOAK_SECONDS` for a shorter or longer operator-controlled +window. Use `ELF_BASELINE_ELF_EMBEDDING_MODE=provider` plus `ELF_BASELINE_ELF_EMBEDDING_API_BASE`, `ELF_BASELINE_ELF_EMBEDDING_API_KEY`, `ELF_BASELINE_ELF_EMBEDDING_MODEL`, and @@ -94,6 +99,20 @@ directory by default, intentionally interrupts the first pass unless `ELF_BASELINE_BACKFILL_BATCH_SIZE`, `ELF_BASELINE_BACKFILL_INTERRUPT_AFTER`, `ELF_BASELINE_BACKFILL_CHECKPOINT`, and `ELF_BASELINE_WORKER_CONCURRENCY` when measuring import and indexing throughput. +Set `ELF_BASELINE_COST_PER_1K_TOKENS_USD` to attach a planning-only cost proxy to +ELF reports. The proxy estimates input tokens from primary corpus note text plus +declared same-corpus query text; it is not a billing statement. + +The ELF report records: + +- duplicate source-note count and checkpoint resume state; +- query latency mean, P50, P95, P99, and max; +- local RSS, Postgres database bytes, corpus bytes, report-directory bytes, and + checkpoint-file bytes; +- the optional cost proxy described above; +- operator-case commands for private addendum, 10k/100k resume, provider outage, + Docker Compose start/stop/upgrade, migration rollback, Postgres restore, Qdrant + rebuild, and unattended soak. Current external same-corpus adapters: @@ -163,6 +182,9 @@ ELF_BASELINE_PROFILE=scale ELF_BASELINE_SCALE_DOCS=240 cargo make baseline-live- ELF_BASELINE_PROFILE=stress cargo make baseline-live-docker ELF_BASELINE_PROJECTS=ELF ELF_BASELINE_PROFILE=backfill cargo make baseline-live-docker cargo make baseline-backfill-docker +cargo make baseline-backfill-10k-docker +ELF_BASELINE_ENABLE_EXPENSIVE=1 cargo make baseline-backfill-100k-docker +ELF_BASELINE_SOAK_SECONDS=3600 cargo make baseline-soak-docker ``` To iterate on one or more project adapters without rerunning the full matrix: @@ -188,6 +210,27 @@ cargo make baseline-production-private The private manifest can contain sanitized inline `text` fields or `local_path` fields that point to local sanitized text/Markdown files. Keep private manifests and local evidence under `tmp/` or outside the repository. `tmp/` is ignored by git. +The manifest `manifest_id`, evidence IDs, and query IDs are report-visible labels; keep +them lower-case ASCII identifiers and do not encode private text in those fields. + +To run the same private profile and publish a safe Markdown addendum under `tmp/`: + +```sh +ELF_BASELINE_PRODUCTION_CORPUS_MANIFEST=tmp/private-production-corpus/manifest.json \ +cargo make baseline-production-private-addendum +``` + +The default addendum path is: + +```text +tmp/live-baseline/private-production-addendum.md +``` + +Override it with `ELF_BASELINE_PRIVATE_ADDENDUM`. The addendum intentionally reports +manifest id, evidence ids, task labels, checks, latency, backfill, resource, cost +proxy, and operator-case fields without embedding private evidence text or local +private file paths. Raw JSON and logs remain under `tmp/live-baseline/` and must be +reviewed before any manual copy into durable docs. The only host artifact is: @@ -219,6 +262,8 @@ generated public `smoke`, `scale`, or `stress` profiles is not enough for person production adoption. Cite a `production-synthetic` report for fixture coverage, and cite a `production-private` report when making a private-corpus production-readiness claim. +If no operator-owned private manifest is supplied, the private-corpus path is a +bounded failure, not a pass. ## Publish A Markdown Report diff --git a/docs/spec/production_corpus_manifest_v1.md b/docs/spec/production_corpus_manifest_v1.md index 4d582958..05bc417e 100644 --- a/docs/spec/production_corpus_manifest_v1.md +++ b/docs/spec/production_corpus_manifest_v1.md @@ -15,7 +15,8 @@ query tasks, evidence expectations, and private-content safety rules. A production corpus manifest is a JSON object with: - `schema`: exactly `elf.production_corpus_manifest/v1`. -- `manifest_id`: stable lower-risk identifier for the corpus snapshot. +- `manifest_id`: stable lower-risk identifier for the corpus snapshot. Allowed + shape: `[a-z0-9][a-z0-9_.-]{1,80}`. - `description`: optional English summary. - `evidence`: non-empty array of production-style memory evidence items. - `queries`: non-empty array of task-oriented retrieval checks. @@ -44,7 +45,8 @@ unsanitized private conversation content. Each `queries[]` item must include: -- `query_id`: stable query identifier. +- `query_id`: stable query identifier. Allowed shape: + `[a-z0-9][a-z0-9_.-]{1,80}`. - `task`: one of `resume_lane`, `recover_exact_command`, `explain_stale_blocker`, `find_prior_decision`, `compare_project_status`, or `detect_contradiction_update`. diff --git a/scripts/live-baseline-benchmark.sh b/scripts/live-baseline-benchmark.sh index a0991a65..63f62465 100755 --- a/scripts/live-baseline-benchmark.sh +++ b/scripts/live-baseline-benchmark.sh @@ -398,6 +398,8 @@ if manifest.get("schema") != "elf.production_corpus_manifest/v1": fail("schema must be elf.production_corpus_manifest/v1") manifest_id = require_string(manifest, "manifest_id", "$") +if not id_re.fullmatch(manifest_id): + fail("$.manifest_id must be lower-case ASCII and safe for reports") evidence_items = manifest.get("evidence") if not isinstance(evidence_items, list) or not evidence_items: fail("$.evidence must be a non-empty array") @@ -443,12 +445,18 @@ for index, item in enumerate(evidence_items): ) queries = [] +query_ids = set() task_counts = Counter() for index, item in enumerate(query_items): context = f"$.queries[{index}]" if not isinstance(item, dict): fail(f"{context} must be an object") query_id = require_string(item, "query_id", context) + if not id_re.fullmatch(query_id): + fail(f"{context}.query_id must be lower-case ASCII and safe for reports") + if query_id in query_ids: + fail(f"{context}.query_id duplicates an earlier item") + query_ids.add(query_id) task = require_string(item, "task", context) if task not in allowed_tasks: fail(f"{context}.task must be one of {sorted(allowed_tasks)}") @@ -599,9 +607,12 @@ json_record() { elapsed_seconds: $elapsed_seconds, adapter: $adapter[0], embedding: ($checks[0].embedding // null), + cost_proxy: ($checks[0].cost_proxy // null), query_summary: ($checks[0].query_summary // null), queries: ($checks[0].queries // null), backfill: ($checks[0].backfill // null), + resource_envelope: ([$checks[0].checks[]? | select(.name == "resource_envelope") | .evidence][0] // null), + ops_cases: ($checks[0].ops_cases // null), check_summary: $checks[0].check_summary, checks: $checks[0].checks }' >>"${RECORDS}" @@ -643,6 +654,9 @@ json_record() { query_summary: null, queries: null, backfill: null, + cost_proxy: null, + resource_envelope: null, + ops_cases: null, adapter: $adapter[0], check_summary: { total: 1, @@ -784,8 +798,30 @@ finish_report() { mean: ( [.[] | select(.query_summary != null) | .query_summary.latency_ms_mean // 0] as $means | if ($means | length) == 0 then 0 else (($means | add) / ($means | length)) end - ) + ), + p50: ( + [.[] | select(.query_summary != null) | .query_summary.latency_ms_p50 // 0] as $values + | if ($values | length) == 0 then 0 else (($values | add) / ($values | length)) end + ), + p95: ( + [.[] | select(.query_summary != null) | .query_summary.latency_ms_p95 // 0] as $values + | if ($values | length) == 0 then 0 else (($values | add) / ($values | length)) end + ), + p99: ( + [.[] | select(.query_summary != null) | .query_summary.latency_ms_p99 // 0] as $values + | if ($values | length) == 0 then 0 else (($values | add) / ($values | length)) end + ), + max: ([.[] | .query_summary.latency_ms_max // 0] | max // 0) }, + cost_proxy: { + projects: [.[] | select(.cost_proxy != null) | {project, cost_proxy}], + estimated_usd: ([.[] | .cost_proxy.estimated_usd? // empty] | add // null), + estimated_input_tokens: ([.[] | .cost_proxy.estimated_input_tokens // 0] | add // 0) + }, + resource_usage: { + projects: [.[] | select(.resource_envelope != null) | {project, resource_envelope}] + }, + ops_cases: [.[] | select(.ops_cases != null) | {project, cases: .ops_cases}], projects: . }' "${RECORDS}" >"${REPORT}" } @@ -852,6 +888,10 @@ project_elf() { "status": "real", "surface": "parallel add_note calls followed by worker indexing and search probes" }, + "scale_stress_profile": { + "status": "real", + "surface": "profile-selected generated or production corpus size plus soak and resource-envelope checks" + }, "soak_profile": { "status": "real", "surface": "profile-controlled repeated write/search stability window" @@ -871,7 +911,7 @@ JSON if run_cmd "${project}: same-corpus retrieval" "$(elf_timeout_seconds)" "${log_path}" \ "cd '${ROOT_DIR}' && cargo run -p elf-eval --bin live_baseline_elf -- --config config/local/elf.docker.toml --corpus '${CORPUS_DIR}' --queries '${REPORT_DIR}/queries.json' --out '${result_path}'"; then if [[ -s "${result_path}" ]] && jq -e '.checks and .check_summary' "${result_path}" >/dev/null 2>&1; then - jq '{embedding, query_summary: .summary, queries, backfill, check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" + jq '{embedding, cost_proxy, query_summary: .summary, queries, backfill, ops_cases, check_summary, checks}' "${result_path}" >"${REPORT_DIR}/${project}-checks.json" fi if [[ -s "${result_path}" ]] && jq -e --argjson document_count "${DOCUMENT_COUNT}" --argjson query_count "${QUERY_COUNT}" ' .schema == "elf.live_baseline.elf_result/v1" and @@ -895,7 +935,7 @@ JSON ' "${result_path}" >/dev/null; then json_record "${project}" "${repo}" "${head}" "pass" "retrieval_pass" \ "$(jq -r '.reason' "${result_path}")" \ - "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability" + "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability; latency/resource/cost proxies" return fi @@ -903,19 +943,19 @@ JSON json_record "${project}" "${repo}" "${head}" "$(jq -r '.status // "incomplete"' "${result_path}")" \ "$(jq -r '.retrieval_status // "retrieval_failed"' "${result_path}")" \ "$(jq -r '.reason // "ELF result did not satisfy live baseline pass criteria"' "${result_path}")" \ - "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability" + "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability; latency/resource/cost proxies" return fi json_record "${project}" "${repo}" "${head}" "incomplete" "runtime_failed" \ "ELF command completed but did not write a valid live-baseline result; inspect ELF.log for the runtime error" \ - "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability" + "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability; latency/resource/cost proxies" return fi json_record "${project}" "${repo}" "${head}" "incomplete" "runtime_failed" \ "ELF same-corpus retrieval command failed in Docker" \ - "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability" + "${project}.log" "checkpointed add_note backfill; bounded worker outbox indexing; rebuild_qdrant; search_raw; concurrent writes; soak stability; latency/resource/cost proxies" } project_agentmemory() { diff --git a/scripts/live-baseline-report-to-md.sh b/scripts/live-baseline-report-to-md.sh index 6b2605db..38ef83ff 100755 --- a/scripts/live-baseline-report-to-md.sh +++ b/scripts/live-baseline-report-to-md.sh @@ -53,6 +53,8 @@ render_report() { ("- Queries: `" + (.corpus.query_count | tostring) + "`"), ("- Wrong-result count: `" + ((.wrong_result_count // 0) | tostring) + "`"), ("- Query latency mean: `" + ((.latency_ms.mean // 0) | tostring) + " ms`"), + ("- Query latency P50/P95/P99: `" + ((.latency_ms.p50 // 0) | tostring) + " ms`, `" + ((.latency_ms.p95 // 0) | tostring) + " ms`, `" + ((.latency_ms.p99 // 0) | tostring) + " ms`"), + ("- Query latency max: `" + ((.latency_ms.max // 0) | tostring) + " ms`"), ("- Project summary: `" + (.summary.pass // 0 | tostring) + " pass`, `" + (.summary.wrong_result // 0 | tostring) + " wrong_result`, `" + (.summary.lifecycle_fail // 0 | tostring) + " lifecycle_fail`, `" + (.summary.blocked // 0 | tostring) + " blocked`, `" + (.summary.incomplete // 0 | tostring) + " incomplete`, `" + (.summary.not_encoded // 0 | tostring) + " not_encoded`"), ("- Same-corpus summary: `" + (.same_corpus_summary.pass // 0 | tostring) + " pass`, `" + (.same_corpus_summary.wrong_result // 0 | tostring) + " wrong_result`, `" + (.same_corpus_summary.blocked // 0 | tostring) + " blocked`, `" + (.same_corpus_summary.incomplete // 0 | tostring) + " incomplete`, `" + (.same_corpus_summary.not_encoded // 0 | tostring) + " not_encoded`"), ("- Full check summary: `" + (.full_check_summary.pass // 0 | tostring) + "/" + (.full_check_summary.total // 0 | tostring) + " pass`, `" + (.full_check_summary.wrong_result // 0 | tostring) + " wrong_result`, `" + (.full_check_summary.lifecycle_fail // 0 | tostring) + " lifecycle_fail`, `" + (.full_check_summary.blocked // 0 | tostring) + " blocked`, `" + (.full_check_summary.incomplete // 0 | tostring) + " incomplete`, `" + (.full_check_summary.not_encoded // 0 | tostring) + " not_encoded`"), @@ -86,7 +88,54 @@ render_report() { + " | `" + (.adapter.behaviors.update.status | md) + "`" + " | `" + (.adapter.behaviors.delete_or_expire.status | md) + "`" + " | `" + (.adapter.behaviors.cold_start_reload.status | md) + "`" - + " | `" + (.adapter.behaviors.scale_stress_profile.status | md) + "` |" + + " | `" + ( + .adapter.behaviors.scale_stress_profile.status + // .adapter.behaviors.soak_profile.status + // .adapter.behaviors.resource_envelope.status + | md + ) + "` |" + ), + "" + else empty end + ), + ( + [.projects[] | select(.cost_proxy != null)] as $costed + | if ($costed | length) > 0 then + "## Cost Proxy", + "", + "This is an input-size proxy for planning provider-backed runs, not a billing claim.", + "", + "| Project | Scope | Mode | Estimated Input Tokens | Rate | Estimated Cost |", + "| --- | --- | --- | --- | --- | --- |", + ( + $costed[] + | "| " + (.project | md) + + " | " + (.cost_proxy.scope | md) + + " | `" + (.cost_proxy.embedding_mode | md) + "`" + + " | `" + (.cost_proxy.estimated_input_tokens | tostring) + "`" + + " | `" + ((.cost_proxy.configured_usd_per_1k_tokens // "-") | tostring) + "`" + + " | `" + ((.cost_proxy.estimated_usd // "-") | tostring) + "` |" + ), + "" + else empty end + ), + ( + [.projects[] | select(.resource_envelope != null)] as $resources + | if ($resources | length) > 0 then + "## Resource Usage", + "", + "| Project | Elapsed | RSS KB | Max RSS KB | Postgres Bytes | Corpus Bytes | Report Bytes | Checkpoint Bytes |", + "| --- | --- | --- | --- | --- | --- | --- | --- |", + ( + $resources[] + | "| " + (.project | md) + + " | `" + (.resource_envelope.elapsed_seconds | tostring) + "s`" + + " | `" + ((.resource_envelope.rss_kb // "-") | tostring) + "`" + + " | `" + (.resource_envelope.max_rss_kb | tostring) + "`" + + " | `" + ((.resource_envelope.postgres_database_bytes // "-") | tostring) + "`" + + " | `" + ((.resource_envelope.corpus_dir_bytes // "-") | tostring) + "`" + + " | `" + ((.resource_envelope.report_dir_bytes // "-") | tostring) + "`" + + " | `" + ((.resource_envelope.checkpoint_file_bytes // "-") | tostring) + "` |" ), "" else empty end @@ -141,8 +190,8 @@ render_report() { | if ($backfilled | length) > 0 then "## Backfill", "", - "| Project | Sources | Completed | Batch | Workers | Resume | Duplicates | Backfill Elapsed |", - "| --- | --- | --- | --- | --- | --- | --- | --- |", + "| Project | Sources | Completed | Batch | Workers | Resume | Attempts | Skipped | Duplicates | Backfill Elapsed |", + "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |", ( $backfilled[] | "| " + (.project | md) @@ -158,12 +207,36 @@ render_report() { "disabled" end ) + "`" + + " | `" + ((.backfill.resume.resume_attempts // 0) | tostring) + "`" + + " | `" + ((.backfill.skipped_completed // 0) | tostring) + "`" + " | `" + ((.backfill.duplicate_source_notes | length) | tostring) + "`" + " | `" + (.backfill.elapsed_seconds | tostring) + "s` |" ), "" else empty end ), + ( + [.ops_cases[]?] as $groups + | if ($groups | length) > 0 then + "## Operational Cases", + "", + "| Project | Case | Default Status | Operator Status | Command | Evidence | Safety |", + "| --- | --- | --- | --- | --- | --- | --- |", + ( + $groups[] + | .project as $project + | .cases[] + | "| " + ($project | md) + + " | `" + (.name | md) + "`" + + " | `" + (.default_status | md) + "`" + + " | `" + (.operator_status | md) + "`" + + " | `" + (.command | md) + "`" + + " | " + (.evidence | md) + + " | " + (.safety | md) + " |" + ), + "" + else empty end + ), "## Result Semantics", "", "- `pass`: every encoded check for the selected project and profile passed.",