diff --git a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json index 32daf4f..d950c52 100644 --- a/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json +++ b/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux/dropped_evidence_filter.json @@ -25,18 +25,49 @@ "adapter_response": { "adapter_id": "fixture_operator_ux", "answer": { - "content": "The auth retry policy note is the root cause; no expected deployment evidence was dropped.", + "content": "The expected evidence was dropped after recall by the read-profile filter; the auth retry policy note was only the selected decoy.", "claims": [ { - "claim_id": "wrong_root_cause", - "text": "No expected evidence was dropped.", - "evidence_ids": ["trace-dropped-decoy"], + "claim_id": "root_cause", + "text": "The expected evidence was dropped after recall by the read-profile filter.", + "evidence_ids": ["trace-dropped-expected"], "confidence": "high" } ], - "evidence_ids": ["trace-dropped-decoy"], + "evidence_ids": ["trace-dropped-expected"], "latency_ms": 2.4, - "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0} + "cost": {"currency": "USD", "amount": 0.0, "input_tokens": 0, "output_tokens": 0}, + "trace_explainability": { + "trace_id": "11111111-1111-4111-8111-111111111111", + "failure_stage": "filter.read_profile", + "failure_reason": "Expected evidence survived recall.candidates but was removed by the read-profile scope filter before final selection.", + "stages": [ + { + "stage_name": "recall.candidates", + "kept_evidence": ["trace-dropped-expected", "trace-dropped-decoy"], + "dropped_evidence": [], + "demoted_evidence": [], + "distractor_evidence": ["trace-dropped-decoy"], + "notes": "Candidate recall found both expected evidence and the decoy top note." + }, + { + "stage_name": "filter.read_profile", + "kept_evidence": ["trace-dropped-decoy"], + "dropped_evidence": ["trace-dropped-expected"], + "demoted_evidence": [], + "distractor_evidence": ["trace-dropped-decoy"], + "notes": "The expected evidence failed the read-profile scope check." + }, + { + "stage_name": "selection.final", + "kept_evidence": ["trace-dropped-decoy"], + "dropped_evidence": ["trace-dropped-expected"], + "demoted_evidence": [], + "distractor_evidence": ["trace-dropped-decoy"], + "notes": "Final selection only saw the decoy after filtering." + } + ] + } } } }, diff --git a/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json b/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json index 56dd226..9a7971e 100644 --- a/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json +++ b/apps/elf-eval/fixtures/real_world_memory/retrieval/stage_explainability_wrong_result.json @@ -32,9 +32,16 @@ "adapter_response": { "adapter_id": "fixture_retrieval", "answer": { - "content": "The trace shows the expected evidence was present in recall.candidates but demoted at rerank.score; however, the selected answer followed the stale top-k smoke-only evidence.", - "claims": [], - "evidence_ids": ["stage-decoy"], + "content": "Expected evidence was present in recall.candidates but demoted at rerank.score; the selected stale top-k smoke-only evidence was the decoy to repair against.", + "claims": [ + { + "claim_id": "stage_attribution", + "text": "Expected evidence was present in recall.candidates but demoted at rerank.score.", + "evidence_ids": ["stage-target"], + "confidence": "high" + } + ], + "evidence_ids": ["stage-target"], "latency_ms": 18.2, "cost": { "currency": "USD", @@ -202,5 +209,5 @@ "trace_evidence": ["stage-target", "stage-decoy"], "ux_gaps": [] }, - "tags": ["synthetic", "operator_debugging_ux", "trace_explainability", "wrong_result", "no_live_claim"] + "tags": ["synthetic", "operator_debugging_ux", "trace_explainability", "stage_attribution", "no_live_claim"] } diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index cc665cb..291dbbf 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -86,6 +86,10 @@ fn find_by_field<'a>(items: &'a [Value], field: &str, expected: &str) -> Result< .ok_or_else(|| eyre::eyre!("missing item with {field} = {expected}")) } +fn array_contains_str(value: &Value, pointer: &str, expected: &str) -> Result { + Ok(array_at(value, pointer)?.iter().any(|item| item.as_str() == Some(expected))) +} + fn set_json_pointer(value: &mut Value, pointer: &str, replacement: Value) -> Result<()> { let target = value.pointer_mut(pointer).ok_or_else(|| eyre::eyre!("missing JSON pointer {pointer}"))?; @@ -171,13 +175,18 @@ fn operator_debug_fixture_reports_trace_links_and_failure_details() -> Result<() assert_eq!(report.pointer("/summary/raw_sql_needed_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/trace_incomplete_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/operator_ux_gap_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); - assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(1) + ); let jobs = array_at(&report, "/jobs")?; let dropped = find_by_field(jobs, "/job_id", "operator-debug-dropped-evidence-001")?; - assert_eq!(dropped.pointer("/status").and_then(Value::as_str), Some("unsupported_claim")); + assert_eq!(dropped.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!( dropped.pointer("/operator_debug/raw_sql_needed").and_then(Value::as_bool), Some(false) @@ -190,6 +199,21 @@ fn operator_debug_fixture_reports_trace_links_and_failure_details() -> Result<() dropped.pointer("/operator_debug/viewer_url").and_then(Value::as_str), Some("/viewer?trace_id=11111111-1111-4111-8111-111111111111") ); + assert_eq!( + dropped.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("filter.read_profile") + ); + assert!(array_contains_str( + dropped, + "/trace_explainability/stages/1/dropped_evidence", + "trace-dropped-expected" + )?); + assert!(array_contains_str( + dropped, + "/trace_explainability/stages/1/distractor_evidence", + "trace-dropped-decoy" + )?); + assert!(array_contains_str(dropped, "/produced_evidence", "trace-dropped-expected")?); Ok(()) } @@ -422,20 +446,20 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { let report = run_json_report_from(real_world_memory_fixture_dir())?; assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(27)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(25)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(26)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(0.938) + Some(0.958) ); assert_eq!( report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), - Some(0.02) + Some(0.0) ); - assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), @@ -465,17 +489,17 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), Some(55) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(52)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.945)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.945)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.945)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(53)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.964)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.964)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.964)); assert_eq!( report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), Some(1) ); assert_eq!( report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), - Some(1) + Some(0) ); assert_eq!( report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), @@ -504,6 +528,7 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { "personalization", "consolidation", "knowledge_compilation", + "operator_debugging_ux", ] { let suite = find_by_field(suites, "/suite_id", suite_id)?; @@ -516,7 +541,7 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { let debug_suite = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; - assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("pass")); let jobs = array_at(&report, "/jobs")?; let rebuild = find_by_field(jobs, "/job_id", "trust-sot-rebuild-001")?; @@ -528,10 +553,12 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { assert_eq!(redaction.pointer("/redaction_leak_count").and_then(Value::as_u64), Some(0)); assert_eq!(personalization.pointer("/scope_check_count").and_then(Value::as_u64), Some(1)); assert_eq!(personalization.pointer("/scope_correct_count").and_then(Value::as_u64), Some(1)); + assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!( stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), Some("rerank.score") ); + assert!(array_contains_str(stage_job, "/produced_evidence", "stage-target")?); Ok(()) } @@ -541,15 +568,15 @@ fn retrieval_fixtures_report_quality_and_trace_attribution() -> Result<()> { let report = run_json_report_from(retrieval_fixture_dir())?; assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(6)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(0.857) + Some(1.0) ); assert_eq!( report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), - Some(0.143) + Some(0.0) ); assert_eq!( report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), @@ -557,7 +584,7 @@ fn retrieval_fixtures_report_quality_and_trace_attribution() -> Result<()> { ); assert_eq!( report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), - Some(1) + Some(0) ); let suites = array_at(&report, "/suites")?; @@ -566,23 +593,76 @@ fn retrieval_fixtures_report_quality_and_trace_attribution() -> Result<()> { assert_eq!(retrieval_suite.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!(retrieval_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); - assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("pass")); let jobs = array_at(&report, "/jobs")?; let stage_job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; - assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(stage_job.pointer("/status").and_then(Value::as_str), Some("pass")); assert_eq!( stage_job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), Some("rerank.score") ); assert_eq!( stage_job.pointer("/retrieval_quality/expected_evidence_recall").and_then(Value::as_f64), - Some(0.0) + Some(1.0) ); assert_eq!( stage_job.pointer("/retrieval_quality/irrelevant_context_ratio").and_then(Value::as_f64), - Some(1.0) + Some(0.0) + ); + + Ok(()) +} + +#[test] +fn stage_attribution_fixture_still_fails_when_decoy_is_used() -> Result<()> { + let fixture_path = retrieval_fixture_dir().join("stage_explainability_wrong_result.json"); + let mut fixture = serde_json::from_str::(&fs::read_to_string(fixture_path)?)?; + + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/content", + Value::String( + "The trace shows the expected evidence was present in recall.candidates but demoted at rerank.score; however, the selected answer followed the stale top-k smoke-only evidence.".to_string(), + ), + )?; + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/claims", + serde_json::json!([]), + )?; + set_json_pointer( + &mut fixture, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["stage-decoy"]), + )?; + + let temp_dir = + env::temp_dir().join(format!("elf-real-world-stage-decoy-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("stage_decoy.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let report = run_json_report_from(temp_dir)?; + + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!( + report.pointer("/summary/wrong_result_stage_attribution_count").and_then(Value::as_u64), + Some(1) + ); + + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "operator-debug-stage-attribution-001")?; + + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("rerank.score") + ); + assert_eq!( + job.pointer("/retrieval_quality/trap_context_count").and_then(Value::as_u64), + Some(1) ); Ok(()) diff --git a/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md b/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md index ac2415f..4b7944c 100644 --- a/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md +++ b/docs/guide/benchmarking/2026-06-09-operator-debugging-ux-report.md @@ -3,23 +3,38 @@ Goal: Publish a Markdown summary for one generated real_world_job benchmark report. Read this when: You need a durable smoke report for real-world agent memory job fixtures. Inputs: `tmp/real-world-job/real-world-job-operator-ux-report.json`. -Depends on: `apps/elf-eval/fixtures/real_world_job/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`. +Depends on: `apps/elf-eval/fixtures/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`. Verification: Compare this Markdown summary with the source JSON before committing. ## Summary - Run ID: `real-world-job-operator-ux` -- Generated at: `2026-06-09T14:52:05.906877Z` -- Runner version: `0.2.0-9b60dee3de54705a71a683d9a36b48d94ce8e752-aarch64-apple-darwin` +- Generated at: `2026-06-10T02:56:58.31558Z` +- Runner version: `0.2.0-5d527b9c5a0bd90b88b905d337f658b7d9eddd05-aarch64-apple-darwin` - Corpus profile: `synthetic` - Adapter: `fixture_operator_ux` (offline_fixture_response) - Jobs: `5` -- Encoded suites: `1` -- Not-encoded suites: `10` -- Status summary: `4` pass, `0` wrong_result, `0` lifecycle_fail, `0` incomplete, `0` blocked, `1` unsupported_claim -- Unsupported claim count: `1` -- Wrong-result count: `3` -- Mean score: `0.800` +- Suites with encoded jobs: `1` +- Suites with `not_encoded` status: `10` +- Status summary: `5` pass, `0` wrong_result, `0` lifecycle_fail, `0` incomplete, `0` blocked, `0` not_encoded, `0` unsupported_claim +- Unsupported claim count: `0` +- Wrong-result count: `0` +- Stale-answer count: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` +- Evidence coverage: `6/6` (`1.000`) +- Source-ref coverage: `6/6` (`1.000`) +- Quote coverage: `6/6` (`1.000`) +- Stale retrieval count: `0` +- Scope correctness: `0/0` (`0.000`), violations `0` +- Redaction leak count: `0` +- Qdrant rebuild cases: `0` encoded, `0` pass +- Expected evidence recall: `1.000` (6/6) +- Irrelevant context ratio: `0.000` (0 irrelevant) +- Trace explainability: `1` job(s), `0` wrong-result stage attribution(s) +- Consolidation source mutation count: `0` +- Mean score: `1.000` - Mean latency: `3.100 ms` - Cost: `0.000 USD` - Operator-debug jobs: `5` @@ -28,31 +43,43 @@ Verification: Compare this Markdown summary with the source JSON before committi - Operator UX gaps: `0` - Private corpus redaction: `no_private_corpus` +## Capture And Integration Coverage + +The real-world job runner is fixture-backed. This section separates encoded evidence from live adapter claims. + +| Class | Behaviors | +| --- | --- | +| real | - | +| fixture-backed | - | +| mocked | - | +| blocked | - | +| not encoded | No capture/integration behavior was declared by encoded fixtures. | + ## Suites -| Suite | Status | Jobs | Score | Unsupported Claims | Wrong Results | Reason | -| --- | --- | ---: | ---: | ---: | ---: | --- | -| trust_source_of_truth | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | -| work_resume | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | -| project_decisions | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | -| retrieval | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | -| memory_evolution | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | -| consolidation | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | -| knowledge_compilation | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | -| operator_debugging_ux | `unsupported_claim` | 5 | `0.800` | 1 | 3 | At least one encoded job produced an unsupported claim. | -| capture_integration | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | -| production_ops | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | -| personalization | `not_encoded` | 0 | `-` | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| Suite | Status | Jobs | Score | Evidence Recall | Irrelevant Context | Trace Explain | Stale Answers | Conflicts | Update Rationales | Temporal Gaps | Unsupported Claims | Wrong Results | Reason | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | +| trust_source_of_truth | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| work_resume | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| project_decisions | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| retrieval | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| memory_evolution | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| consolidation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| knowledge_compilation | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| operator_debugging_ux | `pass` | 5 | `1.000` | `1.000` | `0.000` | 1 | 0 | 0 | 0 | 0 | 0 | 0 | All 5 encoded job(s) passed. | +| capture_integration | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| production_ops | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | +| personalization | `not_encoded` | 0 | `-` | `-` | `-` | 0 | 0 | 0 | 0 | 0 | 0 | 0 | No checked-in real_world_job fixture is encoded for this suite. | ## Jobs -| Suite | Job | Status | Score | Expected Evidence | Produced Evidence | Unsupported Claims | Wrong Results | Latency | Cost | -| --- | --- | --- | ---: | --- | --- | ---: | ---: | ---: | --- | -| operator_debugging_ux | operator-debug-dropped-evidence-001 | `unsupported_claim` | `0.000` | `trace-dropped-expected` | `trace-dropped-decoy` | 1 | 3 | `2.400 ms` | `0.000 USD` | -| operator_debugging_ux | operator-debug-provider-latency-001 | `pass` | `1.000` | `trace-provider-timeout` | `trace-provider-timeout` | 0 | 0 | `4.800 ms` | `0.000 USD` | -| operator_debugging_ux | operator-debug-rebuild-changed-results-001 | `pass` | `1.000` | `trace-before-rebuild, trace-after-rebuild` | `trace-after-rebuild, trace-before-rebuild` | 0 | 0 | `3.300 ms` | `0.000 USD` | -| operator_debugging_ux | operator-debug-relation-context-mislead-001 | `pass` | `1.000` | `trace-relation-context` | `trace-relation-context` | 0 | 0 | `2.900 ms` | `0.000 USD` | -| operator_debugging_ux | operator-debug-rerank-bad-candidate-001 | `pass` | `1.000` | `trace-rerank-promotion` | `trace-rerank-promotion` | 0 | 0 | `2.100 ms` | `0.000 USD` | +| Suite | Job | Status | Score | Evidence Recall | Irrelevant Context | Expected Evidence | Produced Evidence | Trace Failure Stage | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost | +| --- | --- | --- | ---: | ---: | ---: | --- | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- | +| operator_debugging_ux | operator-debug-dropped-evidence-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-dropped-expected` | `trace-dropped-expected` | `filter.read_profile` | 0 | 0 | `false` | `false` | 0 | 0 | `2.400 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-provider-latency-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-provider-timeout` | `trace-provider-timeout` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `4.800 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-rebuild-changed-results-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-before-rebuild, trace-after-rebuild` | `trace-after-rebuild, trace-before-rebuild` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `3.300 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-relation-context-mislead-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-relation-context` | `trace-relation-context` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.900 ms` | `0.000 USD` | +| operator_debugging_ux | operator-debug-rerank-bad-candidate-001 | `pass` | `1.000` | `1.000` | `0.000` | `trace-rerank-promotion` | `trace-rerank-promotion` | `-` | 0 | 0 | `false` | `false` | 0 | 0 | `2.100 ms` | `0.000 USD` | ## Operator Debugging UX @@ -101,11 +128,29 @@ Verification: Compare this Markdown summary with the source JSON before committi - CLI steps: `open trace bundle -> compare retrieval rank with final rank -> inspect rerank score -> tighten scope or rerank inputs` - Trace evidence: `trace-rerank-promotion` +## Memory Evolution + +- Stale answers: `0` +- Conflict detections: `0` +- Update rationales available: `0` +- Temporal validity not encoded: `0` + +| Suite | Job | Current Evidence | Historical Evidence | Stale Traps Used | Conflict Count | Detected | Update Rationale | Temporal Validity | Follow-up | +| --- | --- | --- | --- | --- | ---: | ---: | --- | --- | --- | + +## Trace Explainability + +| Suite | Job | Trace | Failure Stage | Reason | Stage Evidence | +| --- | --- | --- | --- | --- | --- | +| operator_debugging_ux | operator-debug-dropped-evidence-001 | `11111111-1111-4111-8111-111111111111` | `filter.read_profile` | Expected evidence survived recall.candidates but was removed by the read-profile scope filter before final selection. | recall.candidates kept=trace-dropped-expected+trace-dropped-decoy demoted= dropped= distractors=trace-dropped-decoy; filter.read_profile kept=trace-dropped-decoy demoted= dropped=trace-dropped-expected distractors=trace-dropped-decoy; selection.final kept=trace-dropped-decoy demoted= dropped=trace-dropped-expected distractors=trace-dropped-decoy | + ## Unsupported Claims -| Suite | Job | Claim | Evidence | Reason | -| --- | --- | --- | --- | --- | -| operator_debugging_ux | operator-debug-dropped-evidence-001 | No expected evidence was dropped. | `trace-dropped-decoy` | claim_id is not present in expected_answer.evidence_links | +No unsupported claims were produced by encoded jobs. + +## Follow-Ups + +No benchmark follow-ups were declared by encoded jobs. ## Result Semantics @@ -113,12 +158,16 @@ This report uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status term It is a real-world job fixture report, not a Docker live-baseline report. Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins. +The summary counters report required evidence coverage, source-ref coverage, quote coverage, expected evidence recall, irrelevant context ratio, trace explainability, stale retrievals, scope violations, redaction leaks, Qdrant rebuild case coverage, stale answers, conflict detections, update rationale availability, and temporal validity gaps across encoded jobs. + - `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule. - `wrong_result`: a job completed but missed required answer or evidence expectations. - `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links. -- `not_encoded`: a suite has no checked-in real_world_job fixture, so no pass/fail claim is allowed. +- `not_encoded`: a suite has no checked-in fixture, or an encoded fixture declares a capability gap so no pass/fail claim is allowed. + +For `knowledge_compilation` jobs, generated pages are benchmark artifacts. Page sections must cite source evidence or timeline events, or be explicitly flagged as unsupported. Flagged unsupported summaries are counted separately from hidden unsupported claims. -## Not-Encoded Suites +## Suites With `not_encoded` Status - `trust_source_of_truth` - `work_resume` diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 305ec55..01c8b8f 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -150,8 +150,8 @@ including the retrieval-quality slice below. The suite currently encodes: current-versus-obsolete selection, and minimal sufficient context. - `memory_evolution`: TTL/delete suppression plus current-versus-historical preference, issue status, deployment method, benchmark conclusion, and temporal relation cases. -- `operator_debugging_ux`: deliberate wrong-result trace attribution that identifies - the retrieval stage that demoted expected evidence. +- `operator_debugging_ux`: trace-backed stage attribution that identifies where + expected evidence was filtered, demoted, or selected against. - `capture_integration`: write-policy audit behavior for redaction/private exclusion and fixture-backed capture/integration boundary classification. - `personalization`: scoped stable preference correction without temporary or @@ -195,11 +195,11 @@ This parses `apps/elf-eval/fixtures/real_world_memory/retrieval/`, writes `tmp/real-world-memory/retrieval-report.json`, and renders `tmp/real-world-memory/retrieval-report.md`. The fixture set covers alternate phrasing, distractor-heavy retrieval, multi-hop routing, current-versus-obsolete -selection, minimal sufficient context, and a deliberate wrong-result trace attribution -case. Reports include expected evidence recall, irrelevant context ratio, latency/cost, -and optional trace explainability metadata. The qmd and OpenViking references in these -fixtures are design references only; no parity claim is allowed unless an external -adapter run actually provides evidence. +selection, minimal sufficient context, and trace-backed stage attribution for +operator debugging. Reports include expected evidence recall, irrelevant context ratio, +latency/cost, and optional trace explainability metadata. The qmd and OpenViking +references in these fixtures are design references only; no parity claim is allowed +unless an external adapter run actually provides evidence. Operator debugging UX increment: