From 18b0f74f60964b149663cb1db22d20b5723e19d7 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 10 Jun 2026 11:16:21 +0800 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Encode project_decisions real-world memory suite","authority":"XY-861"} --- Makefile.toml | 52 ++++ .../accepted_typed_failure_reporting.json | 217 +++++++++++++++ .../current_validation_gate.json | 259 ++++++++++++++++++ .../private_manifest_caveat.json | 251 +++++++++++++++++ .../reversed_live_baseline_suite_win.json | 259 ++++++++++++++++++ .../tradeoff_fixture_backed_first.json | 256 +++++++++++++++++ .../src/bin/real_world_job_benchmark.rs | 18 +- .../tests/real_world_job_benchmark.rs | 150 ++++++++-- .../real_world_agent_memory_benchmark.md | 32 ++- .../real_world_agent_memory_benchmark_v1.md | 3 + 10 files changed, 1473 insertions(+), 24 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/project_decisions/accepted_typed_failure_reporting.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/project_decisions/private_manifest_caveat.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/project_decisions/reversed_live_baseline_suite_win.json create mode 100644 apps/elf-eval/fixtures/real_world_memory/project_decisions/tradeoff_fixture_backed_first.json diff --git a/Makefile.toml b/Makefile.toml index 03373f4..9291ad2 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -400,6 +400,9 @@ args = [ # | real-world-memory | composite | | # | real-world-memory-json | command | | # | real-world-memory-report | command | | +# | real-world-memory-project-decisions | composite | | +# | real-world-memory-project-decisions-json | command | | +# | real-world-memory-project-decisions-report | command | | # | real-world-memory-evolution | composite | | # | real-world-memory-evolution-json | command | | # | real-world-memory-evolution-report | command | | @@ -505,6 +508,55 @@ args = [ "tmp/real-world-memory/real-world-memory-report.md", ] +[tasks.real-world-memory-project-decisions] +workspace = false +dependencies = [ + "real-world-memory-project-decisions-report", +] + +[tasks.real-world-memory-project-decisions-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory/project_decisions", + "--out", + "tmp/real-world-memory/project-decisions/report.json", + "--run-id", + "real-world-memory-project-decisions", + "--adapter-id", + "fixture_project_decisions", + "--adapter-name", + "ELF project decision fixture", +] + +[tasks.real-world-memory-project-decisions-report] +workspace = false +dependencies = [ + "real-world-memory-project-decisions-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-memory/project-decisions/report.json", + "--out", + "tmp/real-world-memory/project-decisions/report.md", +] + [tasks.real-world-memory-evolution] workspace = false dependencies = [ diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/accepted_typed_failure_reporting.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/accepted_typed_failure_reporting.json new file mode 100644 index 0000000..48ede3b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/accepted_typed_failure_reporting.json @@ -0,0 +1,217 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-accepted-typed-failures-001", + "suite": "project_decisions", + "title": "Recover an accepted benchmark reporting decision with its rationale", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "typed-failure-decision-accepted", + "kind": "decision", + "text": "Accepted decision: real-world benchmark reports must preserve typed outcomes: pass, wrong_result, lifecycle_fail, incomplete, blocked, not_encoded, and unsupported_claim.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "accepted_typed_failure_reporting", + "evidence_id": "typed-failure-decision-accepted" + } + }, + "created_at": "2026-06-09T09:00:00Z" + }, + { + "evidence_id": "typed-failure-decision-rationale", + "kind": "decision", + "text": "Rationale: typed outcomes keep missing evidence, wrong answers, blocked adapter setup, and unencoded dimensions from being hidden inside one aggregate score.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "accepted_typed_failure_reporting", + "evidence_id": "typed-failure-decision-rationale" + } + }, + "created_at": "2026-06-09T09:05:00Z" + }, + { + "evidence_id": "typed-failure-missing-rationale-trap", + "kind": "decision", + "text": "Rejected shortcut: collapse all benchmark outcomes into a single pass/fail label and omit the reason for typed failures.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "accepted_typed_failure_reporting", + "evidence_id": "typed-failure-missing-rationale-trap" + } + }, + "created_at": "2026-06-09T09:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "The accepted decision is to preserve typed benchmark outcomes instead of flattening them, because the typed states keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible.", + "claims": [ + { + "claim_id": "accepted_typed_failure_policy", + "text": "Real-world benchmark reports must preserve typed outcomes.", + "evidence_ids": ["typed-failure-decision-accepted"], + "confidence": "high" + }, + { + "claim_id": "typed_failure_rationale", + "text": "Typed outcomes keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible.", + "evidence_ids": ["typed-failure-decision-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "typed-failure-decision-accepted", + "typed-failure-decision-rationale" + ], + "latency_ms": 1.1, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "accepted-typed-failures", + "ts": "2026-06-09T09:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "typed-failure-decision-accepted", + "typed-failure-decision-rationale" + ], + "summary": "The benchmark report format was accepted with typed outcomes and rationale." + } + ], + "prompt": { + "role": "user", + "content": "Why did we choose typed benchmark outcomes instead of a single pass/fail label?", + "job_mode": "decide", + "constraints": [ + "cite_evidence", + "state_rationale", + "avoid_uncited_policy_claims" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "accepted_typed_failure_policy", + "text": "Real-world benchmark reports must preserve typed outcomes." + }, + { + "claim_id": "typed_failure_rationale", + "text": "Typed outcomes keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible." + } + ], + "must_not_include": [ + "Collapse all benchmark outcomes into a single pass/fail label." + ], + "evidence_links": { + "accepted_typed_failure_policy": ["typed-failure-decision-accepted"], + "typed_failure_rationale": ["typed-failure-decision-rationale"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "typed-failure-decision-accepted", + "claim_id": "accepted_typed_failure_policy", + "requirement": "cite", + "quote": "preserve typed outcomes" + }, + { + "evidence_id": "typed-failure-decision-rationale", + "claim_id": "typed_failure_rationale", + "requirement": "explain", + "quote": "keep missing evidence, wrong answers, blocked adapter setup, and unencoded dimensions" + } + ], + "negative_traps": [ + { + "trap_id": "missing-rationale-pass-fail-shortcut", + "type": "decoy_evidence", + "evidence_ids": ["typed-failure-missing-rationale-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the accepted typed-outcome decision." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites the accepted decision and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Avoids the pass/fail shortcut that omits rationale." + }, + "uncertainty_handling": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Does not hedge because sufficient decision evidence exists." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Explains the decision in a form useful for future benchmark reports." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": [ + "typed-failure-decision-accepted", + "typed-failure-decision-rationale" + ], + "historical_evidence_ids": [], + "stale_trap_ids": [], + "conflicts": [], + "update_rationale": { + "claim_id": "typed_failure_rationale", + "evidence_ids": ["typed-failure-decision-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "accepted_decision", + "rationale" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json new file mode 100644 index 0000000..f3e459b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/current_validation_gate.json @@ -0,0 +1,259 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-current-validation-gate-001", + "suite": "project_decisions", + "title": "Recover the current validation gate instead of an old gate", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "validation-gate-old-lint-test", + "kind": "decision", + "text": "Historical validation gate: earlier runner work used lint and test as the main local proof before review.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "current_validation_gate", + "evidence_id": "validation-gate-old-lint-test" + } + }, + "created_at": "2026-06-08T18:00:00Z" + }, + { + "evidence_id": "validation-gate-current-decodex", + "kind": "decision", + "text": "Current validation gate: before pushing a refreshed PR head, run cargo make fmt, cargo make lint-fix, and cargo make checks.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "current_validation_gate", + "evidence_id": "validation-gate-current-decodex" + } + }, + "created_at": "2026-06-10T02:00:00Z" + }, + { + "evidence_id": "validation-gate-current-rationale", + "kind": "decision", + "text": "Gate rationale: formatting, automatic lint repair, and full checks prevent avoidable review churn before Decodex review handoff.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "current_validation_gate", + "evidence_id": "validation-gate-current-rationale" + } + }, + "created_at": "2026-06-10T02:05:00Z" + }, + { + "evidence_id": "validation-gate-uncited-policy-trap", + "kind": "decision", + "text": "Uncited current-policy trap: describe the current validation gate from memory without citing the current gate evidence.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "current_validation_gate", + "evidence_id": "validation-gate-uncited-policy-trap" + } + }, + "created_at": "2026-06-10T02:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks before pushing a refreshed PR head. The older lint-and-test gate is historical; the current gate adds formatting, automatic lint repair, and full checks to prevent avoidable review churn before Decodex review handoff.", + "claims": [ + { + "claim_id": "current_validation_gate", + "text": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks before pushing a refreshed PR head.", + "evidence_ids": [ + "validation-gate-current-decodex", + "validation-gate-old-lint-test", + "validation-gate-current-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "validation_gate_rationale", + "text": "The gate prevents avoidable review churn before Decodex review handoff.", + "evidence_ids": ["validation-gate-current-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "validation-gate-current-decodex", + "validation-gate-old-lint-test", + "validation-gate-current-rationale" + ], + "latency_ms": 1.4, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "old-validation-gate", + "ts": "2026-06-08T18:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": ["validation-gate-old-lint-test"], + "summary": "The earlier validation gate centered on lint and test." + }, + { + "event_id": "current-validation-gate", + "ts": "2026-06-10T02:00:00Z", + "actor": "operator", + "action": "updated_policy", + "evidence_ids": [ + "validation-gate-current-decodex", + "validation-gate-current-rationale" + ], + "summary": "The current Decodex gate requires fmt, lint-fix, and checks before push or handoff." + } + ], + "prompt": { + "role": "user", + "content": "What is the current validation gate, and how is it different from the old gate?", + "job_mode": "decide", + "constraints": [ + "cite_evidence", + "use_current_policy", + "distinguish_current_from_historical" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_validation_gate", + "text": "The current validation gate is cargo make fmt, cargo make lint-fix, and cargo make checks before pushing a refreshed PR head." + }, + { + "claim_id": "validation_gate_rationale", + "text": "The gate prevents avoidable review churn before Decodex review handoff." + } + ], + "must_not_include": [ + "The current gate only requires lint and test." + ], + "evidence_links": { + "current_validation_gate": [ + "validation-gate-current-decodex", + "validation-gate-old-lint-test", + "validation-gate-current-rationale" + ], + "validation_gate_rationale": ["validation-gate-current-rationale"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "validation-gate-current-decodex", + "claim_id": "current_validation_gate", + "requirement": "cite", + "quote": "run cargo make fmt, cargo make lint-fix, and cargo make checks" + }, + { + "evidence_id": "validation-gate-old-lint-test", + "claim_id": "current_validation_gate", + "requirement": "use", + "quote": "Historical validation gate" + }, + { + "evidence_id": "validation-gate-current-rationale", + "claim_id": "validation_gate_rationale", + "requirement": "explain", + "quote": "prevent avoidable review churn" + } + ], + "negative_traps": [ + { + "trap_id": "uncited-current-policy-claim", + "type": "unsupported_prior", + "evidence_ids": ["validation-gate-uncited-policy-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Reports the current gate and the historical old gate correctly." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current policy, historical policy, and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids uncited current-policy assertions." + }, + "uncertainty_handling": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Does not hedge because current policy evidence exists." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Distinguishes current and historical policy with update rationale." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["validation-gate-current-decodex"], + "historical_evidence_ids": ["validation-gate-old-lint-test"], + "stale_trap_ids": ["uncited-current-policy-claim"], + "conflicts": [ + { + "conflict_id": "validation-gate-updated", + "claim_id": "current_validation_gate", + "current_evidence_id": "validation-gate-current-decodex", + "historical_evidence_id": "validation-gate-old-lint-test", + "resolved_by_evidence_id": "validation-gate-current-rationale" + } + ], + "update_rationale": { + "claim_id": "validation_gate_rationale", + "evidence_ids": ["validation-gate-current-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "validation_gate", + "current_policy" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/private_manifest_caveat.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/private_manifest_caveat.json new file mode 100644 index 0000000..ed196f8 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/private_manifest_caveat.json @@ -0,0 +1,251 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-private-manifest-caveat-001", + "suite": "project_decisions", + "title": "State the bounded private-manifest caveat instead of overclaiming", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "private-manifest-current-caveat", + "kind": "decision", + "text": "Current caveat: no private production corpus manifest is available, so project_decisions reports cannot claim private-corpus validation.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "private_manifest_caveat", + "evidence_id": "private-manifest-current-caveat" + } + }, + "created_at": "2026-06-10T04:00:00Z" + }, + { + "evidence_id": "private-manifest-uncertainty-policy", + "kind": "decision", + "text": "Uncertainty policy: when evidence is missing, answer with the bounded caveat and do not invent a private-manifest pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "private_manifest_caveat", + "evidence_id": "private-manifest-uncertainty-policy" + } + }, + "created_at": "2026-06-10T04:05:00Z" + }, + { + "evidence_id": "private-manifest-supported-scope", + "kind": "decision", + "text": "Supported scope: the checked-in project_decisions fixtures prove only synthetic fixture-backed decision recovery for ELF.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "private_manifest_caveat", + "evidence_id": "private-manifest-supported-scope" + } + }, + "created_at": "2026-06-10T04:10:00Z" + }, + { + "evidence_id": "private-manifest-overconfident-trap", + "kind": "decision", + "text": "Overconfident unsupported answer: the private production corpus passed and proves project_decisions parity.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "private_manifest_caveat", + "evidence_id": "private-manifest-overconfident-trap" + } + }, + "created_at": "2026-06-10T04:15:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "No private production corpus manifest is available, so the project_decisions report cannot claim private-corpus validation. The supported scope is synthetic fixture-backed decision recovery for ELF, and the correct uncertainty policy is to keep that bounded caveat instead of inventing a private-manifest pass.", + "claims": [ + { + "claim_id": "private_manifest_caveat", + "text": "No private production corpus manifest is available, so project_decisions reports cannot claim private-corpus validation.", + "evidence_ids": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy" + ], + "confidence": "high" + }, + { + "claim_id": "supported_project_decision_scope", + "text": "The checked-in project_decisions fixtures prove only synthetic fixture-backed decision recovery for ELF.", + "evidence_ids": ["private-manifest-supported-scope"], + "confidence": "high" + } + ], + "evidence_ids": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy", + "private-manifest-supported-scope" + ], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "private-manifest-caveat-recorded", + "ts": "2026-06-10T04:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy", + "private-manifest-supported-scope" + ], + "summary": "The suite records a bounded caveat for missing private-manifest evidence." + } + ], + "prompt": { + "role": "user", + "content": "Does the new project_decisions suite prove private-corpus validation or external parity?", + "job_mode": "answer", + "constraints": [ + "cite_evidence", + "state_caveats", + "avoid_overconfident_unsupported_answers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "private_manifest_caveat", + "text": "No private production corpus manifest is available, so project_decisions reports cannot claim private-corpus validation." + }, + { + "claim_id": "supported_project_decision_scope", + "text": "The checked-in project_decisions fixtures prove only synthetic fixture-backed decision recovery for ELF." + } + ], + "must_not_include": [ + "The private production corpus passed.", + "project_decisions parity is proven." + ], + "evidence_links": { + "private_manifest_caveat": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy" + ], + "supported_project_decision_scope": ["private-manifest-supported-scope"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "private-manifest-current-caveat", + "claim_id": "private_manifest_caveat", + "requirement": "cite", + "quote": "no private production corpus manifest is available" + }, + { + "evidence_id": "private-manifest-uncertainty-policy", + "claim_id": "private_manifest_caveat", + "requirement": "use", + "quote": "do not invent a private-manifest pass" + }, + { + "evidence_id": "private-manifest-supported-scope", + "claim_id": "supported_project_decision_scope", + "requirement": "cite", + "quote": "only synthetic fixture-backed decision recovery" + } + ], + "negative_traps": [ + { + "trap_id": "overconfident-private-manifest-pass", + "type": "unsupported_prior", + "evidence_ids": ["private-manifest-overconfident-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States that private-corpus validation and parity are not proven." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites caveat, uncertainty policy, and supported-scope evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids the overconfident private-corpus pass trap." + }, + "uncertainty_handling": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Uses a bounded caveat when private-manifest evidence is missing." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Preserves the correct claim boundary for aggregate report interpretation." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "missing required caveat" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "No private production corpus manifest is available", + "synthetic fixture-backed decision recovery" + ], + "fallback_action": "continue_with_caveat" + }, + "memory_evolution": { + "current_evidence_ids": [ + "private-manifest-current-caveat", + "private-manifest-uncertainty-policy", + "private-manifest-supported-scope" + ], + "historical_evidence_ids": [], + "stale_trap_ids": ["overconfident-private-manifest-pass"], + "conflicts": [], + "update_rationale": { + "claim_id": "private_manifest_caveat", + "evidence_ids": ["private-manifest-uncertainty-policy"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "caveat", + "uncertainty" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/reversed_live_baseline_suite_win.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/reversed_live_baseline_suite_win.json new file mode 100644 index 0000000..21605eb --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/reversed_live_baseline_suite_win.json @@ -0,0 +1,259 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-reversal-live-baseline-001", + "suite": "project_decisions", + "title": "Distinguish a superseded live-baseline claim from the current suite boundary", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "live-baseline-suite-win-old", + "kind": "decision", + "text": "Historical decision: a draft allowed live-baseline passes to be cited as real-world job suite wins.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "reversed_live_baseline_suite_win", + "evidence_id": "live-baseline-suite-win-old" + } + }, + "created_at": "2026-06-08T12:00:00Z" + }, + { + "evidence_id": "live-baseline-suite-win-current", + "kind": "decision", + "text": "Current decision: live-baseline passes are retrieval and lifecycle evidence only; real-world job suite wins require fixture-backed real_world_job reports.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "reversed_live_baseline_suite_win", + "evidence_id": "live-baseline-suite-win-current" + } + }, + "created_at": "2026-06-09T12:00:00Z" + }, + { + "evidence_id": "live-baseline-reversal-rationale", + "kind": "decision", + "text": "Reversal rationale: query-level live-baseline checks cannot prove durable decision recovery, rationale recovery, or unsupported-claim handling.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "reversed_live_baseline_suite_win", + "evidence_id": "live-baseline-reversal-rationale" + } + }, + "created_at": "2026-06-09T12:05:00Z" + }, + { + "evidence_id": "live-baseline-stale-reuse-trap", + "kind": "decision", + "text": "Stale reuse trap: cite the historical live-baseline-as-suite-win draft as the current project decision.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "reversed_live_baseline_suite_win", + "evidence_id": "live-baseline-stale-reuse-trap" + } + }, + "created_at": "2026-06-09T12:10:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "The current decision is that live-baseline passes are retrieval and lifecycle evidence only; real-world job suite wins require fixture-backed real_world_job reports. The earlier draft that allowed live-baseline suite wins is historical, and it changed because query-level checks do not prove durable decision recovery, rationale recovery, or unsupported-claim handling.", + "claims": [ + { + "claim_id": "current_live_baseline_boundary", + "text": "Live-baseline passes are retrieval and lifecycle evidence only, not real-world job suite wins.", + "evidence_ids": [ + "live-baseline-suite-win-current", + "live-baseline-suite-win-old", + "live-baseline-reversal-rationale" + ], + "confidence": "high" + }, + { + "claim_id": "live_baseline_reversal_rationale", + "text": "The decision changed because query-level checks do not prove durable decision recovery, rationale recovery, or unsupported-claim handling.", + "evidence_ids": ["live-baseline-reversal-rationale"], + "confidence": "high" + } + ], + "evidence_ids": [ + "live-baseline-suite-win-current", + "live-baseline-suite-win-old", + "live-baseline-reversal-rationale" + ], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "draft-live-baseline-suite-win", + "ts": "2026-06-08T12:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": ["live-baseline-suite-win-old"], + "summary": "A draft treated live-baseline passes as real-world job suite wins." + }, + { + "event_id": "current-live-baseline-boundary", + "ts": "2026-06-09T12:00:00Z", + "actor": "agent", + "action": "updated_memory", + "evidence_ids": [ + "live-baseline-suite-win-current", + "live-baseline-reversal-rationale" + ], + "summary": "The current decision limited live-baseline evidence to retrieval and lifecycle checks." + } + ], + "prompt": { + "role": "user", + "content": "Can we still cite live-baseline passes as real-world job suite wins, or was that reversed?", + "job_mode": "decide", + "constraints": [ + "cite_evidence", + "distinguish_current_from_historical", + "state_rationale" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "current_live_baseline_boundary", + "text": "Live-baseline passes are retrieval and lifecycle evidence only, not real-world job suite wins." + }, + { + "claim_id": "live_baseline_reversal_rationale", + "text": "The decision changed because query-level checks do not prove durable decision recovery, rationale recovery, or unsupported-claim handling." + } + ], + "must_not_include": [ + "Live-baseline passes are real-world job suite wins." + ], + "evidence_links": { + "current_live_baseline_boundary": [ + "live-baseline-suite-win-current", + "live-baseline-suite-win-old", + "live-baseline-reversal-rationale" + ], + "live_baseline_reversal_rationale": ["live-baseline-reversal-rationale"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "live-baseline-suite-win-current", + "claim_id": "current_live_baseline_boundary", + "requirement": "cite", + "quote": "real-world job suite wins require fixture-backed real_world_job reports" + }, + { + "evidence_id": "live-baseline-suite-win-old", + "claim_id": "current_live_baseline_boundary", + "requirement": "use", + "quote": "Historical decision" + }, + { + "evidence_id": "live-baseline-reversal-rationale", + "claim_id": "live_baseline_reversal_rationale", + "requirement": "explain", + "quote": "cannot prove durable decision recovery, rationale recovery, or unsupported-claim handling" + } + ], + "negative_traps": [ + { + "trap_id": "stale-live-baseline-suite-win-reuse", + "type": "stale_fact", + "evidence_ids": ["live-baseline-stale-reuse-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Reports the current boundary and marks the older decision historical." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites current, historical, and rationale evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not reuse the stale draft as the current decision." + }, + "uncertainty_handling": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Does not overstate live-baseline evidence." + }, + "lifecycle_behavior": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Shows the decision reversal and available update rationale." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": ["The fixture does not provide that evidence."], + "fallback_action": "state_blocker" + }, + "memory_evolution": { + "current_evidence_ids": ["live-baseline-suite-win-current"], + "historical_evidence_ids": ["live-baseline-suite-win-old"], + "stale_trap_ids": ["stale-live-baseline-suite-win-reuse"], + "conflicts": [ + { + "conflict_id": "live-baseline-suite-win-reversed", + "claim_id": "current_live_baseline_boundary", + "current_evidence_id": "live-baseline-suite-win-current", + "historical_evidence_id": "live-baseline-suite-win-old", + "resolved_by_evidence_id": "live-baseline-reversal-rationale" + } + ], + "update_rationale": { + "claim_id": "live_baseline_reversal_rationale", + "evidence_ids": ["live-baseline-reversal-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "reversal", + "current_vs_historical" + ] +} diff --git a/apps/elf-eval/fixtures/real_world_memory/project_decisions/tradeoff_fixture_backed_first.json b/apps/elf-eval/fixtures/real_world_memory/project_decisions/tradeoff_fixture_backed_first.json new file mode 100644 index 0000000..268e675 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/project_decisions/tradeoff_fixture_backed_first.json @@ -0,0 +1,256 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "project-decision-tradeoff-fixture-backed-001", + "suite": "project_decisions", + "title": "Explain the rationale and caveat for fixture-backed project decision jobs", + "corpus": { + "corpus_id": "real-world-memory-project-decisions-2026-06-10", + "profile": "synthetic", + "items": [ + { + "evidence_id": "tradeoff-fixture-backed-first", + "kind": "decision", + "text": "Accepted tradeoff: encode project_decisions first as offline fixture-backed jobs, not as live external-adapter parity runs.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "tradeoff_fixture_backed_first", + "evidence_id": "tradeoff-fixture-backed-first" + } + }, + "created_at": "2026-06-10T03:00:00Z" + }, + { + "evidence_id": "tradeoff-fixture-backed-rationale", + "kind": "decision", + "text": "Tradeoff rationale: fixture-backed jobs can lock evidence, negative traps, and typed outcomes now while external adapters remain unrun for this suite.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "tradeoff_fixture_backed_first", + "evidence_id": "tradeoff-fixture-backed-rationale" + } + }, + "created_at": "2026-06-10T03:05:00Z" + }, + { + "evidence_id": "tradeoff-fixture-backed-caveat", + "kind": "decision", + "text": "Caveat: this suite must not claim external-project parity until external adapters actually run the project_decisions jobs.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "tradeoff_fixture_backed_first", + "evidence_id": "tradeoff-fixture-backed-caveat" + } + }, + "created_at": "2026-06-10T03:10:00Z" + }, + { + "evidence_id": "tradeoff-external-parity-trap", + "kind": "decision", + "text": "Unsupported prior: qmd, agentmemory, and mem0 already passed the project_decisions suite.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_memory_fixture/v1", + "ref": { + "fixture": "tradeoff_fixture_backed_first", + "evidence_id": "tradeoff-external-parity-trap" + } + }, + "created_at": "2026-06-10T03:15:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_project_decisions", + "answer": { + "content": "The accepted tradeoff is to encode project_decisions first as offline fixture-backed jobs. The rationale is that fixture-backed jobs can lock evidence, negative traps, and typed outcomes now while external adapters remain unrun. The caveat is that this suite must not claim external-project parity until external adapters actually run these jobs.", + "claims": [ + { + "claim_id": "fixture_backed_tradeoff", + "text": "Encode project_decisions first as offline fixture-backed jobs.", + "evidence_ids": ["tradeoff-fixture-backed-first"], + "confidence": "high" + }, + { + "claim_id": "fixture_backed_tradeoff_rationale", + "text": "Fixture-backed jobs can lock evidence, negative traps, and typed outcomes now while external adapters remain unrun.", + "evidence_ids": ["tradeoff-fixture-backed-rationale"], + "confidence": "high" + }, + { + "claim_id": "fixture_backed_parity_caveat", + "text": "Do not claim external-project parity until external adapters run the project_decisions jobs.", + "evidence_ids": ["tradeoff-fixture-backed-caveat"], + "confidence": "high" + } + ], + "evidence_ids": [ + "tradeoff-fixture-backed-first", + "tradeoff-fixture-backed-rationale", + "tradeoff-fixture-backed-caveat" + ], + "latency_ms": 1.3, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "fixture-backed-first-decision", + "ts": "2026-06-10T03:00:00Z", + "actor": "agent", + "action": "made_decision", + "evidence_ids": [ + "tradeoff-fixture-backed-first", + "tradeoff-fixture-backed-rationale", + "tradeoff-fixture-backed-caveat" + ], + "summary": "The project_decisions suite was encoded as fixture-backed evidence first with a parity caveat." + } + ], + "prompt": { + "role": "user", + "content": "Why are project_decisions fixtures offline first, and what claim boundary should the report preserve?", + "job_mode": "decide", + "constraints": [ + "cite_evidence", + "state_rationale", + "state_caveats", + "do_not_claim_external_parity" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "fixture_backed_tradeoff", + "text": "Encode project_decisions first as offline fixture-backed jobs." + }, + { + "claim_id": "fixture_backed_tradeoff_rationale", + "text": "Fixture-backed jobs can lock evidence, negative traps, and typed outcomes now while external adapters remain unrun." + }, + { + "claim_id": "fixture_backed_parity_caveat", + "text": "Do not claim external-project parity until external adapters run the project_decisions jobs." + } + ], + "must_not_include": [ + "qmd, agentmemory, and mem0 already passed the project_decisions suite." + ], + "evidence_links": { + "fixture_backed_tradeoff": ["tradeoff-fixture-backed-first"], + "fixture_backed_tradeoff_rationale": ["tradeoff-fixture-backed-rationale"], + "fixture_backed_parity_caveat": ["tradeoff-fixture-backed-caveat"] + }, + "answer_type": "decision_record", + "accepted_alternates": [], + "requires_caveat": true, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "tradeoff-fixture-backed-first", + "claim_id": "fixture_backed_tradeoff", + "requirement": "cite", + "quote": "offline fixture-backed jobs" + }, + { + "evidence_id": "tradeoff-fixture-backed-rationale", + "claim_id": "fixture_backed_tradeoff_rationale", + "requirement": "explain", + "quote": "lock evidence, negative traps, and typed outcomes" + }, + { + "evidence_id": "tradeoff-fixture-backed-caveat", + "claim_id": "fixture_backed_parity_caveat", + "requirement": "cite", + "quote": "must not claim external-project parity" + } + ], + "negative_traps": [ + { + "trap_id": "external-parity-without-adapter-run", + "type": "unsupported_prior", + "evidence_ids": ["tradeoff-external-parity-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "States the fixture-backed-first decision." + }, + "evidence_grounding": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Cites decision, rationale, and caveat evidence." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Avoids unsupported external parity claims." + }, + "uncertainty_handling": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "States the external-adapter caveat instead of overclaiming." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Preserves the report boundary for future adapter work." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true", + "missing required caveat" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": true, + "acceptable_phrases": [ + "must not claim external-project parity", + "external adapters remain unrun" + ], + "fallback_action": "continue_with_caveat" + }, + "memory_evolution": { + "current_evidence_ids": [ + "tradeoff-fixture-backed-first", + "tradeoff-fixture-backed-rationale", + "tradeoff-fixture-backed-caveat" + ], + "historical_evidence_ids": [], + "stale_trap_ids": ["external-parity-without-adapter-run"], + "conflicts": [], + "update_rationale": { + "claim_id": "fixture_backed_tradeoff_rationale", + "evidence_ids": ["tradeoff-fixture-backed-rationale"], + "available": true + }, + "temporal_validity": { + "required": false, + "encoded": false, + "follow_up": null + } + }, + "tags": [ + "synthetic", + "project_decisions", + "tradeoff_rationale", + "no_external_parity_claim" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs index f5a5fee..ac3079b 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -736,6 +736,10 @@ struct JobReport { job_id: String, title: String, status: TypedStatus, + answer_type: String, + requires_caveat: bool, + requires_refusal: bool, + can_answer_unknown: bool, normalized_score: f64, hard_fail_hits: Vec, expected_evidence: Vec, @@ -2600,6 +2604,10 @@ fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { job_id: job.job_id.clone(), title: job.title.clone(), status: scoring.status, + answer_type: job.expected_answer.answer_type.clone(), + requires_caveat: job.expected_answer.requires_caveat, + requires_refusal: job.expected_answer.requires_refusal, + can_answer_unknown: job.allowed_uncertainty.can_answer_unknown, normalized_score: round3(scoring.normalized_score), hard_fail_hits: scoring.hard_fail_hits, expected_evidence: expected_evidence_report(job), @@ -3629,9 +3637,9 @@ fn render_markdown_suites(out: &mut String, report: &RealWorldReport) { fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) { out.push_str("## Jobs\n\n"); - out.push_str("| Suite | Job | Status | Score | Evidence Recall | Irrelevant Context | Expected Evidence | Produced Evidence | Trace Failure Stage | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost |\n"); + out.push_str("| Suite | Job | Status | Answer Type | Caveat Required | Refusal Required | Unknown Allowed | Score | Evidence Recall | Irrelevant Context | Expected Evidence | Produced Evidence | Trace Failure Stage | Stale Answers | Conflicts | Update Rationale | Temporal Gap | Unsupported Claims | Wrong Results | Latency | Cost |\n"); out.push_str( - "| --- | --- | --- | ---: | ---: | ---: | --- | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- |\n", + "| --- | --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | --- | --- | --- | ---: | ---: | --- | --- | ---: | ---: | ---: | --- |\n", ); for job in &report.jobs { @@ -3644,10 +3652,14 @@ fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) { let produced = job.produced_evidence.join(", "); out.push_str(&format!( - "| {} | {} | `{}` | `{:.3}` | `{:.3}` | `{:.3}` | `{}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} | {} | `{}` | `{}` |\n", + "| {} | {} | `{}` | `{}` | `{}` | `{}` | `{}` | `{:.3}` | `{:.3}` | `{:.3}` | `{}` | `{}` | `{}` | {} | {} | `{}` | `{}` | {} | {} | `{}` | `{}` |\n", md_cell(job.suite_id.as_str()), md_cell(job.job_id.as_str()), status_str(job.status), + md_inline(job.answer_type.as_str()), + bool_display(job.requires_caveat), + bool_display(job.requires_refusal), + bool_display(job.can_answer_unknown), job.normalized_score, job.retrieval_quality.expected_evidence_recall, job.retrieval_quality.irrelevant_context_ratio, diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index cc665cb..36b8d4d 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -37,6 +37,10 @@ fn operator_debug_fixture_dir() -> PathBuf { .join("operator_debugging_ux") } +fn project_decisions_fixture_dir() -> PathBuf { + real_world_memory_fixture_dir().join("project_decisions") +} + fn retrieval_fixture_dir() -> PathBuf { Path::new(env!("CARGO_MANIFEST_DIR")) .join("fixtures") @@ -154,7 +158,7 @@ fn smoke_fixture_produces_typed_json_report() -> Result<()> { fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = run_json_report_from(fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(27)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(32)); Ok(()) } @@ -331,6 +335,88 @@ fn knowledge_fixtures_report_page_metrics() -> Result<()> { Ok(()) } +#[test] +fn project_decisions_fixtures_report_decision_policy_cases() -> Result<()> { + let report = run_json_report_from(project_decisions_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + + let suites = array_at(&report, "/suites")?; + let project_decisions = find_by_field(suites, "/suite_id", "project_decisions")?; + + assert_eq!(project_decisions.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(project_decisions.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!( + project_decisions.pointer("/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + + let jobs = array_at(&report, "/jobs")?; + let accepted = find_by_field(jobs, "/job_id", "project-decision-accepted-typed-failures-001")?; + let reversal = find_by_field(jobs, "/job_id", "project-decision-reversal-live-baseline-001")?; + let validation = + find_by_field(jobs, "/job_id", "project-decision-current-validation-gate-001")?; + let tradeoff = find_by_field(jobs, "/job_id", "project-decision-tradeoff-fixture-backed-001")?; + let caveat = find_by_field(jobs, "/job_id", "project-decision-private-manifest-caveat-001")?; + + assert_eq!(accepted.pointer("/answer_type").and_then(Value::as_str), Some("decision_record")); + assert_eq!( + accepted.pointer("/expected_evidence").and_then(Value::as_array).map(Vec::len), + Some(2) + ); + assert_eq!( + reversal.pointer("/evolution/historical_evidence/0").and_then(Value::as_str), + Some("live-baseline-suite-win-old") + ); + assert_eq!( + validation.pointer("/evolution/current_evidence/0").and_then(Value::as_str), + Some("validation-gate-current-decodex") + ); + assert_eq!(tradeoff.pointer("/requires_caveat").and_then(Value::as_bool), Some(true)); + assert_eq!(caveat.pointer("/can_answer_unknown").and_then(Value::as_bool), Some(true)); + + for job in jobs { + let expected_evidence = array_at(job, "/expected_evidence")?; + + assert!( + !expected_evidence.is_empty(), + "project decision job {} must declare required evidence", + job.pointer("/job_id").and_then(Value::as_str).unwrap_or("") + ); + } + for entry in fs::read_dir(project_decisions_fixture_dir())? { + let path = entry?.path(); + + if path.extension().and_then(|ext| ext.to_str()) != Some("json") { + continue; + } + + let fixture = serde_json::from_str::(&fs::read_to_string(path)?)?; + let required_evidence = array_at(&fixture, "/required_evidence")?; + let negative_traps = array_at(&fixture, "/negative_traps")?; + + assert!(!required_evidence.is_empty()); + assert!(!negative_traps.is_empty()); + } + + Ok(()) +} + #[test] fn generated_json_report_renders_markdown() -> Result<()> { let report = run_json_report()?; @@ -363,6 +449,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("work_resume")); assert!(markdown.contains("Capture And Integration Coverage")); assert!(markdown.contains("fixture-backed")); + assert!(markdown.contains("Answer Type")); + assert!(markdown.contains("Caveat Required")); + assert!(markdown.contains("Refusal Required")); assert!(markdown.contains("agentmemory-style hook capture")); assert!(markdown.contains("xy844-current-worktree")); assert!(markdown.contains("Existing live-baseline reports remain valid")); @@ -417,33 +506,30 @@ fn assert_root_knowledge_summary(report: &Value) { ); } -#[test] -fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { - let report = run_json_report_from(real_world_memory_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(27)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(25)); +fn assert_root_aggregate_summary(report: &Value) { + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(32)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(30)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(3)); assert_eq!( report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(0.938) + Some(0.952) ); assert_eq!( report.pointer("/summary/irrelevant_context_ratio").and_then(Value::as_f64), - Some(0.02) + Some(0.015) ); assert_eq!(report.pointer("/summary/stale_retrieval_count").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/stale_answer_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), - Some(4) + Some(6) ); assert_eq!( report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), - Some(4) + Some(9) ); assert_eq!( report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), @@ -463,12 +549,12 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(55) + Some(69) ); - assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(52)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.945)); - assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.945)); - assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.945)); + assert_eq!(report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), Some(66)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(0.957)); + assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(0.957)); + assert_eq!(report.pointer("/summary/quote_coverage").and_then(Value::as_f64), Some(0.957)); assert_eq!( report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), Some(1) @@ -492,13 +578,16 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { Some(1) ); - assert_root_knowledge_summary(&report); + assert_root_knowledge_summary(report); +} - let suites = array_at(&report, "/suites")?; +fn assert_root_aggregate_suites(report: &Value) -> Result<()> { + let suites = array_at(report, "/suites")?; for suite_id in [ "trust_source_of_truth", "work_resume", + "project_decisions", "retrieval", "capture_integration", "personalization", @@ -514,11 +603,23 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { assert_eq!(memory_evolution.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + let project_decisions = find_by_field(suites, "/suite_id", "project_decisions")?; + + assert_eq!(project_decisions.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!( + project_decisions.pointer("/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + let debug_suite = find_by_field(suites, "/suite_id", "operator_debugging_ux")?; assert_eq!(debug_suite.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - let jobs = array_at(&report, "/jobs")?; + Ok(()) +} + +fn assert_root_aggregate_jobs(report: &Value) -> Result<()> { + let jobs = array_at(report, "/jobs")?; let rebuild = find_by_field(jobs, "/job_id", "trust-sot-rebuild-001")?; let redaction = find_by_field(jobs, "/job_id", "capture-redaction-exclusion-001")?; let personalization = find_by_field(jobs, "/job_id", "personalization-scoped-preference-001")?; @@ -536,6 +637,17 @@ fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { Ok(()) } +#[test] +fn real_world_memory_fixtures_report_aggregate_metrics() -> Result<()> { + let report = run_json_report_from(real_world_memory_fixture_dir())?; + + assert_root_aggregate_summary(&report); + assert_root_aggregate_suites(&report)?; + assert_root_aggregate_jobs(&report)?; + + Ok(()) +} + #[test] fn retrieval_fixtures_report_quality_and_trace_attribution() -> Result<()> { let report = run_json_report_from(retrieval_fixture_dir())?; diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index 305ec55..e38a803 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -146,6 +146,9 @@ including the retrieval-quality slice below. The suite currently encodes: Postgres-held chunk embeddings before answering. - `work_resume`: stale worktree resume, Decodex/Linear lane status, failed command recovery, PR review blocker recovery, and exact next-action extraction. +- `project_decisions`: accepted durable decisions, superseded/reversed decisions, + old-versus-current validation gates, tradeoff rationale, and bounded caveat or + uncertainty handling. - `retrieval`: alternate phrasing, distractor-heavy retrieval, multi-hop routing, current-versus-obsolete selection, and minimal sufficient context. - `memory_evolution`: TTL/delete suppression plus current-versus-historical preference, @@ -162,10 +165,35 @@ unsupported-claim count, stale retrieval count, stale-answer count, conflict det count, update rationale availability, temporal validity `not_encoded` count, scope correctness, redaction leak count, capture/integration behavior classes, Qdrant rebuild case/pass counts, expected evidence recall, irrelevant context ratio, -latency/cost, and trace explainability counters. The fixtures include negative traps +latency/cost, answer-type plus caveat/refusal/uncertainty flags, and trace +explainability counters. The fixtures include negative traps for stale blockers, unsupported prior claims, stale deleted facts, stale historical facts, cross-project preference leakage, private/redacted text leakage, obsolete -retrieval context, and distractor context. +retrieval context, project-decision stale reuse, missing rationale, uncited current +policy claims, overconfident unsupported decision answers, and distractor context. + +Current checked-in project-decisions increment: + +```sh +cargo make real-world-memory-project-decisions +``` + +This parses `apps/elf-eval/fixtures/real_world_memory/project_decisions/`, writes +`tmp/real-world-memory/project-decisions/report.json`, and renders +`tmp/real-world-memory/project-decisions/report.md`. The fixture set covers: + +- accepted decision recovery with required rationale; +- superseded decision recovery where historical evidence must not become the current + answer; +- old-versus-current validation gate recovery; +- fixture-backed-first tradeoff rationale with an external-adapter parity caveat; +- missing private-manifest uncertainty where the correct answer is a bounded caveat. + +The report exposes `answer_type`, `requires_caveat`, `requires_refusal`, and +`can_answer_unknown` per job, and the memory-evolution table shows current evidence, +historical evidence, conflict detections, and update-rationale availability. These jobs +are fixture-backed only; they do not claim external adapter parity or private-corpus +validation. Narrow memory evolution increment: diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index d1aefae..69ac5eb 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -440,6 +440,9 @@ Reports MUST include: - run id, runner version, corpus profile, job ids, suite ids, project adapter metadata; - per-job status, normalized score, hard-fail hits, evidence ids used, trap ids used; +- per-job `answer_type`, required caveat/refusal flags, and whether an unknown answer + is allowed, so current-decision, historical-decision, rationale, and caveat cases are + distinguishable in generated reports; - expected evidence recall and irrelevant context ratio at job, suite, and summary levels when the runner can derive them from fixture evidence ids; - trace explainability metadata when an adapter or fixture can identify retrieval