Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions Makefile.toml
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,9 @@ args = [
# | real-world-memory | composite | |
# | real-world-memory-json | command | |
# | real-world-memory-report | command | |
# | real-world-memory-project-decisions | composite | |
# | real-world-memory-project-decisions-json | command | |
# | real-world-memory-project-decisions-report | command | |
# | real-world-memory-evolution | composite | |
# | real-world-memory-evolution-json | command | |
# | real-world-memory-evolution-report | command | |
Expand Down Expand Up @@ -505,6 +508,55 @@ args = [
"tmp/real-world-memory/real-world-memory-report.md",
]

[tasks.real-world-memory-project-decisions]
workspace = false
dependencies = [
"real-world-memory-project-decisions-report",
]

[tasks.real-world-memory-project-decisions-json]
workspace = false
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"run",
"--fixtures",
"apps/elf-eval/fixtures/real_world_memory/project_decisions",
"--out",
"tmp/real-world-memory/project-decisions/report.json",
"--run-id",
"real-world-memory-project-decisions",
"--adapter-id",
"fixture_project_decisions",
"--adapter-name",
"ELF project decision fixture",
]

[tasks.real-world-memory-project-decisions-report]
workspace = false
dependencies = [
"real-world-memory-project-decisions-json",
]
command = "cargo"
args = [
"run",
"-p",
"elf-eval",
"--bin",
"real_world_job_benchmark",
"--",
"publish",
"--report",
"tmp/real-world-memory/project-decisions/report.json",
"--out",
"tmp/real-world-memory/project-decisions/report.md",
]

[tasks.real-world-memory-evolution]
workspace = false
dependencies = [
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
{
"schema": "elf.real_world_job/v1",
"job_id": "project-decision-accepted-typed-failures-001",
"suite": "project_decisions",
"title": "Recover an accepted benchmark reporting decision with its rationale",
"corpus": {
"corpus_id": "real-world-memory-project-decisions-2026-06-10",
"profile": "synthetic",
"items": [
{
"evidence_id": "typed-failure-decision-accepted",
"kind": "decision",
"text": "Accepted decision: real-world benchmark reports must preserve typed outcomes: pass, wrong_result, lifecycle_fail, incomplete, blocked, not_encoded, and unsupported_claim.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_memory_fixture/v1",
"ref": {
"fixture": "accepted_typed_failure_reporting",
"evidence_id": "typed-failure-decision-accepted"
}
},
"created_at": "2026-06-09T09:00:00Z"
},
{
"evidence_id": "typed-failure-decision-rationale",
"kind": "decision",
"text": "Rationale: typed outcomes keep missing evidence, wrong answers, blocked adapter setup, and unencoded dimensions from being hidden inside one aggregate score.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_memory_fixture/v1",
"ref": {
"fixture": "accepted_typed_failure_reporting",
"evidence_id": "typed-failure-decision-rationale"
}
},
"created_at": "2026-06-09T09:05:00Z"
},
{
"evidence_id": "typed-failure-missing-rationale-trap",
"kind": "decision",
"text": "Rejected shortcut: collapse all benchmark outcomes into a single pass/fail label and omit the reason for typed failures.",
"source_ref": {
"schema": "source_ref/v1",
"resolver": "real_world_memory_fixture/v1",
"ref": {
"fixture": "accepted_typed_failure_reporting",
"evidence_id": "typed-failure-missing-rationale-trap"
}
},
"created_at": "2026-06-09T09:10:00Z"
}
],
"adapter_response": {
"adapter_id": "fixture_project_decisions",
"answer": {
"content": "The accepted decision is to preserve typed benchmark outcomes instead of flattening them, because the typed states keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible.",
"claims": [
{
"claim_id": "accepted_typed_failure_policy",
"text": "Real-world benchmark reports must preserve typed outcomes.",
"evidence_ids": ["typed-failure-decision-accepted"],
"confidence": "high"
},
{
"claim_id": "typed_failure_rationale",
"text": "Typed outcomes keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible.",
"evidence_ids": ["typed-failure-decision-rationale"],
"confidence": "high"
}
],
"evidence_ids": [
"typed-failure-decision-accepted",
"typed-failure-decision-rationale"
],
"latency_ms": 1.1,
"cost": {
"currency": "USD",
"amount": 0.0,
"input_tokens": 0,
"output_tokens": 0
}
}
}
},
"timeline": [
{
"event_id": "accepted-typed-failures",
"ts": "2026-06-09T09:00:00Z",
"actor": "agent",
"action": "made_decision",
"evidence_ids": [
"typed-failure-decision-accepted",
"typed-failure-decision-rationale"
],
"summary": "The benchmark report format was accepted with typed outcomes and rationale."
}
],
"prompt": {
"role": "user",
"content": "Why did we choose typed benchmark outcomes instead of a single pass/fail label?",
"job_mode": "decide",
"constraints": [
"cite_evidence",
"state_rationale",
"avoid_uncited_policy_claims"
]
},
"expected_answer": {
"must_include": [
{
"claim_id": "accepted_typed_failure_policy",
"text": "Real-world benchmark reports must preserve typed outcomes."
},
{
"claim_id": "typed_failure_rationale",
"text": "Typed outcomes keep missing evidence, wrong answers, blocked setup, and unencoded dimensions visible."
}
],
"must_not_include": [
"Collapse all benchmark outcomes into a single pass/fail label."
],
"evidence_links": {
"accepted_typed_failure_policy": ["typed-failure-decision-accepted"],
"typed_failure_rationale": ["typed-failure-decision-rationale"]
},
"answer_type": "decision_record",
"accepted_alternates": [],
"requires_caveat": false,
"requires_refusal": false
},
"required_evidence": [
{
"evidence_id": "typed-failure-decision-accepted",
"claim_id": "accepted_typed_failure_policy",
"requirement": "cite",
"quote": "preserve typed outcomes"
},
{
"evidence_id": "typed-failure-decision-rationale",
"claim_id": "typed_failure_rationale",
"requirement": "explain",
"quote": "keep missing evidence, wrong answers, blocked adapter setup, and unencoded dimensions"
}
],
"negative_traps": [
{
"trap_id": "missing-rationale-pass-fail-shortcut",
"type": "decoy_evidence",
"evidence_ids": ["typed-failure-missing-rationale-trap"],
"failure_if_used": true
}
],
"scoring_rubric": {
"dimensions": {
"answer_correctness": {
"weight": 0.25,
"max_points": 1.0,
"criteria": "States the accepted typed-outcome decision."
},
"evidence_grounding": {
"weight": 0.3,
"max_points": 1.0,
"criteria": "Cites the accepted decision and rationale evidence."
},
"trap_avoidance": {
"weight": 0.25,
"max_points": 1.0,
"criteria": "Avoids the pass/fail shortcut that omits rationale."
},
"uncertainty_handling": {
"weight": 0.1,
"max_points": 1.0,
"criteria": "Does not hedge because sufficient decision evidence exists."
},
"workflow_helpfulness": {
"weight": 0.1,
"max_points": 1.0,
"criteria": "Explains the decision in a form useful for future benchmark reports."
}
},
"pass_threshold": 0.8,
"hard_fail_rules": [
"unsupported high-confidence claim about a required decision or fact",
"use of a negative trap marked failure_if_used = true"
]
},
"allowed_uncertainty": {
"can_answer_unknown": false,
"acceptable_phrases": ["The fixture does not provide that evidence."],
"fallback_action": "state_blocker"
},
"memory_evolution": {
"current_evidence_ids": [
"typed-failure-decision-accepted",
"typed-failure-decision-rationale"
],
"historical_evidence_ids": [],
"stale_trap_ids": [],
"conflicts": [],
"update_rationale": {
"claim_id": "typed_failure_rationale",
"evidence_ids": ["typed-failure-decision-rationale"],
"available": true
},
"temporal_validity": {
"required": false,
"encoded": false,
"follow_up": null
}
},
"tags": [
"synthetic",
"project_decisions",
"accepted_decision",
"rationale"
]
}
Loading