diff --git a/Makefile.toml b/Makefile.toml index 68d657ad..ad4ecba1 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -355,6 +355,57 @@ args = [ ] +# Real-world job benchmark smoke +# | task | type | cwd | +# | --------------------------- | --------- | --- | +# | real-world-job-smoke | composite | | +# | real-world-job-smoke-json | command | | +# | real-world-job-smoke-report | command | | + +[tasks.real-world-job-smoke] +workspace = false +dependencies = [ + "real-world-job-smoke-report", +] + +[tasks.real-world-job-smoke-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_job/smoke", + "--out", + "tmp/real-world-job/real-world-job-smoke-report.json", +] + +[tasks.real-world-job-smoke-report] +workspace = false +dependencies = [ + "real-world-job-smoke-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/real-world-job/real-world-job-smoke-report.json", + "--out", + "tmp/real-world-job/real-world-job-smoke-report.md", +] + + # Meta # | task | type | cwd | # | ------ | --------- | --- | diff --git a/apps/elf-eval/fixtures/real_world_job/smoke/work_resume_smoke.json b/apps/elf-eval/fixtures/real_world_job/smoke/work_resume_smoke.json new file mode 100644 index 00000000..31289144 --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_job/smoke/work_resume_smoke.json @@ -0,0 +1,183 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "work-resume-smoke-001", + "suite": "work_resume", + "title": "Resume an ELF retained lane without repeating completed work", + "corpus": { + "corpus_id": "real-world-job-smoke-coding-agent-2026-06-09", + "profile": "synthetic", + "items": [ + { + "evidence_id": "issue-xy812-resume", + "kind": "issue", + "text": "XY-812 resume lane uses branch y/elf-xy-812. The next command is `cargo make trace-gate`; the stale blocker cleared after PR #108 merged.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_smoke", + "evidence_id": "issue-xy812-resume" + } + }, + "created_at": "2026-06-09T00:00:00Z" + }, + { + "evidence_id": "stale-command-trap", + "kind": "decision", + "text": "Old decision: run `cargo make lint` next for XY-812.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "work_resume_smoke", + "evidence_id": "stale-command-trap" + } + }, + "created_at": "2026-06-08T00:00:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_smoke", + "answer": { + "content": "Resume XY-812 on branch y/elf-xy-812 and run `cargo make trace-gate` next; the stale blocker cleared after PR #108 merged.", + "claims": [ + { + "claim_id": "resume_next_command", + "text": "Run `cargo make trace-gate` next for XY-812.", + "evidence_ids": ["issue-xy812-resume"], + "confidence": "high" + }, + { + "claim_id": "stale_blocker_cleared", + "text": "The stale blocker cleared after PR #108 merged.", + "evidence_ids": ["issue-xy812-resume"], + "confidence": "high" + } + ], + "evidence_ids": ["issue-xy812-resume"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "xy812-stale-decision", + "ts": "2026-06-08T00:00:00Z", + "actor": "agent", + "action": "recorded_old_next_command", + "evidence_ids": ["stale-command-trap"], + "summary": "The old next command said to run cargo make lint." + }, + { + "event_id": "xy812-current-resume", + "ts": "2026-06-09T00:00:00Z", + "actor": "agent", + "action": "updated_resume_evidence", + "evidence_ids": ["issue-xy812-resume"], + "summary": "The current resume evidence changed the next command to cargo make trace-gate and cleared the stale blocker." + } + ], + "prompt": { + "role": "user", + "content": "Resume XY-812 and tell me the next command without repeating completed work.", + "job_mode": "resume", + "constraints": [ + "cite_evidence", + "avoid_repeating_completed_work", + "state_blockers" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "resume_next_command", + "text": "Run `cargo make trace-gate` next for XY-812." + }, + { + "claim_id": "stale_blocker_cleared", + "text": "The stale blocker cleared after PR #108 merged." + } + ], + "must_not_include": [ + "Run `cargo make lint` next for XY-812.", + "The stale blocker is still active." + ], + "evidence_links": { + "resume_next_command": ["issue-xy812-resume"], + "stale_blocker_cleared": ["issue-xy812-resume"] + }, + "answer_type": "resume_summary", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "issue-xy812-resume", + "claim_id": "resume_next_command", + "requirement": "cite", + "quote": "The next command is `cargo make trace-gate`" + }, + { + "evidence_id": "issue-xy812-resume", + "claim_id": "stale_blocker_cleared", + "requirement": "use", + "quote": "the stale blocker cleared after PR #108 merged" + } + ], + "negative_traps": [ + { + "trap_id": "old-lint-command", + "type": "stale_fact", + "evidence_ids": ["stale-command-trap"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "Includes the current next command and current blocker state." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Uses the current issue evidence for every required claim." + }, + "trap_avoidance": { + "weight": 0.2, + "max_points": 1.0, + "criteria": "Does not use stale command evidence." + }, + "workflow_helpfulness": { + "weight": 0.15, + "max_points": 1.0, + "criteria": "Advances the resume job without repeated completed work." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [ + "The fixture does not provide that evidence." + ], + "fallback_action": "state_blocker" + }, + "tags": [ + "synthetic", + "smoke", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark.rs new file mode 100644 index 00000000..7b5de20c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark.rs @@ -0,0 +1,1586 @@ +#![allow(clippy::single_component_path_imports, unused_crate_dependencies)] + +//! Offline runner and publisher for real-world job benchmark fixtures. + +use std::{ + collections::{BTreeMap, BTreeSet}, + fs, + path::{Path, PathBuf}, +}; + +use clap::{Parser, Subcommand}; +use color_eyre::{Result, eyre}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use time::{OffsetDateTime, format_description::well_known::Rfc3339}; + +use elf_cli::VERSION; + +const JOB_SCHEMA: &str = "elf.real_world_job/v1"; +const REPORT_SCHEMA: &str = "elf.real_world_job_report/v1"; +const DEFAULT_FIXTURE_PATH: &str = "apps/elf-eval/fixtures/real_world_job/smoke"; +const DEFAULT_REPORT_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.json"; +const DEFAULT_MARKDOWN_PATH: &str = "tmp/real-world-job/real-world-job-smoke-report.md"; +const DEFAULT_RUN_ID: &str = "real-world-job-smoke"; +const DEFAULT_ADAPTER_ID: &str = "fixture_smoke"; +const DEFAULT_ADAPTER_NAME: &str = "ELF fixture smoke"; +const NOT_ENCODED_REASON: &str = "No checked-in real_world_job fixture is encoded for this suite."; +const SUITES: &[&str] = &[ + "trust_source_of_truth", + "work_resume", + "project_decisions", + "retrieval", + "memory_evolution", + "consolidation", + "knowledge_compilation", + "operator_debugging_ux", + "capture_integration", + "production_ops", + "personalization", +]; + +#[derive(Debug, Parser)] +#[command( + version = elf_cli::VERSION, + rename_all = "kebab", + styles = elf_cli::styles(), +)] +struct Args { + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Subcommand)] +#[command(rename_all = "kebab")] +enum Command { + /// Parse and score real_world_job fixtures, then emit a JSON report. + Run(RunArgs), + /// Render Markdown from a generated real_world_job JSON report. + Publish(PublishArgs), +} + +#[derive(Debug, Parser)] +struct RunArgs { + /// Fixture file or directory containing real_world_job JSON fixtures. + #[arg(long, value_name = "PATH", default_value = DEFAULT_FIXTURE_PATH)] + fixtures: PathBuf, + /// Write report JSON to this file. Omit to print to stdout. + #[arg(long, value_name = "FILE")] + out: Option, + /// Stable run id recorded in the generated report. + #[arg(long, default_value = DEFAULT_RUN_ID)] + run_id: String, + /// Adapter id recorded for the offline smoke response. + #[arg(long, default_value = DEFAULT_ADAPTER_ID)] + adapter_id: String, + /// Human-readable adapter name recorded in the generated report. + #[arg(long, default_value = DEFAULT_ADAPTER_NAME)] + adapter_name: String, +} + +#[derive(Debug, Parser)] +struct PublishArgs { + /// Generated real_world_job JSON report. + #[arg(long, value_name = "FILE", default_value = DEFAULT_REPORT_PATH)] + report: PathBuf, + /// Write Markdown to this file. Omit to print to stdout. + #[arg(long, value_name = "FILE", default_value = DEFAULT_MARKDOWN_PATH)] + out: Option, +} + +#[derive(Debug, Deserialize)] +struct RealWorldJob { + schema: String, + job_id: String, + suite: String, + title: String, + corpus: Corpus, + #[serde(default)] + timeline: Vec, + prompt: Prompt, + expected_answer: ExpectedAnswer, + #[serde(default)] + required_evidence: Vec, + #[serde(default)] + negative_traps: Vec, + scoring_rubric: ScoringRubric, + allowed_uncertainty: AllowedUncertainty, + #[serde(default)] + tags: Vec, +} + +#[derive(Debug, Deserialize)] +struct Corpus { + corpus_id: String, + profile: CorpusProfile, + #[serde(default)] + items: Vec, + + adapter_response: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum CorpusProfile { + Synthetic, + PrivateSanitized, + GeneratedPublic, + ExternalAdapter, +} +impl CorpusProfile { + fn as_str(&self) -> &'static str { + match self { + Self::Synthetic => "synthetic", + Self::PrivateSanitized => "private_sanitized", + Self::GeneratedPublic => "generated_public", + Self::ExternalAdapter => "external_adapter", + } + } +} + +#[derive(Debug, Deserialize)] +struct CorpusItem { + evidence_id: String, + kind: String, + + text: Option, + + local_ref: Option, + #[serde(default)] + source_ref: Value, + + created_at: Option, +} + +#[derive(Debug, Deserialize)] +struct TimelineEvent { + event_id: String, + ts: String, + actor: String, + action: String, + #[serde(default)] + evidence_ids: Vec, + summary: String, +} + +#[derive(Debug, Deserialize)] +struct Prompt { + role: String, + content: String, + job_mode: String, + #[serde(default)] + constraints: Vec, +} + +#[derive(Debug, Deserialize)] +struct ExpectedAnswer { + #[serde(default)] + must_include: Vec, + #[serde(default)] + must_not_include: Vec, + #[serde(default)] + evidence_links: BTreeMap, + answer_type: String, + #[serde(default)] + accepted_alternates: Vec, + #[serde(default)] + requires_caveat: bool, + #[serde(default)] + requires_refusal: bool, +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +enum ExpectedClaim { + Text(String), + Object { claim_id: Option, text: String }, +} +impl ExpectedClaim { + fn claim_id(&self) -> Option<&str> { + match self { + Self::Text(_) => None, + Self::Object { claim_id, .. } => claim_id.as_deref(), + } + } + + fn text(&self) -> &str { + match self { + Self::Text(text) => text, + Self::Object { text, .. } => text, + } + } +} + +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +enum EvidenceLink { + One(String), + Many(Vec), +} +impl EvidenceLink { + fn ids(&self) -> BTreeSet { + match self { + Self::One(id) => BTreeSet::from([id.clone()]), + Self::Many(ids) => ids.iter().cloned().collect(), + } + } +} + +#[derive(Debug, Deserialize)] +struct RequiredEvidence { + evidence_id: String, + claim_id: String, + requirement: String, + + quote: Option, + + selector: Option, +} + +#[derive(Debug, Deserialize)] +struct NegativeTrap { + trap_id: String, + #[serde(rename = "type")] + trap_type: String, + #[serde(default)] + evidence_ids: Vec, + #[serde(default)] + failure_if_used: bool, +} + +#[derive(Debug, Deserialize)] +struct ScoringRubric { + #[serde(default)] + dimensions: BTreeMap, + pass_threshold: f64, + #[serde(default)] + hard_fail_rules: Vec, +} + +#[derive(Debug, Deserialize)] +struct RubricDimension { + weight: f64, + max_points: f64, + criteria: Value, +} + +#[derive(Debug, Deserialize)] +struct AllowedUncertainty { + can_answer_unknown: bool, + #[serde(default)] + acceptable_phrases: Vec, + fallback_action: String, +} + +#[derive(Clone, Debug, Deserialize)] +struct AdapterResponse { + adapter_id: Option, + answer: ProducedAnswer, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProducedAnswer { + content: String, + #[serde(default)] + claims: Vec, + #[serde(default)] + evidence_ids: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + latency_ms: Option, + #[serde(skip_serializing_if = "Option::is_none")] + cost: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct ProducedClaim { + #[serde(skip_serializing_if = "Option::is_none")] + claim_id: Option, + text: String, + #[serde(default)] + evidence_ids: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + confidence: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct CostReport { + #[serde(skip_serializing_if = "Option::is_none")] + currency: Option, + #[serde(skip_serializing_if = "Option::is_none")] + amount: Option, + #[serde(skip_serializing_if = "Option::is_none")] + input_tokens: Option, + #[serde(skip_serializing_if = "Option::is_none")] + output_tokens: Option, +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +enum TypedStatus { + Pass, + WrongResult, + LifecycleFail, + Incomplete, + Blocked, + NotEncoded, + UnsupportedClaim, +} + +#[derive(Debug, Deserialize, Serialize)] +struct RealWorldReport { + schema: String, + run_id: String, + generated_at: String, + runner_version: String, + corpus_profile: String, + adapter: AdapterReport, + summary: ReportSummary, + suites: Vec, + jobs: Vec, + unsupported_claims: Vec, + not_encoded_suites: Vec, + private_corpus_redaction: PrivateCorpusRedaction, +} + +#[derive(Debug, Deserialize, Serialize)] +struct AdapterReport { + adapter_id: String, + name: String, + behavior: String, + storage: TypedStatus, + runtime: TypedStatus, + notes: String, +} + +#[derive(Debug, Default, Deserialize, Serialize)] +struct ReportSummary { + job_count: usize, + encoded_suite_count: usize, + pass: usize, + wrong_result: usize, + lifecycle_fail: usize, + incomplete: usize, + blocked: usize, + not_encoded: usize, + unsupported_claim: usize, + unsupported_claim_count: usize, + wrong_result_count: usize, + mean_score: f64, + mean_latency_ms: Option, + total_cost: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +struct SuiteReport { + suite_id: String, + status: TypedStatus, + encoded_job_count: usize, + score_mean: Option, + unsupported_claim_count: usize, + wrong_result_count: usize, + reason: String, +} + +#[derive(Debug, Deserialize, Serialize)] +struct JobReport { + suite_id: String, + job_id: String, + title: String, + status: TypedStatus, + normalized_score: f64, + hard_fail_hits: Vec, + expected_evidence: Vec, + produced_answer: String, + produced_evidence: Vec, + unsupported_claim_count: usize, + wrong_result_count: usize, + latency_ms: Option, + cost: Option, + trap_ids_used: Vec, + dimension_scores: Vec, + reason: String, +} + +#[derive(Debug, Deserialize, Serialize)] +struct ExpectedEvidenceReport { + evidence_id: String, + claim_id: String, + requirement: String, +} + +#[derive(Debug, Deserialize, Serialize)] +struct DimensionScoreReport { + dimension: String, + score: f64, + max_points: f64, + weight: f64, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct UnsupportedClaimReport { + suite_id: String, + job_id: String, + claim_id: Option, + claim_text: String, + reason: String, + evidence_ids: Vec, +} + +#[derive(Debug, Deserialize, Serialize)] +struct PrivateCorpusRedaction { + policy: String, + private_fixture_count: usize, +} + +#[derive(Debug)] +struct JobScoring { + status: TypedStatus, + normalized_score: f64, + hard_fail_hits: Vec, + unsupported_claims: Vec, + wrong_result_count: usize, + trap_ids_used: Vec, + dimension_scores: Vec, + reason: String, +} + +#[derive(Debug, Default)] +struct FailureCounts { + missing_claims: usize, + forbidden_claims: usize, + missing_evidence: usize, + trap_uses: usize, + unsupported_claims: usize, +} + +fn main() -> Result<()> { + color_eyre::install()?; + + match Args::parse().command { + Command::Run(args) => run_command(args), + Command::Publish(args) => publish_command(args), + } +} + +fn run_command(args: RunArgs) -> Result<()> { + let jobs = load_jobs(&args.fixtures)?; + let report = build_report(&jobs, &args)?; + let json = serde_json::to_string_pretty(&report)?; + + write_or_print(args.out.as_deref(), json.as_str()) +} + +fn publish_command(args: PublishArgs) -> Result<()> { + let raw = fs::read_to_string(&args.report)?; + let report = serde_json::from_str::(&raw)?; + let markdown = render_markdown(&report, &args.report); + + write_or_print(args.out.as_deref(), markdown.as_str()) +} + +fn load_jobs(path: &Path) -> Result> { + let paths = fixture_paths(path)?; + let mut jobs = Vec::with_capacity(paths.len()); + + for fixture in paths { + let raw = fs::read_to_string(&fixture)?; + let job = serde_json::from_str::(&raw) + .map_err(|err| eyre::eyre!("Failed to parse {}: {err}", fixture.display()))?; + + validate_job(&job, &fixture)?; + + jobs.push(job); + } + + Ok(jobs) +} + +fn fixture_paths(path: &Path) -> Result> { + if path.is_file() { + return Ok(vec![path.to_path_buf()]); + } + if !path.is_dir() { + return Err(eyre::eyre!("Fixture path does not exist: {}", path.display())); + } + + let mut paths = Vec::new(); + + collect_fixture_paths(path, &mut paths)?; + + paths.sort(); + + if paths.is_empty() { + return Err(eyre::eyre!("No JSON fixtures found in {}.", path.display())); + } + + Ok(paths) +} + +fn collect_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { + for entry in fs::read_dir(path)? { + let entry = entry?; + let entry_path = entry.path(); + + if entry_path.is_dir() { + collect_fixture_paths(entry_path.as_path(), paths)?; + } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(entry_path); + } + } + + Ok(()) +} + +fn validate_job(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.schema != JOB_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {JOB_SCHEMA}.", + path.display(), + job.schema + )); + } + + validate_job_identity(job, path)?; + + if !SUITES.contains(&job.suite.as_str()) { + return Err(eyre::eyre!("{} uses unknown suite {}.", path.display(), job.suite)); + } + + validate_corpus_items(job, path)?; + validate_timeline(job, path)?; + validate_prompt(job, path)?; + validate_expected_answer(job, path)?; + validate_required_evidence(job, path)?; + validate_scoring_rubric(job, path)?; + validate_allowed_uncertainty(job, path)?; + + Ok(()) +} + +fn validate_job_identity(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.job_id.trim().is_empty() + || job.suite.trim().is_empty() + || job.title.trim().is_empty() + || job.corpus.corpus_id.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete job identity.", path.display())); + } + + for tag in &job.tags { + if tag.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty tag.", path.display())); + } + } + + if let Some(adapter_response) = &job.corpus.adapter_response + && adapter_response.adapter_id.as_deref().is_some_and(str::is_empty) + { + return Err(eyre::eyre!("{} has an empty adapter_response adapter_id.", path.display())); + } + + Ok(()) +} + +fn validate_corpus_items(job: &RealWorldJob, path: &Path) -> Result<()> { + let mut evidence_ids = BTreeSet::new(); + + for item in &job.corpus.items { + if item.evidence_id.trim().is_empty() { + return Err(eyre::eyre!( + "{} has a corpus item with an empty evidence_id.", + path.display() + )); + } + if item.kind.trim().is_empty() { + return Err(eyre::eyre!( + "{} has corpus item {} with an empty kind.", + path.display(), + item.evidence_id + )); + } + if item.text.is_none() && item.local_ref.is_none() { + return Err(eyre::eyre!( + "{} corpus item {} must provide text or local_ref.", + path.display(), + item.evidence_id + )); + } + if !item.source_ref.is_object() { + return Err(eyre::eyre!( + "{} corpus item {} must provide an object source_ref.", + path.display(), + item.evidence_id + )); + } + + if let Some(created_at) = &item.created_at { + validate_optional_rfc3339(created_at, path, item.evidence_id.as_str())?; + } + + evidence_ids.insert(item.evidence_id.clone()); + } + for trap in &job.negative_traps { + if trap.trap_id.trim().is_empty() || trap.trap_type.trim().is_empty() { + return Err(eyre::eyre!("{} has an incomplete negative trap.", path.display())); + } + + for evidence_id in &trap.evidence_ids { + ensure_known_evidence(path, &evidence_ids, evidence_id)?; + } + } + + Ok(()) +} + +fn validate_timeline(job: &RealWorldJob, path: &Path) -> Result<()> { + let evidence_ids = corpus_evidence_ids(job); + + for event in &job.timeline { + if event.event_id.trim().is_empty() + || event.actor.trim().is_empty() + || event.action.trim().is_empty() + || event.summary.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete timeline event.", path.display())); + } + + validate_required_rfc3339(event.ts.as_str(), path, event.event_id.as_str())?; + + for evidence_id in &event.evidence_ids { + ensure_known_evidence(path, &evidence_ids, evidence_id)?; + } + } + + Ok(()) +} + +fn validate_prompt(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.prompt.role.trim().is_empty() + || job.prompt.content.trim().is_empty() + || job.prompt.job_mode.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete prompt.", path.display())); + } + + for constraint in &job.prompt.constraints { + if constraint.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty prompt constraint.", path.display())); + } + } + + Ok(()) +} + +fn validate_expected_answer(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.expected_answer.answer_type.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty expected answer type.", path.display())); + } + + for claim in &job.expected_answer.must_include { + if claim.text().trim().is_empty() { + return Err(eyre::eyre!("{} has an empty expected claim.", path.display())); + } + } + for claim in &job.expected_answer.must_not_include { + if claim.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty forbidden claim.", path.display())); + } + } + for phrase in &job.expected_answer.accepted_alternates { + if phrase.is_null() { + return Err(eyre::eyre!("{} has a null accepted alternate.", path.display())); + } + } + + Ok(()) +} + +fn validate_required_evidence(job: &RealWorldJob, path: &Path) -> Result<()> { + let evidence_ids = corpus_evidence_ids(job); + + for evidence in &job.required_evidence { + if evidence.claim_id.trim().is_empty() || evidence.requirement.trim().is_empty() { + return Err(eyre::eyre!("{} has incomplete required evidence.", path.display())); + } + + ensure_known_evidence(path, &evidence_ids, evidence.evidence_id.as_str())?; + + if evidence.quote.is_none() && evidence.selector.is_none() { + return Err(eyre::eyre!( + "{} required evidence {} must provide quote or selector.", + path.display(), + evidence.evidence_id + )); + } + } + for (claim_id, link) in &job.expected_answer.evidence_links { + if claim_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty evidence link claim id.", path.display())); + } + + for evidence_id in link.ids() { + ensure_known_evidence(path, &evidence_ids, evidence_id.as_str())?; + } + } + + Ok(()) +} + +fn validate_scoring_rubric(job: &RealWorldJob, path: &Path) -> Result<()> { + if !(0.0..=1.0).contains(&job.scoring_rubric.pass_threshold) { + return Err(eyre::eyre!("{} has invalid pass_threshold.", path.display())); + } + if job.scoring_rubric.dimensions.is_empty() { + return Err(eyre::eyre!("{} has no scoring dimensions.", path.display())); + } + + for (dimension_id, dimension) in &job.scoring_rubric.dimensions { + if dimension_id.trim().is_empty() + || !dimension.weight.is_finite() + || !dimension.max_points.is_finite() + || dimension.weight <= 0.0 + || dimension.max_points <= 0.0 + || dimension.criteria.is_null() + { + return Err(eyre::eyre!( + "{} has invalid scoring dimension {}.", + path.display(), + dimension_id + )); + } + } + for rule in &job.scoring_rubric.hard_fail_rules { + if rule.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty hard fail rule.", path.display())); + } + } + + Ok(()) +} + +fn validate_allowed_uncertainty(job: &RealWorldJob, path: &Path) -> Result<()> { + if job.allowed_uncertainty.fallback_action.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty fallback action.", path.display())); + } + if job.allowed_uncertainty.can_answer_unknown + && job.allowed_uncertainty.acceptable_phrases.is_empty() + { + return Err(eyre::eyre!( + "{} allows unknown answers but defines no acceptable uncertainty phrase.", + path.display() + )); + } + + for phrase in &job.allowed_uncertainty.acceptable_phrases { + if phrase.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty uncertainty phrase.", path.display())); + } + } + + Ok(()) +} + +fn validate_required_rfc3339(value: &str, path: &Path, id: &str) -> Result<()> { + if OffsetDateTime::parse(value, &Rfc3339).is_err() { + return Err(eyre::eyre!("{} has invalid RFC3339 timestamp for {}.", path.display(), id)); + } + + Ok(()) +} + +fn validate_optional_rfc3339(value: &str, path: &Path, id: &str) -> Result<()> { + if !value.trim().is_empty() { + validate_required_rfc3339(value, path, id)?; + } + + Ok(()) +} + +fn ensure_known_evidence(path: &Path, known: &BTreeSet, evidence_id: &str) -> Result<()> { + if !known.contains(evidence_id) { + return Err(eyre::eyre!( + "{} references unknown evidence id {}.", + path.display(), + evidence_id + )); + } + + Ok(()) +} + +fn corpus_evidence_ids(job: &RealWorldJob) -> BTreeSet { + job.corpus.items.iter().map(|item| item.evidence_id.clone()).collect() +} + +fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result { + if jobs.is_empty() { + return Err(eyre::eyre!("At least one real_world_job fixture is required.")); + } + + let mut job_reports = Vec::with_capacity(jobs.len()); + let mut unsupported_claims = Vec::new(); + + for job in jobs { + let scoring = score_job(job); + + unsupported_claims.extend(scoring.unsupported_claims.clone()); + job_reports.push(job_report(job, scoring)); + } + + let suites = suite_reports(&job_reports); + let not_encoded_suites = suites + .iter() + .filter(|suite| suite.status == TypedStatus::NotEncoded) + .map(|suite| suite.suite_id.clone()) + .collect::>(); + let summary = report_summary(&job_reports, &suites); + + Ok(RealWorldReport { + schema: REPORT_SCHEMA.to_string(), + run_id: args.run_id.clone(), + generated_at: OffsetDateTime::now_utc().format(&Rfc3339)?, + runner_version: VERSION.to_string(), + corpus_profile: corpus_profile(jobs), + adapter: adapter_report(args), + summary, + suites, + jobs: job_reports, + unsupported_claims, + not_encoded_suites, + private_corpus_redaction: private_corpus_redaction(jobs), + }) +} + +fn score_job(job: &RealWorldJob) -> JobScoring { + let answer = produced_answer(job); + let produced_evidence = produced_evidence_ids(answer); + let missing_claims = missing_required_claims(job, answer); + let forbidden_claims = forbidden_claim_hits(job, answer); + let missing_evidence = missing_required_evidence(job, &produced_evidence); + let trap_ids_used = trap_ids_used(job, &produced_evidence); + let mut unsupported_claims = unsupported_claims(job, answer); + let hard_fail_hits = hard_fail_hits(job, &unsupported_claims, &trap_ids_used); + let counts = FailureCounts { + missing_claims: missing_claims.len(), + forbidden_claims: forbidden_claims.len(), + missing_evidence: missing_evidence.len(), + trap_uses: trap_ids_used.len(), + unsupported_claims: unsupported_claims.len(), + }; + let dimension_scores = dimension_scores(job, &counts); + let normalized_score = normalized_score(&dimension_scores); + let wrong_result_count = counts.missing_claims + + counts.forbidden_claims + + counts.missing_evidence + + counts.trap_uses; + let status = job_status( + normalized_score, + job.scoring_rubric.pass_threshold, + wrong_result_count, + unsupported_claims.len(), + ); + let reason = job_reason(status, &counts, normalized_score); + + for claim in &mut unsupported_claims { + claim.suite_id = job.suite.clone(); + claim.job_id = job.job_id.clone(); + } + + JobScoring { + status, + normalized_score, + hard_fail_hits, + unsupported_claims, + wrong_result_count, + trap_ids_used, + dimension_scores, + reason, + } +} + +fn produced_answer(job: &RealWorldJob) -> &ProducedAnswer { + job.corpus + .adapter_response + .as_ref() + .map(|response| &response.answer) + .unwrap_or_else(|| synthetic_answer(job)) +} + +fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { + let _ = job; + + static EMPTY_ANSWER: std::sync::OnceLock = std::sync::OnceLock::new(); + + EMPTY_ANSWER.get_or_init(|| ProducedAnswer { + content: String::new(), + claims: Vec::new(), + evidence_ids: Vec::new(), + latency_ms: None, + cost: None, + }) +} + +fn produced_evidence_ids(answer: &ProducedAnswer) -> BTreeSet { + let mut evidence = answer.evidence_ids.iter().cloned().collect::>(); + + for claim in &answer.claims { + evidence.extend(claim.evidence_ids.iter().cloned()); + } + + evidence +} + +fn missing_required_claims(job: &RealWorldJob, answer: &ProducedAnswer) -> Vec { + job.expected_answer + .must_include + .iter() + .filter(|claim| !claim_is_present(claim, answer)) + .map(|claim| claim.text().to_string()) + .collect() +} + +fn claim_is_present(claim: &ExpectedClaim, answer: &ProducedAnswer) -> bool { + if let Some(claim_id) = claim.claim_id() + && answer.claims.iter().any(|produced| produced.claim_id.as_deref() == Some(claim_id)) + { + return true; + } + + answer.content.contains(claim.text()) +} + +fn forbidden_claim_hits(job: &RealWorldJob, answer: &ProducedAnswer) -> Vec { + job.expected_answer + .must_not_include + .iter() + .filter(|claim| answer.content.contains(claim.as_str())) + .cloned() + .collect() +} + +fn missing_required_evidence( + job: &RealWorldJob, + produced_evidence: &BTreeSet, +) -> Vec { + job.required_evidence + .iter() + .filter(|evidence| { + is_required_use(evidence) && !produced_evidence.contains(&evidence.evidence_id) + }) + .map(|evidence| evidence.evidence_id.clone()) + .collect() +} + +fn is_required_use(evidence: &RequiredEvidence) -> bool { + matches!(evidence.requirement.as_str(), "cite" | "use" | "explain") +} + +fn trap_ids_used(job: &RealWorldJob, produced_evidence: &BTreeSet) -> Vec { + job.negative_traps + .iter() + .filter(|trap| trap.failure_if_used) + .filter(|trap| { + trap.evidence_ids.iter().any(|evidence_id| produced_evidence.contains(evidence_id)) + }) + .map(|trap| trap.trap_id.clone()) + .collect() +} + +fn unsupported_claims(job: &RealWorldJob, answer: &ProducedAnswer) -> Vec { + answer.claims.iter().filter_map(|claim| unsupported_claim(job, claim)).collect() +} + +fn unsupported_claim(job: &RealWorldJob, claim: &ProducedClaim) -> Option { + let Some(claim_id) = claim.claim_id.as_deref() else { + return Some(unsupported_claim_report(claim, "claim has no claim_id")); + }; + let Some(allowed) = job.expected_answer.evidence_links.get(claim_id).map(EvidenceLink::ids) + else { + return Some(unsupported_claim_report( + claim, + "claim_id is not present in expected_answer.evidence_links", + )); + }; + + if claim.evidence_ids.is_empty() { + return Some(unsupported_claim_report(claim, "claim has no produced evidence ids")); + } + if !claim.evidence_ids.iter().any(|evidence_id| allowed.contains(evidence_id)) { + return Some(unsupported_claim_report( + claim, + "claim evidence is not allowed for this claim_id", + )); + } + + None +} + +fn unsupported_claim_report(claim: &ProducedClaim, reason: &str) -> UnsupportedClaimReport { + UnsupportedClaimReport { + suite_id: String::new(), + job_id: String::new(), + claim_id: claim.claim_id.clone(), + claim_text: bounded_text(claim.text.as_str(), 240), + reason: reason.to_string(), + evidence_ids: claim.evidence_ids.clone(), + } +} + +fn hard_fail_hits( + job: &RealWorldJob, + unsupported_claims: &[UnsupportedClaimReport], + trap_ids_used: &[String], +) -> Vec { + let mut hits = Vec::new(); + + if !unsupported_claims.is_empty() { + hits.push( + "unsupported high-confidence claim about a required decision or fact".to_string(), + ); + } + if !trap_ids_used.is_empty() { + hits.push("use of a negative trap marked failure_if_used = true".to_string()); + } + if job.expected_answer.requires_caveat && !answer_has_required_caveat(job, produced_answer(job)) + { + hits.push("missing required caveat".to_string()); + } + if job.expected_answer.requires_refusal && !answer_looks_like_refusal(produced_answer(job)) { + hits.push("missing required refusal".to_string()); + } + + hits +} + +fn answer_has_required_caveat(job: &RealWorldJob, answer: &ProducedAnswer) -> bool { + job.allowed_uncertainty.acceptable_phrases.iter().any(|phrase| answer.content.contains(phrase)) +} + +fn answer_looks_like_refusal(answer: &ProducedAnswer) -> bool { + let lower = answer.content.to_ascii_lowercase(); + + lower.contains("cannot") || lower.contains("can't") || lower.contains("refuse") +} + +fn dimension_scores(job: &RealWorldJob, counts: &FailureCounts) -> Vec { + job.scoring_rubric + .dimensions + .iter() + .map(|(dimension_id, dimension)| DimensionScoreReport { + dimension: dimension_id.clone(), + score: dimension_score(dimension_id, dimension.max_points, counts), + max_points: dimension.max_points, + weight: dimension.weight, + }) + .collect() +} + +fn dimension_score(dimension_id: &str, max_points: f64, counts: &FailureCounts) -> f64 { + let failed = match dimension_id { + "answer_correctness" | "workflow_helpfulness" => + counts.missing_claims > 0 || counts.forbidden_claims > 0, + "evidence_grounding" => counts.missing_evidence > 0 || counts.unsupported_claims > 0, + "trap_avoidance" => counts.trap_uses > 0, + "uncertainty_handling" => counts.unsupported_claims > 0, + "lifecycle_behavior" => false, + "debuggability" | "latency_resource" | "personalization_fit" => + counts.missing_claims > 0 || counts.unsupported_claims > 0, + _ => counts.missing_claims > 0 || counts.unsupported_claims > 0 || counts.trap_uses > 0, + }; + + if failed { 0.0 } else { max_points } +} + +fn normalized_score(scores: &[DimensionScoreReport]) -> f64 { + let total_weight = scores.iter().map(|score| score.weight).sum::(); + + if total_weight == 0.0 { + return 0.0; + } + + scores.iter().map(|score| (score.score / score.max_points) * score.weight).sum::() + / total_weight +} + +fn job_status( + normalized_score: f64, + pass_threshold: f64, + wrong_result_count: usize, + unsupported_claim_count: usize, +) -> TypedStatus { + if unsupported_claim_count > 0 { + TypedStatus::UnsupportedClaim + } else if wrong_result_count > 0 { + TypedStatus::WrongResult + } else if normalized_score >= pass_threshold { + TypedStatus::Pass + } else { + TypedStatus::WrongResult + } +} + +fn job_reason(status: TypedStatus, counts: &FailureCounts, normalized_score: f64) -> String { + match status { + TypedStatus::Pass => format!("Job passed with normalized_score {normalized_score:.3}."), + TypedStatus::UnsupportedClaim => format!( + "Job produced {} unsupported claim(s), {} wrong-result signal(s), and normalized_score {normalized_score:.3}.", + counts.unsupported_claims, + counts.missing_claims + + counts.forbidden_claims + + counts.missing_evidence + + counts.trap_uses + ), + TypedStatus::WrongResult => format!( + "Job produced {} wrong-result signal(s) and normalized_score {normalized_score:.3}.", + counts.missing_claims + + counts.forbidden_claims + + counts.missing_evidence + + counts.trap_uses + ), + _ => "Job did not reach a runnable scoring state.".to_string(), + } +} + +fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { + let answer = produced_answer(job); + + JobReport { + suite_id: job.suite.clone(), + job_id: job.job_id.clone(), + title: job.title.clone(), + status: scoring.status, + normalized_score: round3(scoring.normalized_score), + hard_fail_hits: scoring.hard_fail_hits, + expected_evidence: expected_evidence_report(job), + produced_answer: answer.content.clone(), + produced_evidence: produced_evidence_ids(answer).into_iter().collect(), + unsupported_claim_count: scoring.unsupported_claims.len(), + wrong_result_count: scoring.wrong_result_count, + latency_ms: answer.latency_ms, + cost: answer.cost.clone(), + trap_ids_used: scoring.trap_ids_used, + dimension_scores: scoring.dimension_scores, + reason: scoring.reason, + } +} + +fn expected_evidence_report(job: &RealWorldJob) -> Vec { + job.required_evidence + .iter() + .map(|evidence| ExpectedEvidenceReport { + evidence_id: evidence.evidence_id.clone(), + claim_id: evidence.claim_id.clone(), + requirement: evidence.requirement.clone(), + }) + .collect() +} + +fn suite_reports(jobs: &[JobReport]) -> Vec { + SUITES.iter().map(|suite_id| suite_report(suite_id, jobs)).collect() +} + +fn suite_report(suite_id: &str, jobs: &[JobReport]) -> SuiteReport { + let suite_jobs = jobs.iter().filter(|job| job.suite_id == suite_id).collect::>(); + + if suite_jobs.is_empty() { + return SuiteReport { + suite_id: suite_id.to_string(), + status: TypedStatus::NotEncoded, + encoded_job_count: 0, + score_mean: None, + unsupported_claim_count: 0, + wrong_result_count: 0, + reason: NOT_ENCODED_REASON.to_string(), + }; + } + + let status = aggregate_status(&suite_jobs); + let score_sum = suite_jobs.iter().map(|job| job.normalized_score).sum::(); + let unsupported_claim_count = suite_jobs.iter().map(|job| job.unsupported_claim_count).sum(); + let wrong_result_count = suite_jobs.iter().map(|job| job.wrong_result_count).sum(); + + SuiteReport { + suite_id: suite_id.to_string(), + status, + encoded_job_count: suite_jobs.len(), + score_mean: Some(round3(score_sum / suite_jobs.len() as f64)), + unsupported_claim_count, + wrong_result_count, + reason: suite_reason(status, suite_jobs.len()), + } +} + +fn aggregate_status(jobs: &[&JobReport]) -> TypedStatus { + let statuses = jobs.iter().map(|job| job.status).collect::>(); + + if statuses.contains(&TypedStatus::UnsupportedClaim) { + TypedStatus::UnsupportedClaim + } else if statuses.contains(&TypedStatus::LifecycleFail) { + TypedStatus::LifecycleFail + } else if statuses.contains(&TypedStatus::WrongResult) { + TypedStatus::WrongResult + } else if statuses.contains(&TypedStatus::Incomplete) { + TypedStatus::Incomplete + } else if statuses.contains(&TypedStatus::Blocked) { + TypedStatus::Blocked + } else if statuses.contains(&TypedStatus::Pass) { + TypedStatus::Pass + } else { + TypedStatus::NotEncoded + } +} + +fn suite_reason(status: TypedStatus, encoded_job_count: usize) -> String { + match status { + TypedStatus::Pass => format!("All {encoded_job_count} encoded job(s) passed."), + TypedStatus::UnsupportedClaim => + "At least one encoded job produced an unsupported claim.".to_string(), + TypedStatus::WrongResult => "At least one encoded job returned a wrong result.".to_string(), + TypedStatus::LifecycleFail => + "At least one encoded lifecycle-scored job failed lifecycle behavior.".to_string(), + TypedStatus::Incomplete => "At least one encoded job could not complete.".to_string(), + TypedStatus::Blocked => "At least one encoded job is blocked.".to_string(), + TypedStatus::NotEncoded => NOT_ENCODED_REASON.to_string(), + } +} + +fn report_summary(jobs: &[JobReport], suites: &[SuiteReport]) -> ReportSummary { + let mut summary = ReportSummary { + job_count: jobs.len(), + encoded_suite_count: suites + .iter() + .filter(|suite| suite.status != TypedStatus::NotEncoded) + .count(), + not_encoded: suites.iter().filter(|suite| suite.status == TypedStatus::NotEncoded).count(), + unsupported_claim_count: jobs.iter().map(|job| job.unsupported_claim_count).sum(), + wrong_result_count: jobs.iter().map(|job| job.wrong_result_count).sum(), + mean_score: mean_score(jobs), + mean_latency_ms: mean_latency(jobs), + total_cost: total_cost(jobs), + ..ReportSummary::default() + }; + + for job in jobs { + match job.status { + TypedStatus::Pass => summary.pass += 1, + TypedStatus::WrongResult => summary.wrong_result += 1, + TypedStatus::LifecycleFail => summary.lifecycle_fail += 1, + TypedStatus::Incomplete => summary.incomplete += 1, + TypedStatus::Blocked => summary.blocked += 1, + TypedStatus::NotEncoded => summary.not_encoded += 1, + TypedStatus::UnsupportedClaim => summary.unsupported_claim += 1, + } + } + + summary +} + +fn mean_score(jobs: &[JobReport]) -> f64 { + if jobs.is_empty() { + return 0.0; + } + + round3(jobs.iter().map(|job| job.normalized_score).sum::() / jobs.len() as f64) +} + +fn mean_latency(jobs: &[JobReport]) -> Option { + let latencies = jobs.iter().filter_map(|job| job.latency_ms).collect::>(); + + if latencies.is_empty() { + return None; + } + + Some(round3(latencies.iter().sum::() / latencies.len() as f64)) +} + +fn total_cost(jobs: &[JobReport]) -> Option { + let costs = jobs.iter().filter_map(|job| job.cost.as_ref()).collect::>(); + + if costs.is_empty() { + return None; + } + + let currency = costs.iter().find_map(|cost| cost.currency.clone()); + let amount = sum_optional_f64(costs.iter().filter_map(|cost| cost.amount)); + let input_tokens = sum_optional_u64(costs.iter().filter_map(|cost| cost.input_tokens)); + let output_tokens = sum_optional_u64(costs.iter().filter_map(|cost| cost.output_tokens)); + + Some(CostReport { currency, amount, input_tokens, output_tokens }) +} + +fn sum_optional_f64(values: impl Iterator) -> Option { + let values = values.collect::>(); + + if values.is_empty() { None } else { Some(round3(values.iter().sum())) } +} + +fn sum_optional_u64(values: impl Iterator) -> Option { + let values = values.collect::>(); + + if values.is_empty() { None } else { Some(values.iter().sum()) } +} + +fn corpus_profile(jobs: &[RealWorldJob]) -> String { + let profiles = jobs.iter().map(|job| job.corpus.profile.as_str()).collect::>(); + + if profiles.len() == 1 { + profiles.into_iter().next().unwrap_or("unknown").to_string() + } else { + "mixed".to_string() + } +} + +fn adapter_report(args: &RunArgs) -> AdapterReport { + AdapterReport { + adapter_id: args.adapter_id.clone(), + name: args.adapter_name.clone(), + behavior: "offline_fixture_response".to_string(), + storage: TypedStatus::NotEncoded, + runtime: TypedStatus::NotEncoded, + notes: "Smoke runner scores checked-in fixture responses; it does not exercise a live external adapter.".to_string(), + } +} + +fn private_corpus_redaction(jobs: &[RealWorldJob]) -> PrivateCorpusRedaction { + let private_fixture_count = jobs + .iter() + .filter(|job| matches!(job.corpus.profile, CorpusProfile::PrivateSanitized)) + .count(); + let policy = if private_fixture_count == 0 { + "no_private_corpus".to_string() + } else { + "publish evidence ids and bounded score summaries only; do not publish private text" + .to_string() + }; + + PrivateCorpusRedaction { policy, private_fixture_count } +} + +fn render_markdown(report: &RealWorldReport, report_path: &Path) -> String { + let report_path = report_path.display().to_string(); + let mut out = String::new(); + + render_markdown_header(&mut out, report, report_path.as_str()); + render_markdown_suites(&mut out, report); + render_markdown_jobs(&mut out, report); + render_markdown_unsupported_claims(&mut out, report); + render_markdown_semantics(&mut out, report); + + out +} + +fn render_markdown_header(out: &mut String, report: &RealWorldReport, report_path: &str) { + out.push_str("# Real-World Job Benchmark Report\n\n"); + out.push_str( + "Goal: Publish a Markdown summary for one generated real_world_job benchmark report.\n", + ); + out.push_str( + "Read this when: You need a durable smoke report for real-world agent memory job fixtures.\n", + ); + out.push_str(&format!("Inputs: `{}`.\n", md_inline(report_path))); + out.push_str("Depends on: `apps/elf-eval/fixtures/real_world_job/`, `docs/spec/real_world_agent_memory_benchmark_v1.md`, and `Makefile.toml`.\n"); + out.push_str( + "Verification: Compare this Markdown summary with the source JSON before committing.\n\n", + ); + out.push_str("## Summary\n\n"); + out.push_str(&format!("- Run ID: `{}`\n", md_inline(report.run_id.as_str()))); + out.push_str(&format!("- Generated at: `{}`\n", md_inline(report.generated_at.as_str()))); + out.push_str(&format!("- Runner version: `{}`\n", md_inline(report.runner_version.as_str()))); + out.push_str(&format!("- Corpus profile: `{}`\n", md_inline(report.corpus_profile.as_str()))); + out.push_str(&format!( + "- Adapter: `{}` ({})\n", + md_inline(report.adapter.adapter_id.as_str()), + md_inline(report.adapter.behavior.as_str()) + )); + out.push_str(&format!("- Jobs: `{}`\n", report.summary.job_count)); + out.push_str(&format!("- Encoded suites: `{}`\n", report.summary.encoded_suite_count)); + out.push_str(&format!("- Not-encoded suites: `{}`\n", report.not_encoded_suites.len())); + out.push_str(&format!("- Status summary: `{}` pass, `{}` wrong_result, `{}` lifecycle_fail, `{}` incomplete, `{}` blocked, `{}` unsupported_claim\n", report.summary.pass, report.summary.wrong_result, report.summary.lifecycle_fail, report.summary.incomplete, report.summary.blocked, report.summary.unsupported_claim)); + out.push_str(&format!( + "- Unsupported claim count: `{}`\n", + report.summary.unsupported_claim_count + )); + out.push_str(&format!("- Wrong-result count: `{}`\n", report.summary.wrong_result_count)); + out.push_str(&format!("- Mean score: `{:.3}`\n", report.summary.mean_score)); + out.push_str(&format!( + "- Mean latency: `{}`\n", + optional_f64(report.summary.mean_latency_ms, " ms") + )); + out.push_str(&format!("- Cost: `{}`\n", cost_display(report.summary.total_cost.as_ref()))); + out.push_str(&format!( + "- Private corpus redaction: `{}`\n\n", + md_inline(report.private_corpus_redaction.policy.as_str()) + )); +} + +fn render_markdown_suites(out: &mut String, report: &RealWorldReport) { + out.push_str("## Suites\n\n"); + out.push_str( + "| Suite | Status | Jobs | Score | Unsupported Claims | Wrong Results | Reason |\n", + ); + out.push_str("| --- | --- | ---: | ---: | ---: | ---: | --- |\n"); + + for suite in &report.suites { + out.push_str(&format!( + "| {} | `{}` | {} | `{}` | {} | {} | {} |\n", + md_cell(suite.suite_id.as_str()), + status_str(suite.status), + suite.encoded_job_count, + optional_f64(suite.score_mean, ""), + suite.unsupported_claim_count, + suite.wrong_result_count, + md_cell(suite.reason.as_str()) + )); + } + + out.push('\n'); +} + +fn render_markdown_jobs(out: &mut String, report: &RealWorldReport) { + out.push_str("## Jobs\n\n"); + out.push_str("| Suite | Job | Status | Score | Expected Evidence | Produced Evidence | Unsupported Claims | Wrong Results | Latency | Cost |\n"); + out.push_str("| --- | --- | --- | ---: | --- | --- | ---: | ---: | ---: | --- |\n"); + + for job in &report.jobs { + let expected = job + .expected_evidence + .iter() + .map(|evidence| evidence.evidence_id.as_str()) + .collect::>() + .join(", "); + let produced = job.produced_evidence.join(", "); + + out.push_str(&format!( + "| {} | {} | `{}` | `{:.3}` | `{}` | `{}` | {} | {} | `{}` | `{}` |\n", + md_cell(job.suite_id.as_str()), + md_cell(job.job_id.as_str()), + status_str(job.status), + job.normalized_score, + md_inline(expected.as_str()), + md_inline(produced.as_str()), + job.unsupported_claim_count, + job.wrong_result_count, + optional_f64(job.latency_ms, " ms"), + cost_display(job.cost.as_ref()) + )); + } + + out.push('\n'); +} + +fn render_markdown_unsupported_claims(out: &mut String, report: &RealWorldReport) { + out.push_str("## Unsupported Claims\n\n"); + + if report.unsupported_claims.is_empty() { + out.push_str("No unsupported claims were produced by encoded jobs.\n\n"); + + return; + } + + out.push_str("| Suite | Job | Claim | Evidence | Reason |\n"); + out.push_str("| --- | --- | --- | --- | --- |\n"); + + for claim in &report.unsupported_claims { + out.push_str(&format!( + "| {} | {} | {} | `{}` | {} |\n", + md_cell(claim.suite_id.as_str()), + md_cell(claim.job_id.as_str()), + md_cell(claim.claim_text.as_str()), + md_inline(claim.evidence_ids.join(", ").as_str()), + md_cell(claim.reason.as_str()) + )); + } + + out.push('\n'); +} + +fn render_markdown_semantics(out: &mut String, report: &RealWorldReport) { + out.push_str("## Result Semantics\n\n"); + out.push_str( + "This report uses `docs/spec/real_world_agent_memory_benchmark_v1.md` status terms.\n", + ); + out.push_str("It is a real-world job fixture report, not a Docker live-baseline report.\n"); + out.push_str("Existing live-baseline reports remain valid for their encoded retrieval and lifecycle checks and are not reinterpreted as real-world suite wins.\n\n"); + out.push_str( + "- `pass`: encoded jobs met their pass threshold with required evidence and no hard-fail rule.\n", + ); + out.push_str( + "- `wrong_result`: a job completed but missed required answer or evidence expectations.\n", + ); + out.push_str("- `unsupported_claim`: a job produced a substantive claim not supported by the fixture evidence links.\n"); + out.push_str("- `not_encoded`: a suite has no checked-in real_world_job fixture, so no pass/fail claim is allowed.\n\n"); + out.push_str("## Not-Encoded Suites\n\n"); + + if report.not_encoded_suites.is_empty() { + out.push_str("All declared suites have at least one encoded job.\n"); + } else { + for suite in &report.not_encoded_suites { + out.push_str(&format!("- `{}`\n", md_inline(suite.as_str()))); + } + } +} + +fn status_str(status: TypedStatus) -> &'static str { + match status { + TypedStatus::Pass => "pass", + TypedStatus::WrongResult => "wrong_result", + TypedStatus::LifecycleFail => "lifecycle_fail", + TypedStatus::Incomplete => "incomplete", + TypedStatus::Blocked => "blocked", + TypedStatus::NotEncoded => "not_encoded", + TypedStatus::UnsupportedClaim => "unsupported_claim", + } +} + +fn write_or_print(path: Option<&Path>, content: &str) -> Result<()> { + if let Some(path) = path { + if let Some(parent) = path.parent() + && !parent.as_os_str().is_empty() + { + fs::create_dir_all(parent)?; + } + + fs::write(path, content)?; + + println!("Wrote {}", path.display()); + } else { + println!("{content}"); + } + + Ok(()) +} + +fn optional_f64(value: Option, suffix: &str) -> String { + value.map(|value| format!("{value:.3}{suffix}")).unwrap_or_else(|| "-".to_string()) +} + +fn cost_display(cost: Option<&CostReport>) -> String { + let Some(cost) = cost else { + return "-".to_string(); + }; + + match (cost.amount, cost.currency.as_deref()) { + (Some(amount), Some(currency)) => format!("{amount:.3} {currency}"), + (Some(amount), None) => format!("{amount:.3}"), + (None, _) => "-".to_string(), + } +} + +fn bounded_text(value: &str, max_chars: usize) -> String { + let mut chars = value.chars(); + let text = chars.by_ref().take(max_chars).collect::(); + + if chars.next().is_some() { format!("{text}...") } else { text } +} + +fn md_inline(value: &str) -> String { + value.replace('`', "'").replace('\n', " ") +} + +fn md_cell(value: &str) -> String { + md_inline(value).replace('|', "\\|") +} + +fn round3(value: f64) -> f64 { + (value * 1_000.0).round() / 1_000.0 +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs new file mode 100644 index 00000000..5020ed77 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -0,0 +1,137 @@ +#![allow(unused_crate_dependencies)] + +//! Integration tests for the real-world job smoke benchmark runner. + +use std::{ + env, fs, + path::{Path, PathBuf}, + process::{self, Command}, +}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +fn fixture_dir() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_job").join("smoke") +} + +fn fixture_root() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join("real_world_job") +} + +fn run_json_report_from(fixtures: PathBuf) -> Result { + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(fixtures) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + Ok(serde_json::from_slice(&output.stdout)?) +} + +fn run_json_report() -> Result { + run_json_report_from(fixture_dir()) +} + +fn array_at<'a>(value: &'a Value, pointer: &str) -> Result<&'a Vec> { + value + .pointer(pointer) + .and_then(Value::as_array) + .ok_or_else(|| eyre::eyre!("missing array at {pointer}")) +} + +fn find_by_field<'a>(items: &'a [Value], field: &str, expected: &str) -> Result<&'a Value> { + items + .iter() + .find(|item| item.pointer(field).and_then(Value::as_str) == Some(expected)) + .ok_or_else(|| eyre::eyre!("missing item with {field} = {expected}")) +} + +#[test] +fn smoke_fixture_produces_typed_json_report() -> Result<()> { + let report = run_json_report()?; + + assert_eq!( + report.pointer("/schema").and_then(Value::as_str), + Some("elf.real_world_job_report/v1") + ); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + + let jobs = array_at(&report, "/jobs")?; + let job = find_by_field(jobs, "/job_id", "work-resume-smoke-001")?; + + assert_eq!(job.pointer("/suite_id").and_then(Value::as_str), Some("work_resume")); + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(job.pointer("/latency_ms").and_then(Value::as_f64), Some(1.2)); + assert_eq!(job.pointer("/cost/amount").and_then(Value::as_f64), Some(0.0)); + + let expected_evidence = array_at(job, "/expected_evidence")?; + let produced_evidence = array_at(job, "/produced_evidence")?; + + assert_eq!(expected_evidence.len(), 2); + assert_eq!(produced_evidence.len(), 1); + assert_eq!(produced_evidence.first().and_then(Value::as_str), Some("issue-xy812-resume")); + + let suites = array_at(&report, "/suites")?; + let encoded_suite = find_by_field(suites, "/suite_id", "work_resume")?; + let unencoded_suite = find_by_field(suites, "/suite_id", "retrieval")?; + + assert_eq!(encoded_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(unencoded_suite.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + + Ok(()) +} + +#[test] +fn runner_discovers_nested_fixture_layout() -> Result<()> { + let report = run_json_report_from(fixture_root())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +#[test] +fn generated_json_report_renders_markdown() -> Result<()> { + let report = run_json_report()?; + let temp_dir = env::temp_dir().join(format!("elf-real-world-job-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("report.json"); + let markdown_path = temp_dir.join("report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("# Real-World Job Benchmark Report")); + assert!(markdown.contains("work_resume")); + assert!(markdown.contains("issue-xy812-resume")); + assert!(markdown.contains("Existing live-baseline reports remain valid")); + + Ok(()) +} diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index c47f491b..9717c2de 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -41,6 +41,8 @@ cleanup, use `docs/guide/single_user_production.md`. - Add a dated report when a new run changes README-level claims. - Keep generated raw JSON under `tmp/live-baseline/`; commit only reviewed Markdown summaries and durable scripts. +- Keep generated real-world job smoke JSON and Markdown under `tmp/real-world-job/`; + commit fixture schemas, smoke fixtures, runner code, and durable docs only. - Link the newest decision-relevant report from README and this index. - When benchmark semantics change, update `live_baseline_benchmark.md` and the relevant spec before publishing a new result. diff --git a/docs/guide/benchmarking/live_baseline_benchmark.md b/docs/guide/benchmarking/live_baseline_benchmark.md index d1238181..c29f6125 100644 --- a/docs/guide/benchmarking/live_baseline_benchmark.md +++ b/docs/guide/benchmarking/live_baseline_benchmark.md @@ -240,6 +240,29 @@ The publisher summarizes one generated aggregate JSON report. For a combined rep that compares multiple runs, use the generated Markdown as input evidence and then add the interpretation manually under `docs/guide/benchmarking/`. +## Real-World Job Smoke + +The live-baseline runner and real-world job runner publish separate report schemas. +Live-baseline reports remain evidence for Docker retrieval and lifecycle checks only. +They are not real-world suite wins. + +To run the checked-in real-world job smoke fixture and render its Markdown report: + +```sh +cargo make real-world-job-smoke +``` + +Artifacts: + +```text +tmp/real-world-job/real-world-job-smoke-report.json +tmp/real-world-job/real-world-job-smoke-report.md +``` + +The smoke fixture lives under `apps/elf-eval/fixtures/real_world_job/smoke/` and uses +`docs/spec/real_world_agent_memory_benchmark_v1.md` status terms, including +`unsupported_claim`. Suites without checked-in jobs are reported as `not_encoded`. + ## Clean Up ```sh diff --git a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md index df11d9ef..7cf0f637 100644 --- a/docs/guide/benchmarking/real_world_agent_memory_benchmark.md +++ b/docs/guide/benchmarking/real_world_agent_memory_benchmark.md @@ -112,6 +112,19 @@ Recommended first increments: 3. Encode one `memory_evolution` job that proves update/delete/supersession behavior. 4. Add report output for `unsupported_claim` before broadening the suite count. +Current checked-in smoke increment: + +```sh +cargo make real-world-job-smoke +``` + +This parses `apps/elf-eval/fixtures/real_world_job/smoke/`, writes +`tmp/real-world-job/real-world-job-smoke-report.json`, and renders +`tmp/real-world-job/real-world-job-smoke-report.md`. The smoke report includes suite +id, job id, expected evidence, produced answer/evidence, unsupported-claim count, +wrong-result count, latency/cost fields when available, and typed suite/job statuses. +Untouched suites remain `not_encoded`. + Do not generate large fixtures or update production-adoption verdicts while adding the contract. The current adoption gate remains an existing benchmark decision until new real-world job reports are implemented and published.