diff --git a/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md new file mode 100644 index 0000000..862395b --- /dev/null +++ b/docs/guide/benchmarking/2026-06-11-measurement-coverage-audit.md @@ -0,0 +1,243 @@ +# ELF Benchmark Measurement Coverage Audit - June 11, 2026 + +Goal: Record what is actually measured today, where competitor comparisons are still +not comparable, and which measurement reports should guide future ELF iteration. +Read this when: You need to answer whether ELF has enough empirical evidence to +claim a win, tie, loss, or non-claim against tracked memory, RAG, graph, and +agent-continuity projects. +Inputs: Fresh local runs of `cargo make real-world-memory` and +`cargo make real-world-memory-live-adapters` on commit `286af8b`, plus +`apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json`, +`2026-06-11-competitor-strength-evidence-matrix.md`, and +`2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`. +Outputs: Fresh measured counters, scenario coverage, project coverage, and the next +measurement reports needed before stronger ELF claims. + +## Executive Judgment + +The benchmark program is useful and already prevents misleading claims, but the +current measured comparison is not complete enough to say ELF beats or ties every +tracked project's strongest scenario. + +What is proven today: + +- ELF has a strong fixture-backed real-world benchmark contract: 38 jobs, 36 pass, + 2 blocked operator boundaries, and no wrong results in the fixture aggregate. +- ELF and qmd have comparable full-suite live real-world sweeps. They are effectively + tied on pass/fail shape: each has 38 jobs, 18 pass, 5 wrong_result, 2 blocked, and + 13 not_encoded. +- ELF is ahead on production-operation evidence among tracked systems because it has + checked-in provider synthetic, stress, backfill, backup/restore, and Qdrant rebuild + evidence. +- The current comparison still undermeasures most competitor strengths. OpenViking + trajectory, mem0/OpenMemory entity history and UI, Letta core-vs-archival memory, + Graphiti/Zep temporal graph behavior, graph/RAG navigation, agentmemory and + claude-mem capture/continuity, and knowledge-page workflows remain non-claims. + +So the current adoption decision can remain "credible for bounded personal +production," but the competitiveness objective remains open. + +## Fresh Runs + +These commands were run from an isolated report worktree based on `origin/main`: + +| Command | Result | Runtime | +| --- | --- | ---: | +| `cargo make real-world-memory` | pass | 42.38 seconds | +| `cargo make real-world-memory-live-adapters` | pass | 121.93 seconds | + +The live adapter run emitted repeated Qdrant client/server compatibility warnings, but +the command completed successfully and produced ELF and qmd JSON/Markdown reports. +Treat that warning as a measurement-harness risk to keep visible, not as a current run +failure. + +## Fixture Aggregate + +`cargo make real-world-memory` produced: + +| Metric | Value | +| --- | ---: | +| Jobs | `38` | +| Encoded suites | `11` | +| Pass | `36` | +| Blocked | `2` | +| Wrong result | `0` | +| Lifecycle fail | `0` | +| Incomplete | `0` | +| Not encoded | `0` | +| Unsupported claim | `0` | +| Mean score | `0.947` | +| Mean latency | `4.411 ms` | +| Expected evidence recall | `77/77` | +| Evidence coverage | `84/84` | +| Source-ref coverage | `84/84` | +| Quote coverage | `84/84` | + +This proves fixture contract breadth and scoring behavior. It does not prove every +live adapter or competitor runtime can complete those jobs. + +## Live ELF/qmd Sweep + +`cargo make real-world-memory-live-adapters` produced: + +| Adapter | Jobs | Pass | Wrong result | Blocked | Not encoded | Mean score | Mean latency | Evidence recall | Evidence coverage | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| ELF live service adapter | `38` | `18` | `5` | `2` | `13` | `0.525` | `5.100 ms` | `41/77` | `48/84` | +| qmd live CLI adapter | `38` | `18` | `5` | `2` | `13` | `0.512` | `719.758 ms` | `41/77` | `48/84` | + +This supports a narrow tie on the currently encoded live real-world suite shape. It +does not support a broad ELF-over-qmd claim because qmd remains the stronger +retrieval-debug UX reference and its deep profile is still not encoded. + +### Live Suite Breakdown + +ELF and qmd had the same suite status shape: + +| Suite | Jobs | Status breakdown | +| --- | ---: | --- | +| `trust_source_of_truth` | `1` | `pass:1` | +| `work_resume` | `5` | `pass:5` | +| `retrieval` | `5` | `pass:5` | +| `project_decisions` | `5` | `pass:5` | +| `personalization` | `1` | `pass:1` | +| `memory_evolution` | `6` | `pass:1`, `wrong_result:5` | +| `capture_integration` | `2` | `not_encoded:2` | +| `consolidation` | `4` | `not_encoded:4` | +| `knowledge_compilation` | `2` | `not_encoded:2` | +| `operator_debugging_ux` | `1` | `not_encoded:1` | +| `production_ops` | `6` | `blocked:2`, `not_encoded:4` | + +The five live wrong results are all memory-evolution jobs. The live adapters retrieve +current evidence but do not yet provide the required historical conflict evidence +links for current-vs-historical reasoning. + +## External Adapter Ledger + +The checked-in manifest records 21 adapter records across 17 unique project names. + +| Evidence class | Adapter records | Meaning | +| --- | ---: | --- | +| `fixture_backed` | `1` | ELF fixture scoring only. | +| `live_baseline_only` | `6` | Docker same-corpus or lifecycle evidence without real-world job scoring. | +| `live_real_world` | `2` | ELF and qmd live real-world sweeps. | +| `research_gate` | `12` | Setup, source, resource, or output-contract gate only. | + +| Overall status | Adapter records | +| --- | ---: | +| `pass` | `1` | +| `wrong_result` | `6` | +| `lifecycle_fail` | `1` | +| `blocked` | `6` | +| `not_encoded` | `7` | + +The generated JSON report also emits `external_project_count: 19`, while the unique +project-name count from the manifest is 17. The runner currently computes that field +as adapter records whose project is not `ELF`, not as unique external project names. +Interpret the unique manifest project list as the project coverage count. + +## Project Coverage + +| Project | Best current evidence | Current measured state | Strongest unproven scenario | Next measurement before claim | +| --- | --- | --- | --- | --- | +| ELF | `fixture_backed` plus `live_real_world` | Fixture aggregate passes except 2 blocked operator boundaries; live full sweep is `wrong_result`. | Full live memory evolution, live consolidation, live knowledge pages, live capture, live production ops. | Memory-evolution diagnostic report, then live operator/capture/consolidation reports. | +| qmd | `live_real_world` plus `live_baseline_only` | Same live sweep shape as ELF; same-corpus baseline passes. | Deep retrieval-debug ergonomics and trace replay. | qmd/ELF deep retrieval-debug profile with expansion, fusion, rerank, and dropped-candidate traces. | +| agentmemory | `live_baseline_only` | `lifecycle_fail`. | Durable coding-agent continuity and capture hooks. | Durable lifecycle and work-resume/capture adapter report. | +| mem0/OpenMemory | `live_baseline_only` | `wrong_result`. | Entity history, lifecycle UI, OpenMemory inspection. | Same-corpus repair first, then entity-history and UI-readback report. | +| memsearch | `live_baseline_only` | `wrong_result`; source-of-truth is `incomplete`. | Markdown canonical store and local reindex clarity. | Reindex/update/delete/reload plus source-of-truth report. | +| OpenViking | `live_baseline_only` plus `research_gate` | Same-corpus retrieval is `wrong_result`; trajectory is `not_encoded`. | Hierarchical staged context trajectory. | Evidence-bearing retrieval fix, then staged trajectory report. | +| claude-mem | `live_baseline_only` | `wrong_result`. | Progressive disclosure and automatic capture review. | Work-resume, operator-debugging, and capture/write-policy report. | +| RAGFlow | `research_gate` | `blocked`. | RAG app workflow with document/chunk references. | Tiny Docker evidence-smoke with `reference.chunks` mapped to evidence ids. | +| LightRAG | `research_gate` | `blocked`. | Graph/RAG context export with source-path citations. | Docker context-export report with explicit provider config and source citation mapping. | +| GraphRAG | `research_gate` | `blocked`. | Graph summaries and document/text-unit evidence tables. | Cost-bounded Docker adapter report over a tiny corpus. | +| Graphiti/Zep | `research_gate` | `blocked`. | Temporal graph facts and validity windows. | Docker-local temporal graph adapter report for current and historical facts. | +| Letta | `research_gate` | `not_encoded`. | Core memory blocks versus archival memory. | Contained export contract, then core-vs-archival and decision-memory report. | +| LangGraph | `research_gate` | `not_encoded`; direct memory backend is unsupported. | Checkpoint replay and fork/regression debugging. | Treat as benchmark-infra reference unless a memory-output contract emerges. | +| nanograph | `research_gate` | `not_encoded`; full memory backend is unsupported. | Typed graph schema and query ergonomics. | Typed relation query report only if evidence ids can be emitted. | +| llm-wiki | `research_gate` | `not_encoded`. | Wiki/page generation, query-save, lint and repair loops. | Contained page-generation report with citation and unsupported-claim lint. | +| gbrain | `research_gate` | `not_encoded`; setup path is blocked. | Compiled truth pages, timelines, and brain navigation. | Docker-local brain repo setup proof, then compiled-truth/timeline report. | +| graphify | `research_gate` | `blocked`. | Graph-compressed navigation with `graph.json` and `GRAPH_REPORT`. | Docker graph/report output report mapped to benchmark evidence ids. | + +## Scenario Coverage And Claims + +| Scenario | Current measured position | Claim allowed today | Missing measurement | +| --- | --- | --- | --- | +| Retrieval/debug | ELF and qmd live retrieval pass; qmd same-corpus baseline passes. | Tie on encoded live retrieval; no ELF-over-qmd UX claim. | qmd/ELF deep trace replay and debug ergonomics scoring. | +| Work resume | ELF and qmd live pass. | ELF is credible on encoded work resume. | agentmemory, claude-mem, and OpenViking comparable continuity adapters. | +| Project decisions | ELF and qmd live pass. | ELF is credible on encoded project-decision recovery. | Letta core/archival decision memory comparison. | +| Source of truth | ELF and qmd live pass; ELF has stronger production restore/rebuild evidence. | ELF has strongest measured source-of-truth discipline. | memsearch source-of-truth reindex/reload evidence. | +| Memory evolution | ELF and qmd live fail 5/6 jobs; fixture aggregate passes. | No live superiority claim. | Historical conflict evidence links and Graphiti/Zep temporal comparison. | +| Consolidation | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live proposal generation with lineage, confidence, and review-action audit. | +| Knowledge pages | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Live page rebuild/lint plus llm-wiki, gbrain, GraphRAG, and graphify comparisons. | +| Operator debugging | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | Trace hydration, stage attribution, dropped-candidate, and repair-action scoring. | +| Capture/write policy | Fixture aggregate passes; live adapters are not encoded. | Fixture-only claim. | agentmemory/claude-mem style capture with redaction and evidence binding. | +| Production ops | ELF has separate production-provider/backfill/restore evidence; live sweep is not a full production-ops pass. | Bounded personal-production adoption claim with caveats. | Private corpus manifest and credentialed provider gates. | +| Personalization | ELF and qmd live pass one scoped preference job. | Narrow encoded pass only. | mem0/OpenMemory and Letta entity/preference history comparison. | +| Context trajectory | Not comparable. | No claim. | OpenViking staged hierarchy/trajectory scoring. | +| Core-vs-archival memory | Not comparable. | No claim. | Letta contained export and ELF core-block benchmark. | +| Graph/RAG navigation | Research gates and blocked adapters only. | No claim. | RAGFlow, LightRAG, GraphRAG, Graphiti/Zep, and graphify Docker reports. | + +## Next Measurement Reports + +Order these by decision value, not implementation convenience: + +1. ELF/qmd retrieval-debug deep profile + - Why: qmd is the closest measured live competitor and still stronger as a + debugging reference. + - Output: trace-level comparison of expansion, dense/sparse retrieval, fusion, + rerank, dropped candidates, and command-line replay. + +2. ELF/qmd live memory-evolution diagnostic + - Why: both systems currently fail 5/6 live memory-evolution jobs. + - Output: per-job evidence-link failure analysis for current-vs-historical facts, + supersession, and relation temporal validity. + +3. Live operator-debugging and capture/write-policy report + - Why: these are daily-use agent-memory qualities, currently fixture-only or + not_encoded in live sweeps. + - Output: trace hydration, raw-SQL avoidance, redaction, exclusion, write-policy, + and repair-action scoring. + +4. Continuity and context-trajectory report + - Why: agentmemory, claude-mem, and OpenViking represent real user expectations + around automatic capture, progressive disclosure, and staged context. + - Output: comparable work-resume/capture/trajectory jobs or typed blockers. + +5. Personalization and core-memory report + - Why: mem0/OpenMemory and Letta represent product expectations ELF should absorb + before claiming better personalization or operating context. + - Output: entity history, preference correction, UI/readback, core-vs-archival, + and project-decision scoring. + +6. Knowledge and graph/RAG report pack + - Why: llm-wiki, gbrain, graphify, GraphRAG, LightRAG, RAGFlow, and Graphiti/Zep + cover knowledge synthesis and graph navigation that ELF currently cannot claim. + - Output: Docker-contained artifacts mapped to evidence ids, or typed setup and + resource blockers. + +Before publishing the next aggregate report, clarify or rename the generated +`external_project_count` field so readers do not confuse non-ELF adapter records with +unique external projects. + +## Fail Criteria + +Use these criteria for future reports: + +- `pass`: comparable scenario is encoded, run, and evidence-backed. +- `wrong_result`: the system ran but answered with wrong, stale, unsupported, or + insufficiently evidenced memory. +- `not_encoded`: the runner does not yet exercise the scenario. This is not a win or + loss. +- `blocked`: safe measurement needs missing credentials, private data, resource + envelope acceptance, setup proof, or an export contract. +- `unsupported`: the project shape is not a direct memory-system comparison target. +- Fixture evidence cannot be promoted into live runtime evidence. +- Live baseline evidence cannot be promoted into real-world job evidence. +- Research-gate evidence cannot be promoted into pass/fail product quality evidence. + +## Bottom Line + +ELF is on a strong path because its benchmark methodology is stricter than a normal +leaderboard, and its production evidence is unusually concrete. The next work is not +to declare victory. The next work is to measure the strongest user-facing patterns in +adjacent projects, then decide which ones ELF should absorb behind fresh benchmark +gates. diff --git a/docs/guide/benchmarking/index.md b/docs/guide/benchmarking/index.md index b273eae..fd2569d 100644 --- a/docs/guide/benchmarking/index.md +++ b/docs/guide/benchmarking/index.md @@ -54,6 +54,10 @@ cleanup, use `docs/guide/single_user_production.md`. - `2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md`: current optimization-direction report that translates measured benchmark data and competitor strengths into prioritized ELF iteration themes and explicit non-claims. +- `2026-06-11-measurement-coverage-audit.md`: fresh coverage audit that separates + current measured ELF/qmd data, fixture evidence, external adapter ledger coverage, + scenario non-claims, and the next measurement reports needed before stronger + competitor claims. - `real_world_agent_memory_benchmark.md`: operator overview for the v1 real-world agent memory benchmark contract, including suite taxonomy, typed report states, knowledge-compilation fixture tasks, and the production-ops fixture target. diff --git a/docs/research/2026-06-11-measurement-coverage-audit.json b/docs/research/2026-06-11-measurement-coverage-audit.json new file mode 100644 index 0000000..b04d86e --- /dev/null +++ b/docs/research/2026-06-11-measurement-coverage-audit.json @@ -0,0 +1,124 @@ +{ + "schema": "elf.benchmark_measurement_coverage_audit/v1", + "run_id": "2026-06-11-measurement-coverage-audit", + "commit": "286af8b", + "created_at": "2026-06-11", + "scope": "ELF memory-system competitiveness measurement coverage, external competitor comparison evidence, and next report directions", + "commands": [ + { + "command": "cargo make real-world-memory", + "status": "pass", + "runtime_seconds": 42.38, + "artifact": "tmp/real-world-memory/real-world-memory-report.json" + }, + { + "command": "cargo make real-world-memory-live-adapters", + "status": "pass", + "runtime_seconds": 121.93, + "artifact": "tmp/real-world-memory/live-adapters/" + } + ], + "fixture_aggregate": { + "job_count": 38, + "encoded_suite_count": 11, + "pass": 36, + "wrong_result": 0, + "lifecycle_fail": 0, + "incomplete": 0, + "blocked": 2, + "not_encoded": 0, + "unsupported_claim": 0, + "mean_score": 0.947, + "mean_latency_ms": 4.411, + "expected_evidence_total": 77, + "expected_evidence_matched": 77, + "evidence_required_count": 84, + "evidence_covered_count": 84 + }, + "live_real_world_adapters": [ + { + "adapter": "ELF live service adapter", + "job_count": 38, + "encoded_suite_count": 11, + "pass": 18, + "wrong_result": 5, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.525, + "mean_latency_ms": 5.1, + "expected_evidence_total": 77, + "expected_evidence_matched": 41, + "evidence_required_count": 84, + "evidence_covered_count": 48 + }, + { + "adapter": "qmd live CLI adapter", + "job_count": 38, + "encoded_suite_count": 11, + "pass": 18, + "wrong_result": 5, + "blocked": 2, + "not_encoded": 13, + "mean_score": 0.512, + "mean_latency_ms": 719.758, + "expected_evidence_total": 77, + "expected_evidence_matched": 41, + "evidence_required_count": 84, + "evidence_covered_count": 48 + } + ], + "live_suite_breakdown": [ + {"suite": "trust_source_of_truth", "jobs": 1, "status_counts": {"pass": 1}}, + {"suite": "work_resume", "jobs": 5, "status_counts": {"pass": 5}}, + {"suite": "retrieval", "jobs": 5, "status_counts": {"pass": 5}}, + {"suite": "project_decisions", "jobs": 5, "status_counts": {"pass": 5}}, + {"suite": "personalization", "jobs": 1, "status_counts": {"pass": 1}}, + {"suite": "memory_evolution", "jobs": 6, "status_counts": {"pass": 1, "wrong_result": 5}}, + {"suite": "capture_integration", "jobs": 2, "status_counts": {"not_encoded": 2}}, + {"suite": "consolidation", "jobs": 4, "status_counts": {"not_encoded": 4}}, + {"suite": "knowledge_compilation", "jobs": 2, "status_counts": {"not_encoded": 2}}, + {"suite": "operator_debugging_ux", "jobs": 1, "status_counts": {"not_encoded": 1}}, + {"suite": "production_ops", "jobs": 6, "status_counts": {"blocked": 2, "not_encoded": 4}} + ], + "adapter_ledger": { + "adapter_records": 21, + "unique_project_names": 17, + "external_project_count_note": "The generated report field external_project_count currently counts non-ELF adapter records, not unique external project names.", + "evidence_class_counts": { + "fixture_backed": 1, + "live_baseline_only": 6, + "live_real_world": 2, + "research_gate": 12 + }, + "overall_status_counts": { + "pass": 1, + "wrong_result": 6, + "lifecycle_fail": 1, + "blocked": 6, + "not_encoded": 7 + } + }, + "claim_boundary": { + "elf_vs_qmd": "tie_on_current_encoded_live_real_world_shape_not_overall_win", + "elf_personal_production": "credible_with_bounded_caveats", + "broad_competitor_superiority": "not_proven", + "major_unmeasured_strengths": [ + "qmd_deep_retrieval_debug", + "OpenViking_context_trajectory", + "mem0_OpenMemory_entity_history_ui", + "agentmemory_claude_mem_capture_continuity", + "Letta_core_vs_archival_memory", + "Graphiti_Zep_temporal_graph", + "RAG_graph_navigation", + "llm_wiki_gbrain_graphify_knowledge_workflows" + ] + }, + "next_reports": [ + "ELF/qmd retrieval-debug deep profile", + "ELF/qmd live memory-evolution diagnostic", + "Live operator-debugging and capture/write-policy report", + "Continuity and context-trajectory report", + "Personalization and core-memory report", + "Knowledge and graph/RAG report pack" + ] +}