From 314b3c83b85e302e4aae6b77cdc9bbf8c2825031 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Tue, 16 Jun 2026 23:42:48 +0530 Subject: [PATCH] Harden real ODL text aliases Signed-off-by: docushell-admin --- .../grounding/opendataloader-json/README.md | 21 +- .../grounding/opendataloader-json/src/lib.rs | 206 ++++++++++++++++-- crates/ethos-cli/tests/verify.rs | 77 +++++++ docs/execution-status.md | 4 +- 4 files changed, 278 insertions(+), 30 deletions(-) diff --git a/adapters/grounding/opendataloader-json/README.md b/adapters/grounding/opendataloader-json/README.md index 9bc0fac..f74862b 100644 --- a/adapters/grounding/opendataloader-json/README.md +++ b/adapters/grounding/opendataloader-json/README.md @@ -16,16 +16,17 @@ type, text), and optional tables/cells (id, page, bbox, row/col, spans, bbox, te For real OpenDataLoader 2.4.x JSON, it maps the top-level `kids` tree into grounding elements (`id` → `odl-{id}` for numeric and non-empty string ids, missing ids → -`odl-el-N`, `page number` → `page-N`, `bounding box` → centipoints, `content` → text). -Nested `kids`, `list items`, and `rows[].cells` containers are traversed in document order. -Pure structural wrappers that only carry child containers are traversed without becoming -grounding elements. Child containers must use array shapes, and malformed child containers -or non-string `content` values are rejected instead of being silently skipped. Real ODL JSON -does not include parser version or page dimensions, so the adapter reports parser version as -`unknown` and derives page extents from observed bounding boxes. Coordinate origin remains -unknown. Real ODL-style table nodes with explicit `page number`, `bounding box`, and -`rows[].cells[]` cell page/bbox/content fields are mapped to deterministic grounding tables; -row and column addresses are derived from row/cell order. +`odl-el-N`, `page number` → `page-N`, `bounding box` → centipoints, `content` or +unambiguous `text` → text). Nested `kids`/`children`, `list items`/`list_items`, and +`rows[].cells` containers are traversed in document order. Pure structural wrappers that +only carry child containers are traversed without becoming grounding elements. Child +containers must use array shapes, and malformed child containers, non-string text values, +or conflicting `content`/`text` values are rejected instead of being silently skipped. Real +ODL JSON does not include parser version or page dimensions, so the adapter reports parser +version as `unknown` and derives page extents from observed bounding boxes. Coordinate +origin remains unknown. Real ODL-style table nodes with explicit `page number`, `bounding +box`, and `rows[].cells[]` cell page/bbox/text fields are mapped to deterministic grounding +tables; row and column addresses are derived from row/cell order. ## Declared capabilities (honest downgrades) diff --git a/adapters/grounding/opendataloader-json/src/lib.rs b/adapters/grounding/opendataloader-json/src/lib.rs index 1c1d290..389aa91 100644 --- a/adapters/grounding/opendataloader-json/src/lib.rs +++ b/adapters/grounding/opendataloader-json/src/lib.rs @@ -610,6 +610,7 @@ fn real_node_has_element_fields(node: &Value) -> bool { || node.get("page number").is_some() || node.get("bounding box").is_some() || node.get("content").is_some() + || node.get("text").is_some() } fn real_content_kind(node: &Value) -> Result { @@ -660,12 +661,9 @@ fn real_content_text(node: &Value) -> Result, AdapterError> { } fn collect_real_text<'a>(node: &'a Value, parts: &mut Vec<&'a str>) -> Result<(), AdapterError> { - if let Some(content) = node.get("content") { - let content = content - .as_str() - .ok_or_else(|| err("content must be a string"))?; - if !content.is_empty() { - parts.push(content); + if let Some(text) = real_own_text(node)? { + if !text.is_empty() { + parts.push(text); } } for child in real_child_elements(node)? { @@ -674,20 +672,58 @@ fn collect_real_text<'a>(node: &'a Value, parts: &mut Vec<&'a str>) -> Result<() Ok(()) } +fn real_own_text(node: &Value) -> Result, AdapterError> { + let content = real_string_alias(node, "content", "content must be a string")?; + let text = real_string_alias(node, "text", "text must be a string")?; + match (content, text) { + (None, None) => Ok(None), + (Some(content), None) => Ok(Some(content)), + (None, Some(text)) => Ok(Some(text)), + (Some(content), Some(text)) => { + let content_empty = content.is_empty(); + let text_empty = text.is_empty(); + if content_empty && text_empty { + Ok(None) + } else if content_empty { + Ok(Some(text)) + } else if text_empty || content == text { + Ok(Some(content)) + } else { + Err(err("content and text fields disagree")) + } + } + } +} + +fn real_string_alias<'a>( + node: &'a Value, + field: &str, + message: &str, +) -> Result, AdapterError> { + let Some(value) = node.get(field) else { + return Ok(None); + }; + value.as_str().map(Some).ok_or_else(|| err(message)) +} + fn real_child_elements(node: &Value) -> Result, AdapterError> { let mut children = Vec::new(); - if let Some(kids_value) = node.get("kids") { - let kids = kids_value - .as_array() - .ok_or_else(|| err("kids must be an array"))?; - children.extend(kids); - } - if let Some(items_value) = node.get("list items") { - let items = items_value - .as_array() - .ok_or_else(|| err("list items must be an array"))?; - children.extend(items); - } + children.extend(real_child_alias_array( + node, + &[ + ("kids", "kids must be an array"), + ("children", "children must be an array"), + ], + "ambiguous kids/children containers", + )?); + children.extend(real_child_alias_array( + node, + &[ + ("list items", "list items must be an array"), + ("list_items", "list_items must be an array"), + ], + "ambiguous list item containers", + )?); if let Some(rows_value) = node.get("rows") { let rows = rows_value .as_array() @@ -707,6 +743,26 @@ fn real_child_elements(node: &Value) -> Result, AdapterError> { Ok(children) } +fn real_child_alias_array<'a>( + node: &'a Value, + fields: &[(&str, &str)], + ambiguous_message: &str, +) -> Result, AdapterError> { + let mut found = None; + for (field, type_message) in fields { + let Some(value) = node.get(*field) else { + continue; + }; + if found.is_some() { + return Err(err(ambiguous_message)); + } + found = Some(value.as_array().ok_or_else(|| err(type_message))?); + } + Ok(found + .map(|items| items.iter().collect()) + .unwrap_or_default()) +} + fn parse_table_cells(table: &Value) -> Result, AdapterError> { let mut cells = Vec::new(); for cell in table @@ -1065,6 +1121,100 @@ mod tests { assert_eq!(tables[0].cells[1].text, "Cell B"); } + #[test] + fn maps_real_text_and_child_aliases_in_preorder() { + let src = OdlJsonSource::from_json_str( + r#"{ + "file name": "aliases.pdf", + "number of pages": 1, + "kids": [ + { + "type": "section", + "id": "section-a", + "page number": 1, + "bounding box": [10, 10, 220, 90], + "text": "Alias section", + "children": [ + { + "type": "paragraph", + "id": "child-a", + "page number": 1, + "bounding box": [20, 30, 210, 55], + "text": "Child text" + } + ] + }, + { + "type": "list", + "id": "list-a", + "page number": 1, + "bounding box": [10, 100, 220, 160], + "list_items": [ + { + "type": "list_item", + "id": "item-a", + "page number": 1, + "bounding box": [20, 115, 210, 140], + "text": "Alias item" + } + ] + }, + { + "type": "table", + "id": "table-a", + "page number": 1, + "bounding box": [10, 170, 220, 230], + "rows": [ + { + "cells": [ + { + "type": "table_cell", + "page number": 1, + "bounding box": [20, 185, 210, 215], + "text": "Alias cell" + } + ] + } + ] + } + ] + }"#, + ) + .unwrap(); + + let elements = src.elements(); + let ids = elements + .iter() + .map(|element| element.id.as_str()) + .collect::>(); + assert_eq!( + ids, + vec![ + "odl-section-a", + "odl-child-a", + "odl-list-a", + "odl-item-a", + "odl-table-a", + "odl-el-1", + ] + ); + assert_eq!( + elements[0].text.as_deref(), + Some("Alias section\nChild text") + ); + assert_eq!(elements[1].text.as_deref(), Some("Child text")); + assert_eq!(elements[2].text.as_deref(), Some("Alias item")); + assert_eq!(elements[3].text.as_deref(), Some("Alias item")); + assert_eq!(elements[5].text.as_deref(), Some("Alias cell")); + + let tables = src.tables(); + assert_eq!(tables.len(), 1); + assert_eq!(tables[0].id, "odl-table-a"); + assert_eq!(tables[0].cells.len(), 1); + assert_eq!(tables[0].cells[0].text, "Alias cell"); + assert_eq!(tables[0].cells[0].bbox, [2000, 18500, 21000, 21500]); + } + #[test] fn maps_real_structural_containers_without_table_capability() { let src = OdlJsonSource::from_json_str( @@ -1342,6 +1492,26 @@ mod tests { r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"type":"paragraph","page number":1,"bounding box":[1,1,2,2],"content":7}]}"#, "content must be a string", ); + assert_error_contains( + r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"type":"paragraph","page number":1,"bounding box":[1,1,2,2],"text":7}]}"#, + "text must be a string", + ); + assert_error_contains( + r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"type":"paragraph","page number":1,"bounding box":[1,1,2,2],"content":"A","text":"B"}]}"#, + "content and text fields disagree", + ); + assert_error_contains( + r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"children":{}}]}"#, + "children must be an array", + ); + assert_error_contains( + r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"kids":[],"children":[]}]}"#, + "ambiguous kids/children containers", + ); + assert_error_contains( + r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"type":"list","page number":1,"bounding box":[1,1,2,2],"list items":[],"list_items":[]}]}"#, + "ambiguous list item containers", + ); assert_error_contains( r#"{"file name":"bad.pdf","number of pages":1,"kids":[{}]}"#, "content node has no element fields or child containers", diff --git a/crates/ethos-cli/tests/verify.rs b/crates/ethos-cli/tests/verify.rs index a24e694..7a70c98 100644 --- a/crates/ethos-cli/tests/verify.rs +++ b/crates/ethos-cli/tests/verify.rs @@ -2009,6 +2009,83 @@ fn real_opendataloader_style_table_cell_claim_grounds() { assert_eq!(report["all_evidence_grounded"], true); } +#[test] +fn real_opendataloader_text_and_child_alias_claim_grounds() { + let grounding = temp_json( + "real-odl-style-aliases", + r#"{ + "file name": "aliases.pdf", + "number of pages": 1, + "kids": [ + { + "type": "section", + "id": "parent", + "page number": 1, + "bounding box": [10, 10, 240, 80], + "text": "Parent text", + "children": [ + { + "type": "paragraph", + "id": "alias-child", + "page number": 1, + "bounding box": [20, 20, 230, 50], + "text": "Child alias grounds" + } + ] + } + ] + }"#, + ); + let citations = temp_json( + "real-odl-style-alias-citations", + r#"{ + "claims": [ + { + "kind": "quote", + "text": "Child alias grounds", + "citation": { + "element_id": "odl-alias-child" + } + } + ] + }"#, + ); + let report = parse_success(&[ + "verify", + grounding.to_str().unwrap(), + "--grounding", + "opendataloader-json", + "--citations", + citations.to_str().unwrap(), + ]); + + assert_eq!(report["checks"][0]["status"], "grounded"); + assert_eq!( + report["checks"][0]["match_method"], + "normalized_text_contains" + ); + assert_eq!(report["checks"][0]["evidence"]["page"], "page-1"); + assert_eq!( + report["checks"][0]["evidence"]["text"], + "Child alias grounds" + ); + assert_eq!( + report["checks"][0]["evidence"]["bbox"], + serde_json::json!([2000, 2000, 23000, 5000]) + ); + assert_eq!( + report["capability_limits"], + serde_json::json!([ + "missing_fingerprint", + "missing_spans", + "missing_char_offsets", + "missing_tables", + "unknown_coordinate_origin" + ]) + ); + assert_eq!(report["all_evidence_grounded"], true); +} + #[test] fn foreign_source_without_fingerprint_blocks_fingerprint_pinned_citations() { let grounding = odl_example(); diff --git a/docs/execution-status.md b/docs/execution-status.md index b87ed09..18a026b 100644 --- a/docs/execution-status.md +++ b/docs/execution-status.md @@ -18,7 +18,7 @@ The committed implementation now includes: - Quantized page/span extraction at the backend boundary, plus a basic deterministic layout pass that assembles paragraph `text_block` elements, fixture-backed alpha heading and flat list-item elements, and simple column reading order for the current born-digital fixtures. Current alpha layout confidence is explicit for heading signals, and below-threshold layout confidence emits deterministic `low_confidence_reading_order` diagnostics instead of staying silent. Fixture validation binds selected `fixture.json` expectations to committed extraction/layout goldens and binds current alpha text/Markdown exports to committed layout output so current read-order, element-type, heading-export, list-item, and export cases fail closed on drift. - An internal layout evaluator scaffold exists at `fixtures/evaluate_layout_alpha.py` and `make layout-evaluator-alpha`. It reads committed `fixture.json`, `extraction.json`, `layout.json`, `text.txt`, and `markdown.md` files, summarizes alpha element-type and subset coverage, and fails closed on missing layout expectations, dangling/invalid warning references, confidence-policy drift, export-golden drift, invalid span expectation metadata, expected page/span-text/font-id drift, expected rotation drift, or drift in fixture-backed reading order / heading / list-item / hyphenation / ligature cases. - Schema/example/profile validation is green through `schemas/validate_examples.py` using `jsonschema` draft 2020-12 validation, including the crop descriptor artifact contract plus referential-integrity and bbox sanity checks outside JSON Schema. -- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases, and maps explicit real OpenDataLoader-style row/cell structures to table-cell grounding. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, split-quote, not-found, stale-fingerprint, unsupported non-v1 claim, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. +- `ethos verify` now produces non-empty quote, value, presence, and table-cell verification checks over native Ethos document JSON and synthetic OpenDataLoader-style JSON through `--grounding opendataloader-json`; it also verifies quote/value/presence citations over pinned real OpenDataLoader 2.4.7 JSON, including grounded and ungrounded cases, maps explicit real OpenDataLoader-style row/cell structures to table-cell grounding, and normalizes conservative real-style text/child-container aliases when page/bbox/text data remains explicit. Citation/config inputs are rejected when they drift outside the closed schemas. The public demo harness covers grounded, ungrounded, split-quote, not-found, stale-fingerprint, unsupported non-v1 claim, capability-limited, malformed-citation, malformed OpenDataLoader-style input, and summary-format reject paths. - Verification semantics are now trust-honest at alpha scope: quote containment is explicitly labeled, value/table-cell checks require normalized equality, fingerprint-pinned citations fail closed when source fingerprints are unavailable, and structured capability limits explain why a run is downgraded. - `make verify-alpha` is the current alpha trust-loop command: it checks native examples, split-quote evidence matching, unsupported non-v1 claim reporting, synthetic OpenDataLoader-style examples, pinned real OpenDataLoader grounded/ungrounded examples, schema validation, verify-alpha case inventory coverage, usage diagnostics for malformed citations and malformed OpenDataLoader-style structures, byte-identical repeated verification reports, byte-identical native crop descriptors, summary diagnostics for an ungrounded native case, and foreign fixture manifest hash binding. - An internal Python surface scaffold exists under `python/ethos_pdf`. It shells out to a caller-provided local `ethos` CLI binary for `ethos doc parse` JSON, Markdown, and text output, and has stdlib unit tests that use a fake local command. This is pre-alpha scaffolding for Milestone B API shape work, not a public installation or publication path. @@ -57,7 +57,7 @@ Milestone A has an accepted internal Gate Zero decision for roadmap control, so | Python surface scaffold | Landed: internal stdlib wrapper over a caller-provided local `ethos doc parse` command, with explicit JSON/Markdown/text methods, page selection passthrough, diagnostics passthrough, timeout handling, command failure reporting, and mocked-command unit coverage | Native binding work, broader API design, and public setup path remain future work | | Font policy groundwork | Partially landed: substitution table and profile policy are present; fixture output uses deterministic substitution IDs, and committed embedded-font fixture metadata now binds expected extraction font identity | Bundled fallback asset hashing and broader font/CID validation remain open | | Schema/example validation | Landed: schemas, examples, deterministic profile, referential integrity, and bbox sanity pass the `jsonschema` validation gate | Contract changes still require explicit versioning and compatibility review | -| Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping and explicit real ODL-style row/cell table grounding, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding | +| Trust-layer implementation | Landed: `ethos verify` quote/value/presence/table-cell checks, explicit quote-containment labeling, normalized equality for value/table-cell checks, stale and unverifiable fingerprint handling, unsupported claim reporting, structured capability limits, native Ethos JSON path, ODL-style adapter path with synthetic table/cell mapping, explicit real ODL-style row/cell table grounding, conservative real-style text/child-container alias normalization, pinned real OpenDataLoader 2.4.7 grounded/ungrounded fixtures, foreign fixture manifest hash validation, crop-ref evidence plumbing, stable logical native crop refs, native crop descriptor artifacts, raw BGRA crop rendering in `ethos-pdf`, CLI PNG crop artifact production for bound native source PDFs, same-host rendered crop repeatability check, rendered-crop run comparison helper, strict citation/config input validation, citation input schema, split-quote fixture coverage, explicit unsupported non-v1 claim reporting, OpenDataLoader-style structure diagnostics for malformed bbox and unknown-page references, verify-alpha case inventory checks, and demo fixtures | Still needed: additional adapter hardening against broader real output shapes, future claim-kind expansion outside the current v1 alpha policy, and a decision on whether cross-platform rendered crop artifact equality is worth pursuing after the current macOS/Linux bbox drift finding | | WS-HARNESS readiness | Partially landed: readiness path is green for frozen corpus/hardware and pinned competitors, Gate Zero evidence preflight validates the current `ethos-bench` handoff, and gates fail closed if those records regress | Public-safe comparison report flow, release/package approval, claim-wording approval, and future evidence-refresh workflow still need hardening | | Determinism workflow | Landed: macOS arm64, Linux x64, and Windows x64 matrix entries run core contract tests; PDFium-backed corpus work stays gated on an explicitly configured pinned runtime; static workflow tests guard the matrix | Windows PDFium runtime provisioning and broader cross-platform corpus validation remain future work |