Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions adapters/grounding/opendataloader-json/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@ type, text), and optional tables/cells (id, page, bbox, row/col, spans, bbox, te

For real OpenDataLoader 2.4.x JSON, it maps the top-level `kids` tree into grounding
elements (`id` → `odl-{id}` for numeric and non-empty string ids, missing ids →
`odl-el-N`, `page number` → `page-N`, `bounding box` → centipoints, `content` → text).
Nested `kids`, `list items`, and `rows[].cells` containers are traversed in document order.
Pure structural wrappers that only carry child containers are traversed without becoming
grounding elements. Child containers must use array shapes, and malformed child containers
or non-string `content` values are rejected instead of being silently skipped. Real ODL JSON
does not include parser version or page dimensions, so the adapter reports parser version as
`unknown` and derives page extents from observed bounding boxes. Coordinate origin remains
unknown. Real ODL-style table nodes with explicit `page number`, `bounding box`, and
`rows[].cells[]` cell page/bbox/content fields are mapped to deterministic grounding tables;
row and column addresses are derived from row/cell order.
`odl-el-N`, `page number` → `page-N`, `bounding box` → centipoints, `content` or
unambiguous `text` → text). Nested `kids`/`children`, `list items`/`list_items`, and
`rows[].cells` containers are traversed in document order. Pure structural wrappers that
only carry child containers are traversed without becoming grounding elements. Child
containers must use array shapes, and malformed child containers, non-string text values,
or conflicting `content`/`text` values are rejected instead of being silently skipped. Real
ODL JSON does not include parser version or page dimensions, so the adapter reports parser
version as `unknown` and derives page extents from observed bounding boxes. Coordinate
origin remains unknown. Real ODL-style table nodes with explicit `page number`, `bounding
box`, and `rows[].cells[]` cell page/bbox/text fields are mapped to deterministic grounding
tables; row and column addresses are derived from row/cell order.

## Declared capabilities (honest downgrades)

Expand Down
206 changes: 188 additions & 18 deletions adapters/grounding/opendataloader-json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,7 @@ fn real_node_has_element_fields(node: &Value) -> bool {
|| node.get("page number").is_some()
|| node.get("bounding box").is_some()
|| node.get("content").is_some()
|| node.get("text").is_some()
}

fn real_content_kind(node: &Value) -> Result<String, AdapterError> {
Expand Down Expand Up @@ -660,12 +661,9 @@ fn real_content_text(node: &Value) -> Result<Option<String>, AdapterError> {
}

fn collect_real_text<'a>(node: &'a Value, parts: &mut Vec<&'a str>) -> Result<(), AdapterError> {
if let Some(content) = node.get("content") {
let content = content
.as_str()
.ok_or_else(|| err("content must be a string"))?;
if !content.is_empty() {
parts.push(content);
if let Some(text) = real_own_text(node)? {
if !text.is_empty() {
parts.push(text);
}
}
for child in real_child_elements(node)? {
Expand All @@ -674,20 +672,58 @@ fn collect_real_text<'a>(node: &'a Value, parts: &mut Vec<&'a str>) -> Result<()
Ok(())
}

fn real_own_text(node: &Value) -> Result<Option<&str>, AdapterError> {
let content = real_string_alias(node, "content", "content must be a string")?;
let text = real_string_alias(node, "text", "text must be a string")?;
match (content, text) {
(None, None) => Ok(None),
(Some(content), None) => Ok(Some(content)),
(None, Some(text)) => Ok(Some(text)),
(Some(content), Some(text)) => {
let content_empty = content.is_empty();
let text_empty = text.is_empty();
if content_empty && text_empty {
Ok(None)
} else if content_empty {
Ok(Some(text))
} else if text_empty || content == text {
Ok(Some(content))
} else {
Err(err("content and text fields disagree"))
}
}
}
}

fn real_string_alias<'a>(
node: &'a Value,
field: &str,
message: &str,
) -> Result<Option<&'a str>, AdapterError> {
let Some(value) = node.get(field) else {
return Ok(None);
};
value.as_str().map(Some).ok_or_else(|| err(message))
}

fn real_child_elements(node: &Value) -> Result<Vec<&Value>, AdapterError> {
let mut children = Vec::new();
if let Some(kids_value) = node.get("kids") {
let kids = kids_value
.as_array()
.ok_or_else(|| err("kids must be an array"))?;
children.extend(kids);
}
if let Some(items_value) = node.get("list items") {
let items = items_value
.as_array()
.ok_or_else(|| err("list items must be an array"))?;
children.extend(items);
}
children.extend(real_child_alias_array(
node,
&[
("kids", "kids must be an array"),
("children", "children must be an array"),
],
"ambiguous kids/children containers",
)?);
children.extend(real_child_alias_array(
node,
&[
("list items", "list items must be an array"),
("list_items", "list_items must be an array"),
],
"ambiguous list item containers",
)?);
if let Some(rows_value) = node.get("rows") {
let rows = rows_value
.as_array()
Expand All @@ -707,6 +743,26 @@ fn real_child_elements(node: &Value) -> Result<Vec<&Value>, AdapterError> {
Ok(children)
}

fn real_child_alias_array<'a>(
node: &'a Value,
fields: &[(&str, &str)],
ambiguous_message: &str,
) -> Result<Vec<&'a Value>, AdapterError> {
let mut found = None;
for (field, type_message) in fields {
let Some(value) = node.get(*field) else {
continue;
};
if found.is_some() {
return Err(err(ambiguous_message));
}
found = Some(value.as_array().ok_or_else(|| err(type_message))?);
}
Ok(found
.map(|items| items.iter().collect())
.unwrap_or_default())
}

fn parse_table_cells(table: &Value) -> Result<Vec<GroundingCell>, AdapterError> {
let mut cells = Vec::new();
for cell in table
Expand Down Expand Up @@ -1065,6 +1121,100 @@ mod tests {
assert_eq!(tables[0].cells[1].text, "Cell B");
}

#[test]
fn maps_real_text_and_child_aliases_in_preorder() {
let src = OdlJsonSource::from_json_str(
r#"{
"file name": "aliases.pdf",
"number of pages": 1,
"kids": [
{
"type": "section",
"id": "section-a",
"page number": 1,
"bounding box": [10, 10, 220, 90],
"text": "Alias section",
"children": [
{
"type": "paragraph",
"id": "child-a",
"page number": 1,
"bounding box": [20, 30, 210, 55],
"text": "Child text"
}
]
},
{
"type": "list",
"id": "list-a",
"page number": 1,
"bounding box": [10, 100, 220, 160],
"list_items": [
{
"type": "list_item",
"id": "item-a",
"page number": 1,
"bounding box": [20, 115, 210, 140],
"text": "Alias item"
}
]
},
{
"type": "table",
"id": "table-a",
"page number": 1,
"bounding box": [10, 170, 220, 230],
"rows": [
{
"cells": [
{
"type": "table_cell",
"page number": 1,
"bounding box": [20, 185, 210, 215],
"text": "Alias cell"
}
]
}
]
}
]
}"#,
)
.unwrap();

let elements = src.elements();
let ids = elements
.iter()
.map(|element| element.id.as_str())
.collect::<Vec<_>>();
assert_eq!(
ids,
vec![
"odl-section-a",
"odl-child-a",
"odl-list-a",
"odl-item-a",
"odl-table-a",
"odl-el-1",
]
);
assert_eq!(
elements[0].text.as_deref(),
Some("Alias section\nChild text")
);
assert_eq!(elements[1].text.as_deref(), Some("Child text"));
assert_eq!(elements[2].text.as_deref(), Some("Alias item"));
assert_eq!(elements[3].text.as_deref(), Some("Alias item"));
assert_eq!(elements[5].text.as_deref(), Some("Alias cell"));

let tables = src.tables();
assert_eq!(tables.len(), 1);
assert_eq!(tables[0].id, "odl-table-a");
assert_eq!(tables[0].cells.len(), 1);
assert_eq!(tables[0].cells[0].text, "Alias cell");
assert_eq!(tables[0].cells[0].bbox, [2000, 18500, 21000, 21500]);
}

#[test]
fn maps_real_structural_containers_without_table_capability() {
let src = OdlJsonSource::from_json_str(
Expand Down Expand Up @@ -1342,6 +1492,26 @@ mod tests {
r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"type":"paragraph","page number":1,"bounding box":[1,1,2,2],"content":7}]}"#,
"content must be a string",
);
assert_error_contains(
r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"type":"paragraph","page number":1,"bounding box":[1,1,2,2],"text":7}]}"#,
"text must be a string",
);
assert_error_contains(
r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"type":"paragraph","page number":1,"bounding box":[1,1,2,2],"content":"A","text":"B"}]}"#,
"content and text fields disagree",
);
assert_error_contains(
r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"children":{}}]}"#,
"children must be an array",
);
assert_error_contains(
r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"kids":[],"children":[]}]}"#,
"ambiguous kids/children containers",
);
assert_error_contains(
r#"{"file name":"bad.pdf","number of pages":1,"kids":[{"type":"list","page number":1,"bounding box":[1,1,2,2],"list items":[],"list_items":[]}]}"#,
"ambiguous list item containers",
);
assert_error_contains(
r#"{"file name":"bad.pdf","number of pages":1,"kids":[{}]}"#,
"content node has no element fields or child containers",
Expand Down
77 changes: 77 additions & 0 deletions crates/ethos-cli/tests/verify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2009,6 +2009,83 @@ fn real_opendataloader_style_table_cell_claim_grounds() {
assert_eq!(report["all_evidence_grounded"], true);
}

#[test]
fn real_opendataloader_text_and_child_alias_claim_grounds() {
let grounding = temp_json(
"real-odl-style-aliases",
r#"{
"file name": "aliases.pdf",
"number of pages": 1,
"kids": [
{
"type": "section",
"id": "parent",
"page number": 1,
"bounding box": [10, 10, 240, 80],
"text": "Parent text",
"children": [
{
"type": "paragraph",
"id": "alias-child",
"page number": 1,
"bounding box": [20, 20, 230, 50],
"text": "Child alias grounds"
}
]
}
]
}"#,
);
let citations = temp_json(
"real-odl-style-alias-citations",
r#"{
"claims": [
{
"kind": "quote",
"text": "Child alias grounds",
"citation": {
"element_id": "odl-alias-child"
}
}
]
}"#,
);
let report = parse_success(&[
"verify",
grounding.to_str().unwrap(),
"--grounding",
"opendataloader-json",
"--citations",
citations.to_str().unwrap(),
]);

assert_eq!(report["checks"][0]["status"], "grounded");
assert_eq!(
report["checks"][0]["match_method"],
"normalized_text_contains"
);
assert_eq!(report["checks"][0]["evidence"]["page"], "page-1");
assert_eq!(
report["checks"][0]["evidence"]["text"],
"Child alias grounds"
);
assert_eq!(
report["checks"][0]["evidence"]["bbox"],
serde_json::json!([2000, 2000, 23000, 5000])
);
assert_eq!(
report["capability_limits"],
serde_json::json!([
"missing_fingerprint",
"missing_spans",
"missing_char_offsets",
"missing_tables",
"unknown_coordinate_origin"
])
);
assert_eq!(report["all_evidence_grounded"], true);
}

#[test]
fn foreign_source_without_fingerprint_blocks_fingerprint_pinned_citations() {
let grounding = odl_example();
Expand Down
Loading
Loading