From b06b39d72341756fc131a4a69852ddd5d2277a0e Mon Sep 17 00:00:00 2001 From: Matthew James Briggs Date: Sat, 20 Jun 2026 16:58:47 +0200 Subject: [PATCH 1/5] chore: delete claude removing ci file that did not work --- .github/workflows/replace-claude.yaml | 67 --------------------------- 1 file changed, 67 deletions(-) delete mode 100644 .github/workflows/replace-claude.yaml diff --git a/.github/workflows/replace-claude.yaml b/.github/workflows/replace-claude.yaml deleted file mode 100644 index 86ef70ae..00000000 --- a/.github/workflows/replace-claude.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: Take Ownership of Claude PRs -on: - pull_request: - types: [opened, synchronize] - -# Rational: claude is a tool, not an author. -jobs: - rewrite-author: - # Adjust this filter if your bot profile matches a specific string - # or leave it wide open to catch any AI-assisted PR branch - if: contains(github.event.pull_request.labels.*.name, 'claude') || github.event.pull_request.user.login == 'YOUR_GITHUB_USERNAME' - runs-on: ubuntu-latest - permissions: - contents: write - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - fetch-depth: 0 - - - name: Checkout PR Branch - run: git checkout ${{ github.event.pull_request.head.ref }} - - - name: Fetch Dynamic User Profile Info - id: get_user - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # 1. Capture the exact username who opened the PR - USERNAME="${{ github.event.pull_request.user.login }}" - - # 2. Get the user ID to construct a valid GitHub noreply email - USER_ID=$(curl -s -H "Authorization: token $GH_TOKEN" "https://github.com" | jq '.id') - - # 3. Query the GitHub API for the user's public Name profile field - PUBLIC_NAME=$(curl -s -H "Authorization: token $GH_TOKEN" "https://github.com" | jq -r '.name') - - # Fallback to username if your GitHub "Name" profile field is blank - if [ "$PUBLIC_NAME" == "null" ] || [ -z "$PUBLIC_NAME" ]; then - PUBLIC_NAME="$USERNAME" - fi - - # Construct your standard, privacy-protected GitHub email template - NOREPLY_EMAIL="${USER_ID}+${USERNAME}@users.noreply.github.com" - - # Export variables securely to subsequent steps - echo "git_name=$PUBLIC_NAME" >> $GITHUB_OUTPUT - echo "git_email=$NOREPLY_EMAIL" >> $GITHUB_OUTPUT - - - name: Configure Git Credentials Dynamically - run: | - git config --global user.name "${{ steps.get_user.outputs.git_name }}" - git config --global user.email "${{ steps.get_user.outputs.git_email }}" - - - name: Rewrite Commit Authors - run: | - # Dynamically swaps out the author metadata using the fetched credentials - git filter-branch -f --env-filter ' - export GIT_AUTHOR_NAME="${{ steps.get_user.outputs.git_name }}" - export GIT_AUTHOR_EMAIL="${{ steps.get_user.outputs.git_email }}" - export GIT_COMMITTER_NAME="${{ steps.get_user.outputs.git_name }}" - export GIT_COMMITTER_EMAIL="${{ steps.get_user.outputs.git_email }}" - ' origin/${{ github.event.pull_request.base.ref }}..HEAD - - - name: Force Push Clean Commits - run: git push origin ${{ github.event.pull_request.head.ref }} --force From ea4e6e3c750962b94af61e1174333a18ea67555d Mon Sep 17 00:00:00 2001 From: Matthew James Briggs Date: Sat, 20 Jun 2026 08:47:25 +0000 Subject: [PATCH 2/5] docs: diff design for api round-trip classifier (#211) Document the multiset-first, layered diff design for audit/classify.py, superseding the naive "walk both trees in parallel until the first mismatch" sketch. A positional walk cannot survive a deletion (the dominant mx::api failure signal): one drop desynchronizes every later sibling, so only the first divergence is trustworthy. Replace it with a collections.Counter multiset difference that enumerates all missing element classes in O(n), reorder-invariant, stdlib-only. Includes a cited research appendix surveying tree edit distance, XML-specific diff algorithms, sequence alignment, Python libraries, and yield-based ranking. --- docs/ai/design/api-roundtrip-classifier.md | 235 +++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 docs/ai/design/api-roundtrip-classifier.md diff --git a/docs/ai/design/api-roundtrip-classifier.md b/docs/ai/design/api-roundtrip-classifier.md new file mode 100644 index 00000000..dcf3e6fd --- /dev/null +++ b/docs/ai/design/api-roundtrip-classifier.md @@ -0,0 +1,235 @@ +# api round-trip failure classifier — diff design + +Design notes for the `audit/classify.py` classifier (issue #211, Phase 2 of #208). +This document explains *how the classifier diffs the expected/actual dump pairs* +and why. It supersedes the original "walk both trees in parallel until the first +mismatch" sketch in the #211 issue body, which has a structural defect described +below. + +## The defect in the naive design + +Both `compareElements` (`src/private/mxtest/corert/Compare.cpp`) and the original +#211 step 3b do the same thing: a **positional, paired, depth-first walk** of the +two trees that stops at the **first** divergence. That is correct as a pass/fail +gate (corert only needs to know *whether* the documents differ), but it is wrong +as an *analysis* tool. + +`mx::api` is a deliberate subset of MusicXML: it drops whole subtrees and reorders +children **by design**. The dominant failure signal is therefore a *deletion* — +an element present in `expected` that never appears in `actual`. A positional walk +cannot survive a deletion: + +``` +expected children: [credit] [credit] [part-list] [part] ... +actual children: [part-list] [part] [???] ... <- credit dropped +``` + +The moment `actual` is missing one child, the two child-lists shift out of +alignment, and every subsequent sibling is compared against the wrong partner. +You can only trust the **first** reported divergence; everything after it is noise. +A single drop poisons the rest of the file. + +This is a well-documented property of diffing: a naive index-by-index comparison +breaks because one insert/delete shifts every later index, so the differ keeps +matching content at positions that no longer correspond. The principled fixes are +**alignment** (LCS / Myers) or, when position is irrelevant, an **order-free +multiset** comparison. See the research summary at the end of this document. + +## What we actually need + +The classifier's job is *not* to produce a minimal edit script. It is to answer: + +1. **Which element classes (tag names) are present in `expected` but missing from + `actual`?** — the complete set, not just the first. +2. **How many *distinct* element classes are missing per file?** — the headline + ranking metric. +3. **Which files are missing exactly one distinct class** (once or many times)? — + the low-hanging fruit: a single api addition flips them to PASS. +4. A cheap **"distance to passing"** scalar for secondary tie-breaking. + +None of these require positions or a minimal edit script. They require a *complete +inventory of deletions*, which is exactly what a multiset difference gives in +linear time, fully invariant to reordering. + +Exact order-insensitive structural diffing (unordered tree edit distance) is +NP-complete, and ordered tree edit distance (Zhang-Shasha / RTED / APTED) is +O(n³)/O(n²) — both far more than this problem needs. We deliberately do **not** +use them. + +## The layered design + +All layers operate on the two normalized dump documents (`*.expected.xml`, +`*.actual.xml`) using only the Python standard library +(`xml.etree.ElementTree`, `collections.Counter`, `difflib`). No `lxml`, `xmldiff`, +`zss`, or `apted` dependency is introduced — the multiset primitive covers the need +in O(n). + +### Layer 1 — multiset tag diff (primary signal) + +Recurse both trees, building a `Counter` of element tag names: + +```python +from collections import Counter + +def tag_multiset(root): + return Counter(el.tag for el in root.iter()) + +missing = tag_multiset(expected) - tag_multiset(actual) # in expected, not in actual +added = tag_multiset(actual) - tag_multiset(expected) # spurious (rare; a real bug) +``` + +`Counter` subtraction keeps only positive counts, so `missing` is precisely the +multiset of element classes the round-trip dropped. From it: + +| field | meaning | +|---|---| +| `missing_elements` | `sorted(missing)` — the distinct dropped tag names | +| `missing_element_counts` | `dict(missing)` — how many of each were dropped | +| `distinct_missing_count` | `len(missing)` — **the headline ranking metric** | +| `added_elements` | `sorted(added)` — spurious tags the api invented (bug signal) | + +This is O(n), reorder-invariant, and enumerates **every** dropped class — it does +not stop at the first one. It is the direct fix for the cascade defect. + +### Layer 2 — path-qualified multiset (attribution) + +Bare tag names are ambiguous (`` under `` vs. elsewhere). For +`blocking_features` attribution and to line up with `first_divergence_path`, build +a second multiset keyed by `parent/tag` (or the full root-to-node path). The +headline `distinct_missing_count` stays on bare tags; path qualification is used +when mapping a drop back to an `api.features.xml` feature. + +### Layer 3 — cross-reference `api.features.xml` + +For each missing tag, look up `support` in `api.features.xml`: + +- **Category B (drop-only)** becomes *provable across the whole file*: it holds iff + **every** tag in `missing` has `support="none"`. The naive walk could never + establish this — it only saw the first drop. This is the single biggest + correctness win of the redesign. +- A missing tag with `support="full"` or `"partial"` is a genuine correctness bug + or a partial-drop — surfaced explicitly instead of being hidden behind whatever + the first divergence happened to be. + +### Layer 4 — sequence alignment per parent (category C only) + +Reorder-only (category C) is the case where a parent's child *multiset* is equal +between expected and actual but the child *sequence* differs. Detect with +`difflib.SequenceMatcher` opcodes over the two child-tag sequences: a pure +permutation (no net insert/delete) confirms reorder-only. Stdlib, zero dependency. +This is the one place per-parent alignment earns its keep; it is **not** used for +the deletion inventory (Layer 1 already has that). + +### Layer 5 — edit-distance scalar (secondary) + +Use `sum(missing.values())` (total dropped instances) as a cheap "distance to +passing." It is a *secondary* tie-breaker only. We do **not** key ranking on it: +a scalar conflates "one big dropped subtree" with "many tiny diffs" and names no +feature. Full tree edit distance would also pull in a heavyweight dependency for +no added decision value here. + +## Ranking output (the worklist) + +The low-hanging fruit is the set of files with `distinct_missing_count == 1`. +Group those by their single missing tag; the resulting histogram is the +prioritized api-addition worklist. + +Two refinements from the regression-triage literature (e.g. BuildSheriff, ICSE'22: +failures cluster around a few root causes; fixing one unblocks a cluster): + +1. **Rank by files *unblocked*, not occurrence count.** A tag appearing 10,000× + in one file is lower yield than a tag that is the *sole* blocker of 200 files. + The y-axis is "count of files whose single blocking class is tag X." +2. **Histogram on the categorical root cause (missing tag), not the edit-distance + scalar.** Keep the scalar as a within-cluster severity tie-breaker. + +This belongs to Phase 3 (#212) ranking, but the per-file fields it needs +(`distinct_missing_count`, `missing_elements`, single-blocker flag) are emitted +here in Phase 2. + +## JSON schema additions + +The #211 schema gains these per-file fields (existing fields unchanged; all +present on every entry, `null`/`[]`/`{}` when not applicable): + +```jsonc +"missing_elements": ["credit", "defaults"], // sorted distinct dropped tags +"missing_element_counts": {"credit": 3, "defaults": 1}, +"distinct_missing_count": 2, // headline ranking metric +"added_elements": [], // spurious tags in actual (bug) +"is_single_blocker": false, // distinct_missing_count == 1 +"total_missing_instances": 4 // sum of counts; severity scalar +``` + +`first_divergence_element` / `first_divergence_path` / `mismatch_type` are retained +for continuity with the harness detail string, but are explicitly **not** relied on +for completeness — they describe only the first positional divergence. + +## Dependency decision + +Stdlib only: `xml.etree.ElementTree` + `collections.Counter` + `difflib`. Rationale: +the need is "enumerate missing distinct tag names + counts + single-blocker files," +which multiset diff answers exactly in O(n). Structure-aware edit-script libraries +(`xmldiff`, which needs the `lxml` C extension; `apted`) deliver minimal edit +scripts and move detection we don't need; `zss` returns only a scalar distance. + +--- + +## Research appendix (problem-space survey) + +Findings backing the design above, with confidence and sources. UNVERIFIED marks a +claim from training knowledge that the fetched sources did not directly confirm +(usually because a primary PDF was unreachable). + +### Alignment vs. positional walk +- A naive index-by-index walk cascades after one insert/delete; Myers models diff + as shortest-path search on an edit graph where diagonal (match) moves are free, so + shared content anchors the alignment. *(High — jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/; xmailserver.org/diff2.pdf)* +- Microsoft XML Diff documents three child-matching modes — By Index (naive), LCS + (survives ins/del), Unordered (set-like). *(Med-high — documentation.help/Microsoft-XML-Diff)* + +### Tree edit distance (decided against) +- Zhang-Shasha O(n⁴)/O(nm); Klein O(n³ log n); Demaine'07 O(n³); RTED O(n³)/O(n²); + APTED O(n³)/O(n²) with an edit *mapping* (unmapped source nodes = deletions). + *(High — erikdemaine.org/papers/TreeEdit_ICALP2007/paper.pdf; arxiv 1201.0230; + dl.acm.org/doi/10.1145/2699485; github.com/DatabaseGroup/apted)* +- **Unordered** tree edit distance is NP-complete / MAX-SNP-hard even for binary + trees over a 2-symbol alphabet. *(High — Bille survey, sciencedirect + S0304397505000174; arxiv 2108.00953)* + +### XML-specific algorithms +- Chawathe et al. 1996 (basis of Python `xmldiff`): match phase + edit-script phase, + ordered, emits INSERT/UPDATE/MOVE/DELETE — enumerates deletions and reports a + reorder as one MOVE. *(High on ops; exact complexity bound UNVERIFIED — primary + PDF 503'd. sigmodrecord.org/1996/06/24/...; github.com/cfpb/xtdiff)* +- X-Diff: unordered model (reorder is a non-change), bottom-up signature hashing + + min-cost sibling matching; harder to compute, no MOVE op. *(Ordered/unordered: + high — pages.cs.wisc.edu/~yuanwang/xdiff.html; "no MOVE": UNVERIFIED)* + +### Sequence alignment & multiset +- Myers O(ND) powers git/GNU diff; per-parent LCS over child sequences is an + established lightweight alternative to TED (XMLMind). *(High — xmailserver.org/diff2.pdf; + xmlmind.com/.../xmlDiff_algorithm.html)* +- `collections.Counter` is a multiset; `a - b` keeps only positive counts = + elements in `a` not covered by `b`, directly enumerable; building is O(n), + set-ops O(n+m). *(High on semantics — docs.python.org/3/library/collections.html; + the explicit O(n) is standard reasoning, not doc-quoted → UNVERIFIED as a doc claim)* + +### Python libraries +- `xmldiff` 3.0 (2026-06-11, MIT, active): Chawathe; enumerable `DeleteNode`/ + `MoveNode`/… actions; **requires lxml (C ext)**. +- `zss` (2018, BSD): Zhang-Shasha; **scalar distance only, no edit script**. +- `apted` (2017, MIT, pure Python): edit mapping (deletions = unmapped source + nodes); needs bracket-notation input. +- `lxml`: no semantic diff, only C14N canonicalization. +- stdlib `difflib.SequenceMatcher` (opcodes) and `collections.Counter` (multiset): + zero-dependency, both enumerate deletions. *(All high — verified against live + PyPI/GitHub/ReadTheDocs)* + +### Yield-based ranking +- BuildSheriff (ICSE'22, 163K builds): failures cluster around a few root causes; + triaging by shared cause saves cost — validates "many files, few blockers." + *(Med-high — chenbihuan.github.io/paper/icse22-zhang-buildsheriff.pdf)* +- Rank by **files unblocked**, not occurrence count; histogram on categorical root + cause, with edit distance only as a secondary severity scalar. *(Synthesized from + the triage literature above + frequency×impact practice)* From 2f11da6cf7e58ed457cf68cf9421e3059af5917d Mon Sep 17 00:00:00 2001 From: Matthew James Briggs Date: Sat, 20 Jun 2026 09:06:18 +0000 Subject: [PATCH 3/5] feat: dump and classify api round-trip failures (#210, #211) Phase 1 (#210): add a --dump flag to the api round-trip harness' discovery mode. For every non-PASS, non-SKIP file it re-runs the pipeline and writes the fully-normalized expected (and, when produced, actual) documents to /.expected.xml / .actual.xml. Pipeline errors (LOADFAIL/GETDATAFAIL/ CREATEFAIL) have no actual document, so a .status sidecar records the exact code. runRoundtrip() is left untouched; dumpDocuments() replicates the normalization sequence. New make target: dump-api-roundtrip. Phase 2 (#211): add audit/classify.py and the `classify` subcommand. It diffs each dumped pair as an order-free element multiset (Counter(expected) - Counter(actual)), which enumerates every dropped element class in O(n) and is reorder-invariant -- fixing the positional-walk cascade where one drop desynchronizes all later siblings. It cross-references data/api.features.xml and assigns each file a root-cause category (B drop-only, C reorder-only, D enum-bug, E missing-attribute, F pipeline-error, or unknown), emitting build/api/classified.json plus a stdout worklist ranked by files unblocked. New make targets: classify-api-roundtrip, test-audit (wired into CI). Tests in audit/tests/test_classify.py cover all categories, the multiset completeness fix, the single-blocker low-hanging-fruit metric, and ranking. --- .github/workflows/ci.yaml | 3 + Makefile | 39 +- audit/README.md | 26 + audit/__main__.py | 25 +- audit/classify.py | 502 ++++++++++++++++++ audit/tests/__init__.py | 0 audit/tests/test_classify.py | 203 +++++++ docs/ai/design/api-roundtrip-classifier.md | 23 + .../mxtest/api/CorpusRoundtripMain.cpp | 162 +++++- 9 files changed, 977 insertions(+), 6 deletions(-) create mode 100644 audit/classify.py create mode 100644 audit/tests/__init__.py create mode 100644 audit/tests/test_classify.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c57f6c7b..a46368e7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -58,6 +58,9 @@ jobs: - name: Generator tests run: make test-gen + - name: Audit tests + run: make test-audit + - name: plates --check (all targets) run: make gen-check diff --git a/Makefile b/Makefile index 85ceef82..f9732431 100644 --- a/Makefile +++ b/Makefile @@ -65,12 +65,13 @@ FIND_CPP := find src \ .DEFAULT_GOAL := help .PHONY: help sdk fmt check core-dev check-core-dev test-core-dev test-cpp-unit \ - validate-cpp probe-cpp coverage-core-dev test-gen gen-check \ + validate-cpp probe-cpp coverage-core-dev test-gen test-audit gen-check \ gen-quality gen-lint \ gen gen-cpp gen-go gen-c gen-schema \ audit audit-force \ build-go build-c test-go test-c \ - lib dev test run-examples test-api-roundtrip discover-api-roundtrip coverage-api \ + lib dev test run-examples test-api-roundtrip discover-api-roundtrip \ + dump-api-roundtrip classify-api-roundtrip coverage-api \ clean clean-docker check-docker docker-volume help: @@ -84,6 +85,8 @@ help: @echo ' make run-examples Build and run all three api example programs.' @echo ' make test-api-roundtrip Run the corpus api roundtrip in regression mode (CI gate).' @echo ' make discover-api-roundtrip Run discovery mode over the full corpus (manual only).' + @echo ' make dump-api-roundtrip Dump normalized expected/actual XML for failures.' + @echo ' make classify-api-roundtrip Classify dumped failures by root cause (Python).' @echo ' make coverage-api Instrumented api/impl/utility build + gcovr report.' @echo '' @echo ' C++ core:' @@ -97,6 +100,7 @@ help: @echo '' @echo ' Generator:' @echo ' make test-gen Run the generator (parser + IR + plates + press) Python tests.' + @echo ' make test-audit Run the audit tool Python tests (incl. failure classifier).' @echo ' make gen-check plates --check for every target (renames, collisions).' @echo ' make gen Run the generator for every target (cpp/go/c/schema).' @echo ' make gen-cpp Run the generator for the C++ target (src/private/mx/core/generated).' @@ -177,6 +181,24 @@ test-api-roundtrip: dev discover-api-roundtrip: dev $(BUILD_ROOT)/api/mxtest-api-roundtrip discovery $(CURDIR)/data +# Dump normalized expected/actual XML for every failing api round-trip. +# Output goes to build/api/roundtrip-dump/ (build dir, already gitignored). +# Feeds the classifier: make dump-api-roundtrip && make classify-api-roundtrip +dump-api-roundtrip: dev + mkdir -p $(BUILD_ROOT)/api/roundtrip-dump + $(BUILD_ROOT)/api/mxtest-api-roundtrip discovery $(CURDIR)/data \ + --dump $(CURDIR)/$(BUILD_ROOT)/api/roundtrip-dump + +# Classify api round-trip failures by root cause. +# Reads the dump produced by dump-api-roundtrip; writes build/api/classified.json. +# Fast (pure Python); kept separate from the slow dump step so classification +# logic can be re-run without re-dumping. Pass DUMP_DIR=path to override. +DUMP_DIR ?= $(BUILD_ROOT)/api/roundtrip-dump +classify-api-roundtrip: + python3 -m audit classify $(DUMP_DIR) \ + --data $(CURDIR)/data \ + --out $(BUILD_ROOT)/api/classified.json + # Instrumented api coverage: build mx+mxtest with --coverage, run all # suites, produce gcovr report for src/private/mx/{api,impl,utility}/. coverage-api: @@ -299,6 +321,10 @@ coverage-core-dev: test-gen: python3 -m unittest discover -s gen/tests -t . $(ARGS) +# Audit tool Python tests (feature-audit + the round-trip failure classifier). +test-audit: + python3 -m unittest discover -s audit/tests -t . $(ARGS) + # plates --check for every target: validates renames and detects identifier # collisions (a CI gate, like test-gen). gen-check: @@ -406,6 +432,12 @@ test-api-roundtrip: $(DOCKER_STAMP) docker-volume discover-api-roundtrip: $(DOCKER_STAMP) docker-volume $(DOCKER_RUN) make discover-api-roundtrip BUILD_TYPE=$(BUILD_TYPE) +dump-api-roundtrip: $(DOCKER_STAMP) docker-volume + $(DOCKER_RUN) make dump-api-roundtrip BUILD_TYPE=$(BUILD_TYPE) + +classify-api-roundtrip: $(DOCKER_STAMP) docker-volume + $(DOCKER_RUN) make classify-api-roundtrip BUILD_TYPE=$(BUILD_TYPE) + coverage-api: $(DOCKER_STAMP) docker-volume @rm -rf $(COV_DIR)/api $(DOCKER_RUN) make coverage-api BUILD_TYPE=$(BUILD_TYPE) ARGS='$(ARGS)' @@ -437,6 +469,9 @@ coverage-core-dev: $(DOCKER_STAMP) docker-volume test-gen: $(DOCKER_STAMP) $(DOCKER_RUN) make test-gen ARGS='$(ARGS)' +test-audit: $(DOCKER_STAMP) + $(DOCKER_RUN) make test-audit ARGS='$(ARGS)' + gen-check: $(DOCKER_STAMP) $(DOCKER_RUN) make gen-check diff --git a/audit/README.md b/audit/README.md index 728e3a68..14f89745 100644 --- a/audit/README.md +++ b/audit/README.md @@ -37,6 +37,32 @@ python3 -m audit all [--force] # both common case (a new corpus file was added) only writes the new sidecar. Use `--force` when the output format itself changes. +## Classifying api round-trip failures + +``` +make dump-api-roundtrip # C++: write normalized expected/actual XML pairs +make classify-api-roundtrip # Python: classify those failures by root cause + +python3 -m audit classify [--data DIR] [--out FILE] +``` + +`classify` reads the dump directory produced by `make dump-api-roundtrip` +(`build/api/roundtrip-dump/`), diffs each expected/actual pair as an order-free +element **multiset** (`Counter(expected) - Counter(actual)`), cross-references +`data/api.features.xml`, and assigns each non-passing file a root-cause category +(drop-only, reorder-only, enum bug, missing attribute, pipeline error). It writes +`build/api/classified.json` and prints a worklist of the features blocking the +most files. The two steps are kept separate: dumping is slow (runs the C++ +pipeline over the whole corpus), classifying is fast (pure Python), so the +classification logic can be iterated without re-dumping. See +`docs/ai/design/api-roundtrip-classifier.md`. + +## Tests + +``` +make test-audit # python3 -m unittest discover -s audit/tests -t . +``` + ## Audited set The audited files are exactly those the `corert` round-trip suite processes (see diff --git a/audit/__main__.py b/audit/__main__.py index f2d7a088..743a70e2 100644 --- a/audit/__main__.py +++ b/audit/__main__.py @@ -5,6 +5,9 @@ corpus file (skips existing unless --force) python3 -m audit corpus (re)build data/corpus.xml from the corpus python3 -m audit all [--force] run `files` then `corpus` + python3 -m audit classify [--data DIR] [--out FILE] + classify api round-trip failures by root + cause from a dump directory (see #211) See audit/README.md. The audited set mirrors the corert round-trip suite. """ @@ -15,11 +18,12 @@ import sys from pathlib import Path -from . import CORPUS_SUMMARY_NAME, featuresfile, summary +from . import CORPUS_SUMMARY_NAME, classify, featuresfile, summary from .discover import discover from .extract import extract_file DATA_ROOT = Path(__file__).resolve().parent.parent / "data" +BUILD_ROOT = Path(__file__).resolve().parent.parent / "build" def _cmd_files(force: bool) -> int: @@ -44,6 +48,12 @@ def _cmd_corpus() -> int: return 0 +def _cmd_classify(dump_dir: str, data: str | None, out: str | None) -> int: + data_root = Path(data).resolve() if data else DATA_ROOT + out_path = Path(out).resolve() if out else BUILD_ROOT / "api" / "classified.json" + return classify.run(Path(dump_dir).resolve(), data_root, out_path) + + def main(argv: list[str]) -> int: parser = argparse.ArgumentParser(prog="python3 -m audit") sub = parser.add_subparsers(dest="cmd", required=True) @@ -56,6 +66,17 @@ def main(argv: list[str]) -> int: p_all = sub.add_parser("all", help="run files then corpus") p_all.add_argument("--force", action="store_true", help="overwrite existing sidecars") + p_classify = sub.add_parser( + "classify", help="classify api round-trip failures from a dump directory" + ) + p_classify.add_argument("dump_dir", help="directory of *.expected.xml/*.actual.xml dumps") + p_classify.add_argument( + "--data", default=None, help="data root (default: repo data/ directory)" + ) + p_classify.add_argument( + "--out", default=None, help="output JSON path (default: build/api/classified.json)" + ) + args = parser.parse_args(argv) if args.cmd == "files": return _cmd_files(args.force) @@ -64,6 +85,8 @@ def main(argv: list[str]) -> int: if args.cmd == "all": rc = _cmd_files(args.force) return rc or _cmd_corpus() + if args.cmd == "classify": + return _cmd_classify(args.dump_dir, args.data, args.out) return 2 diff --git a/audit/classify.py b/audit/classify.py new file mode 100644 index 00000000..08f7c537 --- /dev/null +++ b/audit/classify.py @@ -0,0 +1,502 @@ +"""Classify api round-trip failures by root cause. + +Reads the dump directory produced by ``make dump-api-roundtrip`` (Phase 1, +issue #210): pairs of ``.expected.xml`` / ``.actual.xml`` plus a +``.status`` sidecar for pipeline errors. Cross-references +``data/api.features.xml`` and the per-file ``*.features.xml`` sidecars, assigns +each non-passing file one primary root-cause category plus any secondaries, and +writes a machine-readable JSON report (consumed by Phase 3 ranking) with a +human-readable summary on stdout. + +The diff is **multiset-based**, not a positional walk. ``mx::api`` drops and +reorders subtrees by design, so the dominant signal is a *deletion*; a naive +index-by-index walk desynchronizes after the first drop and only its first +divergence is trustworthy. The multiset difference +``Counter(expected) - Counter(actual)`` enumerates **every** dropped element +class in O(n), fully reorder-invariant. See +``docs/ai/design/api-roundtrip-classifier.md`` for the full rationale. + +Categories: + + A already-passing expected/actual identical (defensive; not normally dumped) + B drop-only every missing element class has support="none" + C reorder-only same tag multiset, but a parent's child order differs + D enum-bug text/attr value is a known missing enum member + E missing-attribute a partial feature's attribute was dropped + F pipeline-error LOADFAIL / GETDATAFAIL / CREATEFAIL (no actual produced) + unknown a FAIL that matched none of the above +""" + +from __future__ import annotations + +import difflib +import json +import sys +import xml.etree.ElementTree as ET +from collections import Counter +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path + +from . import FEATURES_SUFFIX, featuresfile + +# Filename suffixes written by the dump harness (#210). +_EXPECTED_SUFFIX = ".expected.xml" +_ACTUAL_SUFFIX = ".actual.xml" +_STATUS_SUFFIX = ".status" + +# Pipeline-error status codes (no actual document was produced). +_PIPELINE_STATUSES = frozenset({"LOADFAIL", "GETDATAFAIL", "CREATEFAIL"}) + +# Human-readable category labels for the stdout summary. +_CATEGORY_LABELS = { + "A": "already passing", + "B": "drop-only divergence", + "C": "reorder-only divergence", + "D": "enum bug", + "E": "missing attribute/element", + "F": "pipeline error", + "unknown": "unknown", +} + +# Categories that are actionable feature gaps (ranked in the worklist). +_ACTIONABLE = frozenset({"B", "D", "E"}) + + +# --------------------------------------------------------------------------- # +# api.features.xml support index +# --------------------------------------------------------------------------- # + + +@dataclass +class ApiFeature: + """One ```` from ``data/api.features.xml`` with support detail.""" + + name: str + support: str # "full" / "partial" / "none" / "" (unknown) + attributes: dict[str, str] = field(default_factory=dict) # attr name -> support + enum_missing: set[str] = field(default_factory=set) # normalized missing members + + +def _normalize_enum(value: str | None) -> str: + """Bridge XML kebab-case values and api/core camelCase enum symbols. + + ``sharp-sharp`` (XML text) and ``sharpSharp`` (enum member) both normalize to + ``sharpsharp`` so they compare equal. + """ + if not value: + return "" + return value.strip().lower().replace("-", "").replace("_", "") + + +def load_api_features(path: Path) -> dict[str, ApiFeature]: + """Parse ``api.features.xml`` into a name -> ApiFeature map.""" + features: dict[str, ApiFeature] = {} + root = ET.parse(path).getroot() + for feature in root.findall("./features/feature"): + name = feature.get("name", "") + if not name: + continue + feat = ApiFeature(name=name, support=feature.get("support", "")) + for attr in feature.findall("./attributes/attribute"): + aname = attr.get("name", "") + if aname: + feat.attributes[aname] = attr.get("support", "") + for enum in feature.findall("./enums/enum"): + for missing in enum.findall("./missing"): + feat.enum_missing.add(_normalize_enum(missing.text)) + features[name] = feat + return features + + +# --------------------------------------------------------------------------- # +# Dump directory discovery +# --------------------------------------------------------------------------- # + + +@dataclass +class DumpEntry: + """One corpus file's presence in the dump directory.""" + + rel: str # corpus-relative path, forward slashes (e.g. "lysuite/Saltarello.xml") + expected: Path | None = None + actual: Path | None = None + status_code: str | None = None # from the .status sidecar, if any + + +def _flat_to_rel(flat: str) -> str: + """Reverse the dump harness's path flattening: ``a__b.xml`` -> ``a/b.xml``.""" + return flat.replace("__", "/") + + +def parse_dump_dir(dump_dir: Path) -> list[DumpEntry]: + """Group the dump directory's files into one DumpEntry per corpus file.""" + entries: dict[str, DumpEntry] = {} + + def _entry(flat: str) -> DumpEntry: + return entries.setdefault(flat, DumpEntry(rel=_flat_to_rel(flat))) + + for p in sorted(dump_dir.iterdir()): + if not p.is_file(): + continue + name = p.name + if name.endswith(_EXPECTED_SUFFIX): + _entry(name[: -len(_EXPECTED_SUFFIX)]).expected = p + elif name.endswith(_ACTUAL_SUFFIX): + _entry(name[: -len(_ACTUAL_SUFFIX)]).actual = p + elif name.endswith(_STATUS_SUFFIX): + _entry(name[: -len(_STATUS_SUFFIX)]).status_code = ( + p.read_text(encoding="utf-8").strip() or None + ) + return [entries[k] for k in sorted(entries)] + + +# --------------------------------------------------------------------------- # +# XML helpers and the positional first-divergence walk +# --------------------------------------------------------------------------- # + + +def _local(tag: str) -> str: + """Strip a Clark-notation namespace ({uri}local -> local).""" + if tag.startswith("{"): + return tag.rsplit("}", 1)[1] + return tag + + +def tag_multiset(root: ET.Element) -> Counter: + """Count every element's local tag name in the tree (the multiset primitive).""" + return Counter(_local(el.tag) for el in root.iter()) + + +def _values_equiv(left: str, right: str) -> bool: + """Mirror corert ``isEquivalent``: numeric values compare by value.""" + if left == right: + return True + try: + return abs(float(left) - float(right)) < 1e-8 + except (ValueError, OverflowError): + return False + + +@dataclass +class Divergence: + """The first positional divergence, retained for continuity (not completeness).""" + + mismatch_type: str # element-name / text / attribute / attribute-count / child-count + path: str + element: str # leaf local name at the divergence + expected_value: str | None = None + actual_value: str | None = None + attr_name: str | None = None + missing_attrs: list[str] = field(default_factory=list) + parent_expected_children: list[str] = field(default_factory=list) + parent_actual_children: list[str] = field(default_factory=list) + + +def first_divergence(exp: ET.Element, act: ET.Element, path: str) -> Divergence | None: + """Find the first positional divergence, mirroring ``compareElements``. + + ``path`` includes ``exp`` itself. Tags of ``exp``/``act`` are assumed equal + (the caller pairs them); the root pair is the only entry point. + """ + et = (exp.text or "").strip() + at = (act.text or "").strip() + if not _values_equiv(et, at): + return Divergence("text", path, _local(exp.tag), expected_value=et, actual_value=at) + + ea = {_local(k): v for k, v in exp.attrib.items()} + aa = {_local(k): v for k, v in act.attrib.items()} + for aname in sorted(set(ea) & set(aa)): + if not _values_equiv(ea[aname], aa[aname]): + return Divergence( + "attribute", path, _local(exp.tag), + expected_value=ea[aname], actual_value=aa[aname], attr_name=aname, + ) + missing_attrs = sorted(set(ea) - set(aa)) + extra_attrs = sorted(set(aa) - set(ea)) + if missing_attrs or extra_attrs: + return Divergence( + "attribute-count", path, _local(exp.tag), + attr_name=missing_attrs[0] if missing_attrs else None, + missing_attrs=missing_attrs, + ) + + ec = [c for c in exp if isinstance(c.tag, str)] + ac = [c for c in act if isinstance(c.tag, str)] + exp_children = [_local(c.tag) for c in ec] + act_children = [_local(c.tag) for c in ac] + for i in range(min(len(ec), len(ac))): + if exp_children[i] != act_children[i]: + return Divergence( + "element-name", f"{path}/{exp_children[i]}", exp_children[i], + parent_expected_children=exp_children, parent_actual_children=act_children, + ) + deeper = first_divergence(ec[i], ac[i], f"{path}/{exp_children[i]}") + if deeper is not None: + return deeper + if len(ec) != len(ac): + return Divergence( + "child-count", path, _local(exp.tag), + parent_expected_children=exp_children, parent_actual_children=act_children, + ) + return None + + +def _is_permutation(a: list[str], b: list[str]) -> bool: + """True if the two child-tag sequences are a pure reorder (same multiset).""" + if Counter(a) != Counter(b) or a == b: + return False + # Confirm via difflib opcodes: a permutation has no shared-multiset 'replace' + # that adds/removes a tag (handled by the Counter check above); this just + # asserts the sequences genuinely differ in order. + return any(tag != "equal" for tag, *_ in difflib.SequenceMatcher(a=a, b=b).get_opcodes()) + + +# --------------------------------------------------------------------------- # +# Per-file classification +# --------------------------------------------------------------------------- # + + +def _blank_record(rel: str) -> dict: + """A per-file record with every field present (filled in by the caller).""" + return { + "file": rel, + "status": None, + "primary_category": None, + "secondary_categories": [], + "first_divergence_element": None, + "first_divergence_path": None, + "mismatch_type": None, + "missing_elements": [], + "missing_element_counts": {}, + "distinct_missing_count": 0, + "added_elements": [], + "is_single_blocker": False, + "total_missing_instances": 0, + "blocking_features": [], + "pipeline_error_kind": None, + } + + +def classify_entry( + entry: DumpEntry, + api_features: dict[str, ApiFeature], + data_root: Path, + warn, +) -> dict: + """Classify one dump entry into a per-file JSON record.""" + rec = _blank_record(entry.rel) + + # Pipeline error: only an expected file (and maybe a .status sidecar). + if entry.actual is None: + kind = entry.status_code if entry.status_code in _PIPELINE_STATUSES else None + rec["status"] = kind or "PIPELINE_ERROR" + rec["primary_category"] = "F" + rec["pipeline_error_kind"] = kind + return rec + + if entry.expected is None: + warn(f"{entry.rel}: actual present but no expected; skipping") + rec["status"] = "FAIL" + rec["primary_category"] = "unknown" + return rec + + rec["status"] = "FAIL" + try: + exp_root = ET.parse(entry.expected).getroot() + act_root = ET.parse(entry.actual).getroot() + except ET.ParseError as exc: + warn(f"{entry.rel}: XML parse error ({exc}); marking unknown") + rec["primary_category"] = "unknown" + return rec + + # Layer 1: multiset tag diff -- the complete, reorder-invariant inventory. + missing = tag_multiset(exp_root) - tag_multiset(act_root) + added = tag_multiset(act_root) - tag_multiset(exp_root) + rec["missing_elements"] = sorted(missing) + rec["missing_element_counts"] = {k: missing[k] for k in sorted(missing)} + rec["distinct_missing_count"] = len(missing) + rec["added_elements"] = sorted(added) + rec["total_missing_instances"] = sum(missing.values()) + rec["is_single_blocker"] = len(missing) == 1 + + # First positional divergence (continuity only; the harness reports the same). + div = first_divergence(exp_root, act_root, f"/{_local(exp_root.tag)}") + if div is not None: + rec["mismatch_type"] = div.mismatch_type + rec["first_divergence_element"] = div.element + rec["first_divergence_path"] = div.path + + # No divergence and nothing dropped/added -> the file actually round-trips. + if div is None and not missing and not added: + rec["status"] = "PASS" + rec["primary_category"] = "A" + return rec + + def support_of(tag: str) -> str | None: + feat = api_features.get(tag) + return feat.support if feat else None + + cats: list[str] = [] + + # B -- drop-only: provable across the whole file via the multiset. + if missing and not added and all(support_of(t) == "none" for t in missing): + cats.append("B") + + # C -- reorder-only: same global multiset, a parent's child order differs. + if ( + not missing + and not added + and div is not None + and div.mismatch_type in ("element-name", "child-count") + and _is_permutation(div.parent_expected_children, div.parent_actual_children) + ): + cats.append("C") + + # D -- enum bug: a value mismatch that is a known missing enum member. + if div is not None and div.mismatch_type in ("text", "attribute"): + feat = api_features.get(div.element) + if ( + feat is not None + and feat.support == "partial" + and feat.enum_missing + and _normalize_enum(div.expected_value) in feat.enum_missing + ): + cats.append("D") + + # E -- missing attribute: a partial feature's modeled attribute was dropped. + if div is not None and div.mismatch_type in ("attribute", "attribute-count"): + feat = api_features.get(div.element) + if ( + feat is not None + and feat.support == "partial" + and div.attr_name is not None + and div.attr_name in feat.attributes + ): + cats.append("E") + + # Primary = first match in priority order; the rest are secondary. + primary = next((c for c in ("B", "C", "D", "E") if c in cats), None) + if primary is None: + warn(f"{entry.rel}: unclassified FAIL (missing={rec['missing_elements']}, " + f"mismatch={rec['mismatch_type']})") + rec["primary_category"] = "unknown" + else: + rec["primary_category"] = primary + rec["secondary_categories"] = [c for c in cats if c != primary] + + # Blocking features: what, if fully supported, would unblock this file. + if primary == "B": + rec["blocking_features"] = sorted(missing) + elif primary in ("D", "E") and div is not None and div.element: + rec["blocking_features"] = [div.element] + + # Cross-reference the per-file sidecar: a missing or unreadable sidecar is a + # warning, never an abort. When present, sanity-check that the divergence + # element really is part of the file's surface (catches namespace/local-name + # drift between the dump XML and the audited surface). + sidecar = (data_root / entry.rel).with_suffix("") + sidecar = sidecar.with_name(sidecar.name + FEATURES_SUFFIX) + if not sidecar.exists(): + warn(f"{entry.rel}: no feature sidecar at {sidecar}; cross-reference skipped") + else: + try: + surface = featuresfile.read(sidecar) + if div is not None and div.element and div.element not in surface.elements: + warn(f"{entry.rel}: divergence element '{div.element}' not in file surface") + except ET.ParseError as exc: + warn(f"{entry.rel}: unreadable feature sidecar ({exc}); cross-reference skipped") + + return rec + + +# --------------------------------------------------------------------------- # +# Aggregation, ranking, output +# --------------------------------------------------------------------------- # + + +def _rank_blocking_features(records: list[dict]) -> list[tuple[str, int, int]]: + """Rank actionable (B/D/E) blocking features by files unblocked. + + Returns (feature, files_unblocked, single_blocker_count) sorted descending by + files unblocked, then by single-blocker count, then name. + """ + files = Counter() + single = Counter() + for rec in records: + if rec["primary_category"] not in _ACTIONABLE: + continue + blockers = rec["blocking_features"] + for feat in set(blockers): + files[feat] += 1 + if rec["is_single_blocker"] and len(blockers) == 1: + single[blockers[0]] += 1 + ranked = [(f, files[f], single[f]) for f in files] + ranked.sort(key=lambda t: (-t[1], -t[2], t[0])) + return ranked + + +def build_report( + dump_dir: Path, data_root: Path, api_features_path: Path, warn +) -> dict: + """Run classification over the dump directory and return the JSON report.""" + api_features = load_api_features(api_features_path) + entries = parse_dump_dir(dump_dir) + records = [classify_entry(e, api_features, data_root, warn) for e in entries] + + by_category: Counter = Counter(r["primary_category"] for r in records) + return { + "dump_dir": str(dump_dir.resolve()), + "data_root": str(data_root.resolve()), + "generated": datetime.now(timezone.utc).isoformat(), + "summary": { + "total": len(records), + "by_category": dict(sorted(by_category.items())), + }, + "files": records, + } + + +def print_summary(report: dict, out_path: Path) -> None: + """Print the human-readable summary to stdout.""" + records = report["files"] + counts = report["summary"]["by_category"] + total = report["summary"]["total"] + print(f"Classified {total} files from {report['dump_dir']}\n") + + for cat in ("A", "B", "C", "D", "E", "F", "unknown"): + n = counts.get(cat, 0) + if n == 0 and cat == "A": + continue + marker = "?" if cat == "unknown" else cat + print(f" {marker} {_CATEGORY_LABELS[cat]:<28}{n:>4}") + + ranked = _rank_blocking_features(records) + if ranked: + print("\nTop blocking features (ranked by files unblocked; B+D+E):") + for feat, files, single in ranked[:15]: + print(f" {feat:<24}{files:>4} files ({single} single-blocker)") + + print(f"\nOutput: {out_path}") + + +def run(dump_dir: Path, data_root: Path, out_path: Path) -> int: + """Entry point for the ``classify`` subcommand.""" + if not dump_dir.is_dir(): + print(f"classify: dump dir not found: {dump_dir}", file=sys.stderr) + return 2 + api_features_path = data_root / "api.features.xml" + if not api_features_path.exists(): + print(f"classify: api.features.xml not found: {api_features_path}", file=sys.stderr) + return 2 + + def warn(msg: str) -> None: + print(f"classify: warning: {msg}", file=sys.stderr) + + report = build_report(dump_dir, data_root, api_features_path, warn) + + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8") + + print_summary(report, out_path) + return 0 diff --git a/audit/tests/__init__.py b/audit/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/audit/tests/test_classify.py b/audit/tests/test_classify.py new file mode 100644 index 00000000..b72f6d0a --- /dev/null +++ b/audit/tests/test_classify.py @@ -0,0 +1,203 @@ +"""Tests for the api round-trip failure classifier (audit/classify.py, #211). + +Builds synthetic dump directories that mirror the #210 dump format and asserts +the classifier's category assignment, the multiset diff (the core fix: it must +enumerate *every* missing element class, not just the first), and the ranking. + +Run with: python3 -m unittest audit.tests.test_classify +""" + +from __future__ import annotations + +import tempfile +import unittest +from pathlib import Path + +from audit import classify + +# A small api.features.xml fixture with controlled support levels. +API_FEATURES = """ + + mx::api + 4.0 + + + + + + + + + + + + + + + sharpSharp + + + + + + + + + + +""" + + +def _wrap(body: str) -> str: + return f"{body}" + + +class ClassifierTest(unittest.TestCase): + def setUp(self) -> None: + self._tmp = tempfile.TemporaryDirectory() + self.root = Path(self._tmp.name) + self.data = self.root / "data" + self.dump = self.root / "dump" + self.data.mkdir() + self.dump.mkdir() + (self.data / "api.features.xml").write_text(API_FEATURES, encoding="utf-8") + self.warnings: list[str] = [] + + def tearDown(self) -> None: + self._tmp.cleanup() + + def _pair(self, rel: str, expected: str, actual: str | None) -> None: + flat = rel.replace("/", "__") + (self.dump / f"{flat}.expected.xml").write_text(expected, encoding="utf-8") + if actual is not None: + (self.dump / f"{flat}.actual.xml").write_text(actual, encoding="utf-8") + + def _status(self, rel: str, code: str) -> None: + flat = rel.replace("/", "__") + (self.dump / f"{flat}.status").write_text(code + "\n", encoding="utf-8") + + def _classify(self) -> dict[str, dict]: + report = classify.build_report( + self.dump, self.data, self.data / "api.features.xml", self.warnings.append + ) + self.report = report + return {r["file"]: r for r in report["files"]} + + # --- individual categories ------------------------------------------- # + + def test_drop_only_enumerates_all_missing(self) -> None: + # credit AND defaults are both dropped; a positional walk would only see + # the first. The multiset must report both. + self._pair( + "wild/drop.xml", + _wrap("ab" + "C"), + _wrap("C"), + ) + rec = self._classify()["wild/drop.xml"] + self.assertEqual(rec["primary_category"], "B") + self.assertEqual(rec["missing_elements"], ["credit", "defaults"]) + self.assertEqual(rec["distinct_missing_count"], 2) + self.assertFalse(rec["is_single_blocker"]) + self.assertEqual(rec["blocking_features"], ["credit", "defaults"]) + + def test_drop_only_single_blocker(self) -> None: + self._pair( + "wild/single.xml", + _wrap("aC"), + _wrap("C"), + ) + rec = self._classify()["wild/single.xml"] + self.assertEqual(rec["primary_category"], "B") + self.assertEqual(rec["distinct_missing_count"], 1) + self.assertTrue(rec["is_single_blocker"]) + + def test_reorder_only(self) -> None: + self._pair( + "wild/reorder.xml", + _wrap("12"), + _wrap("21"), + ) + rec = self._classify()["wild/reorder.xml"] + self.assertEqual(rec["primary_category"], "C") + self.assertEqual(rec["missing_elements"], []) + self.assertEqual(rec["added_elements"], []) + + def test_enum_bug(self) -> None: + self._pair( + "wild/enum.xml", + _wrap("sharp-sharp"), + _wrap("sharp"), + ) + rec = self._classify()["wild/enum.xml"] + self.assertEqual(rec["primary_category"], "D") + self.assertEqual(rec["mismatch_type"], "text") + self.assertEqual(rec["first_divergence_element"], "accidental") + + def test_missing_attribute(self) -> None: + self._pair( + "wild/attr.xml", + _wrap(''), + _wrap(""), + ) + rec = self._classify()["wild/attr.xml"] + self.assertEqual(rec["primary_category"], "E") + self.assertEqual(rec["mismatch_type"], "attribute-count") + + def test_pipeline_error_with_status(self) -> None: + self._pair("wild/load.xml", _wrap(""), None) + self._status("wild/load.xml", "LOADFAIL") + rec = self._classify()["wild/load.xml"] + self.assertEqual(rec["primary_category"], "F") + self.assertEqual(rec["status"], "LOADFAIL") + self.assertEqual(rec["pipeline_error_kind"], "LOADFAIL") + + def test_pipeline_error_without_status(self) -> None: + self._pair("wild/nostatus.xml", _wrap(""), None) + rec = self._classify()["wild/nostatus.xml"] + self.assertEqual(rec["primary_category"], "F") + self.assertEqual(rec["status"], "PIPELINE_ERROR") + self.assertIsNone(rec["pipeline_error_kind"]) + + def test_unknown_full_support_mismatch(self) -> None: + self._pair( + "wild/unknown.xml", + _wrap("C"), + _wrap("D"), + ) + rec = self._classify()["wild/unknown.xml"] + self.assertEqual(rec["primary_category"], "unknown") + + def test_every_record_has_all_fields(self) -> None: + self._pair("wild/single.xml", + _wrap("a"), _wrap("")) + rec = next(iter(self._classify().values())) + for key in ( + "file", "status", "primary_category", "secondary_categories", + "first_divergence_element", "first_divergence_path", "mismatch_type", + "missing_elements", "missing_element_counts", "distinct_missing_count", + "added_elements", "is_single_blocker", "total_missing_instances", + "blocking_features", "pipeline_error_kind", + ): + self.assertIn(key, rec) + + def test_missing_sidecar_warns_not_aborts(self) -> None: + self._pair("wild/single.xml", + _wrap("a"), _wrap("")) + self._classify() # no sidecars created -> warnings, no exception + self.assertTrue(any("no feature sidecar" in w for w in self.warnings)) + + def test_ranking_orders_by_files_unblocked(self) -> None: + # credit blocks two files, defaults blocks one. + self._pair("wild/a.xml", _wrap("x"), _wrap("")) + self._pair("wild/b.xml", _wrap("x"), _wrap("")) + self._pair("wild/c.xml", _wrap("x"), _wrap("")) + records = list(self._classify().values()) + ranked = classify._rank_blocking_features(records) + self.assertEqual(ranked[0][0], "credit") + self.assertEqual(ranked[0][1], 2) # files unblocked + self.assertEqual(ranked[0][2], 2) # single-blocker count + + +if __name__ == "__main__": + unittest.main() diff --git a/docs/ai/design/api-roundtrip-classifier.md b/docs/ai/design/api-roundtrip-classifier.md index dcf3e6fd..e72c2efd 100644 --- a/docs/ai/design/api-roundtrip-classifier.md +++ b/docs/ai/design/api-roundtrip-classifier.md @@ -165,6 +165,29 @@ present on every entry, `null`/`[]`/`{}` when not applicable): for continuity with the harness detail string, but are explicitly **not** relied on for completeness — they describe only the first positional divergence. +### Pipeline errors and the `.status` sidecar + +A flat dump filename (`a__b.xml.expected.xml`) cannot, on its own, distinguish the +three pipeline-error statuses (`LOADFAIL` / `GETDATAFAIL` / `CREATEFAIL`): all +three produce only an `.expected.xml` (the api pipeline never reached a written +document). So the dump step (#210) writes a tiny `.status` sidecar next to +the expected file recording the exact code. The classifier reads it to populate +`pipeline_error_kind` and `status`; when the sidecar is absent it falls back to a +generic `status = "PIPELINE_ERROR"` with `pipeline_error_kind = null`. The +presence/absence of `.actual.xml` remains the FAIL-vs-pipeline-error signal; the +`.status` sidecar only refines *which* pipeline error. The sidecar lives in the +gitignored dump dir and is never checked in. + +### A note on `support="full"`/`"partial"` drops + +A dropped element whose `api.features.xml` support is `full` or `partial` does +**not** satisfy category B (drop-only) — B requires *every* missing class to be +`support="none"`. Such a file falls through to `unknown` and is logged, because a +class that is supposed to round-trip but vanished is a genuine correctness bug or +a partial-drop, not an expected feature gap. This is the intended behavior: the +multiset makes the distinction provable across the whole file rather than hiding +it behind whatever the first positional divergence happened to be. + ## Dependency decision Stdlib only: `xml.etree.ElementTree` + `collections.Counter` + `difflib`. Rationale: diff --git a/src/private/mxtest/api/CorpusRoundtripMain.cpp b/src/private/mxtest/api/CorpusRoundtripMain.cpp index 5ef42074..22b3c192 100644 --- a/src/private/mxtest/api/CorpusRoundtripMain.cpp +++ b/src/private/mxtest/api/CorpusRoundtripMain.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -266,6 +267,114 @@ std::string statusString(RoundtripResult::Status s) return "UNKNOWN"; } +// Flatten a corpus-relative path to a single filename component by replacing +// every directory separator with "__". `lysuite/Saltarello.xml` becomes +// `lysuite__Saltarello.xml`. The classifier (audit/classify.py) reverses this. +std::string toFlatName(const std::string &rel) +{ + std::string out; + out.reserve(rel.size() + 8); + for (const char c : rel) + { + if (c == '/' || c == '\\') + out += "__"; + else + out += c; + } + return out; +} + +// Write the fully-normalized expected (and, when available, actual) documents +// for one non-PASS file into `dumpDir` for offline diff analysis. This re-runs +// just enough of the pipeline to reproduce the documents `compareElements()` +// saw; `runRoundtrip()` is intentionally left untouched (it reports pass/fail +// and never needs to surface the documents). The duplicated pipeline work is +// acceptable: --dump is a developer tool, not a hot path. +// +// FAIL files yield both `.expected.xml` and `.actual.xml`. +// LOADFAIL/GETDATAFAIL/CREATEFAIL files have no actual document; they yield the +// expected file plus a `.status` sidecar recording the exact status code +// (the flat filename alone cannot distinguish the three pipeline-error kinds). +void dumpDocuments(const std::string &absolutePath, const std::string &relPath, const std::string &dumpDir, + RoundtripResult::Status status) +{ + namespace fs = std::filesystem; + const std::string flat = toFlatName(relPath); + const std::string expectedPath = (fs::path(dumpDir) / (flat + ".expected.xml")).string(); + const std::string actualPath = (fs::path(dumpDir) / (flat + ".actual.xml")).string(); + + // Expected side: load the original input, normalize, then apply fixups -- + // the same order runRoundtrip() uses for the expected document. + pugi::xml_document expectedDoc; + if (!expectedDoc.load_file(absolutePath.c_str(), pugi::parse_default | pugi::parse_doctype)) + { + std::cerr << "dump: cannot load expected for " << relPath << "\n"; + return; + } + mxtest::corert::normalizeForComparison(expectedDoc); + mxtest::corert::Fixer fixer(absolutePath); + fixer.applyToExpected(expectedDoc); + if (!expectedDoc.save_file(expectedPath.c_str())) + std::cerr << "dump: failed to write " << expectedPath << "\n"; + + // Pipeline errors produced no actual document. Record the status so the + // classifier can report the precise kind, then stop. + if (status == RoundtripResult::Status::loadFail || status == RoundtripResult::Status::getDataFail || + status == RoundtripResult::Status::createFail) + { + const std::string statusPath = (fs::path(dumpDir) / (flat + ".status")).string(); + std::ofstream statusFile(statusPath); + statusFile << statusString(status) << "\n"; + std::cerr << "dump: no actual for " << relPath << " (" << statusString(status) << ")\n"; + return; + } + + // Actual side: re-run the api pipeline (load -> getData -> createFromScore + // -> writeToStream), strip the provenance stamp, normalize. Mirrors + // runRoundtrip() lines for the actual document. + auto &mgr = mx::api::DocumentManager::getInstance(); + const auto idResult = mgr.createFromFile(absolutePath); + if (!idResult.ok()) + { + std::cerr << "dump: no actual for " << relPath << " (createFromFile failed)\n"; + return; + } + const int docId = idResult.value(); + const auto scoreResult = mgr.getData(docId); + mgr.destroyDocument(docId); + if (!scoreResult.ok()) + { + std::cerr << "dump: no actual for " << relPath << " (getData failed)\n"; + return; + } + const auto id2Result = mgr.createFromScore(scoreResult.value()); + if (!id2Result.ok()) + { + std::cerr << "dump: no actual for " << relPath << " (createFromScore failed)\n"; + return; + } + const int docId2 = id2Result.value(); + std::ostringstream ss; + const auto writeResult = mgr.writeToStream(docId2, ss); + mgr.destroyDocument(docId2); + if (!writeResult.ok()) + { + std::cerr << "dump: no actual for " << relPath << " (writeToStream failed)\n"; + return; + } + + pugi::xml_document actualDoc; + if (!actualDoc.load_string(ss.str().c_str(), pugi::parse_default | pugi::parse_doctype)) + { + std::cerr << "dump: no actual for " << relPath << " (output did not parse)\n"; + return; + } + stripMxAttribution(actualDoc.document_element()); + mxtest::corert::normalizeForComparison(actualDoc); + if (!actualDoc.save_file(actualPath.c_str())) + std::cerr << "dump: failed to write " << actualPath << "\n"; +} + } // namespace int main(int argc, char *argv[]) @@ -274,7 +383,7 @@ int main(int argc, char *argv[]) { std::cerr << "usage:\n" << " " << argv[0] << " regression \n" - << " " << argv[0] << " discovery \n"; + << " " << argv[0] << " discovery [--dump ]\n"; return 2; } @@ -318,14 +427,61 @@ int main(int argc, char *argv[]) } else if (mode == "discovery") { - const auto files = discoverFiles(dataRoot); + // discovery [--dump ] (flag may precede or follow the root) + std::string discDataRoot; + std::optional dumpDir; + for (int i = 2; i < argc; ++i) + { + const std::string a = argv[i]; + if (a == "--dump") + { + if (i + 1 >= argc) + { + std::cerr << "--dump requires a directory" << std::endl; + return 2; + } + dumpDir = argv[++i]; + } + else if (discDataRoot.empty()) + { + discDataRoot = a; + } + else + { + std::cerr << "unexpected argument: " << a << std::endl; + return 2; + } + } + if (discDataRoot.empty()) + { + std::cerr << "discovery mode requires " << std::endl; + return 2; + } + if (dumpDir) + { + std::error_code ec; + std::filesystem::create_directories(*dumpDir, ec); + if (ec) + { + std::cerr << "cannot create dump dir: " << *dumpDir << ": " << ec.message() << std::endl; + return 2; + } + } + + const auto files = discoverFiles(discDataRoot); int pass = 0, fail = 0, skip = 0, loadFail = 0, getDataFail = 0, createFail = 0; for (const auto &absPath : files) { namespace fs = std::filesystem; - const std::string rel = fs::relative(fs::path(absPath), fs::path(dataRoot)).generic_string(); + const std::string rel = fs::relative(fs::path(absPath), fs::path(discDataRoot)).generic_string(); const auto r = runRoundtrip(absPath); std::cout << statusString(r.status) << "\t" << rel << "\t" << r.detail << "\n"; + + // Dump normalized expected/actual for every non-PASS, non-SKIP file. + if (dumpDir && r.status != RoundtripResult::Status::pass && r.status != RoundtripResult::Status::skip) + { + dumpDocuments(absPath, rel, *dumpDir, r.status); + } switch (r.status) { case RoundtripResult::Status::pass: From 2f3675adc41a689f491a0d560b10712e31a9c826 Mon Sep 17 00:00:00 2001 From: Matthew James Briggs Date: Sat, 20 Jun 2026 10:31:06 +0000 Subject: [PATCH 4/5] docs: add explain-api-roundtrip skill (#211) Drives the failure classifier (dump -> classify) and turns build/api/classified.json into a prioritized, plain-language explanation of what is wrong with the mx::api round-trip and what to fix next, grouped by failure mode (crash, supported-element drop, reorder, by-design drop, audit blind spot). --- .claude/skills/explain-api-roundtrip/SKILL.md | 167 ++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 .claude/skills/explain-api-roundtrip/SKILL.md diff --git a/.claude/skills/explain-api-roundtrip/SKILL.md b/.claude/skills/explain-api-roundtrip/SKILL.md new file mode 100644 index 00000000..ec97f7e2 --- /dev/null +++ b/.claude/skills/explain-api-roundtrip/SKILL.md @@ -0,0 +1,167 @@ +--- +name: explain-api-roundtrip +description: > + Use this skill to explain, in plain language, what is wrong with the `mx::api` round-trip and what + it needs next. It drives the failure classifier (dump -> classify) over the corpus, then reads + build/api/classified.json and turns it into a prioritized, human-readable explanation grouped by + failure mode (hard crash, dropped supported elements, reorder, by-design drop, audit blind spot). + Invoke for requests like "what's broken about mx::api", "explain the round-trip failures", + "what does the api need next", or "triage the api round-trip". +argument-hint: "" +disable-model-invocation: false +user-invocable: true +--- +# Explain the `mx::api` round-trip + +`mx::api` is a deliberate subset of MusicXML, so some round-trip data loss is by design. This skill +separates the by-design losses from the real defects and produces a plain-English answer to two +questions: what is broken, and what should we fix next. + +It is the read-out layer on top of the failure classifier (issue #211): the classifier produces a +machine-readable `build/api/classified.json`; this skill interprets it for a human. + +## How it works + +The pipeline has two steps, kept separate on purpose (see `audit/README.md`): + +1. `make dump-api-roundtrip` — runs the api pipeline over every corpus file and writes the + normalized expected/actual XML pairs to `build/api/roundtrip-dump/`. Slow: it builds the C++ + harness and runs ~800 files. Re-run only when api/impl code changed. +2. `make classify-api-roundtrip` — pure Python, fast. Diffs each pair as an element multiset + (`Counter(expected) - Counter(actual)`), cross-references `data/api.features.xml`, and writes + `build/api/classified.json` plus a stdout summary. + +The categories in the JSON (`primary_category`): + +| id | meaning | +|----|---------| +| B | drop-only: every missing element is `support="none"` — a by-design subset drop | +| C | reorder-only: same elements, different order | +| D | enum bug: a value maps to a known-missing enum member | +| E | missing attribute: a `partial` feature dropped one attribute | +| F | pipeline error: LOADFAIL/GETDATAFAIL/CREATEFAIL — no output produced (a crash) | +| unknown | a FAIL that matched none of the above — usually a `support="full"` element that was dropped (a real bug) or an element not tracked in `api.features.xml` | + +## Procedure + +### Step 1 — produce the data + +If `build/api/roundtrip-dump/` is empty or stale (api/impl changed since it was written), run: + +``` +make dump-api-roundtrip +``` + +Then always run: + +``` +make classify-api-roundtrip +``` + +Read the stdout summary it prints — that is the top-level shape (counts per category + the worklist +of features blocking the most files). + +### Step 2 — mine `build/api/classified.json` + +Run these read-only analyses (they join the classifier output against the support index). Adjust the +path if `--out` was overridden. + +Top dropped elements, with their audited support level (the key signal — `support="full"` drops are +bugs, `support="none"` drops are by design): + +``` +python3 - <<'PY' +import json, re +from collections import Counter +d = json.load(open("build/api/classified.json")) +support = {m.group(1): m.group(2) for m in + re.finditer(r'name="([^"]+)" support="([a-z]+)"', open("data/api.features.xml").read())} +miss = Counter() +for r in d["files"]: + for tag in r["missing_element_counts"]: + miss[tag] += 1 # files affected +for tag, n in miss.most_common(25): + print(f"{n:>4} files {tag:<18} support={support.get(tag, 'NOT-IN-INDEX')}") +PY +``` + +Pipeline-error (crash) cluster — group by file/feature to find the common root: + +``` +python3 - <<'PY' +import json +d = json.load(open("build/api/classified.json")) +for r in d["files"]: + if r["primary_category"] == "F": + print(r["pipeline_error_kind"], r["file"]) +PY +``` + +Reorder cluster — where in the tree the order diverges: + +``` +python3 - <<'PY' +import json +from collections import Counter +d = json.load(open("build/api/classified.json")) +paths = Counter(r["first_divergence_path"] for r in d["files"] if r["primary_category"] == "C") +for path, n in paths.most_common(10): + print(f"{n:>4} {path}") +PY +``` + +What is driving the `unknown` bucket (the warnings on stderr from Step 1 also list this; this is the +programmatic view): + +``` +python3 - <<'PY' +import json, re +from collections import Counter +d = json.load(open("build/api/classified.json")) +support = {m.group(1): m.group(2) for m in + re.finditer(r'name="([^"]+)" support="([a-z]+)"', open("data/api.features.xml").read())} +full_drop, untracked = Counter(), Counter() +for r in d["files"]: + if r["primary_category"] != "unknown": + continue + for tag in r["missing_elements"]: + s = support.get(tag) + if s in ("full", "partial"): + full_drop[tag] += 1 # claimed supported but dropped -> bug + elif s is None: + untracked[tag] += 1 # not in api.features.xml -> audit gap +print("supported-but-dropped:", full_drop.most_common(10)) +print("untracked:", untracked.most_common(10)) +PY +``` + +To drill into one file, look at its record (`missing_elements`, `mismatch_type`, +`first_divergence_path`) and diff the pair directly: +`diff build/api/roundtrip-dump/.expected.xml build/api/roundtrip-dump/.actual.xml` +where `` is the corpus path with `/` replaced by `__`. + +### Step 3 — write the explanation + +Synthesize the findings into plain language grouped by failure mode, ordered by severity. Use this +structure (fill the numbers and element names from Step 2; do not invent them): + +1. Frame it: `mx::api` is a subset, so some loss is by design — separate that from the real defects. +2. Hard crashes (category F). Highest severity: no output at all. Name the cluster (the crash + analysis usually points at one feature). These are bugs. +3. Dropped supported elements (the `support="full"`/`partial` rows from the top-dropped table and the + `supported-but-dropped` view). Either an impl round-trip bug or `api.features.xml` overstates + support — say which needs checking, per element. +4. Reorder (category C). Lower severity: content intact, order wrong. Name the divergence path. +5. By-design drops (category B): mention briefly — these are expected subset behavior, not bugs. +6. Audit blind spots: the `untracked` view — elements dropped but not in `api.features.xml`, so they + can't be categorized. Recommend running `api-feature-audit` to close the gap. + +Then give a prioritized "what it needs" list. Be honest about caveats: the comparison is strict +full-DOM, and if the pinned baseline (`roundtrip-baseline.txt`) is ungrown, almost the whole corpus +shows as failing — these are the raw landscape, not a regression. + +## Hand-off + +- To fix a dropped/under-supported element or a crash: use the `add-mx-api-feature` skill. +- To correct or extend support levels in `data/api.features.xml`: use the `api-feature-audit` skill. +- The findings belong under the tracking issue #208; file specifics with the `open-mx-issue` skill. From db56b6a9cc8d802c7a70ebe56933552e8773ddd6 Mon Sep 17 00:00:00 2001 From: Matthew James Briggs Date: Sat, 20 Jun 2026 17:11:03 +0200 Subject: [PATCH 5/5] Update .claude/skills/explain-api-roundtrip/SKILL.md --- .claude/skills/explain-api-roundtrip/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/skills/explain-api-roundtrip/SKILL.md b/.claude/skills/explain-api-roundtrip/SKILL.md index ec97f7e2..c8caf15d 100644 --- a/.claude/skills/explain-api-roundtrip/SKILL.md +++ b/.claude/skills/explain-api-roundtrip/SKILL.md @@ -160,7 +160,7 @@ Then give a prioritized "what it needs" list. Be honest about caveats: the compa full-DOM, and if the pinned baseline (`roundtrip-baseline.txt`) is ungrown, almost the whole corpus shows as failing — these are the raw landscape, not a regression. -## Hand-off +## Hand-off Fixes (if requested) - To fix a dropped/under-supported element or a crash: use the `add-mx-api-feature` skill. - To correct or extend support levels in `data/api.features.xml`: use the `api-feature-audit` skill.