From 137408d994595e1d6bbe1997c1bcca400d35dd41 Mon Sep 17 00:00:00 2001 From: Peter Jacobson Date: Fri, 12 Jun 2026 09:00:21 +1000 Subject: [PATCH] Parser review: strict VERT metadata summary, surfaced partial-load warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adversarial pass over io/readers (truncation, corruption, partial files), pinned by the new tests/test_parser_adversarial.py corpus tests: - Createc VERT metadata fast path used the lenient (and deprecated) np.fromstring, which silently stops at the first unparsable token: a corrupt or mid-row-truncated table summarised as healthy (full row count, no warnings) while the full parse raised — so browse and the metadata cache showed a spectrum the viewer could not load. The summary now validates rows exactly as strictly as the full parse (consistent column count, every token a float), matching the Nanonis reader's existing strict summary. The old test that pinned the leniency is updated to pin the symmetric contract instead. - scan.warnings was consumed nowhere: the SXM/SM4 readers degrade gracefully on partial files (a scan still being written loads its complete planes only) and record exactly why — and the explanation was then dropped, leaving the user with silently missing channels. ViewerScanData now carries scan_warnings and the viewer shows them in the status bar at load. - Nanonis spec: a valid single-column file failed the column-count check (loadtxt collapses one column to 1-D and it was reshaped to one wide row); the column-header length now disambiguates. Verified sound, now pinned by corpus tests across every real fixture: metadata <-> full-parse agreement for VERT / Nanonis spec / Createc DAT; Createc DAT truncation diagnostics ("corrupt or truncated", byte counts, format token) and dimension guards; Nanonis spec strict two-path failures; SXM/SM4 partial-file tolerance returning only complete, fully-finite planes with explicit warnings; VERT row-boundary truncation warning (partial sweeps); 64 KiB chunk-boundary DATA-marker handling in the streaming header reader. Disproved during review: the VERT Vpoint time-trace threshold units (Vpoint.V is in mV, consistent with the data column). Co-Authored-By: Claude Fable 5 --- probeflow/gui/dialogs/image_viewer.py | 12 ++ probeflow/gui/viewer/scan_load.py | 6 + probeflow/io/readers/createc_vert.py | 36 ++-- probeflow/io/readers/nanonis_spec.py | 6 +- tests/test_parser_adversarial.py | 277 ++++++++++++++++++++++++++ tests/test_spec_io.py | 15 +- 6 files changed, 333 insertions(+), 19 deletions(-) create mode 100644 tests/test_parser_adversarial.py diff --git a/probeflow/gui/dialogs/image_viewer.py b/probeflow/gui/dialogs/image_viewer.py index 3615eb9..7704d14 100644 --- a/probeflow/gui/dialogs/image_viewer.py +++ b/probeflow/gui/dialogs/image_viewer.py @@ -305,6 +305,18 @@ def _load_current_source(self, entry: SxmFile, reset_zoom: bool = True): self._scan_plane_units = data.plane_units self._source_processing_history = data.processing_history self._rebuild_processing_history() + # Readers degrade gracefully on partial/odd files (e.g. a scan still + # being written yields only its complete planes) and record why on + # the Scan; without this the user just sees missing channels with no + # explanation (2026-06-12 parser review). + if data.scan_warnings and hasattr(self, "_status_lbl"): + extra = ( + f" (+{len(data.scan_warnings) - 1} more)" + if len(data.scan_warnings) > 1 else "" + ) + self._status_lbl.setText( + f"Loaded with warnings: {data.scan_warnings[0]}{extra}" + ) def _rebuild_processing_history(self) -> None: if self._source_processing_history is None: diff --git a/probeflow/gui/viewer/scan_load.py b/probeflow/gui/viewer/scan_load.py index 55dd060..24632fd 100644 --- a/probeflow/gui/viewer/scan_load.py +++ b/probeflow/gui/viewer/scan_load.py @@ -31,6 +31,11 @@ class ViewerScanData: source_format: str n_planes: int processing_history: Optional["ProcessingHistory"] = field(default=None) + # Reader warnings (e.g. "payload contains 1 complete plane(s) but the + # header indicates 4 — file may have been incompletely written"). The + # readers degrade gracefully on partial files and record why; the viewer + # must surface this or the user just sees missing channels. + scan_warnings: list[str] = field(default_factory=list) def load_scan_for_viewer(path: Path, channel_idx: int) -> ViewerScanData: @@ -67,6 +72,7 @@ def load_scan_for_viewer(path: Path, channel_idx: int) -> ViewerScanData: source_format=scan.source_format, n_planes=scan.n_planes, processing_history=history, + scan_warnings=[str(w) for w in (getattr(scan, "warnings", ()) or ())], ) except Exception: return ViewerScanData( diff --git a/probeflow/io/readers/createc_vert.py b/probeflow/io/readers/createc_vert.py index 01604c3..7a5249f 100644 --- a/probeflow/io/readers/createc_vert.py +++ b/probeflow/io/readers/createc_vert.py @@ -352,6 +352,16 @@ def _summarise_numeric_table( path: Path, data_text: str, ) -> tuple[int, int, float | None, float | None]: + """Stream row/column counts and the bias range without keeping arrays. + + Validates every row as strictly as the full :func:`_parse_numeric_table` + (token count consistent with the first row, every token a float). The + previous implementation used the lenient ``np.fromstring(sep=...)``, + which silently stops at the first unparsable token — a corrupt or + truncated table summarised as healthy here while the full parse raised, + so browse metadata (and the metadata cache) disagreed with what the + viewer could actually load (2026-06-12 parser review). + """ n_rows = 0 col_count: int | None = None bias_min: float | None = None @@ -361,20 +371,22 @@ def _summarise_numeric_table( stripped = line.strip() if not stripped: continue - try: - values = np.fromstring( - stripped.replace(",", ".").replace("\t", " "), - sep=" ", - dtype=np.float64, + parts = stripped.replace(",", ".").split() + if col_count is None: + col_count = len(parts) + elif len(parts) != col_count: + raise ValueError( + f"{path.name}: data row {n_rows} has {len(parts)} column(s), " + f"expected {col_count}" ) + try: + values = [float(part) for part in parts] except ValueError as exc: - raise ValueError(f"{path.name}: failed to parse data row - {exc}") from exc - if values.size == 0: - continue - if col_count is None: - col_count = int(values.size) - if values.size > 1: - bias = float(values[1]) + raise ValueError( + f"{path.name}: failed to parse data row {n_rows} - {exc}" + ) from exc + if len(values) > 1: + bias = values[1] bias_min = bias if bias_min is None else min(bias_min, bias) bias_max = bias if bias_max is None else max(bias_max, bias) n_rows += 1 diff --git a/probeflow/io/readers/nanonis_spec.py b/probeflow/io/readers/nanonis_spec.py index 6935d33..e360c61 100644 --- a/probeflow/io/readers/nanonis_spec.py +++ b/probeflow/io/readers/nanonis_spec.py @@ -113,8 +113,10 @@ def read_nanonis_spec(path: Union[str, Path]) -> SpecData: try: arr = np.loadtxt(StringIO(clean), dtype=np.float64, delimiter="\t") if arr.ndim == 1: - # Either a single row or a single column — reshape as one row. - arr = arr.reshape(1, -1) + # loadtxt collapses both a single row and a single column to 1-D; + # the column-header length disambiguates (a one-column file with + # N rows must become (N, 1), not one N-wide row). + arr = arr.reshape(-1, 1) if len(columns) == 1 else arr.reshape(1, -1) except ValueError as exc: raise ValueError( f"{path.name}: failed to parse data block — {exc}" diff --git a/tests/test_parser_adversarial.py b/tests/test_parser_adversarial.py new file mode 100644 index 0000000..949cd5e --- /dev/null +++ b/tests/test_parser_adversarial.py @@ -0,0 +1,277 @@ +"""Adversarial parser tests: truncated, corrupted, and partial files. + +2026-06-12 parser review. Two contracts: + +* **Metadata ↔ full-parse agreement** — for any file, the metadata fast path + and the full reader must agree: both succeed with consistent counts, or + both fail. The review found the Createc VERT metadata path summarising a + corrupt table as healthy (lenient ``np.fromstring``) while the full parse + raised — so browse (and the metadata cache) showed a spectrum the viewer + could not load. +* **Failures are loud and informative; partial files are explicit** — + decoders either raise a ValueError naming the file and the problem, or + degrade to the complete subset of the data with warnings recorded (the + scan-still-being-written case), never silent garbage. +""" + +from __future__ import annotations + +import shutil +from pathlib import Path + +import numpy as np +import pytest + +TESTDATA = Path(__file__).resolve().parent.parent / "test_data" + +VERT_FIXTURES = sorted(TESTDATA.glob("*.VERT")) +DAT_SCAN_FIXTURES = sorted(TESTDATA.glob("createc_scan_*.dat")) +NANONIS_SPEC_FIXTURES = sorted( + p for p in TESTDATA.glob("nanonis_*.dat") if p.is_file()) +SXM_FIXTURE = TESTDATA / "sxm_moire_10nm.sxm" +SM4_FIXTURE = TESTDATA / "VT260430_0004.sm4" + + +# ── Metadata ↔ full-parse agreement across the real corpus ──────────────────── + +class TestMetadataFullAgreement: + @pytest.mark.parametrize( + "path", VERT_FIXTURES, ids=lambda p: p.name) + def test_vert_corpus(self, path): + from probeflow.io.readers.createc_vert import read_createc_vert_report + + full = read_createc_vert_report(path, include_arrays=True) + meta = read_createc_vert_report(path, include_arrays=False) + assert meta.raw_table_shape == full.raw_table_shape + assert meta.column_names == full.column_names + assert meta.bias_min_mv == pytest.approx(full.bias_min_mv) + assert meta.bias_max_mv == pytest.approx(full.bias_max_mv) + + @pytest.mark.parametrize( + "path", NANONIS_SPEC_FIXTURES, ids=lambda p: p.name) + def test_nanonis_spec_corpus(self, path): + from probeflow.io.spectroscopy import read_spec_file, read_spec_metadata + + full = read_spec_file(path) + meta = read_spec_metadata(path) + assert meta.metadata["n_points"] == full.metadata["n_points"] + assert tuple(meta.channels) == tuple(full.channel_order) + + @pytest.mark.parametrize( + "path", DAT_SCAN_FIXTURES, ids=lambda p: p.name) + def test_createc_scan_corpus(self, path): + from probeflow.core.metadata import read_scan_metadata + from probeflow.core.scan_loader import load_scan + + full = load_scan(path) + meta = read_scan_metadata(path) + assert meta.shape == full.planes[0].shape + assert tuple(meta.plane_names) == tuple(full.plane_names) + + +# ── Createc VERT: corruption and truncation ─────────────────────────────────── + +@pytest.mark.skipif(not VERT_FIXTURES, reason="no VERT fixtures") +class TestVertCorruption: + @pytest.fixture + def vert_bytes(self): + return VERT_FIXTURES[0].read_bytes() + + def _write(self, tmp_path, data: bytes) -> Path: + p = tmp_path / "mutated.VERT" + p.write_bytes(data) + return p + + def test_corrupt_row_fails_in_both_paths(self, tmp_path, vert_bytes): + """The bug: a garbage token mid-table raised in the full parse but + summarised as healthy in the metadata path.""" + import re + + from probeflow.io.readers.createc_vert import read_createc_vert_report + + text = vert_bytes.decode("latin-1") + lines = text.splitlines(keepends=True) + for i in range(len(lines) // 2, len(lines)): + if re.match(r"\s*-?\d", lines[i]): + lines[i] = lines[i][:10] + "GARBAGE_##\n" + break + p = self._write(tmp_path, "".join(lines).encode("latin-1")) + + with pytest.raises(ValueError, match="mutated.VERT"): + read_createc_vert_report(p, include_arrays=True) + with pytest.raises(ValueError, match="mutated.VERT"): + read_createc_vert_report(p, include_arrays=False) + + def test_mid_row_truncation_fails_in_both_paths(self, tmp_path, vert_bytes): + from probeflow.io.readers.createc_vert import read_createc_vert_report + + p = self._write(tmp_path, vert_bytes[: int(len(vert_bytes) * 0.5) + 7]) + results = [] + for include in (True, False): + try: + read_createc_vert_report(p, include_arrays=include) + results.append("ok") + except ValueError: + results.append("raise") + assert results[0] == results[1], ( + f"metadata/full divergence on truncated file: {results}" + ) + + def test_row_boundary_truncation_warns_about_row_count( + self, tmp_path, vert_bytes): + """Cutting whole trailing rows is the partial-sweep case: both paths + load what is there and warn that the params line promised more.""" + from probeflow.io.readers.createc_vert import read_createc_vert_report + + text = vert_bytes.decode("latin-1") + head, _, _tail = text.rpartition("\n") + # Drop the last 100 lines cleanly. + kept = "\n".join(head.splitlines()[:-100]) + "\n" + p = self._write(tmp_path, kept.encode("latin-1")) + + for include in (True, False): + rep = read_createc_vert_report(p, include_arrays=include) + assert any("point(s), parsed" in w for w in rep.warnings), ( + "partial table not flagged" + ) + + def test_missing_data_marker(self, tmp_path, vert_bytes): + from probeflow.io.readers.createc_vert import read_createc_vert_report + + pos = vert_bytes.find(b"DATA") + p = self._write(tmp_path, vert_bytes[:pos]) + with pytest.raises(ValueError, match="missing DATA marker"): + read_createc_vert_report(p) + + def test_header_only_chunked_reader_handles_boundary(self, tmp_path): + """parse_createc_vert_header streams in 64 KiB chunks; a DATA marker + straddling the chunk boundary must still be found.""" + from probeflow.io.readers.createc_vert import parse_createc_vert_header + + filler = b"Comment=" + b"x" * (65536 - 10) + b"\r\n" + pad = b"A=1\r\n" * 3 + blob = filler + pad + # Position DATA so it straddles the 64 KiB boundary. + blob = blob[:65534] + b"\r\nDATA\r\nparams\r\n0 0 0 0\r\n" + p = tmp_path / "straddle.VERT" + p.write_bytes(b"[ParVERT30]\r\nKey=1\r\n" + blob) + hdr = parse_createc_vert_header(p) + assert hdr.get("Key") == "1" + + +# ── Createc DAT scans: corruption and truncation ────────────────────────────── + +@pytest.mark.skipif(not DAT_SCAN_FIXTURES, reason="no DAT fixtures") +class TestCreatecDatCorruption: + @pytest.fixture + def dat_bytes(self): + return DAT_SCAN_FIXTURES[0].read_bytes() + + def test_truncated_payload_names_the_problem(self, tmp_path, dat_bytes): + from probeflow.io.readers.createc_dat import read_createc_dat_report + + p = tmp_path / "trunc.dat" + p.write_bytes(dat_bytes[: len(dat_bytes) // 2]) + with pytest.raises(ValueError, match="corrupt or truncated"): + read_createc_dat_report(p) + + def test_missing_marker(self, tmp_path, dat_bytes): + from probeflow.io.readers.createc_dat import read_createc_dat_report + + pos = dat_bytes.find(b"DATA") + p = tmp_path / "nomarker.dat" + p.write_bytes(dat_bytes[:pos]) + with pytest.raises(ValueError, match="missing DATA marker"): + read_createc_dat_report(p) + + def test_zero_dimensions_rejected(self, tmp_path, dat_bytes): + from probeflow.io.readers.createc_dat import read_createc_dat_report + + mutated = dat_bytes.replace(b"Num.X", b"Nim.X") # hide every spelling + p = tmp_path / "nodims.dat" + p.write_bytes(mutated) + with pytest.raises(ValueError, match="invalid dimensions"): + read_createc_dat_report(p) + + +# ── Partial-file tolerance (scan still being written) ───────────────────────── + +class TestPartialFileTolerance: + @pytest.mark.skipif(not SXM_FIXTURE.exists(), reason="no SXM fixture") + def test_truncated_sxm_loads_complete_planes_with_warning(self, tmp_path): + from probeflow.core.scan_loader import load_scan + + raw = SXM_FIXTURE.read_bytes() + p = tmp_path / "half.sxm" + p.write_bytes(raw[: len(raw) // 2]) + full = load_scan(SXM_FIXTURE) + part = load_scan(p) + + assert 0 < part.n_planes < full.n_planes + for plane in part.planes: + assert np.isfinite(plane).all(), "partial plane contains garbage" + assert any("incompletely written" in w for w in part.warnings), ( + "partial load not explained in scan.warnings" + ) + + @pytest.mark.skipif(not SM4_FIXTURE.exists(), reason="no SM4 fixture") + def test_truncated_sm4_loads_complete_pages_with_warnings(self, tmp_path): + from probeflow.core.scan_loader import load_scan + + raw = SM4_FIXTURE.read_bytes() + p = tmp_path / "half.sm4" + p.write_bytes(raw[: len(raw) // 2]) + part = load_scan(p) + + assert part.n_planes >= 1 + for plane in part.planes: + assert np.isfinite(plane).all() + assert part.warnings, "skipped pages/objects not recorded" + + @pytest.mark.skipif(not SXM_FIXTURE.exists(), reason="no SXM fixture") + def test_viewer_scan_data_carries_the_warnings(self, tmp_path): + """The reader explains the partial load; the viewer load path must + carry that explanation instead of dropping it (scan.warnings was + consumed nowhere before this review).""" + from probeflow.gui.viewer.scan_load import load_scan_for_viewer + + raw = SXM_FIXTURE.read_bytes() + p = tmp_path / "half.sxm" + p.write_bytes(raw[: len(raw) // 2]) + data = load_scan_for_viewer(p, 0) + assert data.raw_arr is not None + assert data.scan_warnings + assert "incompletely written" in data.scan_warnings[0] + + +# ── Nanonis spec edge cases ─────────────────────────────────────────────────── + +class TestNanonisSpecEdges: + def test_single_column_file_parses_as_column(self, tmp_path): + """loadtxt collapses one-column data to 1-D; it must come back as + (N, 1), not one N-wide row that fails the column-count check.""" + from probeflow.io.readers.nanonis_spec import read_nanonis_spec + + p = tmp_path / "single.dat" + p.write_text( + "Experiment\tHistory Data\r\n" + "[DATA]\r\n" + "Current (A)\r\n" + + "".join(f"{v:.3e}\r\n" for v in np.linspace(1e-12, 5e-12, 40)), + encoding="latin-1", + ) + spec = read_nanonis_spec(p) + assert spec.metadata["n_points"] == 40 + assert spec.channels["Current"].shape == (40,) + + @pytest.mark.skipif(not NANONIS_SPEC_FIXTURES, reason="no fixtures") + def test_truncation_fails_in_both_paths(self, tmp_path): + from probeflow.io.spectroscopy import read_spec_file, read_spec_metadata + + raw = NANONIS_SPEC_FIXTURES[0].read_bytes() + p = tmp_path / "trunc.dat" + p.write_bytes(raw[: int(len(raw) * 0.95)]) + with pytest.raises(ValueError): + read_spec_file(p) + with pytest.raises(ValueError): + read_spec_metadata(p) diff --git a/tests/test_spec_io.py b/tests/test_spec_io.py index 4a52d04..6382286 100644 --- a/tests/test_spec_io.py +++ b/tests/test_spec_io.py @@ -131,7 +131,12 @@ def test_mixed_eol_after_data_marker_is_accepted(self, tmp_path): assert report.raw_table_shape == (2, 4) - def test_metadata_fast_path_tolerates_ragged_rows(self, tmp_path): + def test_metadata_fast_path_rejects_ragged_rows_like_the_full_parse( + self, tmp_path): + """A ragged table fails the full parse (np.loadtxt), so the metadata + fast path must reject it too — the old lenient summary reported such + files as healthy in browse (and the metadata cache) while the viewer + could not load them (2026-06-12 parser review).""" text = _createc_vert_text( n_rows=2, channel_code=0b1, @@ -139,10 +144,10 @@ def test_metadata_fast_path_tolerates_ragged_rows(self, tmp_path): ) f = _write_createc_vert(tmp_path, "ragged_summary.VERT", text) - report = read_createc_vert_report(f, include_arrays=False) - - assert report.raw_table_shape == (2, 4) - assert report.bias_min_mv == pytest.approx(-50.1) + with pytest.raises(ValueError, match="column"): + read_createc_vert_report(f, include_arrays=False) + with pytest.raises(ValueError): + read_createc_vert_report(f, include_arrays=True) def test_comma_decimal_data_rows_are_accepted(self, tmp_path): text = _createc_vert_text(