From 3a3906a008e0cee54f1faa6c1337d50755d874a2 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 4 Jun 2026 09:52:28 -0600 Subject: [PATCH 1/9] perofrmance improvements through dataset module --- README.md | 44 ++ benchmarks/benchmark_lazy_tensor.py | 25 +- benchmarks/benchmark_ome_iris.py | 507 +++++++++++++++++++ src/ome_arrow/__init__.py | 6 + src/ome_arrow/dataset.py | 728 ++++++++++++++++++++++++++++ tests/test_dataset.py | 119 +++++ 6 files changed, 1426 insertions(+), 3 deletions(-) create mode 100644 benchmarks/benchmark_ome_iris.py create mode 100644 src/ome_arrow/dataset.py create mode 100644 tests/test_dataset.py diff --git a/README.md b/README.md index e429b07..8121224 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,39 @@ Advanced options: See full docs: [`docs/src/dlpack.md`](docs/src/dlpack.md) +## Typed chunk datasets + +For selective pixel reads, use the typed byte-buffer dataset writer. This stores +image metadata separately from pixel chunks and writes one chunk per Parquet row +group so `read_plane()` and `read_region()` can jump through a physical index +instead of materializing the older nested struct payload. + +```python +import numpy as np +from ome_arrow import OMEArrowDataset, write_ome_arrow_dataset + +arr = np.zeros((1, 1, 1, 1024, 1024), dtype=np.uint16) # TCZYX + +choice = write_ome_arrow_dataset( + [arr], + "image.ome-arrow", + layout="tile", + chunk_shape=(1, 1, 1, 512, 512), + compression="zstd", + chunk_rows_per_row_group=1, +) +print(choice.rationale) + +dataset = OMEArrowDataset("image.ome-arrow") +image_id = dataset.images["image_id"].to_pylist()[0] +plane = dataset.pixels.read_plane(image_id, t=0, c=0, z=0) +crop = dataset.pixels.read_region(image_id, y=slice(128, 384), x=slice(128, 384)) +``` + +Use `chunk_rows_per_row_group=1` for the fastest direct chunk reads. Use a +larger value, such as `8`, to reduce row-group overhead for small chunks when +storage size matters. + ## Tensor ingest (PyTorch/JAX) You can ingest torch or JAX arrays directly with `OMEArrow(...)`. @@ -172,6 +205,17 @@ read paths (TIFF source-backed, Parquet planes, Parquet chunks): uv run python benchmarks/benchmark_lazy_tensor.py --repeats 5 --warmup 1 ``` +For OME-IRIS-style 2D/3D/4D/5D access patterns, run: + +```bash +uv run python benchmarks/benchmark_ome_iris.py --repeats 3 --warmup 1 +``` + +You can pass local real-data fixtures with `--fixture name=/path/to/image.tif`. +The benchmark writes matched temporary OME-Zarr and typed OME-Arrow artifacts, +then compares full-image, plane, crop, subvolume, timepoint, and channel reads +where those axes are present. + Notes: - This benchmark is for local iteration and relative comparisons. diff --git a/benchmarks/benchmark_lazy_tensor.py b/benchmarks/benchmark_lazy_tensor.py index 7abcc72..9581a74 100644 --- a/benchmarks/benchmark_lazy_tensor.py +++ b/benchmarks/benchmark_lazy_tensor.py @@ -23,6 +23,7 @@ import numpy as np from ome_arrow import OMEArrow +from ome_arrow.dataset import OMEArrowDataset, write_ome_arrow_dataset from ome_arrow.export import to_ome_parquet from ome_arrow.ingest import from_numpy @@ -78,7 +79,7 @@ def _time_case( ) -def _build_parquet_fixtures(workdir: Path) -> tuple[Path, Path]: +def _build_parquet_fixtures(workdir: Path) -> tuple[Path, Path, Path]: """Create small planes/chunks parquet fixtures for local benchmarking.""" arr = np.arange(1 * 2 * 3 * 256 * 256, dtype=np.uint16).reshape(1, 2, 3, 256, 256) @@ -92,9 +93,17 @@ def _build_parquet_fixtures(workdir: Path) -> tuple[Path, Path]: planes_path = workdir / "bench_planes.ome.parquet" chunks_path = workdir / "bench_chunks.ome.parquet" + typed_dataset_path = workdir / "bench_typed_dataset" to_ome_parquet(planes_scalar, out_path=str(planes_path), column_name="ome_arrow") to_ome_parquet(chunks_scalar, out_path=str(chunks_path), column_name="ome_arrow") - return planes_path, chunks_path + write_ome_arrow_dataset( + [arr], + typed_dataset_path, + layout="tile", + chunk_shape=(1, 1, 1, 64, 64), + compression="zstd", + ) + return planes_path, chunks_path, typed_dataset_path def _print_results(results: list[BenchmarkResult]) -> None: @@ -262,7 +271,9 @@ def main() -> None: with tempfile.TemporaryDirectory(prefix="ome_arrow_lazy_bench_") as tmp: tmpdir = Path(tmp) - planes_path, chunks_path = _build_parquet_fixtures(tmpdir) + planes_path, chunks_path, typed_dataset_path = _build_parquet_fixtures(tmpdir) + typed_dataset = OMEArrowDataset(typed_dataset_path) + typed_image_id = typed_dataset.images["image_id"].to_pylist()[0] cases: list[tuple[str, Callable[[], np.ndarray]]] = [] if args.tiff_path.exists(): @@ -297,6 +308,14 @@ def main() -> None: .to_numpy(contiguous=True) ), ), + ( + "typed-dataset(tile) -> read_plane", + lambda: typed_dataset.pixels.read_plane( + typed_image_id, + z=1, + c=1, + ), + ), ] ) diff --git a/benchmarks/benchmark_ome_iris.py b/benchmarks/benchmark_ome_iris.py new file mode 100644 index 0000000..c81f0e2 --- /dev/null +++ b/benchmarks/benchmark_ome_iris.py @@ -0,0 +1,507 @@ +"""Lightweight OME-IRIS-aligned benchmark for selective pixel reads. + +This benchmark intentionally follows the simple style of +``benchmarks/benchmark_lazy_tensor.py``. It accepts local image paths, builds +temporary OME-Zarr and typed OME-Arrow dataset artifacts with matched chunking, +and times public read APIs for 2D, 3D, 4D, and 5D-style access patterns. +""" + +from __future__ import annotations + +import argparse +import json +import statistics +import tempfile +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Callable + +import numpy as np + +from ome_arrow import OMEArrow, OMEArrowDataset, write_ome_arrow_dataset + + +@dataclass(frozen=True) +class BenchmarkResult: + """Summary stats for one benchmark case.""" + + dataset: str + case: str + format: str + median_ms: float + min_ms: float + max_ms: float + shape: tuple[int, ...] + dtype: str + bytes_on_disk: int + + +@dataclass(frozen=True) +class Fixture: + """One source image fixture to benchmark.""" + + name: str + path: Path + preferred_chunk_shape: tuple[int, int, int, int, int] + chunk_rows_per_row_group: int + + +def _default_fixtures() -> list[Fixture]: + """Return local fixtures that cover 2D, 3D, 4D, and 5D patterns.""" + return [ + Fixture( + name="2d-human", + path=Path("tests/data/examplehuman/AS_09125_050116030001_D03f00d0.tif"), + preferred_chunk_shape=(1, 1, 1, 256, 256), + chunk_rows_per_row_group=1, + ), + Fixture( + name="3d-nuclei", + path=Path("tests/data/cp-3d-nuclei/nuclei1_out_c00_dr90_image.tif"), + preferred_chunk_shape=(1, 1, 16, 128, 128), + chunk_rows_per_row_group=1, + ), + Fixture( + name="4d-time-volume", + path=Path("tests/data/ome-artificial-5d-datasets/4D-series.ome.tiff"), + preferred_chunk_shape=(1, 1, 2, 128, 128), + chunk_rows_per_row_group=8, + ), + Fixture( + name="5d-multichannel-time-volume", + path=Path( + "tests/data/ome-artificial-5d-datasets/" + "multi-channel-4D-series.ome.tiff" + ), + preferred_chunk_shape=(1, 1, 2, 128, 128), + chunk_rows_per_row_group=8, + ), + ] + + +def _parse_fixture_arg(raw: str) -> Fixture: + """Parse ``name=path`` or bare path CLI fixture input.""" + if "=" in raw: + name, path = raw.split("=", 1) + else: + path = raw + name = Path(path).stem + return Fixture( + name=name, + path=Path(path), + preferred_chunk_shape=(1, 1, 1, 256, 256), + chunk_rows_per_row_group=1, + ) + + +def _dir_size(path: Path) -> int: + """Return total bytes for a file or directory.""" + if not path.exists(): + return 0 + if path.is_file(): + return path.stat().st_size + return sum(p.stat().st_size for p in path.rglob("*") if p.is_file()) + + +def _time_case( + *, + dataset: str, + case: str, + format_name: str, + fn: Callable[[], np.ndarray], + repeats: int, + warmup: int, + bytes_on_disk: int, +) -> BenchmarkResult: + """Time one case and return summary stats.""" + out: np.ndarray | None = None + for _ in range(warmup): + out = fn() + + times_ms: list[float] = [] + for _ in range(repeats): + start = time.perf_counter() + out = fn() + end = time.perf_counter() + times_ms.append((end - start) * 1000.0) + + if out is None: + raise RuntimeError(f"Case did not produce output: {dataset} {case}") + return BenchmarkResult( + dataset=dataset, + case=case, + format=format_name, + median_ms=statistics.median(times_ms), + min_ms=min(times_ms), + max_ms=max(times_ms), + shape=tuple(out.shape), + dtype=np.dtype(out.dtype).name, + bytes_on_disk=int(bytes_on_disk), + ) + + +def _load_source(path: Path) -> tuple[OMEArrow, np.ndarray]: + """Load a source image through the public OMEArrow API.""" + oa = OMEArrow(str(path)) + arr = oa.export(how="numpy", dtype=np.dtype(oa.data.as_py()["pixels_meta"]["type"])) + if not isinstance(arr, np.ndarray): + raise TypeError("OMEArrow numpy export did not return an ndarray") + return oa, arr + + +def _write_artifacts( + fixture: Fixture, + source: OMEArrow, + arr: np.ndarray, + workdir: Path, +) -> tuple[Path, Path, Path, OMEArrowDataset, str, OMEArrowDataset, str]: + """Write matched OME-Zarr and typed OME-Arrow artifacts for one fixture.""" + zarr_path = workdir / f"{fixture.name}.ome.zarr" + typed_block_path = workdir / f"{fixture.name}.block.ome-arrow" + typed_image_path = workdir / f"{fixture.name}.image.ome-arrow" + dtype = np.dtype(arr.dtype) + + source.export( + how="ome-zarr", + out=str(zarr_path), + dtype=dtype, + chunks=fixture.preferred_chunk_shape, + zarr_compressor="zstd", + zarr_level=3, + ) + write_ome_arrow_dataset( + [arr], + typed_block_path, + layout="block", + chunk_shape=fixture.preferred_chunk_shape, + compression="zstd", + chunk_rows_per_row_group=fixture.chunk_rows_per_row_group, + ) + write_ome_arrow_dataset( + [arr], + typed_image_path, + layout="image", + compression="zstd", + ) + typed_block = OMEArrowDataset(typed_block_path) + typed_image = OMEArrowDataset(typed_image_path) + block_image_id = typed_block.images["image_id"].to_pylist()[0] + image_image_id = typed_image.images["image_id"].to_pylist()[0] + return ( + zarr_path, + typed_block_path, + typed_image_path, + typed_block, + str(block_image_id), + typed_image, + str(image_image_id), + ) + + +def _center_crop(size_y: int, size_x: int) -> tuple[slice, slice]: + """Return a modest centered YX crop.""" + height = min(128, size_y) + width = min(128, size_x) + y0 = max(0, (size_y - height) // 2) + x0 = max(0, (size_x - width) // 2) + return slice(y0, y0 + height), slice(x0, x0 + width) + + +def _subvolume_slice(size_z: int) -> slice: + """Return a small centered Z slice.""" + depth = min(4, size_z) + z0 = max(0, (size_z - depth) // 2) + return slice(z0, z0 + depth) + + +def _cases_for_fixture( + fixture: Fixture, + zarr_path: Path, + typed_block_path: Path, + typed_image_path: Path, + typed_block: OMEArrowDataset, + block_image_id: str, + typed_image: OMEArrowDataset, + image_image_id: str, + arr: np.ndarray, + *, + repeats: int, + warmup: int, +) -> list[BenchmarkResult]: + """Build and execute benchmark cases for one fixture.""" + st, sc, sz, sy, sx = (int(v) for v in arr.shape) + z_mid = sz // 2 + t_mid = st // 2 + c_mid = sc // 2 + crop_y, crop_x = _center_crop(sy, sx) + roi = ( + crop_x.start, + crop_y.start, + crop_x.stop - crop_x.start, + crop_y.stop - crop_y.start, + ) + z_sub = _subvolume_slice(sz) + + zarr_size = _dir_size(zarr_path) + typed_block_size = _dir_size(typed_block_path) + typed_image_size = _dir_size(typed_image_path) + + case_defs: list[tuple[str, str, Callable[[], np.ndarray], int]] = [ + ( + "full-image", + "ome-zarr", + lambda: OMEArrow.scan(str(zarr_path)) + .tensor_view(layout="TCZYX") + .to_numpy(contiguous=True), + zarr_size, + ), + ( + "full-image", + "ome-arrow-image", + lambda: typed_image.pixels.read_image(image_image_id), + typed_image_size, + ), + ( + "plane", + "ome-zarr", + lambda: OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=0, z=z_mid, layout="YX") + .to_numpy(contiguous=True), + zarr_size, + ), + ( + "plane", + "ome-arrow-block", + lambda: typed_block.pixels.read_plane(block_image_id, t=0, c=0, z=z_mid), + typed_block_size, + ), + ( + "crop-2d", + "ome-zarr", + lambda: OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=0, z=z_mid, roi=roi, layout="YX") + .to_numpy(contiguous=True), + zarr_size, + ), + ( + "crop-2d", + "ome-arrow-block", + lambda: typed_block.pixels.read_region( + block_image_id, + t=0, + c=0, + z=z_mid, + y=crop_y, + x=crop_x, + ), + typed_block_size, + ), + ] + + if sz > 1: + case_defs.extend( + [ + ( + "subvolume", + "ome-zarr", + lambda: OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=0, z=z_sub, roi=roi, layout="TCZYX") + .to_numpy(contiguous=True), + zarr_size, + ), + ( + "subvolume", + "ome-arrow-block", + lambda: typed_block.pixels.read_region( + block_image_id, + t=0, + c=0, + z=z_sub, + y=crop_y, + x=crop_x, + ), + typed_block_size, + ), + ] + ) + + if st > 1: + case_defs.extend( + [ + ( + "timepoint-plane", + "ome-zarr", + lambda: OMEArrow.scan(str(zarr_path)) + .tensor_view(t=t_mid, c=0, z=z_mid, layout="YX") + .to_numpy(contiguous=True), + zarr_size, + ), + ( + "timepoint-plane", + "ome-arrow-block", + lambda: typed_block.pixels.read_plane( + block_image_id, + t=t_mid, + c=0, + z=z_mid, + ), + typed_block_size, + ), + ] + ) + + if sc > 1: + case_defs.extend( + [ + ( + "channel-plane", + "ome-zarr", + lambda: OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=c_mid, z=z_mid, layout="YX") + .to_numpy(contiguous=True), + zarr_size, + ), + ( + "channel-plane", + "ome-arrow-block", + lambda: typed_block.pixels.read_plane( + block_image_id, + t=0, + c=c_mid, + z=z_mid, + ), + typed_block_size, + ), + ] + ) + + results = [] + for case, format_name, fn, bytes_on_disk in case_defs: + results.append( + _time_case( + dataset=fixture.name, + case=case, + format_name=format_name, + fn=fn, + repeats=repeats, + warmup=warmup, + bytes_on_disk=bytes_on_disk, + ) + ) + return results + + +def _print_results(results: list[BenchmarkResult]) -> None: + """Print benchmark results grouped in a compact table.""" + print("") + print("OME-IRIS-style benchmark (ms)") + print( + f"{'dataset':24} {'case':18} {'format':16} {'median':>9} " + f"{'min':>9} {'max':>9} {'shape':>18} {'dtype':>8} {'MB':>8}" + ) + print("-" * 112) + for r in results: + print( + f"{r.dataset:24} {r.case:18} {r.format:16} " + f"{r.median_ms:9.2f} {r.min_ms:9.2f} {r.max_ms:9.2f} " + f"{r.shape!s:>18} {r.dtype:>8} {r.bytes_on_disk / 1_000_000:8.2f}" + ) + + +def _write_json(path: Path, results: list[BenchmarkResult]) -> None: + """Write benchmark results as JSON.""" + path.write_text( + json.dumps( + { + "results": [ + { + "dataset": r.dataset, + "case": r.case, + "format": r.format, + "median_ms": r.median_ms, + "min_ms": r.min_ms, + "max_ms": r.max_ms, + "shape": list(r.shape), + "dtype": r.dtype, + "bytes_on_disk": r.bytes_on_disk, + } + for r in results + ] + }, + indent=2, + ) + ) + + +def main() -> None: + """Run OME-IRIS-style benchmarks.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--repeats", type=int, default=3) + parser.add_argument("--warmup", type=int, default=1) + parser.add_argument("--json-out", type=Path, default=None) + parser.add_argument( + "--fixture", + action="append", + default=[], + help="Fixture as name=path or path. Can be provided more than once.", + ) + args = parser.parse_args() + + if args.repeats <= 0: + raise ValueError("--repeats must be > 0") + if args.warmup < 0: + raise ValueError("--warmup must be >= 0") + + fixtures = [_parse_fixture_arg(raw) for raw in args.fixture] + if not fixtures: + fixtures = [f for f in _default_fixtures() if f.path.exists()] + if not fixtures: + raise FileNotFoundError("No benchmark fixtures were found.") + + all_results: list[BenchmarkResult] = [] + with tempfile.TemporaryDirectory(prefix="ome_arrow_ome_iris_bench_") as tmp: + workdir = Path(tmp) + for fixture in fixtures: + if not fixture.path.exists(): + print(f"Skipping missing fixture: {fixture.name} -> {fixture.path}") + continue + print(f"Preparing {fixture.name}: {fixture.path}") + source, arr = _load_source(fixture.path) + ( + zarr_path, + typed_block_path, + typed_image_path, + typed_block, + block_image_id, + typed_image, + image_image_id, + ) = _write_artifacts( + fixture, + source, + arr, + workdir, + ) + all_results.extend( + _cases_for_fixture( + fixture, + zarr_path, + typed_block_path, + typed_image_path, + typed_block, + block_image_id, + typed_image, + image_image_id, + arr, + repeats=args.repeats, + warmup=args.warmup, + ) + ) + + _print_results(all_results) + if args.json_out is not None: + _write_json(args.json_out, all_results) + + +if __name__ == "__main__": + main() diff --git a/src/ome_arrow/__init__.py b/src/ome_arrow/__init__.py index 4ca3bb4..57b273b 100644 --- a/src/ome_arrow/__init__.py +++ b/src/ome_arrow/__init__.py @@ -4,6 +4,12 @@ from ome_arrow._version import version as ome_arrow_version from ome_arrow.core import OMEArrow +from ome_arrow.dataset import ( + OMEArrowDataset, + OMEArrowPixels, + choose_chunking, + write_ome_arrow_dataset, +) from ome_arrow.export import ( to_numpy, to_ome_parquet, diff --git a/src/ome_arrow/dataset.py b/src/ome_arrow/dataset.py new file mode 100644 index 0000000..0096c19 --- /dev/null +++ b/src/ome_arrow/dataset.py @@ -0,0 +1,728 @@ +"""Typed chunk-buffer dataset support for OME-Arrow.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable, Literal, Sequence + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc +import pyarrow.parquet as pq + +from ome_arrow.export import to_numpy +from ome_arrow.meta import OME_ARROW_TAG_TYPE, OME_ARROW_TAG_VERSION + +Layout = Literal["auto", "image", "tile", "volume", "z-plane", "block", "hybrid"] +AccessPattern = Literal[ + "balanced", + "full_image", + "random_image", + "crop_2d", + "fast_crop_2d", + "full_volume", + "plane_3d", + "subvolume_3d", + "balanced_5d", + "time_series", +] + +IMAGE_TABLE_SCHEMA = pa.schema( + [ + pa.field("image_id", pa.string()), + pa.field("name", pa.string()), + pa.field("image_type", pa.string()), + pa.field("dtype", pa.string()), + pa.field("size_t", pa.int32()), + pa.field("size_c", pa.int32()), + pa.field("size_z", pa.int32()), + pa.field("size_y", pa.int32()), + pa.field("size_x", pa.int32()), + pa.field("layout", pa.string()), + pa.field("chunk_t", pa.int32()), + pa.field("chunk_c", pa.int32()), + pa.field("chunk_z", pa.int32()), + pa.field("chunk_y", pa.int32()), + pa.field("chunk_x", pa.int32()), + pa.field("compression", pa.string()), + pa.field("channels_json", pa.string()), + ] +) + +CHUNK_TABLE_SCHEMA = pa.schema( + [ + pa.field("image_id", pa.string()), + pa.field("chunk_id", pa.int64()), + pa.field("t", pa.int32()), + pa.field("c", pa.int32()), + pa.field("z", pa.int32()), + pa.field("y", pa.int32()), + pa.field("x", pa.int32()), + pa.field("shape_t", pa.int32()), + pa.field("shape_c", pa.int32()), + pa.field("shape_z", pa.int32()), + pa.field("shape_y", pa.int32()), + pa.field("shape_x", pa.int32()), + pa.field("dtype", pa.string()), + pa.field("pixel_bytes", pa.large_binary()), + ] +) + +INDEX_TABLE_SCHEMA = pa.schema( + [ + pa.field("image_id", pa.string()), + pa.field("chunk_id", pa.int64()), + pa.field("row_group_id", pa.int32()), + pa.field("row_index_in_group", pa.int32()), + pa.field("fragment_path", pa.string()), + pa.field("t", pa.int32()), + pa.field("c", pa.int32()), + pa.field("z", pa.int32()), + pa.field("y", pa.int32()), + pa.field("x", pa.int32()), + pa.field("shape_t", pa.int32()), + pa.field("shape_c", pa.int32()), + pa.field("shape_z", pa.int32()), + pa.field("shape_y", pa.int32()), + pa.field("shape_x", pa.int32()), + ] +) + + +@dataclass(frozen=True) +class ChunkChoice: + """Resolved chunking strategy for a typed pixel dataset.""" + + layout: str + chunk_shape: tuple[int, int, int, int, int] + compression: str | None + rationale: str + + +def choose_chunking( + shape: Sequence[int], + dtype: np.dtype | str, + *, + layout: Layout = "auto", + access_pattern: AccessPattern = "balanced", + target_chunk_mb: float = 4.0, + compression: str | None = "zstd", + chunk_shape: Sequence[int] | None = None, +) -> ChunkChoice: + """Choose a chunk layout for a TCZYX image. + + Args: + shape: Image shape as ``(T, C, Z, Y, X)``. + dtype: NumPy dtype or dtype string. + layout: Explicit layout or ``"auto"``. + access_pattern: Preset used when layout/chunk shape are not explicit. + target_chunk_mb: Soft maximum chunk payload size in megabytes. + compression: Parquet compression setting. + chunk_shape: Optional explicit chunk shape as ``(T, C, Z, Y, X)``. + + Returns: + ChunkChoice: Resolved layout, chunk shape, compression, and rationale. + """ + if len(shape) != 5: + raise ValueError("shape must be a five-value TCZYX sequence") + st, sc, sz, sy, sx = (int(v) for v in shape) + if min(st, sc, sz, sy, sx) <= 0: + raise ValueError("shape values must be positive") + dtype_obj = np.dtype(dtype) + + if layout == "auto": + layout_by_pattern = { + "balanced": "block", + "full_image": "image", + "random_image": "image", + "crop_2d": "tile", + "fast_crop_2d": "tile", + "full_volume": "volume", + "plane_3d": "z-plane", + "subvolume_3d": "block", + "balanced_5d": "block", + "time_series": "block", + } + resolved_layout = layout_by_pattern[access_pattern] + else: + resolved_layout = str(layout) + + if chunk_shape is not None: + if len(chunk_shape) != 5: + raise ValueError("chunk_shape must be a five-value TCZYX sequence") + ct, cc, cz, cy, cx = (int(v) for v in chunk_shape) + elif resolved_layout in {"image", "hybrid"}: + ct, cc, cz, cy, cx = st, sc, sz, sy, sx + elif resolved_layout == "volume": + ct, cc, cz, cy, cx = 1, 1, sz, sy, sx + elif resolved_layout == "z-plane": + ct, cc, cz, cy, cx = 1, 1, 1, sy, sx + elif access_pattern == "fast_crop_2d": + ct, cc, cz, cy, cx = 1, 1, 1, min(sy, 256), min(sx, 256) + elif resolved_layout == "tile": + ct, cc, cz, cy, cx = 1, 1, 1, min(sy, 512), min(sx, 512) + elif access_pattern == "time_series": + ct, cc, cz, cy, cx = 1, 1, min(sz, 8), min(sy, 256), min(sx, 256) + elif access_pattern == "balanced_5d": + ct, cc, cz, cy, cx = 1, 1, min(sz, 16), min(sy, 256), min(sx, 256) + else: + ct, cc, cz, cy, cx = 1, 1, min(sz, 16), min(sy, 128), min(sx, 128) + + bounds = (st, sc, sz, sy, sx) + chunk = tuple( + max(1, min(int(v), limit)) for v, limit in zip((ct, cc, cz, cy, cx), bounds) + ) + size_mb = float(np.prod(chunk) * dtype_obj.itemsize) / (1024 * 1024) + if ( + chunk_shape is None + and size_mb > target_chunk_mb + and resolved_layout + not in { + "image", + "volume", + "z-plane", + } + ): + scale = (target_chunk_mb / size_mb) ** 0.5 + cy = max(1, min(sy, int(chunk[3] * scale))) + cx = max(1, min(sx, int(chunk[4] * scale))) + chunk = (chunk[0], chunk[1], chunk[2], cy, cx) + size_mb = float(np.prod(chunk) * dtype_obj.itemsize) / (1024 * 1024) + + rationale = ( + f'Selected layout="{resolved_layout}", chunk_shape={chunk}, ' + f"compression={compression!r} because dtype={dtype_obj.name}, " + f"estimated chunk size is {size_mb:.3g} MB, and " + f'access_pattern="{access_pattern}".' + ) + return ChunkChoice( + layout=resolved_layout, + chunk_shape=chunk, + compression=compression, + rationale=rationale, + ) + + +def _as_tczyx_array(image: Any) -> tuple[np.ndarray, dict[str, Any]]: + """Return a TCZYX array and metadata dictionary for a supported image input.""" + try: + from ome_arrow.core import OMEArrow + except Exception: + OMEArrow = () # type: ignore + + if isinstance(image, np.ndarray): + arr = image + if arr.ndim == 2: + arr = arr.reshape(1, 1, 1, *arr.shape) + elif arr.ndim == 3: + arr = arr.reshape(1, 1, *arr.shape) + elif arr.ndim == 4: + arr = arr.reshape(arr.shape[0], 1, *arr.shape[1:]) + elif arr.ndim != 5: + raise ValueError("NumPy image inputs must be 2D, 3D, 4D, or 5D") + return np.asarray(arr), {} + + if OMEArrow and isinstance(image, OMEArrow): + record = image.data.as_py() + arr = to_numpy(image.data, dtype=np.dtype(record["pixels_meta"]["type"])) + return arr, record + + if isinstance(image, pa.StructScalar): + record = image.as_py() + return to_numpy(image, dtype=np.dtype(record["pixels_meta"]["type"])), record + + if isinstance(image, dict) and "pixels_meta" in image: + return to_numpy(image, dtype=np.dtype(image["pixels_meta"]["type"])), image + + raise TypeError( + "images must contain numpy.ndarray, OMEArrow, pa.StructScalar, " + "or OME-Arrow dict values" + ) + + +def _image_metadata( + *, + image: Any, + arr: np.ndarray, + record: dict[str, Any], + index: int, + choice: ChunkChoice, +) -> dict[str, Any]: + """Build one image metadata row.""" + pm = record.get("pixels_meta") or {} + image_id = record.get("id") or getattr(image, "name", None) or f"image-{index:05d}" + name = record.get("name") or str(image_id) + channels = pm.get("channels") or [] + st, sc, sz, sy, sx = (int(v) for v in arr.shape) + return { + "image_id": str(image_id), + "name": str(name), + "image_type": ( + None if record.get("image_type") is None else str(record.get("image_type")) + ), + "dtype": np.dtype(arr.dtype).name, + "size_t": st, + "size_c": sc, + "size_z": sz, + "size_y": sy, + "size_x": sx, + "layout": choice.layout, + "chunk_t": choice.chunk_shape[0], + "chunk_c": choice.chunk_shape[1], + "chunk_z": choice.chunk_shape[2], + "chunk_y": choice.chunk_shape[3], + "chunk_x": choice.chunk_shape[4], + "compression": choice.compression, + "channels_json": json.dumps(channels), + } + + +def _iter_chunk_rows( + image_id: str, + arr: np.ndarray, + *, + start_chunk_id: int, + choice: ChunkChoice, +) -> Iterable[dict[str, Any]]: + """Yield typed byte-buffer chunk rows for one TCZYX image.""" + ct, cc, cz, cy, cx = choice.chunk_shape + st, sc, sz, sy, sx = arr.shape + chunk_id = int(start_chunk_id) + for t0 in range(0, st, ct): + nt = min(ct, st - t0) + for c0 in range(0, sc, cc): + nc = min(cc, sc - c0) + for z0 in range(0, sz, cz): + nz = min(cz, sz - z0) + for y0 in range(0, sy, cy): + ny = min(cy, sy - y0) + for x0 in range(0, sx, cx): + nx = min(cx, sx - x0) + chunk = np.ascontiguousarray( + arr[ + t0 : t0 + nt, + c0 : c0 + nc, + z0 : z0 + nz, + y0 : y0 + ny, + x0 : x0 + nx, + ] + ) + yield { + "image_id": image_id, + "chunk_id": chunk_id, + "t": t0, + "c": c0, + "z": z0, + "y": y0, + "x": x0, + "shape_t": nt, + "shape_c": nc, + "shape_z": nz, + "shape_y": ny, + "shape_x": nx, + "dtype": np.dtype(arr.dtype).name, + "pixel_bytes": chunk.tobytes(order="C"), + } + chunk_id += 1 + + +def write_ome_arrow_dataset( + images: Sequence[Any], + output_path: str | Path, + *, + layout: Layout = "auto", + access_pattern: AccessPattern = "balanced", + target_chunk_mb: float = 4.0, + chunk_shape: Sequence[int] | None = None, + compression: str | None = "zstd", + build_physical_index: bool = True, + chunk_rows_per_row_group: int = 1, +) -> ChunkChoice: + """Write a metadata table and typed pixel chunk table. + + Args: + images: Sequence of image inputs. NumPy arrays are interpreted as + ``TCZYX`` for 5D, ``TZYX`` for 4D, ``ZYX`` for 3D, and ``YX`` for 2D. + Existing OME-Arrow records preserve their metadata. + output_path: Directory to write. + layout: Physical layout mode or ``"auto"``. + access_pattern: Preset used for automatic layout/chunk shape selection. + target_chunk_mb: Soft maximum chunk payload size in megabytes. + chunk_shape: Optional explicit ``TCZYX`` chunk shape. + compression: Parquet compression setting, e.g. ``"zstd"`` or ``None``. + build_physical_index: Whether to emit row-group index metadata. + chunk_rows_per_row_group: Number of chunk rows to pack into each row + group. ``1`` gives fastest direct chunk reads. Larger values reduce + Parquet row-group overhead and can improve storage for small chunks. + + Returns: + ChunkChoice: The choice used for the first image. All images currently + share the same explicit options, but their edge chunks are clipped. + """ + if not images: + raise ValueError("images must contain at least one image") + if chunk_rows_per_row_group <= 0: + raise ValueError("chunk_rows_per_row_group must be > 0") + out_dir = Path(output_path) + out_dir.mkdir(parents=True, exist_ok=True) + + image_rows: list[dict[str, Any]] = [] + chunk_rows: list[dict[str, Any]] = [] + choices: list[ChunkChoice] = [] + next_chunk_id = 0 + for i, image in enumerate(images): + arr, record = _as_tczyx_array(image) + choice = choose_chunking( + arr.shape, + arr.dtype, + layout=layout, + access_pattern=access_pattern, + target_chunk_mb=target_chunk_mb, + compression=compression, + chunk_shape=chunk_shape, + ) + choices.append(choice) + meta = _image_metadata( + image=image, + arr=arr, + record=record, + index=i, + choice=choice, + ) + image_rows.append(meta) + new_rows = list( + _iter_chunk_rows( + meta["image_id"], + arr, + start_chunk_id=next_chunk_id, + choice=choice, + ) + ) + chunk_rows.extend(new_rows) + next_chunk_id += len(new_rows) + + metadata = { + b"ome.arrow.type": str(OME_ARROW_TAG_TYPE).encode(), + b"ome.arrow.version": str(OME_ARROW_TAG_VERSION).encode(), + b"ome.arrow.layout": b"typed-chunk-dataset", + } + images_table = pa.Table.from_pylist(image_rows, schema=IMAGE_TABLE_SCHEMA) + chunks_table = pa.Table.from_pylist(chunk_rows, schema=CHUNK_TABLE_SCHEMA) + images_table = images_table.replace_schema_metadata(metadata) + chunks_table = chunks_table.replace_schema_metadata(metadata) + + pq.write_table(images_table, out_dir / "images.parquet", compression=compression) + pq.write_table( + chunks_table, + out_dir / "chunks.parquet", + compression=compression, + row_group_size=int(chunk_rows_per_row_group), + ) + + if build_physical_index: + index_rows = [ + { + "image_id": row["image_id"], + "chunk_id": row["chunk_id"], + "row_group_id": i // int(chunk_rows_per_row_group), + "row_index_in_group": i % int(chunk_rows_per_row_group), + "fragment_path": "chunks.parquet", + "t": row["t"], + "c": row["c"], + "z": row["z"], + "y": row["y"], + "x": row["x"], + "shape_t": row["shape_t"], + "shape_c": row["shape_c"], + "shape_z": row["shape_z"], + "shape_y": row["shape_y"], + "shape_x": row["shape_x"], + } + for i, row in enumerate(chunk_rows) + ] + index_table = pa.Table.from_pylist(index_rows, schema=INDEX_TABLE_SCHEMA) + index_table = index_table.replace_schema_metadata(metadata) + pq.write_table(index_table, out_dir / "index.parquet", compression=compression) + + manifest = { + "type": OME_ARROW_TAG_TYPE, + "version": OME_ARROW_TAG_VERSION, + "layout": "typed-chunk-dataset", + "image_count": len(image_rows), + "chunk_count": len(chunk_rows), + "choice": choices[0].__dict__, + } + (out_dir / "_ome_arrow_dataset.json").write_text(json.dumps(manifest, indent=2)) + return choices[0] + + +class OMEArrowPixels: + """Direct typed pixel reader for an OME-Arrow dataset.""" + + def __init__(self, dataset: "OMEArrowDataset") -> None: + """Initialize a pixel reader from its parent dataset.""" + self._dataset = dataset + + def read_image(self, image_id: str) -> np.ndarray: + """Read one image as a dense ``TCZYX`` NumPy array.""" + meta = self._dataset.image_metadata(image_id) + return self._read_region( + str(image_id), + t=slice(0, int(meta["size_t"])), + c=slice(0, int(meta["size_c"])), + z=slice(0, int(meta["size_z"])), + y=slice(0, int(meta["size_y"])), + x=slice(0, int(meta["size_x"])), + ) + + def read_many(self, image_ids: Iterable[str]) -> list[np.ndarray]: + """Read multiple images by identifier.""" + return [self.read_image(str(image_id)) for image_id in image_ids] + + def read_channel(self, image_id: str, channel: int) -> np.ndarray: + """Read one channel as a ``TZYX`` NumPy array.""" + arr = self._read_region( + str(image_id), + c=slice(int(channel), int(channel) + 1), + ) + return arr[:, 0] + + def read_plane( + self, + image_id: str, + *, + t: int = 0, + c: int = 0, + z: int = 0, + ) -> np.ndarray: + """Read one ``YX`` plane.""" + arr = self._read_region( + str(image_id), + t=slice(int(t), int(t) + 1), + c=slice(int(c), int(c) + 1), + z=slice(int(z), int(z) + 1), + ) + return arr[0, 0, 0] + + def read_region( + self, + image_id: str, + *, + y: slice, + x: slice, + t: int | slice | None = None, + c: int | slice | None = None, + z: int | slice | None = None, + ) -> np.ndarray: + """Read a spatial region as a dense ``TCZYX`` NumPy array.""" + return self._read_region( + str(image_id), + t=_as_slice(t), + c=_as_slice(c), + z=_as_slice(z), + y=y, + x=x, + ) + + def _read_region( + self, + image_id: str, + *, + t: slice | None = None, + c: slice | None = None, + z: slice | None = None, + y: slice | None = None, + x: slice | None = None, + ) -> np.ndarray: + meta = self._dataset.image_metadata(image_id) + dtype = np.dtype(meta["dtype"]) + bounds = ( + int(meta["size_t"]), + int(meta["size_c"]), + int(meta["size_z"]), + int(meta["size_y"]), + int(meta["size_x"]), + ) + ts, cs, zs, ys, xs = _normalize_region((t, c, z, y, x), bounds) + out = np.zeros( + ( + ts.stop - ts.start, + cs.stop - cs.start, + zs.stop - zs.start, + ys.stop - ys.start, + xs.stop - xs.start, + ), + dtype=dtype, + ) + + chunks = self._dataset._matching_chunks( + image_id, + t=ts, + c=cs, + z=zs, + y=ys, + x=xs, + ) + parquet_file = self._dataset._chunks_file + for row in chunks: + chunk = self._dataset._read_chunk_row(parquet_file, row) + arr = np.frombuffer(chunk["pixel_bytes"], dtype=np.dtype(chunk["dtype"])) + arr = arr.reshape( + ( + int(chunk["shape_t"]), + int(chunk["shape_c"]), + int(chunk["shape_z"]), + int(chunk["shape_y"]), + int(chunk["shape_x"]), + ) + ) + _copy_overlap(out, arr, chunk, (ts, cs, zs, ys, xs)) + return out + + +class OMEArrowDataset: + """Reader for metadata plus typed byte-buffer OME-Arrow datasets.""" + + def __init__(self, path: str | Path) -> None: + """Open an OME-Arrow dataset directory.""" + self.path = Path(path) + if not self.path.exists(): + raise FileNotFoundError(f"No such dataset: {self.path}") + self.images = pq.read_table(self.path / "images.parquet") + self._chunks_file = pq.ParquetFile(self.path / "chunks.parquet") + index_path = self.path / "index.parquet" + self.index = pq.read_table(index_path) if index_path.exists() else None + self.pixels = OMEArrowPixels(self) + + def image_metadata(self, image_id: str) -> dict[str, Any]: + """Return one image metadata row as a dictionary.""" + mask = pc.equal(self.images["image_id"], str(image_id)) + rows = self.images.filter(mask).to_pylist() + if not rows: + raise KeyError(f"image_id not found: {image_id}") + return rows[0] + + def _matching_chunks( + self, + image_id: str, + *, + t: slice, + c: slice, + z: slice, + y: slice, + x: slice, + ) -> list[dict[str, Any]]: + if self.index is not None: + source = self.index + else: + source = pq.read_table(self.path / "chunks.parquet") + mask = pc.equal(source["image_id"], str(image_id)) + for axis, region in { + "t": t, + "c": c, + "z": z, + "y": y, + "x": x, + }.items(): + start = source[axis] + stop = pc.add(source[axis], source[f"shape_{axis}"]) + axis_mask = pc.and_( + pc.less(start, region.stop), + pc.greater(stop, region.start), + ) + mask = pc.and_(mask, axis_mask) + return source.filter(mask).to_pylist() + + def _read_chunk_row( + self, + parquet_file: pq.ParquetFile, + row: dict[str, Any], + ) -> dict[str, Any]: + if "row_group_id" in row and row["row_group_id"] is not None: + table = parquet_file.read_row_group( + int(row["row_group_id"]), + columns=["chunk_id", "dtype", "pixel_bytes"], + ) + mask = pc.equal(table["chunk_id"], int(row["chunk_id"])) + chunk_rows = table.filter(mask).to_pylist() + if not chunk_rows: + raise ValueError( + f"Chunk {row['chunk_id']} not found in row group " + f"{row['row_group_id']}" + ) + return {**row, **chunk_rows[0]} + + table = pq.read_table( + self.path / "chunks.parquet", + filters=[ + ("image_id", "=", row["image_id"]), + ("chunk_id", "=", row["chunk_id"]), + ], + ) + chunk_rows = table.to_pylist() + if not chunk_rows: + raise ValueError(f"Chunk not found: {row['chunk_id']}") + return chunk_rows[0] + + +def _as_slice(value: int | slice | None) -> slice | None: + """Normalize an optional scalar index to a slice.""" + if value is None: + return None + if isinstance(value, slice): + return value + i = int(value) + return slice(i, i + 1) + + +def _normalize_region( + slices: tuple[slice | None, slice | None, slice | None, slice | None, slice | None], + bounds: tuple[int, int, int, int, int], +) -> tuple[slice, slice, slice, slice, slice]: + """Normalize optional slices against TCZYX image bounds.""" + normalized = [] + for raw_slice, bound in zip(slices, bounds): + axis_slice = slice(0, bound) if raw_slice is None else raw_slice + start, stop, step = axis_slice.indices(bound) + if step != 1: + raise ValueError("OMEArrowDataset pixel reads support only step=1 slices") + stop = max(stop, start) + normalized.append(slice(start, stop)) + return tuple(normalized) # type: ignore[return-value] + + +def _copy_overlap( + out: np.ndarray, + chunk: np.ndarray, + row: dict[str, Any], + region: tuple[slice, slice, slice, slice, slice], +) -> None: + """Copy the overlap between a decoded chunk and requested output region.""" + chunk_starts = ( + int(row["t"]), + int(row["c"]), + int(row["z"]), + int(row["y"]), + int(row["x"]), + ) + chunk_shapes = ( + int(row["shape_t"]), + int(row["shape_c"]), + int(row["shape_z"]), + int(row["shape_y"]), + int(row["shape_x"]), + ) + out_slices = [] + chunk_slices = [] + for start, size, requested in zip(chunk_starts, chunk_shapes, region): + overlap_start = max(start, requested.start) + overlap_stop = min(start + size, requested.stop) + if overlap_stop <= overlap_start: + return + out_slices.append( + slice(overlap_start - requested.start, overlap_stop - requested.start) + ) + chunk_slices.append(slice(overlap_start - start, overlap_stop - start)) + out[tuple(out_slices)] = chunk[tuple(chunk_slices)] diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 0000000..67115c0 --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,119 @@ +"""Tests for typed chunk-buffer OME-Arrow datasets.""" + +import pathlib + +import numpy as np +import pyarrow.parquet as pq + +from ome_arrow import ( + OMEArrow, + OMEArrowDataset, + choose_chunking, + write_ome_arrow_dataset, +) + + +def test_choose_chunking_presets() -> None: + """Resolve layout and chunk shape from access-pattern presets.""" + choice = choose_chunking( + (1, 1, 4, 1024, 1024), + np.uint16, + access_pattern="fast_crop_2d", + ) + + assert choice.layout == "tile" + assert choice.chunk_shape == (1, 1, 1, 256, 256) + assert "fast_crop_2d" in choice.rationale + + +def test_write_dataset_preserves_dtype_and_reads_image(tmp_path: pathlib.Path) -> None: + """Round-trip a typed byte-buffer dataset without widening pixels.""" + arr = np.arange(1 * 2 * 2 * 5 * 6, dtype=np.uint8).reshape(1, 2, 2, 5, 6) + out = tmp_path / "typed_dataset" + + choice = write_ome_arrow_dataset( + [arr], + out, + layout="image", + compression=None, + ) + dataset = OMEArrowDataset(out) + image_id = dataset.images["image_id"].to_pylist()[0] + roundtrip = dataset.pixels.read_image(image_id) + + assert choice.layout == "image" + assert roundtrip.dtype == np.uint8 + np.testing.assert_array_equal(roundtrip, arr) + assert pq.ParquetFile(out / "chunks.parquet").metadata.num_row_groups == 1 + + +def test_dataset_reads_plane_and_region_from_indexed_tiles( + tmp_path: pathlib.Path, +) -> None: + """Read selected planes and crops through the physical chunk index.""" + arr = np.arange(1 * 1 * 2 * 6 * 7, dtype=np.uint16).reshape(1, 1, 2, 6, 7) + out = tmp_path / "tiles" + + write_ome_arrow_dataset( + [arr], + out, + layout="tile", + chunk_shape=(1, 1, 1, 3, 4), + compression="zstd", + ) + dataset = OMEArrowDataset(out) + image_id = dataset.images["image_id"].to_pylist()[0] + + plane = dataset.pixels.read_plane(image_id, z=1) + region = dataset.pixels.read_region( + image_id, + z=1, + y=slice(2, 6), + x=slice(3, 7), + ) + + np.testing.assert_array_equal(plane, arr[0, 0, 1]) + np.testing.assert_array_equal(region, arr[:, :, 1:2, 2:6, 3:7]) + assert pq.ParquetFile(out / "chunks.parquet").metadata.num_row_groups == 8 + + +def test_dataset_reads_from_packed_chunk_row_groups(tmp_path: pathlib.Path) -> None: + """Read indexed chunks when several chunk rows share one row group.""" + arr = np.arange(1 * 1 * 2 * 6 * 7, dtype=np.uint16).reshape(1, 1, 2, 6, 7) + out = tmp_path / "packed_tiles" + + write_ome_arrow_dataset( + [arr], + out, + layout="tile", + chunk_shape=(1, 1, 1, 3, 4), + compression="zstd", + chunk_rows_per_row_group=4, + ) + dataset = OMEArrowDataset(out) + image_id = dataset.images["image_id"].to_pylist()[0] + + region = dataset.pixels.read_region( + image_id, + z=1, + y=slice(2, 6), + x=slice(3, 7), + ) + + np.testing.assert_array_equal(region, arr[:, :, 1:2, 2:6, 3:7]) + assert pq.ParquetFile(out / "chunks.parquet").metadata.num_row_groups == 2 + + +def test_dataset_accepts_ome_arrow_records(tmp_path: pathlib.Path) -> None: + """Write OME-Arrow records to typed byte-buffer datasets.""" + arr = np.arange(1 * 1 * 1 * 3 * 4, dtype=np.uint16).reshape(1, 1, 1, 3, 4) + oa = OMEArrow(arr, dim_order="TCZYX") + out = tmp_path / "records" + + write_ome_arrow_dataset([oa], out, layout="z-plane", compression=None) + dataset = OMEArrowDataset(out) + image_id = dataset.images["image_id"].to_pylist()[0] + + roundtrip = dataset.pixels.read_image(image_id) + assert roundtrip.dtype == np.uint16 + np.testing.assert_array_equal(roundtrip, arr) From 5d6664ab7ecf1a7954eb9b95cbeeb666884ae983 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 4 Jun 2026 10:38:24 -0600 Subject: [PATCH 2/9] set pixel dtype; further benchmarking --- README.md | 29 ++ benchmarks/benchmark_ome_iris.py | 723 +++++++++++++++++++++++++------ src/ome_arrow/dataset.py | 138 ++++++ tests/test_dataset.py | 71 ++- 4 files changed, 826 insertions(+), 135 deletions(-) diff --git a/README.md b/README.md index 8121224..e4b5c20 100644 --- a/README.md +++ b/README.md @@ -147,12 +147,23 @@ dataset = OMEArrowDataset("image.ome-arrow") image_id = dataset.images["image_id"].to_pylist()[0] plane = dataset.pixels.read_plane(image_id, t=0, c=0, z=0) crop = dataset.pixels.read_region(image_id, y=slice(128, 384), x=slice(128, 384)) + +# Dataset-level shortcuts return NumPy by default and can return Torch/JAX +# arrays when those packages are installed. +plane_np = dataset.read_plane(t=0, c=0, z=0) +plane_torch = dataset.read_plane(t=0, c=0, z=0, return_type="torch") +plane_jax = dataset.read_plane(t=0, c=0, z=0, return_type="jax") ``` Use `chunk_rows_per_row_group=1` for the fastest direct chunk reads. Use a larger value, such as `8`, to reduce row-group overhead for small chunks when storage size matters. +The writer preserves source pixel dtype by default. To normalize stored pixel +buffers explicitly, pass `pixel_dtype`, for example `pixel_dtype="uint16"`. +Integer casts clamp by default; pass `clamp=False` to use NumPy casting +behavior directly. + ## Tensor ingest (PyTorch/JAX) You can ingest torch or JAX arrays directly with `OMEArrow(...)`. @@ -216,6 +227,24 @@ The benchmark writes matched temporary OME-Zarr and typed OME-Arrow artifacts, then compares full-image, plane, crop, subvolume, timepoint, and channel reads where those axes are present. +The OME-IRIS-style benchmark separates return/API paths: + +- `ome-zarr-tensor-numpy`: OME-Arrow `tensor_view(...).to_numpy()` over OME-Zarr. +- `ome-zarr-bioio-numpy`: direct BioImage NumPy reads over OME-Zarr. +- `ome-tiff-tensor-numpy`: OME-Arrow `tensor_view(...).to_numpy()` over TIFF. +- `ome-tiff-bioio-numpy`: direct BioImage NumPy reads over TIFF. +- `ome-arrow-src-numpy`: source-dtype typed OME-Arrow dataset NumPy reads. +- `ome-arrow-u16-numpy`: typed OME-Arrow dataset NumPy reads normalized to + `uint16` for apples-to-apples comparisons with normalized paths. +- `ome-tiff-tensor-torch` / `ome-tiff-tensor-jax`: OME-Arrow tensor-view + Torch/JAX returns over TIFF. +- `ome-zarr-tensor-torch` / `ome-zarr-tensor-jax`: OME-Arrow tensor-view + Torch/JAX returns over OME-Zarr. +- `ome-arrow-src-torch` / `ome-arrow-src-jax`: source-dtype typed OME-Arrow + dataset reads with `return_type="torch"` or `return_type="jax"`. +- `ome-arrow-u16-torch` / `ome-arrow-u16-jax`: normalized `uint16` typed + OME-Arrow dataset reads with Torch/JAX returns. + Notes: - This benchmark is for local iteration and relative comparisons. diff --git a/benchmarks/benchmark_ome_iris.py b/benchmarks/benchmark_ome_iris.py index c81f0e2..6d2d5aa 100644 --- a/benchmarks/benchmark_ome_iris.py +++ b/benchmarks/benchmark_ome_iris.py @@ -9,15 +9,18 @@ from __future__ import annotations import argparse +import importlib.util import json import statistics import tempfile import time from dataclasses import dataclass from pathlib import Path -from typing import Callable +from typing import Any, Callable import numpy as np +from bioio import BioImage +from bioio_ome_zarr import Reader as OMEZarrReader from ome_arrow import OMEArrow, OMEArrowDataset, write_ome_arrow_dataset @@ -47,6 +50,19 @@ class Fixture: chunk_rows_per_row_group: int +@dataclass(frozen=True) +class ArrowArtifact: + """Typed OME-Arrow artifacts for one dtype mode.""" + + label: str + block_path: Path + image_path: Path + block: OMEArrowDataset + block_image_id: str + image: OMEArrowDataset + image_image_id: str + + def _default_fixtures() -> list[Fixture]: """Return local fixtures that cover 2D, 3D, 4D, and 5D patterns.""" return [ @@ -71,8 +87,7 @@ def _default_fixtures() -> list[Fixture]: Fixture( name="5d-multichannel-time-volume", path=Path( - "tests/data/ome-artificial-5d-datasets/" - "multi-channel-4D-series.ome.tiff" + "tests/data/ome-artificial-5d-datasets/multi-channel-4D-series.ome.tiff" ), preferred_chunk_shape=(1, 1, 2, 128, 128), chunk_rows_per_row_group=8, @@ -109,25 +124,28 @@ def _time_case( dataset: str, case: str, format_name: str, - fn: Callable[[], np.ndarray], + fn: Callable[[], Any], repeats: int, warmup: int, bytes_on_disk: int, ) -> BenchmarkResult: """Time one case and return summary stats.""" - out: np.ndarray | None = None + out: Any | None = None for _ in range(warmup): out = fn() + _sync_result(out) times_ms: list[float] = [] for _ in range(repeats): start = time.perf_counter() out = fn() + _sync_result(out) end = time.perf_counter() times_ms.append((end - start) * 1000.0) if out is None: raise RuntimeError(f"Case did not produce output: {dataset} {case}") + shape, dtype = _result_shape_dtype(out) return BenchmarkResult( dataset=dataset, case=case, @@ -135,32 +153,144 @@ def _time_case( median_ms=statistics.median(times_ms), min_ms=min(times_ms), max_ms=max(times_ms), - shape=tuple(out.shape), - dtype=np.dtype(out.dtype).name, + shape=shape, + dtype=dtype, bytes_on_disk=int(bytes_on_disk), ) -def _load_source(path: Path) -> tuple[OMEArrow, np.ndarray]: - """Load a source image through the public OMEArrow API.""" +def _sync_result(out: Any) -> None: + """Synchronize lazy array backends for fair timing.""" + block_until_ready = getattr(out, "block_until_ready", None) + if callable(block_until_ready): + block_until_ready() + + +def _result_shape_dtype(out: Any) -> tuple[tuple[int, ...], str]: + """Return shape and dtype strings for NumPy, Torch, and JAX-like arrays.""" + shape = tuple(int(v) for v in getattr(out, "shape", ())) + dtype = getattr(out, "dtype", None) + if dtype is None: + dtype_str = type(out).__name__ + else: + try: + dtype_str = np.dtype(dtype).name + except TypeError: + dtype_str = str(dtype) + return shape, dtype_str + + +def _torch_available() -> bool: + """Return whether torch is importable.""" + return importlib.util.find_spec("torch") is not None + + +def _jax_available() -> bool: + """Return whether JAX is importable.""" + return importlib.util.find_spec("jax") is not None + + +def _read_zarr_full(path: Path) -> np.ndarray: + """Read a Zarr image directly through BioImage as TCZYX NumPy.""" + return np.asarray(BioImage(str(path), reader=OMEZarrReader).data) + + +def _read_zarr_plane(path: Path, *, t: int, c: int, z: int) -> np.ndarray: + """Read one Zarr YX plane directly through BioImage.""" + image = BioImage(str(path), reader=OMEZarrReader) + return np.asarray(image.get_image_data("YX", T=t, C=c, Z=z)) + + +def _read_zarr_crop( + path: Path, + *, + t: int, + c: int, + z: int, + y: slice, + x: slice, +) -> np.ndarray: + """Read one Zarr YX crop directly through BioImage.""" + return _read_zarr_plane(path, t=t, c=c, z=z)[y, x] + + +def _read_zarr_subvolume( + path: Path, + *, + t: int, + c: int, + z: slice, + y: slice, + x: slice, +) -> np.ndarray: + """Read one Zarr subvolume directly through BioImage as TCZYX NumPy.""" + image = BioImage(str(path), reader=OMEZarrReader) + cropped = np.asarray(image.get_image_data("ZYX", T=t, C=c))[z, y, x] + return cropped.reshape(1, 1, z.stop - z.start, y.stop - y.start, x.stop - x.start) + + +def _read_tiff_full(path: Path) -> np.ndarray: + """Read a TIFF image directly through BioImage as TCZYX NumPy.""" + return np.asarray(BioImage(str(path)).data) + + +def _read_tiff_plane(path: Path, *, t: int, c: int, z: int) -> np.ndarray: + """Read one TIFF YX plane directly through BioImage.""" + image = BioImage(str(path)) + return np.asarray(image.get_image_data("YX", T=t, C=c, Z=z)) + + +def _read_tiff_crop( + path: Path, + *, + t: int, + c: int, + z: int, + y: slice, + x: slice, +) -> np.ndarray: + """Read one TIFF YX crop directly through BioImage.""" + return _read_tiff_plane(path, t=t, c=c, z=z)[y, x] + + +def _read_tiff_subvolume( + path: Path, + *, + t: int, + c: int, + z: slice, + y: slice, + x: slice, +) -> np.ndarray: + """Read one TIFF subvolume directly through BioImage as TCZYX NumPy.""" + image = BioImage(str(path)) + cropped = np.asarray(image.get_image_data("ZYX", T=t, C=c))[z, y, x] + return cropped.reshape(1, 1, z.stop - z.start, y.stop - y.start, x.stop - x.start) + + +def _load_source(path: Path) -> tuple[OMEArrow, np.ndarray, np.ndarray]: + """Load source image arrays for normalized and raw dtype comparisons.""" oa = OMEArrow(str(path)) - arr = oa.export(how="numpy", dtype=np.dtype(oa.data.as_py()["pixels_meta"]["type"])) - if not isinstance(arr, np.ndarray): + normalized = oa.export( + how="numpy", + dtype=np.dtype(oa.data.as_py()["pixels_meta"]["type"]), + ) + if not isinstance(normalized, np.ndarray): raise TypeError("OMEArrow numpy export did not return an ndarray") - return oa, arr + raw = _read_tiff_full(path) + return oa, normalized, raw def _write_artifacts( fixture: Fixture, source: OMEArrow, - arr: np.ndarray, + normalized_arr: np.ndarray, + raw_arr: np.ndarray, workdir: Path, -) -> tuple[Path, Path, Path, OMEArrowDataset, str, OMEArrowDataset, str]: +) -> tuple[Path, list[ArrowArtifact]]: """Write matched OME-Zarr and typed OME-Arrow artifacts for one fixture.""" zarr_path = workdir / f"{fixture.name}.ome.zarr" - typed_block_path = workdir / f"{fixture.name}.block.ome-arrow" - typed_image_path = workdir / f"{fixture.name}.image.ome-arrow" - dtype = np.dtype(arr.dtype) + dtype = np.dtype(normalized_arr.dtype) source.export( how="ome-zarr", @@ -170,33 +300,45 @@ def _write_artifacts( zarr_compressor="zstd", zarr_level=3, ) - write_ome_arrow_dataset( - [arr], - typed_block_path, - layout="block", - chunk_shape=fixture.preferred_chunk_shape, - compression="zstd", - chunk_rows_per_row_group=fixture.chunk_rows_per_row_group, - ) - write_ome_arrow_dataset( - [arr], - typed_image_path, - layout="image", - compression="zstd", - ) - typed_block = OMEArrowDataset(typed_block_path) - typed_image = OMEArrowDataset(typed_image_path) - block_image_id = typed_block.images["image_id"].to_pylist()[0] - image_image_id = typed_image.images["image_id"].to_pylist()[0] - return ( - zarr_path, - typed_block_path, - typed_image_path, - typed_block, - str(block_image_id), - typed_image, - str(image_image_id), - ) + + artifacts: list[ArrowArtifact] = [] + for label, arr, pixel_dtype in ( + ("ome-arrow-src", raw_arr, None), + ("ome-arrow-u16", normalized_arr, "uint16"), + ): + typed_block_path = workdir / f"{fixture.name}.{label}.block.ome-arrow" + typed_image_path = workdir / f"{fixture.name}.{label}.image.ome-arrow" + write_ome_arrow_dataset( + [arr], + typed_block_path, + layout="block", + chunk_shape=fixture.preferred_chunk_shape, + compression="zstd", + chunk_rows_per_row_group=fixture.chunk_rows_per_row_group, + pixel_dtype=pixel_dtype, + ) + write_ome_arrow_dataset( + [arr], + typed_image_path, + layout="image", + compression="zstd", + pixel_dtype=pixel_dtype, + ) + typed_block = OMEArrowDataset(typed_block_path) + typed_image = OMEArrowDataset(typed_image_path) + artifacts.append( + ArrowArtifact( + label=label, + block_path=typed_block_path, + image_path=typed_image_path, + block=typed_block, + block_image_id=str(typed_block.image_ids[0]), + image=typed_image, + image_image_id=str(typed_image.image_ids[0]), + ) + ) + + return zarr_path, artifacts def _center_crop(size_y: int, size_x: int) -> tuple[slice, slice]: @@ -218,12 +360,7 @@ def _subvolume_slice(size_z: int) -> slice: def _cases_for_fixture( fixture: Fixture, zarr_path: Path, - typed_block_path: Path, - typed_image_path: Path, - typed_block: OMEArrowDataset, - block_image_id: str, - typed_image: OMEArrowDataset, - image_image_id: str, + arrow_artifacts: list[ArrowArtifact], arr: np.ndarray, *, repeats: int, @@ -244,137 +381,487 @@ def _cases_for_fixture( z_sub = _subvolume_slice(sz) zarr_size = _dir_size(zarr_path) - typed_block_size = _dir_size(typed_block_path) - typed_image_size = _dir_size(typed_image_path) + tiff_size = _dir_size(fixture.path) - case_defs: list[tuple[str, str, Callable[[], np.ndarray], int]] = [ + case_defs: list[tuple[str, str, Callable[[], Any], int]] = [ ( "full-image", - "ome-zarr", - lambda: OMEArrow.scan(str(zarr_path)) - .tensor_view(layout="TCZYX") - .to_numpy(contiguous=True), + "ome-tiff-tensor-numpy", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(layout="TCZYX") + .to_numpy(contiguous=True) + ), + tiff_size, + ), + ( + "full-image", + "ome-tiff-bioio-numpy", + lambda: _read_tiff_full(fixture.path), + tiff_size, + ), + ( + "full-image", + "ome-zarr-tensor-numpy", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(layout="TCZYX") + .to_numpy(contiguous=True) + ), zarr_size, ), ( "full-image", - "ome-arrow-image", - lambda: typed_image.pixels.read_image(image_image_id), - typed_image_size, + "ome-zarr-bioio-numpy", + lambda: _read_zarr_full(zarr_path), + zarr_size, + ), + ( + "plane", + "ome-tiff-tensor-numpy", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(t=0, c=0, z=z_mid, layout="YX") + .to_numpy(contiguous=True) + ), + tiff_size, ), ( "plane", - "ome-zarr", - lambda: OMEArrow.scan(str(zarr_path)) - .tensor_view(t=0, c=0, z=z_mid, layout="YX") - .to_numpy(contiguous=True), + "ome-tiff-bioio-numpy", + lambda: _read_tiff_plane(fixture.path, t=0, c=0, z=z_mid), + tiff_size, + ), + ( + "plane", + "ome-zarr-tensor-numpy", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=0, z=z_mid, layout="YX") + .to_numpy(contiguous=True) + ), zarr_size, ), ( "plane", - "ome-arrow-block", - lambda: typed_block.pixels.read_plane(block_image_id, t=0, c=0, z=z_mid), - typed_block_size, + "ome-zarr-bioio-numpy", + lambda: _read_zarr_plane(zarr_path, t=0, c=0, z=z_mid), + zarr_size, + ), + ( + "crop-2d", + "ome-tiff-tensor-numpy", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(t=0, c=0, z=z_mid, roi=roi, layout="YX") + .to_numpy(contiguous=True) + ), + tiff_size, + ), + ( + "crop-2d", + "ome-tiff-bioio-numpy", + lambda: _read_tiff_crop( + fixture.path, + t=0, + c=0, + z=z_mid, + y=crop_y, + x=crop_x, + ), + tiff_size, ), ( "crop-2d", - "ome-zarr", - lambda: OMEArrow.scan(str(zarr_path)) - .tensor_view(t=0, c=0, z=z_mid, roi=roi, layout="YX") - .to_numpy(contiguous=True), + "ome-zarr-tensor-numpy", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=0, z=z_mid, roi=roi, layout="YX") + .to_numpy(contiguous=True) + ), zarr_size, ), ( "crop-2d", - "ome-arrow-block", - lambda: typed_block.pixels.read_region( - block_image_id, + "ome-zarr-bioio-numpy", + lambda: _read_zarr_crop( + zarr_path, t=0, c=0, z=z_mid, y=crop_y, x=crop_x, ), - typed_block_size, + zarr_size, ), ] + for artifact in arrow_artifacts: + block_size = _dir_size(artifact.block_path) + image_size = _dir_size(artifact.image_path) + case_defs.extend( + [ + ( + "full-image", + f"{artifact.label}-numpy", + lambda artifact=artifact: artifact.image.read_image( + artifact.image_image_id + ), + image_size, + ), + ( + "plane", + f"{artifact.label}-numpy", + lambda artifact=artifact: artifact.block.read_plane( + artifact.block_image_id, + t=0, + c=0, + z=z_mid, + ), + block_size, + ), + ( + "crop-2d", + f"{artifact.label}-numpy", + lambda artifact=artifact: artifact.block.read_region( + artifact.block_image_id, + t=0, + c=0, + z=z_mid, + y=crop_y, + x=crop_x, + ), + block_size, + ), + ] + ) + + if _torch_available(): + case_defs.extend( + [ + ( + "full-image", + "ome-tiff-tensor-torch", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(layout="TCZYX") + .to_torch(mode="numpy") + ), + tiff_size, + ), + ( + "full-image", + "ome-zarr-tensor-torch", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(layout="TCZYX") + .to_torch(mode="numpy") + ), + zarr_size, + ), + ( + "crop-2d", + "ome-tiff-tensor-torch", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(t=0, c=0, z=z_mid, roi=roi, layout="YX") + .to_torch(mode="numpy") + ), + tiff_size, + ), + ( + "crop-2d", + "ome-zarr-tensor-torch", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=0, z=z_mid, roi=roi, layout="YX") + .to_torch(mode="numpy") + ), + zarr_size, + ), + ] + ) + for artifact in arrow_artifacts: + block_size = _dir_size(artifact.block_path) + image_size = _dir_size(artifact.image_path) + case_defs.extend( + [ + ( + "full-image", + f"{artifact.label}-torch", + lambda artifact=artifact: artifact.image.read_image( + artifact.image_image_id, + return_type="torch", + ), + image_size, + ), + ( + "crop-2d", + f"{artifact.label}-torch", + lambda artifact=artifact: artifact.block.read_region( + artifact.block_image_id, + t=0, + c=0, + z=z_mid, + y=crop_y, + x=crop_x, + return_type="torch", + ), + block_size, + ), + ] + ) + + if _jax_available(): + case_defs.extend( + [ + ( + "full-image", + "ome-tiff-tensor-jax", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(layout="TCZYX") + .to_jax(mode="numpy") + ), + tiff_size, + ), + ( + "full-image", + "ome-zarr-tensor-jax", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(layout="TCZYX") + .to_jax(mode="numpy") + ), + zarr_size, + ), + ( + "crop-2d", + "ome-tiff-tensor-jax", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(t=0, c=0, z=z_mid, roi=roi, layout="YX") + .to_jax(mode="numpy") + ), + tiff_size, + ), + ( + "crop-2d", + "ome-zarr-tensor-jax", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=0, z=z_mid, roi=roi, layout="YX") + .to_jax(mode="numpy") + ), + zarr_size, + ), + ] + ) + for artifact in arrow_artifacts: + block_size = _dir_size(artifact.block_path) + image_size = _dir_size(artifact.image_path) + case_defs.extend( + [ + ( + "full-image", + f"{artifact.label}-jax", + lambda artifact=artifact: artifact.image.read_image( + artifact.image_image_id, + return_type="jax", + ), + image_size, + ), + ( + "crop-2d", + f"{artifact.label}-jax", + lambda artifact=artifact: artifact.block.read_region( + artifact.block_image_id, + t=0, + c=0, + z=z_mid, + y=crop_y, + x=crop_x, + return_type="jax", + ), + block_size, + ), + ] + ) + if sz > 1: case_defs.extend( [ ( "subvolume", - "ome-zarr", - lambda: OMEArrow.scan(str(zarr_path)) - .tensor_view(t=0, c=0, z=z_sub, roi=roi, layout="TCZYX") - .to_numpy(contiguous=True), + "ome-tiff-tensor-numpy", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(t=0, c=0, z=z_sub, roi=roi, layout="TCZYX") + .to_numpy(contiguous=True) + ), + tiff_size, + ), + ( + "subvolume", + "ome-tiff-bioio-numpy", + lambda: _read_tiff_subvolume( + fixture.path, + t=0, + c=0, + z=z_sub, + y=crop_y, + x=crop_x, + ), + tiff_size, + ), + ( + "subvolume", + "ome-zarr-tensor-numpy", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=0, z=z_sub, roi=roi, layout="TCZYX") + .to_numpy(contiguous=True) + ), zarr_size, ), ( "subvolume", - "ome-arrow-block", - lambda: typed_block.pixels.read_region( - block_image_id, + "ome-zarr-bioio-numpy", + lambda: _read_zarr_subvolume( + zarr_path, t=0, c=0, z=z_sub, y=crop_y, x=crop_x, ), - typed_block_size, + zarr_size, ), ] ) + for artifact in arrow_artifacts: + block_size = _dir_size(artifact.block_path) + case_defs.append( + ( + "subvolume", + f"{artifact.label}-numpy", + lambda artifact=artifact: artifact.block.read_region( + artifact.block_image_id, + t=0, + c=0, + z=z_sub, + y=crop_y, + x=crop_x, + ), + block_size, + ) + ) if st > 1: case_defs.extend( [ ( "timepoint-plane", - "ome-zarr", - lambda: OMEArrow.scan(str(zarr_path)) - .tensor_view(t=t_mid, c=0, z=z_mid, layout="YX") - .to_numpy(contiguous=True), + "ome-tiff-tensor-numpy", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(t=t_mid, c=0, z=z_mid, layout="YX") + .to_numpy(contiguous=True) + ), + tiff_size, + ), + ( + "timepoint-plane", + "ome-tiff-bioio-numpy", + lambda: _read_tiff_plane(fixture.path, t=t_mid, c=0, z=z_mid), + tiff_size, + ), + ( + "timepoint-plane", + "ome-zarr-tensor-numpy", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(t=t_mid, c=0, z=z_mid, layout="YX") + .to_numpy(contiguous=True) + ), zarr_size, ), ( "timepoint-plane", - "ome-arrow-block", - lambda: typed_block.pixels.read_plane( - block_image_id, + "ome-zarr-bioio-numpy", + lambda: _read_zarr_plane(zarr_path, t=t_mid, c=0, z=z_mid), + zarr_size, + ), + ] + ) + for artifact in arrow_artifacts: + block_size = _dir_size(artifact.block_path) + case_defs.append( + ( + "timepoint-plane", + f"{artifact.label}-numpy", + lambda artifact=artifact: artifact.block.read_plane( + artifact.block_image_id, t=t_mid, c=0, z=z_mid, ), - typed_block_size, - ), - ] - ) + block_size, + ) + ) if sc > 1: case_defs.extend( [ ( "channel-plane", - "ome-zarr", - lambda: OMEArrow.scan(str(zarr_path)) - .tensor_view(t=0, c=c_mid, z=z_mid, layout="YX") - .to_numpy(contiguous=True), + "ome-tiff-tensor-numpy", + lambda: ( + OMEArrow.scan(str(fixture.path)) + .tensor_view(t=0, c=c_mid, z=z_mid, layout="YX") + .to_numpy(contiguous=True) + ), + tiff_size, + ), + ( + "channel-plane", + "ome-tiff-bioio-numpy", + lambda: _read_tiff_plane(fixture.path, t=0, c=c_mid, z=z_mid), + tiff_size, + ), + ( + "channel-plane", + "ome-zarr-tensor-numpy", + lambda: ( + OMEArrow.scan(str(zarr_path)) + .tensor_view(t=0, c=c_mid, z=z_mid, layout="YX") + .to_numpy(contiguous=True) + ), zarr_size, ), ( "channel-plane", - "ome-arrow-block", - lambda: typed_block.pixels.read_plane( - block_image_id, + "ome-zarr-bioio-numpy", + lambda: _read_zarr_plane(zarr_path, t=0, c=c_mid, z=z_mid), + zarr_size, + ), + ] + ) + for artifact in arrow_artifacts: + block_size = _dir_size(artifact.block_path) + case_defs.append( + ( + "channel-plane", + f"{artifact.label}-numpy", + lambda artifact=artifact: artifact.block.read_plane( + artifact.block_image_id, t=0, c=c_mid, z=z_mid, ), - typed_block_size, - ), - ] - ) + block_size, + ) + ) results = [] for case, format_name, fn, bytes_on_disk in case_defs: @@ -397,13 +884,13 @@ def _print_results(results: list[BenchmarkResult]) -> None: print("") print("OME-IRIS-style benchmark (ms)") print( - f"{'dataset':24} {'case':18} {'format':16} {'median':>9} " + f"{'dataset':24} {'case':18} {'format':24} {'median':>9} " f"{'min':>9} {'max':>9} {'shape':>18} {'dtype':>8} {'MB':>8}" ) - print("-" * 112) + print("-" * 120) for r in results: print( - f"{r.dataset:24} {r.case:18} {r.format:16} " + f"{r.dataset:24} {r.case:18} {r.format:24} " f"{r.median_ms:9.2f} {r.min_ms:9.2f} {r.max_ms:9.2f} " f"{r.shape!s:>18} {r.dtype:>8} {r.bytes_on_disk / 1_000_000:8.2f}" ) @@ -467,31 +954,19 @@ def main() -> None: print(f"Skipping missing fixture: {fixture.name} -> {fixture.path}") continue print(f"Preparing {fixture.name}: {fixture.path}") - source, arr = _load_source(fixture.path) - ( - zarr_path, - typed_block_path, - typed_image_path, - typed_block, - block_image_id, - typed_image, - image_image_id, - ) = _write_artifacts( + source, arr, raw_arr = _load_source(fixture.path) + zarr_path, arrow_artifacts = _write_artifacts( fixture, source, arr, + raw_arr, workdir, ) all_results.extend( _cases_for_fixture( fixture, zarr_path, - typed_block_path, - typed_image_path, - typed_block, - block_image_id, - typed_image, - image_image_id, + arrow_artifacts, arr, repeats=args.repeats, warmup=args.warmup, diff --git a/src/ome_arrow/dataset.py b/src/ome_arrow/dataset.py index 0096c19..2e327fc 100644 --- a/src/ome_arrow/dataset.py +++ b/src/ome_arrow/dataset.py @@ -16,6 +16,7 @@ from ome_arrow.meta import OME_ARROW_TAG_TYPE, OME_ARROW_TAG_VERSION Layout = Literal["auto", "image", "tile", "volume", "z-plane", "block", "hybrid"] +ArrayReturn = Literal["numpy", "torch", "jax"] AccessPattern = Literal[ "balanced", "full_image", @@ -242,6 +243,27 @@ def _as_tczyx_array(image: Any) -> tuple[np.ndarray, dict[str, Any]]: ) +def _cast_pixel_dtype( + arr: np.ndarray, + pixel_dtype: np.dtype | str | None, + *, + clamp: bool, +) -> np.ndarray: + """Return ``arr`` in the requested pixel dtype, preserving dtype by default.""" + if pixel_dtype is None: + return arr + + dtype = np.dtype(pixel_dtype) + if arr.dtype == dtype: + return arr + + out = arr + if clamp and np.issubdtype(dtype, np.integer): + info = np.iinfo(dtype) + out = np.clip(out, info.min, info.max) + return out.astype(dtype, copy=False) + + def _image_metadata( *, image: Any, @@ -339,6 +361,8 @@ def write_ome_arrow_dataset( compression: str | None = "zstd", build_physical_index: bool = True, chunk_rows_per_row_group: int = 1, + pixel_dtype: np.dtype | str | None = None, + clamp: bool = True, ) -> ChunkChoice: """Write a metadata table and typed pixel chunk table. @@ -356,6 +380,11 @@ def write_ome_arrow_dataset( chunk_rows_per_row_group: Number of chunk rows to pack into each row group. ``1`` gives fastest direct chunk reads. Larger values reduce Parquet row-group overhead and can improve storage for small chunks. + pixel_dtype: Optional output dtype for stored pixel buffers. ``None`` + preserves the source dtype. Set to ``"uint16"`` to normalize inputs + to the legacy OME-Arrow pixel dtype. + clamp: Whether to clamp values to the output dtype range before casting + when ``pixel_dtype`` is set to an integer dtype. Returns: ChunkChoice: The choice used for the first image. All images currently @@ -374,6 +403,7 @@ def write_ome_arrow_dataset( next_chunk_id = 0 for i, image in enumerate(images): arr, record = _as_tczyx_array(image) + arr = _cast_pixel_dtype(arr, pixel_dtype, clamp=clamp) choice = choose_chunking( arr.shape, arr.dtype, @@ -596,6 +626,11 @@ def __init__(self, path: str | Path) -> None: self.index = pq.read_table(index_path) if index_path.exists() else None self.pixels = OMEArrowPixels(self) + @property + def image_ids(self) -> list[str]: + """Return image identifiers in this dataset.""" + return [str(v) for v in self.images["image_id"].to_pylist()] + def image_metadata(self, image_id: str) -> dict[str, Any]: """Return one image metadata row as a dictionary.""" mask = pc.equal(self.images["image_id"], str(image_id)) @@ -604,6 +639,90 @@ def image_metadata(self, image_id: str) -> dict[str, Any]: raise KeyError(f"image_id not found: {image_id}") return rows[0] + def read_image( + self, + image_id: str | None = None, + *, + return_type: ArrayReturn = "numpy", + ) -> Any: + """Read one image as a dense ``TCZYX`` NumPy array. + + Args: + image_id: Image identifier. When omitted, the first image is read. + return_type: Array backend to return: ``"numpy"``, ``"torch"``, + or ``"jax"``. + + Returns: + Any: Dense TCZYX image in the requested array backend. + """ + arr = self.pixels.read_image(self._resolve_image_id(image_id)) + return _convert_return(arr, return_type) + + def read_many( + self, + image_ids: Iterable[str], + *, + return_type: ArrayReturn = "numpy", + ) -> list[Any]: + """Read multiple images by identifier.""" + arrays = self.pixels.read_many(image_ids) + return [_convert_return(arr, return_type) for arr in arrays] + + def read_channel( + self, + image_id: str | None = None, + channel: int = 0, + *, + return_type: ArrayReturn = "numpy", + ) -> Any: + """Read one channel as a ``TZYX`` NumPy array.""" + arr = self.pixels.read_channel(self._resolve_image_id(image_id), channel) + return _convert_return(arr, return_type) + + def read_plane( + self, + image_id: str | None = None, + *, + t: int = 0, + c: int = 0, + z: int = 0, + return_type: ArrayReturn = "numpy", + ) -> Any: + """Read one ``YX`` plane.""" + arr = self.pixels.read_plane(self._resolve_image_id(image_id), t=t, c=c, z=z) + return _convert_return(arr, return_type) + + def read_region( + self, + image_id: str | None = None, + *, + y: slice, + x: slice, + t: int | slice | None = None, + c: int | slice | None = None, + z: int | slice | None = None, + return_type: ArrayReturn = "numpy", + ) -> Any: + """Read a spatial region as a dense ``TCZYX`` NumPy array.""" + arr = self.pixels.read_region( + self._resolve_image_id(image_id), + y=y, + x=x, + t=t, + c=c, + z=z, + ) + return _convert_return(arr, return_type) + + def _resolve_image_id(self, image_id: str | None) -> str: + """Resolve an optional image ID to the first image when omitted.""" + if image_id is not None: + return str(image_id) + image_ids = self.image_ids + if not image_ids: + raise ValueError("Dataset contains no image IDs.") + return image_ids[0] + def _matching_chunks( self, image_id: str, @@ -677,6 +796,25 @@ def _as_slice(value: int | slice | None) -> slice | None: return slice(i, i + 1) +def _convert_return(arr: np.ndarray, return_type: ArrayReturn) -> Any: + """Convert a NumPy read result to the requested array backend.""" + if return_type == "numpy": + return arr + if return_type == "torch": + try: + import torch + except ImportError as exc: + raise RuntimeError("Torch is not installed.") from exc + return torch.as_tensor(arr) + if return_type == "jax": + try: + import jax.numpy as jnp + except ImportError as exc: + raise RuntimeError("JAX is not installed.") from exc + return jnp.asarray(arr) + raise ValueError("return_type must be one of 'numpy', 'torch', or 'jax'") + + def _normalize_region( slices: tuple[slice | None, slice | None, slice | None, slice | None, slice | None], bounds: tuple[int, int, int, int, int], diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 67115c0..c25ad42 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -4,6 +4,7 @@ import numpy as np import pyarrow.parquet as pq +import pytest from ome_arrow import ( OMEArrow, @@ -38,15 +39,38 @@ def test_write_dataset_preserves_dtype_and_reads_image(tmp_path: pathlib.Path) - compression=None, ) dataset = OMEArrowDataset(out) - image_id = dataset.images["image_id"].to_pylist()[0] - roundtrip = dataset.pixels.read_image(image_id) + image_id = dataset.image_ids[0] + roundtrip = dataset.read_image() assert choice.layout == "image" + assert dataset.image_ids == [image_id] assert roundtrip.dtype == np.uint8 np.testing.assert_array_equal(roundtrip, arr) assert pq.ParquetFile(out / "chunks.parquet").metadata.num_row_groups == 1 +def test_write_dataset_can_normalize_pixel_dtype(tmp_path: pathlib.Path) -> None: + """Explicitly widen stored pixels when callers ask for a target dtype.""" + arr = np.array([[[[[0, 255], [256, 70000]]]]], dtype=np.uint32) + out = tmp_path / "typed_dataset_uint16" + + write_ome_arrow_dataset( + [arr], + out, + layout="image", + compression=None, + pixel_dtype="uint16", + ) + dataset = OMEArrowDataset(out) + roundtrip = dataset.read_image() + + assert roundtrip.dtype == np.uint16 + np.testing.assert_array_equal( + roundtrip, + np.array([[[[[0, 255], [256, 65535]]]]], dtype=np.uint16), + ) + + def test_dataset_reads_plane_and_region_from_indexed_tiles( tmp_path: pathlib.Path, ) -> None: @@ -62,11 +86,9 @@ def test_dataset_reads_plane_and_region_from_indexed_tiles( compression="zstd", ) dataset = OMEArrowDataset(out) - image_id = dataset.images["image_id"].to_pylist()[0] - plane = dataset.pixels.read_plane(image_id, z=1) - region = dataset.pixels.read_region( - image_id, + plane = dataset.read_plane(z=1) + region = dataset.read_region( z=1, y=slice(2, 6), x=slice(3, 7), @@ -91,10 +113,8 @@ def test_dataset_reads_from_packed_chunk_row_groups(tmp_path: pathlib.Path) -> N chunk_rows_per_row_group=4, ) dataset = OMEArrowDataset(out) - image_id = dataset.images["image_id"].to_pylist()[0] - region = dataset.pixels.read_region( - image_id, + region = dataset.read_region( z=1, y=slice(2, 6), x=slice(3, 7), @@ -112,8 +132,37 @@ def test_dataset_accepts_ome_arrow_records(tmp_path: pathlib.Path) -> None: write_ome_arrow_dataset([oa], out, layout="z-plane", compression=None) dataset = OMEArrowDataset(out) - image_id = dataset.images["image_id"].to_pylist()[0] - roundtrip = dataset.pixels.read_image(image_id) + roundtrip = dataset.read_image() assert roundtrip.dtype == np.uint16 np.testing.assert_array_equal(roundtrip, arr) + + +def test_dataset_read_torch_return_type(tmp_path: pathlib.Path) -> None: + """Read typed dataset pixels directly as torch tensors.""" + torch = pytest.importorskip("torch") + arr = np.arange(1 * 1 * 1 * 3 * 4, dtype=np.uint16).reshape(1, 1, 1, 3, 4) + out = tmp_path / "torch_return" + + write_ome_arrow_dataset([arr], out, layout="image", compression=None) + dataset = OMEArrowDataset(out) + + tensor = dataset.read_image(return_type="torch") + assert isinstance(tensor, torch.Tensor) + assert tensor.dtype == torch.uint16 + np.testing.assert_array_equal(tensor.numpy(), arr) + + +def test_dataset_read_jax_return_type(tmp_path: pathlib.Path) -> None: + """Read typed dataset pixels directly as JAX arrays.""" + jnp = pytest.importorskip("jax.numpy") + arr = np.arange(1 * 1 * 1 * 3 * 4, dtype=np.uint16).reshape(1, 1, 1, 3, 4) + out = tmp_path / "jax_return" + + write_ome_arrow_dataset([arr], out, layout="image", compression=None) + dataset = OMEArrowDataset(out) + + jax_arr = dataset.read_image(return_type="jax") + assert isinstance(jax_arr, jnp.ndarray) + assert jax_arr.dtype == jnp.uint16 + np.testing.assert_array_equal(np.asarray(jax_arr), arr) From 550170ed4d49c1db5259c8ac5e6d97166e6859bb Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 4 Jun 2026 14:32:39 -0600 Subject: [PATCH 3/9] address coderabbit review comments --- README.md | 9 +++-- benchmarks/benchmark_lazy_tensor.py | 3 +- benchmarks/benchmark_ome_iris.py | 14 ++++++- src/ome_arrow/__init__.py | 3 ++ src/ome_arrow/dataset.py | 20 +++++++++- tests/test_dataset.py | 62 +++++++++++++++++++++++++++++ 6 files changed, 103 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e4b5c20..8cdaa63 100644 --- a/README.md +++ b/README.md @@ -122,10 +122,11 @@ See full docs: [`docs/src/dlpack.md`](docs/src/dlpack.md) ## Typed chunk datasets -For selective pixel reads, use the typed byte-buffer dataset writer. This stores -image metadata separately from pixel chunks and writes one chunk per Parquet row -group so `read_plane()` and `read_region()` can jump through a physical index -instead of materializing the older nested struct payload. +For selective pixel reads, use the typed byte-buffer dataset writer. By default, +this stores image metadata separately from pixel chunks and writes one chunk per +Parquet row group, so `read_plane()` and `read_region()` can jump through a +physical index instead of materializing the older nested struct payload. You can +change that row-group packing with `chunk_rows_per_row_group`. ```python import numpy as np diff --git a/benchmarks/benchmark_lazy_tensor.py b/benchmarks/benchmark_lazy_tensor.py index 9581a74..86fb14d 100644 --- a/benchmarks/benchmark_lazy_tensor.py +++ b/benchmarks/benchmark_lazy_tensor.py @@ -22,8 +22,7 @@ import numpy as np -from ome_arrow import OMEArrow -from ome_arrow.dataset import OMEArrowDataset, write_ome_arrow_dataset +from ome_arrow import OMEArrow, OMEArrowDataset, write_ome_arrow_dataset from ome_arrow.export import to_ome_parquet from ome_arrow.ingest import from_numpy diff --git a/benchmarks/benchmark_ome_iris.py b/benchmarks/benchmark_ome_iris.py index 6d2d5aa..d3a7a7d 100644 --- a/benchmarks/benchmark_ome_iris.py +++ b/benchmarks/benchmark_ome_iris.py @@ -102,14 +102,25 @@ def _parse_fixture_arg(raw: str) -> Fixture: else: path = raw name = Path(path).stem + fixture_path = Path(path) + _validate_tiff_fixture(fixture_path) return Fixture( name=name, - path=Path(path), + path=fixture_path, preferred_chunk_shape=(1, 1, 1, 256, 256), chunk_rows_per_row_group=1, ) +def _validate_tiff_fixture(path: Path) -> None: + """Validate that a benchmark fixture path targets a TIFF source image.""" + if path.suffix.lower() not in {".tif", ".tiff"}: + raise ValueError( + "OME-IRIS-style benchmark fixtures must be TIFF files because " + f"_load_source uses _read_tiff_full; got {path!s}" + ) + + def _dir_size(path: Path) -> int: """Return total bytes for a file or directory.""" if not path.exists(): @@ -270,6 +281,7 @@ def _read_tiff_subvolume( def _load_source(path: Path) -> tuple[OMEArrow, np.ndarray, np.ndarray]: """Load source image arrays for normalized and raw dtype comparisons.""" + _validate_tiff_fixture(path) oa = OMEArrow(str(path)) normalized = oa.export( how="numpy", diff --git a/src/ome_arrow/__init__.py b/src/ome_arrow/__init__.py index 57b273b..8ee8e4c 100644 --- a/src/ome_arrow/__init__.py +++ b/src/ome_arrow/__init__.py @@ -5,6 +5,9 @@ from ome_arrow._version import version as ome_arrow_version from ome_arrow.core import OMEArrow from ome_arrow.dataset import ( + AccessPattern, + ChunkChoice, + Layout, OMEArrowDataset, OMEArrowPixels, choose_chunking, diff --git a/src/ome_arrow/dataset.py b/src/ome_arrow/dataset.py index 2e327fc..12e853b 100644 --- a/src/ome_arrow/dataset.py +++ b/src/ome_arrow/dataset.py @@ -132,6 +132,17 @@ def choose_chunking( if min(st, sc, sz, sy, sx) <= 0: raise ValueError("shape values must be positive") dtype_obj = np.dtype(dtype) + try: + target_chunk_mb = float(target_chunk_mb) + except (TypeError, ValueError) as exc: + raise ValueError( + "target_chunk_mb must be a positive number; cannot compare " + "target_chunk_mb to computed size_mb." + ) from exc + if target_chunk_mb <= 0: + raise ValueError( + "target_chunk_mb must be > 0 before computing chunk scale from size_mb." + ) if layout == "auto": layout_by_pattern = { @@ -421,6 +432,10 @@ def write_ome_arrow_dataset( index=i, choice=choice, ) + if any(row["image_id"] == meta["image_id"] for row in image_rows): + raise ValueError( + f"Duplicate image_id in OME-Arrow dataset write: {meta['image_id']!r}" + ) image_rows.append(meta) new_rows = list( _iter_chunk_rows( @@ -451,6 +466,7 @@ def write_ome_arrow_dataset( row_group_size=int(chunk_rows_per_row_group), ) + index_path = out_dir / "index.parquet" if build_physical_index: index_rows = [ { @@ -474,7 +490,9 @@ def write_ome_arrow_dataset( ] index_table = pa.Table.from_pylist(index_rows, schema=INDEX_TABLE_SCHEMA) index_table = index_table.replace_schema_metadata(metadata) - pq.write_table(index_table, out_dir / "index.parquet", compression=compression) + pq.write_table(index_table, index_path, compression=compression) + elif index_path.exists(): + index_path.unlink() manifest = { "type": OME_ARROW_TAG_TYPE, diff --git a/tests/test_dataset.py b/tests/test_dataset.py index c25ad42..fa6b011 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -7,9 +7,13 @@ import pytest from ome_arrow import ( + AccessPattern, + ChunkChoice, + Layout, OMEArrow, OMEArrowDataset, choose_chunking, + from_numpy, write_ome_arrow_dataset, ) @@ -27,6 +31,24 @@ def test_choose_chunking_presets() -> None: assert "fast_crop_2d" in choice.rationale +def test_public_dataset_typing_exports() -> None: + """Expose dataset typing helpers from the package root.""" + assert ChunkChoice is not None + assert Layout is not None + assert AccessPattern is not None + + +def test_choose_chunking_rejects_nonpositive_target_chunk_mb() -> None: + """Fail before chunk scaling when target_chunk_mb is invalid.""" + with pytest.raises(ValueError, match=r"target_chunk_mb.*size_mb"): + choose_chunking( + (1, 1, 4, 1024, 1024), + np.uint16, + access_pattern="fast_crop_2d", + target_chunk_mb=0, + ) + + def test_write_dataset_preserves_dtype_and_reads_image(tmp_path: pathlib.Path) -> None: """Round-trip a typed byte-buffer dataset without widening pixels.""" arr = np.arange(1 * 2 * 2 * 5 * 6, dtype=np.uint8).reshape(1, 2, 2, 5, 6) @@ -49,6 +71,46 @@ def test_write_dataset_preserves_dtype_and_reads_image(tmp_path: pathlib.Path) - assert pq.ParquetFile(out / "chunks.parquet").metadata.num_row_groups == 1 +def test_write_dataset_rejects_duplicate_image_ids(tmp_path: pathlib.Path) -> None: + """Reject duplicate image IDs before chunk rows can merge at read time.""" + arr = np.arange(1 * 1 * 1 * 3 * 4, dtype=np.uint16).reshape(1, 1, 1, 3, 4) + first = from_numpy(arr, image_id="duplicate", build_chunks=False) + second = from_numpy(arr + 1, image_id="duplicate", build_chunks=False) + + with pytest.raises(ValueError, match="Duplicate image_id"): + write_ome_arrow_dataset([first, second], tmp_path / "duplicates") + + +def test_write_dataset_removes_stale_index_when_index_disabled( + tmp_path: pathlib.Path, +) -> None: + """Avoid loading stale physical index metadata after rewriting a dataset.""" + arr = np.arange(1 * 1 * 2 * 6 * 7, dtype=np.uint16).reshape(1, 1, 2, 6, 7) + out = tmp_path / "stale_index" + + write_ome_arrow_dataset( + [arr], + out, + layout="tile", + chunk_shape=(1, 1, 1, 3, 4), + compression=None, + build_physical_index=True, + ) + assert (out / "index.parquet").exists() + + write_ome_arrow_dataset( + [arr], + out, + layout="tile", + chunk_shape=(1, 1, 1, 3, 4), + compression=None, + build_physical_index=False, + ) + + assert not (out / "index.parquet").exists() + assert OMEArrowDataset(out).index is None + + def test_write_dataset_can_normalize_pixel_dtype(tmp_path: pathlib.Path) -> None: """Explicitly widen stored pixels when callers ask for a target dtype.""" arr = np.array([[[[[0, 255], [256, 70000]]]]], dtype=np.uint32) From 94eb5410dda7bc1e45f334586208b79639a15f4d Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 4 Jun 2026 15:36:04 -0600 Subject: [PATCH 4/9] address the core --- README.md | 17 +++-- benchmarks/benchmark_ome_iris.py | 86 ++++++++++++++++++++++++-- src/ome_arrow/core.py | 103 ++++++++++++++++++++++++++----- src/ome_arrow/dataset.py | 75 ++++++++++++++++++++++ tests/test_core.py | 14 +++++ tests/test_dataset.py | 45 ++++++++++++++ 6 files changed, 314 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 8cdaa63..97b82f0 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ oa_image.view(how="matplotlib") # (great for ZYX 3D images; install extras: `pip install 'ome-arrow[viz]'`). oa_image.view(how="pyvista") -# Export to OME-Parquet. +# Export to OME-Parquet. This writes the typed chunk dataset layout. # We can also export OME-TIFF, OME-Zarr or NumPy arrays. oa_image.export(how="ome-parquet", out="your_image.ome.parquet") @@ -122,11 +122,12 @@ See full docs: [`docs/src/dlpack.md`](docs/src/dlpack.md) ## Typed chunk datasets -For selective pixel reads, use the typed byte-buffer dataset writer. By default, -this stores image metadata separately from pixel chunks and writes one chunk per -Parquet row group, so `read_plane()` and `read_region()` can jump through a -physical index instead of materializing the older nested struct payload. You can -change that row-group packing with `chunk_rows_per_row_group`. +`OMEArrow.export(how="ome-parquet")` writes the typed byte-buffer dataset layout. +For explicit control over layout and chunks, use the dataset writer directly. +By default, this stores image metadata separately from pixel chunks and writes +one chunk per Parquet row group, so `read_plane()` and `read_region()` can jump +through a physical index instead of materializing the older nested struct +payload. You can change that row-group packing with `chunk_rows_per_row_group`. ```python import numpy as np @@ -237,6 +238,10 @@ The OME-IRIS-style benchmark separates return/API paths: - `ome-arrow-src-numpy`: source-dtype typed OME-Arrow dataset NumPy reads. - `ome-arrow-u16-numpy`: typed OME-Arrow dataset NumPy reads normalized to `uint16` for apples-to-apples comparisons with normalized paths. +- `ome-arrow-u16-raw-numpy`: normalized `uint16` typed OME-Arrow reads with + uncompressed chunk bytes for local speed comparisons. +- `ome-arrow-*-chunks`: Arrow-native raw chunk-row reads that return + `pixel_bytes` without decoding into NumPy. - `ome-tiff-tensor-torch` / `ome-tiff-tensor-jax`: OME-Arrow tensor-view Torch/JAX returns over TIFF. - `ome-zarr-tensor-torch` / `ome-zarr-tensor-jax`: OME-Arrow tensor-view diff --git a/benchmarks/benchmark_ome_iris.py b/benchmarks/benchmark_ome_iris.py index d3a7a7d..a5f5016 100644 --- a/benchmarks/benchmark_ome_iris.py +++ b/benchmarks/benchmark_ome_iris.py @@ -179,6 +179,8 @@ def _sync_result(out: Any) -> None: def _result_shape_dtype(out: Any) -> tuple[tuple[int, ...], str]: """Return shape and dtype strings for NumPy, Torch, and JAX-like arrays.""" + if hasattr(out, "num_rows") and hasattr(out, "num_columns"): + return (int(out.num_rows), int(out.num_columns)), "arrow-table" shape = tuple(int(v) for v in getattr(out, "shape", ())) dtype = getattr(out, "dtype", None) if dtype is None: @@ -314,9 +316,10 @@ def _write_artifacts( ) artifacts: list[ArrowArtifact] = [] - for label, arr, pixel_dtype in ( - ("ome-arrow-src", raw_arr, None), - ("ome-arrow-u16", normalized_arr, "uint16"), + for label, arr, pixel_dtype, compression in ( + ("ome-arrow-src", raw_arr, None, "zstd"), + ("ome-arrow-u16", normalized_arr, "uint16", "zstd"), + ("ome-arrow-u16-raw", normalized_arr, "uint16", None), ): typed_block_path = workdir / f"{fixture.name}.{label}.block.ome-arrow" typed_image_path = workdir / f"{fixture.name}.{label}.image.ome-arrow" @@ -325,7 +328,7 @@ def _write_artifacts( typed_block_path, layout="block", chunk_shape=fixture.preferred_chunk_shape, - compression="zstd", + compression=compression, chunk_rows_per_row_group=fixture.chunk_rows_per_row_group, pixel_dtype=pixel_dtype, ) @@ -333,7 +336,7 @@ def _write_artifacts( [arr], typed_image_path, layout="image", - compression="zstd", + compression=compression, pixel_dtype=pixel_dtype, ) typed_block = OMEArrowDataset(typed_block_path) @@ -521,6 +524,14 @@ def _cases_for_fixture( ), image_size, ), + ( + "full-image", + f"{artifact.label}-chunks", + lambda artifact=artifact: artifact.image.read_chunk_rows( + artifact.image_image_id + ), + image_size, + ), ( "plane", f"{artifact.label}-numpy", @@ -532,6 +543,17 @@ def _cases_for_fixture( ), block_size, ), + ( + "plane", + f"{artifact.label}-chunks", + lambda artifact=artifact: artifact.block.read_chunk_rows( + artifact.block_image_id, + t=0, + c=0, + z=z_mid, + ), + block_size, + ), ( "crop-2d", f"{artifact.label}-numpy", @@ -545,6 +567,19 @@ def _cases_for_fixture( ), block_size, ), + ( + "crop-2d", + f"{artifact.label}-chunks", + lambda artifact=artifact: artifact.block.read_chunk_rows( + artifact.block_image_id, + t=0, + c=0, + z=z_mid, + y=crop_y, + x=crop_x, + ), + block_size, + ), ] ) @@ -768,6 +803,21 @@ def _cases_for_fixture( block_size, ) ) + case_defs.append( + ( + "subvolume", + f"{artifact.label}-chunks", + lambda artifact=artifact: artifact.block.read_chunk_rows( + artifact.block_image_id, + t=0, + c=0, + z=z_sub, + y=crop_y, + x=crop_x, + ), + block_size, + ) + ) if st > 1: case_defs.extend( @@ -821,6 +871,19 @@ def _cases_for_fixture( block_size, ) ) + case_defs.append( + ( + "timepoint-plane", + f"{artifact.label}-chunks", + lambda artifact=artifact: artifact.block.read_chunk_rows( + artifact.block_image_id, + t=t_mid, + c=0, + z=z_mid, + ), + block_size, + ) + ) if sc > 1: case_defs.extend( @@ -874,6 +937,19 @@ def _cases_for_fixture( block_size, ) ) + case_defs.append( + ( + "channel-plane", + f"{artifact.label}-chunks", + lambda artifact=artifact: artifact.block.read_chunk_rows( + artifact.block_image_id, + t=0, + c=c_mid, + z=z_mid, + ), + block_size, + ) + ) results = [] for case, format_name, fn, bytes_on_disk in case_defs: diff --git a/src/ome_arrow/core.py b/src/ome_arrow/core.py index 993c6e6..97f9145 100644 --- a/src/ome_arrow/core.py +++ b/src/ome_arrow/core.py @@ -23,7 +23,6 @@ from ome_arrow.export import ( to_numpy, - to_ome_parquet, to_ome_tiff, to_ome_vortex, to_ome_zarr, @@ -144,6 +143,7 @@ def __init__( self.tcz = tcz self._data: pa.StructScalar | None = None self._struct_array: pa.StructArray | None = None + self._dataset: Any | None = None self._lazy_source: _LazySourceSpec | None = None self._lazy_slices: list[_LazySliceSpec] = [] @@ -176,12 +176,16 @@ def __init__( # --- 2) String path/URL: OME-Zarr / OME-Parquet / OME-TIFF --------------- elif isinstance(data, str): - self.data, self._struct_array = self._load_from_string_source( - data, - column_name=column_name, - row_index=row_index, - image_type=image_type, - ) + dataset = self._try_open_dataset(data) + if dataset is not None: + self._dataset = dataset + else: + self.data, self._struct_array = self._load_from_string_source( + data, + column_name=column_name, + row_index=row_index, + image_type=image_type, + ) # --- 3) NumPy ndarray ---------------------------------------------------- elif isinstance(data, np.ndarray): @@ -278,6 +282,8 @@ def data(self) -> pa.StructScalar: RuntimeError: If the record could not be initialized. """ self._ensure_materialized() + if self._data is None and self._dataset is not None: + self._data = self._dataset_to_scalar() if self._data is None: raise RuntimeError("OMEArrow data is not initialized.") return self._data @@ -295,6 +301,35 @@ def collect(self) -> "OMEArrow": self._ensure_materialized() return self + @staticmethod + def _try_open_dataset(data: str) -> Any | None: + """Open a typed OME-Arrow dataset path if the manifest is present.""" + path = pathlib.Path(data.strip()) + if not (path.exists() and path.is_dir()): + return None + if not (path / "_ome_arrow_dataset.json").exists(): + return None + from ome_arrow.dataset import OMEArrowDataset + + return OMEArrowDataset(path) + + def _dataset_to_scalar(self) -> pa.StructScalar: + """Materialize a dataset-backed image into the legacy scalar shape.""" + if self._dataset is None: + raise RuntimeError("OMEArrow has no dataset backing.") + image_id = self._dataset.image_ids[0] + meta = self._dataset.image_metadata(image_id) + arr = self._dataset.read_image(image_id) + return from_numpy( + arr, + dim_order="TCZYX", + image_id=str(image_id), + name=str(meta.get("name") or image_id), + image_type=meta.get("image_type"), + clamp_to_uint16=False, + dtype_meta=str(meta["dtype"]), + ) + @staticmethod def _load_from_string_source( data: str, @@ -372,6 +407,11 @@ def _ensure_materialized(self) -> None: if self._lazy_source is None: return lazy_source = self._lazy_source + dataset = self._try_open_dataset(lazy_source.data) + if dataset is not None and not self._lazy_slices: + self._dataset = dataset + self._lazy_source = None + return # Intentionally do not clear `_lazy_source` / `_lazy_slices` before load. # If `_load_from_string_source(...)` raises, lazy state is preserved so # callers can inspect/retry without losing the deferred plan. @@ -410,6 +450,8 @@ def _ensure_materialized(self) -> None: def _tensor_source(self) -> pa.StructScalar | pa.StructArray: self._ensure_materialized() + if self._dataset is not None and self._data is None: + return self._dataset_to_scalar() if self._struct_array is not None: return self._struct_array if self._data is None: @@ -554,6 +596,18 @@ def export( # noqa: PLR0911 ValueError: Unknown 'how' or missing required params. """ + mode = how.lower().replace("_", "-") + + if self._dataset is not None and mode == "numpy": + arr = self._dataset.read_image() + dtype_obj = np.dtype(dtype) + if arr.dtype != dtype_obj: + if clamp and np.issubdtype(dtype_obj, np.integer): + info = np.iinfo(dtype_obj) + arr = np.clip(arr, info.min, info.max) + arr = arr.astype(dtype_obj, copy=False) + return arr + self._ensure_materialized() # existing modes @@ -564,8 +618,6 @@ def export( # noqa: PLR0911 if how == "scalar": return self.data - mode = how.lower().replace("_", "-") - # OME-TIFF via BioIO if mode in {"ome-tiff", "ometiff", "tiff"}: if not out: @@ -603,12 +655,20 @@ def export( # noqa: PLR0911 if mode in {"ome-parquet", "omeparquet", "parquet"}: if not out: raise ValueError("export(how='parquet') requires 'out' path.") - to_ome_parquet( - data=self.data, - out_path=out, - column_name=parquet_column_name, - compression=parquet_compression, # default 'zstd' - file_metadata=parquet_metadata, + from ome_arrow.dataset import write_ome_arrow_dataset + + # Kept for call-site compatibility with the old single-file + # Parquet writer; the typed dataset layout does not use a struct + # column name or arbitrary file-level metadata. + _legacy_parquet_options = (parquet_column_name, parquet_metadata) + write_ome_arrow_dataset( + [self.data], + out, + layout="auto", + access_pattern="balanced", + compression=parquet_compression, + pixel_dtype=dtype, + clamp=clamp, ) return out @@ -637,6 +697,19 @@ def info(self) -> Dict[str, Any]: - summary: human-readable text """ self._ensure_materialized() + if self._dataset is not None and self._data is None: + meta = self._dataset.image_metadata(self._dataset.image_ids[0]) + return describe_ome_arrow( + { + "pixels_meta": { + "size_t": meta["size_t"], + "size_c": meta["size_c"], + "size_z": meta["size_z"], + "size_y": meta["size_y"], + "size_x": meta["size_x"], + } + } + ) return describe_ome_arrow(self.data) def view( diff --git a/src/ome_arrow/dataset.py b/src/ome_arrow/dataset.py index 12e853b..aba27a9 100644 --- a/src/ome_arrow/dataset.py +++ b/src/ome_arrow/dataset.py @@ -4,6 +4,7 @@ import json from dataclasses import dataclass +from importlib.metadata import PackageNotFoundError, version from pathlib import Path from typing import Any, Iterable, Literal, Sequence @@ -406,6 +407,8 @@ def write_ome_arrow_dataset( if chunk_rows_per_row_group <= 0: raise ValueError("chunk_rows_per_row_group must be > 0") out_dir = Path(output_path) + if out_dir.exists() and out_dir.is_file(): + out_dir.unlink() out_dir.mkdir(parents=True, exist_ok=True) image_rows: list[dict[str, Any]] = [] @@ -501,6 +504,7 @@ def write_ome_arrow_dataset( "image_count": len(image_rows), "chunk_count": len(chunk_rows), "choice": choices[0].__dict__, + "package_versions": _package_versions(), } (out_dir / "_ome_arrow_dataset.json").write_text(json.dumps(manifest, indent=2)) return choices[0] @@ -574,6 +578,45 @@ def read_region( x=x, ) + def read_chunk_rows( + self, + image_id: str, + *, + y: slice | None = None, + x: slice | None = None, + t: int | slice | None = None, + c: int | slice | None = None, + z: int | slice | None = None, + ) -> pa.Table: + """Read selected raw chunk rows without decoding pixels to NumPy.""" + meta = self._dataset.image_metadata(image_id) + bounds = ( + int(meta["size_t"]), + int(meta["size_c"]), + int(meta["size_z"]), + int(meta["size_y"]), + int(meta["size_x"]), + ) + ts, cs, zs, ys, xs = _normalize_region( + (_as_slice(t), _as_slice(c), _as_slice(z), y, x), + bounds, + ) + chunks = self._dataset._matching_chunks( + str(image_id), + t=ts, + c=cs, + z=zs, + y=ys, + x=xs, + ) + rows = [ + self._dataset._read_chunk_row(self._dataset._chunks_file, row) + for row in chunks + ] + if not rows: + return pa.Table.from_pylist([], schema=CHUNK_TABLE_SCHEMA) + return pa.Table.from_pylist(rows, schema=CHUNK_TABLE_SCHEMA) + def _read_region( self, image_id: str, @@ -732,6 +775,26 @@ def read_region( ) return _convert_return(arr, return_type) + def read_chunk_rows( + self, + image_id: str | None = None, + *, + y: slice | None = None, + x: slice | None = None, + t: int | slice | None = None, + c: int | slice | None = None, + z: int | slice | None = None, + ) -> pa.Table: + """Read selected raw chunk rows without pixel decoding.""" + return self.pixels.read_chunk_rows( + self._resolve_image_id(image_id), + y=y, + x=x, + t=t, + c=c, + z=z, + ) + def _resolve_image_id(self, image_id: str | None) -> str: """Resolve an optional image ID to the first image when omitted.""" if image_id is not None: @@ -833,6 +896,18 @@ def _convert_return(arr: np.ndarray, return_type: ArrayReturn) -> Any: raise ValueError("return_type must be one of 'numpy', 'torch', or 'jax'") +def _package_versions() -> dict[str, str]: + """Return package versions useful for interpreting benchmark artifacts.""" + packages = ("ome-arrow", "numpy", "pyarrow") + versions = {} + for package in packages: + try: + versions[package] = version(package) + except PackageNotFoundError: + versions[package] = "unknown" + return versions + + def _normalize_region( slices: tuple[slice | None, slice | None, slice | None, slice | None, slice | None], bounds: tuple[int, int, int, int, int], diff --git a/tests/test_core.py b/tests/test_core.py index 515ac46..799bef8 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -506,6 +506,20 @@ def test_vortex_custom_column_name(tmp_path: pathlib.Path) -> None: assert reloaded.info() == oa.info() +def test_ome_parquet_export_uses_typed_dataset_path(tmp_path: pathlib.Path) -> None: + """Notebook-style parquet export/reopen uses the typed dataset layout.""" + arr = np.arange(1 * 1 * 2 * 6 * 7, dtype=np.uint16).reshape(1, 1, 2, 6, 7) + out = tmp_path / "example.ome.parquet" + oa = OMEArrow(arr) + + exported = oa.export(how="ome-parquet", out=str(out)) + reloaded = OMEArrow(exported) + + assert (out / "_ome_arrow_dataset.json").exists() + assert reloaded.info() == oa.info() + np.testing.assert_array_equal(reloaded.export(how="numpy"), arr) + + def test_scan_collect_roundtrip() -> None: """Materialize a lazily scanned parquet source via collect().""" oa = OMEArrow.scan("tests/data/JUMP-BR00117006/BR00117006.ome.parquet") diff --git a/tests/test_dataset.py b/tests/test_dataset.py index fa6b011..804a6b9 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,5 +1,6 @@ """Tests for typed chunk-buffer OME-Arrow datasets.""" +import json import pathlib import numpy as np @@ -71,6 +72,18 @@ def test_write_dataset_preserves_dtype_and_reads_image(tmp_path: pathlib.Path) - assert pq.ParquetFile(out / "chunks.parquet").metadata.num_row_groups == 1 +def test_write_dataset_replaces_existing_file_path(tmp_path: pathlib.Path) -> None: + """Replace an old single-file artifact with a typed dataset directory.""" + arr = np.arange(1 * 1 * 1 * 3 * 4, dtype=np.uint16).reshape(1, 1, 1, 3, 4) + out = tmp_path / "image.ome.parquet" + out.write_text("old parquet placeholder") + + write_ome_arrow_dataset([arr], out, layout="image", compression=None) + + assert out.is_dir() + assert (out / "_ome_arrow_dataset.json").exists() + + def test_write_dataset_rejects_duplicate_image_ids(tmp_path: pathlib.Path) -> None: """Reject duplicate image IDs before chunk rows can merge at read time.""" arr = np.arange(1 * 1 * 1 * 3 * 4, dtype=np.uint16).reshape(1, 1, 1, 3, 4) @@ -161,6 +174,38 @@ def test_dataset_reads_plane_and_region_from_indexed_tiles( assert pq.ParquetFile(out / "chunks.parquet").metadata.num_row_groups == 8 +def test_dataset_reads_raw_chunk_rows_and_manifest_versions( + tmp_path: pathlib.Path, +) -> None: + """Read raw Arrow chunk rows without decoding pixels.""" + arr = np.arange(1 * 1 * 2 * 6 * 7, dtype=np.uint16).reshape(1, 1, 2, 6, 7) + out = tmp_path / "raw_chunks" + + write_ome_arrow_dataset( + [arr], + out, + layout="tile", + chunk_shape=(1, 1, 1, 3, 4), + compression=None, + ) + dataset = OMEArrowDataset(out) + + chunks = dataset.read_chunk_rows( + z=1, + y=slice(2, 6), + x=slice(3, 7), + ) + manifest = json.loads((out / "_ome_arrow_dataset.json").read_text()) + + assert chunks.num_rows == 4 + assert ( + chunks.schema.field("pixel_bytes").type + == pq.read_table(out / "chunks.parquet").schema.field("pixel_bytes").type + ) + assert "package_versions" in manifest + assert "pyarrow" in manifest["package_versions"] + + def test_dataset_reads_from_packed_chunk_row_groups(tmp_path: pathlib.Path) -> None: """Read indexed chunks when several chunk rows share one row group.""" arr = np.arange(1 * 1 * 2 * 6 * 7, dtype=np.uint16).reshape(1, 1, 2, 6, 7) From 7b0df16d9d22c38458a294c4389fdae156289652 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 4 Jun 2026 15:42:45 -0600 Subject: [PATCH 5/9] one sentence per line Co-Authored-By: Gregory Way --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 97b82f0..ff52fe2 100644 --- a/README.md +++ b/README.md @@ -124,10 +124,8 @@ See full docs: [`docs/src/dlpack.md`](docs/src/dlpack.md) `OMEArrow.export(how="ome-parquet")` writes the typed byte-buffer dataset layout. For explicit control over layout and chunks, use the dataset writer directly. -By default, this stores image metadata separately from pixel chunks and writes -one chunk per Parquet row group, so `read_plane()` and `read_region()` can jump -through a physical index instead of materializing the older nested struct -payload. You can change that row-group packing with `chunk_rows_per_row_group`. +By default, this stores image metadata separately from pixel chunks and writes one chunk per Parquet row group, so `read_plane()` and `read_region()` can jump through a physical index instead of materializing the older nested struct payload. +You can change that row-group packing with `chunk_rows_per_row_group`. ```python import numpy as np From c824a61a07a46cb6a51f277cc2272f981f04a439 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 4 Jun 2026 22:08:12 -0600 Subject: [PATCH 6/9] minor dataset improvements --- src/ome_arrow/dataset.py | 50 +++++++++++++++++++++++++++++++--------- tests/test_dataset.py | 10 ++++++++ 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/src/ome_arrow/dataset.py b/src/ome_arrow/dataset.py index aba27a9..bf06eb7 100644 --- a/src/ome_arrow/dataset.py +++ b/src/ome_arrow/dataset.py @@ -609,10 +609,7 @@ def read_chunk_rows( y=ys, x=xs, ) - rows = [ - self._dataset._read_chunk_row(self._dataset._chunks_file, row) - for row in chunks - ] + rows = self._dataset._read_chunk_rows(self._dataset._chunks_file, chunks) if not rows: return pa.Table.from_pylist([], schema=CHUNK_TABLE_SCHEMA) return pa.Table.from_pylist(rows, schema=CHUNK_TABLE_SCHEMA) @@ -657,8 +654,7 @@ def _read_region( x=xs, ) parquet_file = self._dataset._chunks_file - for row in chunks: - chunk = self._dataset._read_chunk_row(parquet_file, row) + for chunk in self._dataset._read_chunk_rows(parquet_file, chunks): arr = np.frombuffer(chunk["pixel_bytes"], dtype=np.dtype(chunk["dtype"])) arr = arr.reshape( ( @@ -685,6 +681,9 @@ def __init__(self, path: str | Path) -> None: self._chunks_file = pq.ParquetFile(self.path / "chunks.parquet") index_path = self.path / "index.parquet" self.index = pq.read_table(index_path) if index_path.exists() else None + self._image_metadata_by_id = { + str(row["image_id"]): row for row in self.images.to_pylist() + } self.pixels = OMEArrowPixels(self) @property @@ -694,11 +693,10 @@ def image_ids(self) -> list[str]: def image_metadata(self, image_id: str) -> dict[str, Any]: """Return one image metadata row as a dictionary.""" - mask = pc.equal(self.images["image_id"], str(image_id)) - rows = self.images.filter(mask).to_pylist() - if not rows: - raise KeyError(f"image_id not found: {image_id}") - return rows[0] + try: + return self._image_metadata_by_id[str(image_id)] + except KeyError as exc: + raise KeyError(f"image_id not found: {image_id}") from exc def read_image( self, @@ -866,6 +864,36 @@ def _read_chunk_row( raise ValueError(f"Chunk not found: {row['chunk_id']}") return chunk_rows[0] + def _read_chunk_rows( + self, + parquet_file: pq.ParquetFile, + rows: list[dict[str, Any]], + ) -> list[dict[str, Any]]: + """Read multiple chunk payload rows while batching row-group IO.""" + if not rows: + return [] + if all( + "row_group_id" in row and row["row_group_id"] is not None for row in rows + ): + row_group_ids = sorted({int(row["row_group_id"]) for row in rows}) + table = parquet_file.read_row_groups( + row_group_ids, + columns=["chunk_id", "dtype", "pixel_bytes"], + ) + payload_by_chunk_id = { + int(row["chunk_id"]): row for row in table.to_pylist() + } + out = [] + for row in rows: + chunk_id = int(row["chunk_id"]) + try: + payload = payload_by_chunk_id[chunk_id] + except KeyError as exc: + raise ValueError(f"Chunk not found: {chunk_id}") from exc + out.append({**row, **payload}) + return out + return [self._read_chunk_row(parquet_file, row) for row in rows] + def _as_slice(value: int | slice | None) -> slice | None: """Normalize an optional scalar index to a slice.""" diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 804a6b9..934fa6a 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -220,14 +220,24 @@ def test_dataset_reads_from_packed_chunk_row_groups(tmp_path: pathlib.Path) -> N chunk_rows_per_row_group=4, ) dataset = OMEArrowDataset(out) + dataset._read_chunk_row = ( # type: ignore[method-assign] + lambda _parquet_file, _row: pytest.fail("single chunk-row fallback used") + ) region = dataset.read_region( z=1, y=slice(2, 6), x=slice(3, 7), ) + chunks = dataset.read_chunk_rows( + z=1, + y=slice(2, 6), + x=slice(3, 7), + ) np.testing.assert_array_equal(region, arr[:, :, 1:2, 2:6, 3:7]) + assert chunks.num_rows == 4 + assert all(chunks["pixel_bytes"].to_pylist()) assert pq.ParquetFile(out / "chunks.parquet").metadata.num_row_groups == 2 From f7b4ea21602497003e32b5397154a830eaf80003 Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 5 Jun 2026 09:45:24 -0600 Subject: [PATCH 7/9] nested table improvements --- README.md | 29 ++++++ benchmarks/benchmark_lazy_tensor.py | 23 ++++- src/ome_arrow/__init__.py | 7 +- src/ome_arrow/export.py | 140 ++++++++++++++++++++-------- src/ome_arrow/ingest.py | 88 ++++++++++++----- src/ome_arrow/meta.py | 34 +++++++ src/ome_arrow/tensor.py | 19 +++- tests/test_chunks.py | 63 ++++++++++++- 8 files changed, 337 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index ff52fe2..324f974 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,35 @@ Advanced options: See full docs: [`docs/src/dlpack.md`](docs/src/dlpack.md) +## Inline byte-backed OME values + +The historical nested table stores pixel payloads as numeric lists inside +`chunks[].pixels` and `planes[].pixels`. For faster one-row-per-image Parquet +tables, write inline chunk bytes instead: + +```python +from ome_arrow import from_numpy, to_ome_parquet + +record = from_numpy(arr, dim_order="TCZYX", chunk_encoding="bytes") +to_ome_parquet(record, "image.ome.parquet", column_name="ome_arrow") +``` + +You can also convert an existing OME-Arrow record at write time: + +```python +to_ome_parquet( + record, + "image.ome.parquet", + column_name="ome_arrow", + inline_chunk_encoding="bytes", +) +``` + +This keeps the ergonomic inline OME value while storing chunk payloads as typed +`pixel_bytes: large_binary`. Use it for moderate image-level tables and +whole-image reads. For large 3D/5D selective reads, prefer the typed chunk +dataset API below. + ## Typed chunk datasets `OMEArrow.export(how="ome-parquet")` writes the typed byte-buffer dataset layout. diff --git a/benchmarks/benchmark_lazy_tensor.py b/benchmarks/benchmark_lazy_tensor.py index 86fb14d..5d9c51f 100644 --- a/benchmarks/benchmark_lazy_tensor.py +++ b/benchmarks/benchmark_lazy_tensor.py @@ -78,7 +78,7 @@ def _time_case( ) -def _build_parquet_fixtures(workdir: Path) -> tuple[Path, Path, Path]: +def _build_parquet_fixtures(workdir: Path) -> tuple[Path, Path, Path, Path]: """Create small planes/chunks parquet fixtures for local benchmarking.""" arr = np.arange(1 * 2 * 3 * 256 * 256, dtype=np.uint16).reshape(1, 2, 3, 256, 256) @@ -92,9 +92,16 @@ def _build_parquet_fixtures(workdir: Path) -> tuple[Path, Path, Path]: planes_path = workdir / "bench_planes.ome.parquet" chunks_path = workdir / "bench_chunks.ome.parquet" + inline_bytes_path = workdir / "bench_inline_bytes.ome.parquet" typed_dataset_path = workdir / "bench_typed_dataset" to_ome_parquet(planes_scalar, out_path=str(planes_path), column_name="ome_arrow") to_ome_parquet(chunks_scalar, out_path=str(chunks_path), column_name="ome_arrow") + to_ome_parquet( + chunks_scalar, + out_path=str(inline_bytes_path), + column_name="ome_arrow", + inline_chunk_encoding="bytes", + ) write_ome_arrow_dataset( [arr], typed_dataset_path, @@ -102,7 +109,7 @@ def _build_parquet_fixtures(workdir: Path) -> tuple[Path, Path, Path]: chunk_shape=(1, 1, 1, 64, 64), compression="zstd", ) - return planes_path, chunks_path, typed_dataset_path + return planes_path, chunks_path, inline_bytes_path, typed_dataset_path def _print_results(results: list[BenchmarkResult]) -> None: @@ -270,7 +277,9 @@ def main() -> None: with tempfile.TemporaryDirectory(prefix="ome_arrow_lazy_bench_") as tmp: tmpdir = Path(tmp) - planes_path, chunks_path, typed_dataset_path = _build_parquet_fixtures(tmpdir) + planes_path, chunks_path, inline_bytes_path, typed_dataset_path = ( + _build_parquet_fixtures(tmpdir) + ) typed_dataset = OMEArrowDataset(typed_dataset_path) typed_image_id = typed_dataset.images["image_id"].to_pylist()[0] @@ -307,6 +316,14 @@ def main() -> None: .to_numpy(contiguous=True) ), ), + ( + "scan+parquet(inline-bytes) -> tensor_view(YX)", + lambda: ( + OMEArrow.scan(str(inline_bytes_path)) + .tensor_view(t=0, z=1, c=1, layout="YX") + .to_numpy(contiguous=True) + ), + ), ( "typed-dataset(tile) -> read_plane", lambda: typed_dataset.pixels.read_plane( diff --git a/src/ome_arrow/__init__.py b/src/ome_arrow/__init__.py index 8ee8e4c..bd256a0 100644 --- a/src/ome_arrow/__init__.py +++ b/src/ome_arrow/__init__.py @@ -30,7 +30,12 @@ from_torch_array, to_ome_arrow, ) -from ome_arrow.meta import OME_ARROW_STRUCT, OME_ARROW_TAG_TYPE, OME_ARROW_TAG_VERSION +from ome_arrow.meta import ( + OME_ARROW_BYTE_STRUCT, + OME_ARROW_STRUCT, + OME_ARROW_TAG_TYPE, + OME_ARROW_TAG_VERSION, +) from ome_arrow.tensor import LazyTensorView, TensorView from ome_arrow.utils import describe_ome_arrow, verify_ome_arrow from ome_arrow.view import view_matplotlib, view_pyvista diff --git a/src/ome_arrow/export.py b/src/ome_arrow/export.py index 08d5e59..7f5c8fc 100644 --- a/src/ome_arrow/export.py +++ b/src/ome_arrow/export.py @@ -2,13 +2,50 @@ Module for exporting OME-Arrow data to other formats. """ -from typing import Any, Dict, List, Optional, Sequence, Tuple +from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple import numpy as np import pyarrow as pa import pyarrow.parquet as pq -from ome_arrow.meta import OME_ARROW_STRUCT, OME_ARROW_TAG_TYPE, OME_ARROW_TAG_VERSION +from ome_arrow.meta import ( + OME_ARROW_BYTE_STRUCT, + OME_ARROW_STRUCT, + OME_ARROW_TAG_TYPE, + OME_ARROW_TAG_VERSION, +) + + +def _chunk_payload_array( + chunk: Dict[str, Any], + *, + expected_len: int, + fallback_dtype: np.dtype, + label: str, + strict: bool, +) -> np.ndarray: + """Return one chunk payload as a one-dimensional NumPy array.""" + if chunk.get("pixel_bytes") is not None: + source_dtype = np.dtype(chunk.get("dtype") or fallback_dtype) + payload = np.frombuffer(chunk["pixel_bytes"], dtype=source_dtype) + else: + pix = chunk.get("pixels") + try: + payload = np.asarray(pix) + except Exception as exc: + raise ValueError(f"{label}.pixels is not a sequence") from exc + + if payload.size != expected_len: + if strict: + raise ValueError( + f"{label} pixel payload length {payload.size} != expected " + f"{expected_len}" + ) + if payload.size > expected_len: + payload = payload[:expected_len] + else: + payload = np.pad(payload, (0, expected_len - payload.size)) + return payload def to_numpy( @@ -121,24 +158,14 @@ def _cast_plane(a: np.ndarray) -> np.ndarray: f"> sx={sx}" ) - pix = ch["pixels"] - try: - n = len(pix) - except Exception as e: - raise ValueError(f"chunks[{i}].pixels is not a sequence") from e - expected_len = shape_z * shape_y * shape_x - if n != expected_len: - if strict: - raise ValueError( - f"chunks[{i}].pixels length {n} != expected {expected_len}" - ) - if n > expected_len: - pix = pix[:expected_len] - else: - pix = list(pix) + [0] * (expected_len - n) - - arr3d = np.asarray(pix).reshape(shape_z, shape_y, shape_x) + arr3d = _chunk_payload_array( + ch, + expected_len=expected_len, + fallback_dtype=dtype, + label=f"chunks[{i}]", + strict=strict, + ).reshape(shape_z, shape_y, shape_x) arr3d = _cast_plane(arr3d) out[t, c, z : z + shape_z, y : y + shape_y, x : x + shape_x] = arr3d return out @@ -278,25 +305,16 @@ def _cast_plane(a: np.ndarray) -> np.ndarray: if strict: raise ValueError(msg) continue - pix = ch["pixels"] - try: - n = len(pix) - except Exception as e: - raise ValueError(f"chunks[{i}].pixels is not a sequence") from e expected_len = szc * syc * sxc - if n != expected_len: - if strict: - raise ValueError( - f"chunks[{i}].pixels length {n} != expected {expected_len}" - ) - # Lenient mode: truncate or zero-pad to match the expected size. - if n > expected_len: - pix = pix[:expected_len] - else: - pix = list(pix) + [0] * (expected_len - n) # Convert to a Z/Y/X slab and copy the requested Z slice into the plane. - slab = np.asarray(pix).reshape(szc, syc, sxc) + slab = _chunk_payload_array( + ch, + expected_len=expected_len, + fallback_dtype=dtype, + label=f"chunks[{i}]", + strict=strict, + ).reshape(szc, syc, sxc) slab = _cast_plane(slab) zi = z - z0 plane[y0 : y0 + syc, x0 : x0 + sxc] = slab[zi] @@ -608,6 +626,19 @@ def _level_shapes_tcxyz(levels: int) -> List[Tuple[int, int, int, int, int]]: writer.write_full_volume(arr) +def _chunk_shape_from_grid(grid: Dict[str, Any] | None) -> Tuple[int, int, int] | None: + if not grid: + return None + try: + return ( + int(grid["chunk_z"]), + int(grid["chunk_y"]), + int(grid["chunk_x"]), + ) + except Exception: + return None + + def to_ome_parquet( data: Dict[str, Any] | pa.StructScalar, out_path: str, @@ -615,23 +646,58 @@ def to_ome_parquet( file_metadata: Optional[Dict[str, str]] = None, compression: Optional[str] = "zstd", row_group_size: Optional[int] = None, + inline_chunk_encoding: Literal["existing", "list", "bytes"] = "existing", ) -> None: """ Export an OME-Arrow record to a Parquet file as a single-row, single-column table. - The single column holds a struct with the OME-Arrow schema. + The single column holds a struct with an OME-Arrow schema. """ + if inline_chunk_encoding not in {"existing", "list", "bytes"}: + raise ValueError("inline_chunk_encoding must be one of: existing, list, bytes") # 1) Normalize to a plain Python dict (works better with pyarrow builders, # especially when the struct has a `null`-typed field like "masks"). if isinstance(data, pa.StructScalar): record_dict = data.as_py() + schema = data.type if data.type == OME_ARROW_BYTE_STRUCT else OME_ARROW_STRUCT else: # Validate by round-tripping through a typed scalar, then back to dict. record_dict = {f.name: data.get(f.name) for f in OME_ARROW_STRUCT} record_dict = pa.scalar(record_dict, type=OME_ARROW_STRUCT).as_py() + schema = OME_ARROW_STRUCT + + if inline_chunk_encoding == "bytes" and schema != OME_ARROW_BYTE_STRUCT: + from ome_arrow.ingest import from_numpy + + arr = to_numpy(record_dict, dtype=np.dtype(record_dict["pixels_meta"]["type"])) + byte_scalar = from_numpy( + arr, + dim_order="TCZYX", + image_id=record_dict.get("id"), + name=record_dict.get("name"), + image_type=record_dict.get("image_type"), + acquisition_datetime=record_dict.get("acquisition_datetime"), + clamp_to_uint16=False, + chunk_shape=_chunk_shape_from_grid(record_dict.get("chunk_grid")), + chunk_encoding="bytes", + build_chunks=True, + physical_size_x=record_dict["pixels_meta"].get("physical_size_x") or 1.0, + physical_size_y=record_dict["pixels_meta"].get("physical_size_y") or 1.0, + physical_size_z=record_dict["pixels_meta"].get("physical_size_z") or 1.0, + physical_size_unit=( + record_dict["pixels_meta"].get("physical_size_x_unit") or "µm" + ), + dtype_meta=record_dict["pixels_meta"]["type"], + ) + record_dict = byte_scalar.as_py() + schema = OME_ARROW_BYTE_STRUCT + elif inline_chunk_encoding == "list": + record_dict = {f.name: record_dict.get(f.name) for f in OME_ARROW_STRUCT} + record_dict = pa.scalar(record_dict, type=OME_ARROW_STRUCT).as_py() + schema = OME_ARROW_STRUCT # 2) Build a single-row struct array from the dict, explicitly passing the schema - struct_array = pa.array([record_dict], type=OME_ARROW_STRUCT) # len=1 + struct_array = pa.array([record_dict], type=schema) # len=1 # 3) Wrap into a one-column table table = pa.table({column_name: struct_array}) diff --git a/src/ome_arrow/ingest.py b/src/ome_arrow/ingest.py index de0bb2f..c0b6760 100644 --- a/src/ome_arrow/ingest.py +++ b/src/ome_arrow/ingest.py @@ -8,7 +8,7 @@ import warnings from datetime import datetime, timezone from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple +from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple import bioio_ome_tiff import bioio_tifffile @@ -18,7 +18,13 @@ from bioio import BioImage from bioio_ome_zarr import Reader as OMEZarrReader -from ome_arrow.meta import OME_ARROW_STRUCT, OME_ARROW_TAG_TYPE, OME_ARROW_TAG_VERSION +from ome_arrow.meta import ( + OME_ARROW_BYTE_STRUCT, + OME_ARROW_KNOWN_STRUCTS, + OME_ARROW_STRUCT, + OME_ARROW_TAG_TYPE, + OME_ARROW_TAG_VERSION, +) def _ome_arrow_from_table( @@ -64,11 +70,11 @@ def _struct_matches_ome_fields(t: pa.StructType) -> bool: arr = table[column_name] if not pa.types.is_struct(arr.type): raise ValueError(f"Column '{column_name}' is not a Struct; got {arr.type}.") - if strict_schema and arr.type != OME_ARROW_STRUCT: + if strict_schema and arr.type not in OME_ARROW_KNOWN_STRUCTS: raise ValueError( - f"Column '{column_name}' schema != OME_ARROW_STRUCT.\n" + f"Column '{column_name}' schema is not a known OME-Arrow struct.\n" f"Got: {arr.type}\n" - f"Expect:{OME_ARROW_STRUCT}" + f"Expect:{OME_ARROW_STRUCT} or {OME_ARROW_BYTE_STRUCT}" ) if not strict_schema and not _struct_matches_ome_fields(arr.type): raise ValueError( @@ -80,7 +86,7 @@ def _struct_matches_ome_fields(t: pa.StructType) -> bool: for name in table.column_names: arr = table[name] if pa.types.is_struct(arr.type): - if strict_schema and arr.type == OME_ARROW_STRUCT: + if strict_schema and arr.type in OME_ARROW_KNOWN_STRUCTS: candidate_col = arr autodetected_name = name column_name = name @@ -115,7 +121,7 @@ def _struct_matches_ome_fields(t: pa.StructType) -> bool: struct_array = struct_array.combine_chunks() # 3) Construct a typed StructScalar (preserve zero-copy when possible). - if strict_schema or candidate_col.type == OME_ARROW_STRUCT: + if strict_schema or candidate_col.type in OME_ARROW_KNOWN_STRUCTS: scalar = struct_array[0] else: warnings.warn( @@ -399,6 +405,7 @@ def _build_chunks_from_planes( size_x: int, chunk_shape: Optional[Tuple[int, int, int]], chunk_order: str = "ZYX", + chunk_encoding: Literal["list", "bytes"] = "list", ) -> List[Dict[str, Any]]: """Build chunked pixels from a list of flattened planes. @@ -413,13 +420,16 @@ def _build_chunks_from_planes( chunk_order: Flattening order for chunk pixels (default "ZYX"). Returns: - List[Dict[str, Any]]: Chunk list with pixels stored as flat lists. + List[Dict[str, Any]]: Chunk list with pixels stored as flat lists or + typed byte buffers. Raises: ValueError: If an unsupported chunk_order is requested. """ if str(chunk_order).upper() != "ZYX": raise ValueError("Only chunk_order='ZYX' is supported for now.") + if chunk_encoding not in {"list", "bytes"}: + raise ValueError("chunk_encoding must be either 'list' or 'bytes'") cz, cy, cx = _normalize_chunk_shape(chunk_shape, size_z, size_y, size_x) @@ -449,19 +459,27 @@ def _build_chunks_from_planes( if plane is None: continue slab[zi] = plane[y0 : y0 + sy, x0 : x0 + sx] - chunks.append( - { - "t": t, - "c": c, - "z": z0, - "y": y0, - "x": x0, - "shape_z": sz, - "shape_y": sy, - "shape_x": sx, - "pixels": slab.reshape(-1), - } - ) + row = { + "t": t, + "c": c, + "z": z0, + "y": y0, + "x": x0, + "shape_z": sz, + "shape_y": sy, + "shape_x": sx, + } + if chunk_encoding == "bytes": + slab = np.ascontiguousarray(slab) + row.update( + { + "dtype": np.dtype(slab.dtype).name, + "pixel_bytes": slab.tobytes(order="C"), + } + ) + else: + row["pixels"] = slab.reshape(-1) + chunks.append(row) return chunks @@ -488,6 +506,7 @@ def to_ome_arrow( chunks: Optional[List[Dict[str, Any]]] = None, chunk_shape: Optional[Tuple[int, int, int]] = (1, 512, 512), # (Z, Y, X) chunk_order: str = "ZYX", + chunk_encoding: Literal["list", "bytes"] = "list", build_chunks: bool = True, masks: Any = None, ) -> pa.StructScalar: @@ -518,6 +537,8 @@ def to_ome_arrow( chunks are derived from planes using chunk_shape. chunk_shape: Chunk shape as (Z, Y, X). Defaults to (1, 512, 512). chunk_order: Flattening order for chunk pixels (default "ZYX"). + chunk_encoding: ``"list"`` stores historical numeric pixel lists. + ``"bytes"`` stores compact typed chunk byte buffers. build_chunks: If True, build chunked pixels from planes when chunks is None. masks: Optional placeholder for future annotations. @@ -572,6 +593,11 @@ def to_ome_arrow( } ] + if chunk_encoding not in {"list", "bytes"}: + raise ValueError("chunk_encoding must be either 'list' or 'bytes'") + if chunks is not None and chunks and "pixel_bytes" in chunks[0]: + chunk_encoding = "bytes" + if chunks is None and build_chunks: chunks = _build_chunks_from_planes( planes=planes, @@ -582,7 +608,10 @@ def to_ome_arrow( size_x=size_x, chunk_shape=chunk_shape, chunk_order=chunk_order, + chunk_encoding=chunk_encoding, ) + if chunk_encoding == "bytes" and chunks: + planes = [] chunk_grid = None if chunks is not None: @@ -651,7 +680,8 @@ def to_ome_arrow( "masks": masks, } - return pa.scalar(record, type=OME_ARROW_STRUCT) + schema = OME_ARROW_BYTE_STRUCT if chunk_encoding == "bytes" else OME_ARROW_STRUCT + return pa.scalar(record, type=schema) def from_numpy( @@ -666,6 +696,7 @@ def from_numpy( clamp_to_uint16: bool = True, chunk_shape: Optional[Tuple[int, int, int]] = (1, 512, 512), chunk_order: str = "ZYX", + chunk_encoding: Literal["list", "bytes"] = "list", build_chunks: bool = True, # meta physical_size_x: float = 1.0, @@ -691,6 +722,8 @@ def from_numpy( clamp_to_uint16: If True, clamp/cast planes to uint16 before serialization. chunk_shape: Chunk shape as (Z, Y, X). Defaults to (1, 512, 512). chunk_order: Flattening order for chunk pixels (default "ZYX"). + chunk_encoding: ``"list"`` stores historical numeric pixel lists. + ``"bytes"`` stores compact typed chunk byte buffers. build_chunks: If True, build chunked pixels from planes. physical_size_x: Spatial pixel size (µm) for X. physical_size_y: Spatial pixel size (µm) for Y. @@ -812,6 +845,7 @@ def from_numpy( planes=planes, chunk_shape=chunk_shape, chunk_order=chunk_order, + chunk_encoding=chunk_encoding, build_chunks=build_chunks, masks=None, ) @@ -857,6 +891,7 @@ def _from_array_via_numpy( clamp_to_uint16: bool, chunk_shape: Optional[Tuple[int, int, int]], chunk_order: str, + chunk_encoding: Literal["list", "bytes"], build_chunks: bool, physical_size_x: float, physical_size_y: float, @@ -881,6 +916,7 @@ def _from_array_via_numpy( clamp_to_uint16=clamp_to_uint16, chunk_shape=chunk_shape, chunk_order=chunk_order, + chunk_encoding=chunk_encoding, build_chunks=build_chunks, physical_size_x=physical_size_x, physical_size_y=physical_size_y, @@ -902,6 +938,7 @@ def from_torch_array( clamp_to_uint16: bool = True, chunk_shape: Optional[Tuple[int, int, int]] = (1, 512, 512), chunk_order: str = "ZYX", + chunk_encoding: Literal["list", "bytes"] = "list", build_chunks: bool = True, # meta physical_size_x: float = 1.0, @@ -932,6 +969,8 @@ def from_torch_array( clamp_to_uint16: If True, clamp/cast planes to uint16 before serialization. chunk_shape: Chunk shape as (Z, Y, X). Defaults to (1, 512, 512). chunk_order: Flattening order for chunk pixels (default "ZYX"). + chunk_encoding: ``"list"`` stores historical numeric pixel lists. + ``"bytes"`` stores compact typed chunk byte buffers. build_chunks: If True, build chunked pixels from planes. physical_size_x: Spatial pixel size (µm) for X. physical_size_y: Spatial pixel size (µm) for Y. @@ -977,6 +1016,7 @@ def from_torch_array( clamp_to_uint16=clamp_to_uint16, chunk_shape=chunk_shape, chunk_order=chunk_order, + chunk_encoding=chunk_encoding, build_chunks=build_chunks, physical_size_x=physical_size_x, physical_size_y=physical_size_y, @@ -998,6 +1038,7 @@ def from_jax_array( clamp_to_uint16: bool = True, chunk_shape: Optional[Tuple[int, int, int]] = (1, 512, 512), chunk_order: str = "ZYX", + chunk_encoding: Literal["list", "bytes"] = "list", build_chunks: bool = True, # meta physical_size_x: float = 1.0, @@ -1027,6 +1068,8 @@ def from_jax_array( clamp_to_uint16: If True, clamp/cast planes to uint16 before serialization. chunk_shape: Chunk shape as (Z, Y, X). Defaults to (1, 512, 512). chunk_order: Flattening order for chunk pixels (default "ZYX"). + chunk_encoding: ``"list"`` stores historical numeric pixel lists. + ``"bytes"`` stores compact typed chunk byte buffers. build_chunks: If True, build chunked pixels from planes. physical_size_x: Spatial pixel size (µm) for X. physical_size_y: Spatial pixel size (µm) for Y. @@ -1060,6 +1103,7 @@ def from_jax_array( clamp_to_uint16=clamp_to_uint16, chunk_shape=chunk_shape, chunk_order=chunk_order, + chunk_encoding=chunk_encoding, build_chunks=build_chunks, physical_size_x=physical_size_x, physical_size_y=physical_size_y, diff --git a/src/ome_arrow/meta.py b/src/ome_arrow/meta.py index 2eaf29e..5514d3e 100644 --- a/src/ome_arrow/meta.py +++ b/src/ome_arrow/meta.py @@ -129,3 +129,37 @@ pa.field("masks", pa.null()), # reserved for future annotations ] ) + +# Faster inline OME value variant. This keeps the same top-level OME-Arrow value +# shape, but stores chunk payloads as typed bytes instead of nested numeric +# lists. The historical OME_ARROW_STRUCT remains the compatibility baseline. +OME_ARROW_BYTE_STRUCT: pa.StructType = pa.struct( + [ + ( + pa.field( + "chunks", + pa.list_( + pa.struct( + [ + pa.field("t", pa.int32()), + pa.field("c", pa.int16()), + pa.field("z", pa.int32()), + pa.field("y", pa.int32()), + pa.field("x", pa.int32()), + pa.field("shape_z", pa.int32()), + pa.field("shape_y", pa.int32()), + pa.field("shape_x", pa.int32()), + pa.field("dtype", pa.string()), + pa.field("pixel_bytes", pa.large_binary()), + ] + ) + ), + ) + if field.name == "chunks" + else field + ) + for field in OME_ARROW_STRUCT + ] +) + +OME_ARROW_KNOWN_STRUCTS = (OME_ARROW_STRUCT, OME_ARROW_BYTE_STRUCT) diff --git a/src/ome_arrow/tensor.py b/src/ome_arrow/tensor.py index 2c7813e..0272e69 100644 --- a/src/ome_arrow/tensor.py +++ b/src/ome_arrow/tensor.py @@ -958,6 +958,9 @@ def _read_plane( ) if self._struct_array is not None and self._has_chunks(): + if _chunks_have_pixel_bytes(self._struct_array): + data_py = self._data_py_dict() + return plane_from_chunks(data_py, t=t, z=z, c=c, dtype=self._dtype) chunk_order = str(self._chunk_grid().get("chunk_order") or "ZYX").upper() return _plane_from_chunks_arrow( self._struct_array, @@ -1256,6 +1259,11 @@ def _arrow_values(self) -> pa.Array: raise ValueError("mode='arrow' requires Arrow-backed data.") if self._has_chunks(): + if _chunks_have_pixel_bytes(struct_arr): + raise ValueError( + "mode='arrow' does not support byte-backed inline chunks; " + "use mode='numpy'." + ) return _select_chunk_values( struct_arr, t=t_idx[0], @@ -1511,7 +1519,7 @@ def _ensure_struct_array( UserWarning, stacklevel=3, ) - return pa.array([data.as_py()], type=OME_ARROW_STRUCT) + return pa.array([data.as_py()], type=data.type) if isinstance(data, dict): warnings.warn( "mode='arrow' received a dict; converting to Arrow buffers, " @@ -1598,6 +1606,15 @@ def _select_chunk_values( return pixels_list.values +def _chunks_have_pixel_bytes(struct_arr: pa.StructArray) -> bool: + """Return True when chunk entries use pixel_bytes instead of pixels.""" + try: + chunks_type = struct_arr.type.field("chunks").type.value_type + except KeyError: + return False + return "pixel_bytes" in {field.name for field in chunks_type} + + def _plane_from_chunks_arrow( struct_arr: pa.StructArray, *, diff --git a/tests/test_chunks.py b/tests/test_chunks.py index 76ebb88..fb59521 100644 --- a/tests/test_chunks.py +++ b/tests/test_chunks.py @@ -2,10 +2,14 @@ Tests for chunked pixel support. """ +from pathlib import Path + import numpy as np +import pyarrow.parquet as pq -from ome_arrow.export import plane_from_chunks, to_numpy -from ome_arrow.ingest import to_ome_arrow +from ome_arrow import OME_ARROW_BYTE_STRUCT, OMEArrow +from ome_arrow.export import plane_from_chunks, to_numpy, to_ome_parquet +from ome_arrow.ingest import from_numpy, from_ome_parquet, to_ome_arrow def test_to_numpy_from_chunks(example_correct_data: dict) -> None: @@ -81,3 +85,58 @@ def test_plane_from_chunks(example_correct_data: dict) -> None: plane, np.array([[100, 101, 102, 103], [110, 111, 112, 113], [120, 121, 122, 123]]), ) + + +def test_to_numpy_from_byte_chunks() -> None: + """Decode inline typed chunk bytes without numeric pixel lists.""" + arr = np.arange(1 * 1 * 2 * 3 * 4, dtype=np.uint16).reshape(1, 1, 2, 3, 4) + + scalar = from_numpy( + arr, + dim_order="TCZYX", + chunk_shape=(1, 2, 2), + chunk_encoding="bytes", + ) + record = scalar.as_py() + + assert scalar.type == OME_ARROW_BYTE_STRUCT + assert record["planes"] == [] + assert "pixel_bytes" in record["chunks"][0] + assert "pixels" not in record["chunks"][0] + np.testing.assert_array_equal(to_numpy(scalar), arr) + np.testing.assert_array_equal( + plane_from_chunks(scalar, t=0, c=0, z=1), arr[0, 0, 1] + ) + + +def test_inline_byte_parquet_roundtrip_and_tensor_view(tmp_path: Path) -> None: + """Write/read an ergonomic inline OME value backed by typed chunk bytes.""" + arr = np.arange(1 * 1 * 2 * 3 * 4, dtype=np.uint16).reshape(1, 1, 2, 3, 4) + scalar = from_numpy(arr, dim_order="TCZYX", build_chunks=True) + out = tmp_path / "inline-bytes.ome.parquet" + + to_ome_parquet( + scalar, + str(out), + column_name="ome_arrow", + inline_chunk_encoding="bytes", + ) + + table = pq.read_table(out) + assert table["ome_arrow"].type == OME_ARROW_BYTE_STRUCT + + roundtrip, struct_array = from_ome_parquet( + out, + column_name="ome_arrow", + return_array=True, + ) + np.testing.assert_array_equal(to_numpy(roundtrip), arr) + np.testing.assert_array_equal( + OMEArrow(str(out)).tensor_view(layout="TCZYX").to_numpy(contiguous=True), + arr, + ) + np.testing.assert_array_equal( + OMEArrow(str(out)).tensor_view(t=0, z=1, c=0, layout="YX").to_numpy(), + arr[0, 0, 1], + ) + assert struct_array.type == OME_ARROW_BYTE_STRUCT From 7155c9e99c6ab5b7522518e6e878e343e360475e Mon Sep 17 00:00:00 2001 From: d33bs Date: Sat, 6 Jun 2026 07:30:57 -0600 Subject: [PATCH 8/9] leaf compression --- README.md | 31 ++++ .../benchmark_inline_byte_compression.py | 159 ++++++++++++++++++ src/ome_arrow/export.py | 121 ++++++++++--- src/ome_arrow/ingest.py | 94 ++++++++++- src/ome_arrow/meta.py | 1 + tests/test_chunks.py | 88 ++++++++++ 6 files changed, 472 insertions(+), 22 deletions(-) create mode 100644 benchmarks/benchmark_inline_byte_compression.py diff --git a/README.md b/README.md index 324f974..882f6e2 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,37 @@ This keeps the ergonomic inline OME value while storing chunk payloads as typed whole-image reads. For large 3D/5D selective reads, prefer the typed chunk dataset API below. +Leaf-level chunk compression is also available for inline byte chunks: + +```python +record = from_numpy( + arr, + dim_order="TCZYX", + chunk_encoding="bytes", + chunk_compression="auto", +) + +to_ome_parquet( + record, + "image.ome.parquet", + column_name="ome_arrow", + compression="zstd", +) +``` + +Compression guidance from `benchmarks/benchmark_inline_byte_compression.py`: + +| Data/workload | Suggested setting | Why | +| ------------------------------------------- | ----------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- | +| General inline-byte tables | `chunk_compression="auto"` and Parquet `compression="zstd"` | Compresses chunks only when they shrink, then lets Parquet compress metadata. | +| Faster reads on compressible images | `chunk_compression="fast"` with Parquet `compression=None` | Uses LZ4 only when chunks shrink, keeping decode overhead low. | +| Best storage on compressible 3D/volume data | `chunk_compression="small"` plus Parquet `compression="zstd"` | Uses Zstd level 1 only when chunks shrink, then applies Parquet compression. | +| Noisy/high-entropy images | `chunk_compression="auto"` or no leaf compression; use Parquet `compression="zstd"` | Auto skips chunks that would grow; noisy data often does not compress. | + +Explicit codecs such as `chunk_compression="zstd"` with +`chunk_compression_level=1` and `chunk_compression="lz4"` are also supported +when you want fixed behavior instead of a preset. + ## Typed chunk datasets `OMEArrow.export(how="ome-parquet")` writes the typed byte-buffer dataset layout. diff --git a/benchmarks/benchmark_inline_byte_compression.py b/benchmarks/benchmark_inline_byte_compression.py new file mode 100644 index 0000000..34837f1 --- /dev/null +++ b/benchmarks/benchmark_inline_byte_compression.py @@ -0,0 +1,159 @@ +"""Benchmark inline byte-backed OME values with leaf-level compression.""" + +from __future__ import annotations + +import argparse +import json +import statistics +import tempfile +import time +from dataclasses import asdict, dataclass +from pathlib import Path +from typing import Callable + +import numpy as np + +from ome_arrow import from_numpy +from ome_arrow.export import to_numpy, to_ome_parquet +from ome_arrow.ingest import from_ome_parquet + + +@dataclass(frozen=True) +class Result: + """One inline byte compression benchmark result.""" + + dataset: str + codec: str + parquet_compression: str | None + write_ms: float + read_ms: float + size_mb: float + + +def _time(fn: Callable[[], object], *, repeats: int, warmup: int) -> float: + for _ in range(warmup): + fn() + times = [] + for _ in range(repeats): + start = time.perf_counter() + fn() + times.append((time.perf_counter() - start) * 1000.0) + return statistics.median(times) + + +def _make_arrays() -> dict[str, np.ndarray]: + y, x = np.mgrid[:512, :512] + smooth = ((y * 3 + x * 5) % 4096).astype(np.uint16) + rng = np.random.default_rng(42) + noisy = rng.integers(0, 65535, size=(512, 512), dtype=np.uint16) + volume = np.stack( + [((smooth + z * 17) % 4096).astype(np.uint16) for z in range(16)], + axis=0, + ) + return { + "2d-smooth": smooth.reshape(1, 1, 1, 512, 512), + "2d-noisy": noisy.reshape(1, 1, 1, 512, 512), + "3d-smooth": volume.reshape(1, 1, 16, 512, 512), + } + + +def _cases() -> list[tuple[str, str | None, int | None, str | None]]: + return [ + ("leaf-none/parquet-none", None, None, None), + ("leaf-none/parquet-zstd", None, None, "zstd"), + ("leaf-auto/parquet-none", "auto", None, None), + ("leaf-auto/parquet-zstd", "auto", None, "zstd"), + ("leaf-fast/parquet-none", "fast", None, None), + ("leaf-small/parquet-zstd", "small", None, "zstd"), + ("leaf-lz4/parquet-none", "lz4", None, None), + ("leaf-zstd1/parquet-none", "zstd", 1, None), + ("leaf-zstd3/parquet-none", "zstd", 3, None), + ("leaf-zstd1/parquet-zstd", "zstd", 1, "zstd"), + ("leaf-brotli3/parquet-none", "brotli", 3, None), + ] + + +def run(*, repeats: int, warmup: int) -> list[Result]: + """Run inline byte compression benchmarks.""" + results: list[Result] = [] + with tempfile.TemporaryDirectory(prefix="ome_arrow_inline_compression_") as tmp: + tmpdir = Path(tmp) + for dataset, arr in _make_arrays().items(): + base = from_numpy( + arr, + dim_order="TCZYX", + chunk_shape=(1, 256, 256), + build_chunks=True, + ) + expected = arr + for label, codec, level, parquet_compression in _cases(): + filename_label = label.replace("/", "_") + out = tmpdir / f"{dataset}.{filename_label}.ome.parquet" + + def write() -> None: + to_ome_parquet( + base, + str(out), + column_name="ome_arrow", + compression=parquet_compression, + inline_chunk_encoding="bytes", + inline_chunk_compression=codec, + inline_chunk_compression_level=level, + ) + + write_ms = _time(write, repeats=repeats, warmup=warmup) + + def read() -> np.ndarray: + value = from_ome_parquet(out, column_name="ome_arrow") + decoded = to_numpy(value, dtype=expected.dtype) + if decoded.shape != expected.shape: + raise AssertionError( + f"decoded shape {decoded.shape} != {expected.shape}" + ) + return decoded + + decoded = read() + np.testing.assert_array_equal(decoded, expected) + read_ms = _time(read, repeats=repeats, warmup=warmup) + results.append( + Result( + dataset=dataset, + codec=label, + parquet_compression=parquet_compression, + write_ms=write_ms, + read_ms=read_ms, + size_mb=out.stat().st_size / (1024 * 1024), + ) + ) + return results + + +def main() -> None: + """Run the command-line benchmark.""" + parser = argparse.ArgumentParser() + parser.add_argument("--repeats", type=int, default=3) + parser.add_argument("--warmup", type=int, default=1) + parser.add_argument("--json-out", type=Path, default=None) + args = parser.parse_args() + + results = run(repeats=args.repeats, warmup=args.warmup) + print("") + print("Inline byte compression benchmark") + print( + f"{'dataset':12} {'codec':26} {'write ms':>10} {'read ms':>10} {'size MB':>10}" + ) + print("-" * 72) + for result in results: + print( + f"{result.dataset:12} {result.codec:26} " + f"{result.write_ms:10.2f} {result.read_ms:10.2f} " + f"{result.size_mb:10.2f}" + ) + if args.json_out is not None: + args.json_out.write_text( + json.dumps({"results": [asdict(r) for r in results]}, indent=2) + ) + + +if __name__ == "__main__": + main() diff --git a/src/ome_arrow/export.py b/src/ome_arrow/export.py index 7f5c8fc..d7beee6 100644 --- a/src/ome_arrow/export.py +++ b/src/ome_arrow/export.py @@ -27,7 +27,14 @@ def _chunk_payload_array( """Return one chunk payload as a one-dimensional NumPy array.""" if chunk.get("pixel_bytes") is not None: source_dtype = np.dtype(chunk.get("dtype") or fallback_dtype) - payload = np.frombuffer(chunk["pixel_bytes"], dtype=source_dtype) + payload_bytes = chunk["pixel_bytes"] + compression = chunk.get("compression") + if compression and str(compression).strip().lower() not in {"none", ""}: + payload_bytes = pa.Codec(str(compression)).decompress( + payload_bytes, + expected_len * source_dtype.itemsize, + ) + payload = np.frombuffer(payload_bytes, dtype=source_dtype) else: pix = chunk.get("pixels") try: @@ -639,6 +646,61 @@ def _chunk_shape_from_grid(grid: Dict[str, Any] | None) -> Tuple[int, int, int] return None +def _byte_record_from_numeric_chunks( + record: Dict[str, Any], + *, + compression: str | None, + compression_level: int | None, +) -> Dict[str, Any] | None: + """Convert existing numeric chunks to byte chunks without densifying.""" + chunks = record.get("chunks") or [] + if not chunks or any("pixels" not in chunk for chunk in chunks): + return None + + from ome_arrow.ingest import _encode_chunk_payload, _normalize_chunk_compression + + requested_compression = _normalize_chunk_compression(compression) + dtype = np.dtype(record["pixels_meta"]["type"]) + byte_chunks = [] + for i, chunk in enumerate(chunks): + shape_z = int(chunk["shape_z"]) + shape_y = int(chunk["shape_y"]) + shape_x = int(chunk["shape_x"]) + expected_len = shape_z * shape_y * shape_x + pixels = np.asarray(chunk["pixels"], dtype=dtype) + if pixels.size != expected_len: + raise ValueError( + f"chunks[{i}].pixels length {pixels.size} != expected {expected_len}" + ) + payload = np.ascontiguousarray(pixels.reshape(-1)).tobytes(order="C") + stored_compression, payload = _encode_chunk_payload( + payload, + compression=requested_compression, + compression_level=compression_level, + ) + byte_chunks.append( + { + "t": int(chunk["t"]), + "c": int(chunk["c"]), + "z": int(chunk["z"]), + "y": int(chunk["y"]), + "x": int(chunk["x"]), + "shape_z": shape_z, + "shape_y": shape_y, + "shape_x": shape_x, + "dtype": dtype.name, + "compression": stored_compression, + "pixel_bytes": payload, + } + ) + + return { + **record, + "chunks": byte_chunks, + "planes": [], + } + + def to_ome_parquet( data: Dict[str, Any] | pa.StructScalar, out_path: str, @@ -647,6 +709,8 @@ def to_ome_parquet( compression: Optional[str] = "zstd", row_group_size: Optional[int] = None, inline_chunk_encoding: Literal["existing", "list", "bytes"] = "existing", + inline_chunk_compression: str | None = None, + inline_chunk_compression_level: int | None = None, ) -> None: """ Export an OME-Arrow record to a Parquet file as a single-row, single-column table. @@ -669,27 +733,42 @@ def to_ome_parquet( if inline_chunk_encoding == "bytes" and schema != OME_ARROW_BYTE_STRUCT: from ome_arrow.ingest import from_numpy - arr = to_numpy(record_dict, dtype=np.dtype(record_dict["pixels_meta"]["type"])) - byte_scalar = from_numpy( - arr, - dim_order="TCZYX", - image_id=record_dict.get("id"), - name=record_dict.get("name"), - image_type=record_dict.get("image_type"), - acquisition_datetime=record_dict.get("acquisition_datetime"), - clamp_to_uint16=False, - chunk_shape=_chunk_shape_from_grid(record_dict.get("chunk_grid")), - chunk_encoding="bytes", - build_chunks=True, - physical_size_x=record_dict["pixels_meta"].get("physical_size_x") or 1.0, - physical_size_y=record_dict["pixels_meta"].get("physical_size_y") or 1.0, - physical_size_z=record_dict["pixels_meta"].get("physical_size_z") or 1.0, - physical_size_unit=( - record_dict["pixels_meta"].get("physical_size_x_unit") or "µm" - ), - dtype_meta=record_dict["pixels_meta"]["type"], + direct_record = _byte_record_from_numeric_chunks( + record_dict, + compression=inline_chunk_compression, + compression_level=inline_chunk_compression_level, ) - record_dict = byte_scalar.as_py() + if direct_record is None: + arr = to_numpy( + record_dict, dtype=np.dtype(record_dict["pixels_meta"]["type"]) + ) + byte_scalar = from_numpy( + arr, + dim_order="TCZYX", + image_id=record_dict.get("id"), + name=record_dict.get("name"), + image_type=record_dict.get("image_type"), + acquisition_datetime=record_dict.get("acquisition_datetime"), + clamp_to_uint16=False, + chunk_shape=_chunk_shape_from_grid(record_dict.get("chunk_grid")), + chunk_encoding="bytes", + chunk_compression=inline_chunk_compression, + chunk_compression_level=inline_chunk_compression_level, + build_chunks=True, + physical_size_x=record_dict["pixels_meta"].get("physical_size_x") + or 1.0, + physical_size_y=record_dict["pixels_meta"].get("physical_size_y") + or 1.0, + physical_size_z=record_dict["pixels_meta"].get("physical_size_z") + or 1.0, + physical_size_unit=( + record_dict["pixels_meta"].get("physical_size_x_unit") or "µm" + ), + dtype_meta=record_dict["pixels_meta"]["type"], + ) + record_dict = byte_scalar.as_py() + else: + record_dict = direct_record schema = OME_ARROW_BYTE_STRUCT elif inline_chunk_encoding == "list": record_dict = {f.name: record_dict.get(f.name) for f in OME_ARROW_STRUCT} diff --git a/src/ome_arrow/ingest.py b/src/ome_arrow/ingest.py index c0b6760..070d2d7 100644 --- a/src/ome_arrow/ingest.py +++ b/src/ome_arrow/ingest.py @@ -406,6 +406,8 @@ def _build_chunks_from_planes( chunk_shape: Optional[Tuple[int, int, int]], chunk_order: str = "ZYX", chunk_encoding: Literal["list", "bytes"] = "list", + chunk_compression: str | None = None, + chunk_compression_level: int | None = None, ) -> List[Dict[str, Any]]: """Build chunked pixels from a list of flattened planes. @@ -418,6 +420,9 @@ def _build_chunks_from_planes( size_x: Total X size of the image. chunk_shape: Desired chunk shape as (Z, Y, X). chunk_order: Flattening order for chunk pixels (default "ZYX"). + chunk_encoding: Pixel payload representation. + chunk_compression: Optional leaf-level compression for byte chunks. + chunk_compression_level: Optional codec compression level. Returns: List[Dict[str, Any]]: Chunk list with pixels stored as flat lists or @@ -430,6 +435,7 @@ def _build_chunks_from_planes( raise ValueError("Only chunk_order='ZYX' is supported for now.") if chunk_encoding not in {"list", "bytes"}: raise ValueError("chunk_encoding must be either 'list' or 'bytes'") + chunk_compression = _normalize_chunk_compression(chunk_compression) cz, cy, cx = _normalize_chunk_shape(chunk_shape, size_z, size_y, size_x) @@ -471,10 +477,17 @@ def _build_chunks_from_planes( } if chunk_encoding == "bytes": slab = np.ascontiguousarray(slab) + payload = slab.tobytes(order="C") + stored_compression, payload = _encode_chunk_payload( + payload, + compression=chunk_compression, + compression_level=chunk_compression_level, + ) row.update( { "dtype": np.dtype(slab.dtype).name, - "pixel_bytes": slab.tobytes(order="C"), + "compression": stored_compression, + "pixel_bytes": payload, } ) else: @@ -483,6 +496,53 @@ def _build_chunks_from_planes( return chunks +def _normalize_chunk_compression(compression: str | None) -> str | None: + """Normalize optional leaf-level chunk compression names.""" + if compression is None: + return None + value = str(compression).strip().lower() + if value in {"", "none", "null", "false"}: + return None + if value in {"auto", "balanced", "fast", "small"}: + return value + try: + pa.Codec(value) + except Exception as exc: + raise ValueError(f"Unsupported chunk_compression: {compression!r}") from exc + return value + + +def _encode_chunk_payload( + payload: bytes, + *, + compression: str | None, + compression_level: int | None, +) -> tuple[str | None, bytes]: + """Compress a chunk payload when requested and worthwhile.""" + if compression is None: + return None, payload + + codec_name = compression + level = compression_level + if compression in {"auto", "balanced", "small"}: + codec_name = "zstd" + level = 1 if level is None else level + elif compression == "fast": + codec_name = "lz4" + + codec = ( + pa.Codec(codec_name) + if level is None + else pa.Codec(codec_name, compression_level=level) + ) + compressed = bytes(codec.compress(payload)) + if compression in {"auto", "balanced", "fast", "small"} and len(compressed) >= len( + payload + ): + return None, payload + return codec_name, compressed + + def to_ome_arrow( type_: str = OME_ARROW_TAG_TYPE, version: str = OME_ARROW_TAG_VERSION, @@ -507,6 +567,8 @@ def to_ome_arrow( chunk_shape: Optional[Tuple[int, int, int]] = (1, 512, 512), # (Z, Y, X) chunk_order: str = "ZYX", chunk_encoding: Literal["list", "bytes"] = "list", + chunk_compression: str | None = None, + chunk_compression_level: int | None = None, build_chunks: bool = True, masks: Any = None, ) -> pa.StructScalar: @@ -539,6 +601,9 @@ def to_ome_arrow( chunk_order: Flattening order for chunk pixels (default "ZYX"). chunk_encoding: ``"list"`` stores historical numeric pixel lists. ``"bytes"`` stores compact typed chunk byte buffers. + chunk_compression: Optional leaf-level compression for byte chunks, + such as ``"zstd"`` or ``"lz4"``. + chunk_compression_level: Optional codec compression level. build_chunks: If True, build chunked pixels from planes when chunks is None. masks: Optional placeholder for future annotations. @@ -609,6 +674,8 @@ def to_ome_arrow( chunk_shape=chunk_shape, chunk_order=chunk_order, chunk_encoding=chunk_encoding, + chunk_compression=chunk_compression, + chunk_compression_level=chunk_compression_level, ) if chunk_encoding == "bytes" and chunks: planes = [] @@ -697,6 +764,8 @@ def from_numpy( chunk_shape: Optional[Tuple[int, int, int]] = (1, 512, 512), chunk_order: str = "ZYX", chunk_encoding: Literal["list", "bytes"] = "list", + chunk_compression: str | None = None, + chunk_compression_level: int | None = None, build_chunks: bool = True, # meta physical_size_x: float = 1.0, @@ -724,6 +793,9 @@ def from_numpy( chunk_order: Flattening order for chunk pixels (default "ZYX"). chunk_encoding: ``"list"`` stores historical numeric pixel lists. ``"bytes"`` stores compact typed chunk byte buffers. + chunk_compression: Optional leaf-level compression for byte chunks, + such as ``"zstd"`` or ``"lz4"``. + chunk_compression_level: Optional codec compression level. build_chunks: If True, build chunked pixels from planes. physical_size_x: Spatial pixel size (µm) for X. physical_size_y: Spatial pixel size (µm) for Y. @@ -846,6 +918,8 @@ def from_numpy( chunk_shape=chunk_shape, chunk_order=chunk_order, chunk_encoding=chunk_encoding, + chunk_compression=chunk_compression, + chunk_compression_level=chunk_compression_level, build_chunks=build_chunks, masks=None, ) @@ -892,6 +966,8 @@ def _from_array_via_numpy( chunk_shape: Optional[Tuple[int, int, int]], chunk_order: str, chunk_encoding: Literal["list", "bytes"], + chunk_compression: str | None, + chunk_compression_level: int | None, build_chunks: bool, physical_size_x: float, physical_size_y: float, @@ -917,6 +993,8 @@ def _from_array_via_numpy( chunk_shape=chunk_shape, chunk_order=chunk_order, chunk_encoding=chunk_encoding, + chunk_compression=chunk_compression, + chunk_compression_level=chunk_compression_level, build_chunks=build_chunks, physical_size_x=physical_size_x, physical_size_y=physical_size_y, @@ -939,6 +1017,8 @@ def from_torch_array( chunk_shape: Optional[Tuple[int, int, int]] = (1, 512, 512), chunk_order: str = "ZYX", chunk_encoding: Literal["list", "bytes"] = "list", + chunk_compression: str | None = None, + chunk_compression_level: int | None = None, build_chunks: bool = True, # meta physical_size_x: float = 1.0, @@ -971,6 +1051,9 @@ def from_torch_array( chunk_order: Flattening order for chunk pixels (default "ZYX"). chunk_encoding: ``"list"`` stores historical numeric pixel lists. ``"bytes"`` stores compact typed chunk byte buffers. + chunk_compression: Optional leaf-level compression for byte chunks, + such as ``"zstd"`` or ``"lz4"``. + chunk_compression_level: Optional codec compression level. build_chunks: If True, build chunked pixels from planes. physical_size_x: Spatial pixel size (µm) for X. physical_size_y: Spatial pixel size (µm) for Y. @@ -1017,6 +1100,8 @@ def from_torch_array( chunk_shape=chunk_shape, chunk_order=chunk_order, chunk_encoding=chunk_encoding, + chunk_compression=chunk_compression, + chunk_compression_level=chunk_compression_level, build_chunks=build_chunks, physical_size_x=physical_size_x, physical_size_y=physical_size_y, @@ -1039,6 +1124,8 @@ def from_jax_array( chunk_shape: Optional[Tuple[int, int, int]] = (1, 512, 512), chunk_order: str = "ZYX", chunk_encoding: Literal["list", "bytes"] = "list", + chunk_compression: str | None = None, + chunk_compression_level: int | None = None, build_chunks: bool = True, # meta physical_size_x: float = 1.0, @@ -1070,6 +1157,9 @@ def from_jax_array( chunk_order: Flattening order for chunk pixels (default "ZYX"). chunk_encoding: ``"list"`` stores historical numeric pixel lists. ``"bytes"`` stores compact typed chunk byte buffers. + chunk_compression: Optional leaf-level compression for byte chunks, + such as ``"zstd"`` or ``"lz4"``. + chunk_compression_level: Optional codec compression level. build_chunks: If True, build chunked pixels from planes. physical_size_x: Spatial pixel size (µm) for X. physical_size_y: Spatial pixel size (µm) for Y. @@ -1104,6 +1194,8 @@ def from_jax_array( chunk_shape=chunk_shape, chunk_order=chunk_order, chunk_encoding=chunk_encoding, + chunk_compression=chunk_compression, + chunk_compression_level=chunk_compression_level, build_chunks=build_chunks, physical_size_x=physical_size_x, physical_size_y=physical_size_y, diff --git a/src/ome_arrow/meta.py b/src/ome_arrow/meta.py index 5514d3e..8a80cd0 100644 --- a/src/ome_arrow/meta.py +++ b/src/ome_arrow/meta.py @@ -150,6 +150,7 @@ pa.field("shape_y", pa.int32()), pa.field("shape_x", pa.int32()), pa.field("dtype", pa.string()), + pa.field("compression", pa.string()), pa.field("pixel_bytes", pa.large_binary()), ] ) diff --git a/tests/test_chunks.py b/tests/test_chunks.py index fb59521..ead3a26 100644 --- a/tests/test_chunks.py +++ b/tests/test_chunks.py @@ -6,7 +6,9 @@ import numpy as np import pyarrow.parquet as pq +import pytest +import ome_arrow.export as export_mod from ome_arrow import OME_ARROW_BYTE_STRUCT, OMEArrow from ome_arrow.export import plane_from_chunks, to_numpy, to_ome_parquet from ome_arrow.ingest import from_numpy, from_ome_parquet, to_ome_arrow @@ -109,6 +111,45 @@ def test_to_numpy_from_byte_chunks() -> None: ) +def test_to_numpy_from_leaf_compressed_byte_chunks() -> None: + """Decode inline typed chunk bytes compressed at the chunk leaf.""" + arr = np.arange(1 * 1 * 2 * 16 * 16, dtype=np.uint16).reshape(1, 1, 2, 16, 16) + + scalar = from_numpy( + arr, + dim_order="TCZYX", + chunk_shape=(1, 8, 8), + chunk_encoding="bytes", + chunk_compression="zstd", + chunk_compression_level=1, + ) + record = scalar.as_py() + + assert record["chunks"][0]["compression"] == "zstd" + np.testing.assert_array_equal(to_numpy(scalar), arr) + np.testing.assert_array_equal( + plane_from_chunks(scalar, t=0, c=0, z=1), arr[0, 0, 1] + ) + + +def test_auto_leaf_compression_skips_high_entropy_chunks() -> None: + """Avoid storing larger compressed payloads for noisy chunks.""" + rng = np.random.default_rng(42) + arr = rng.integers(0, 65535, size=(1, 1, 1, 64, 64), dtype=np.uint16) + + scalar = from_numpy( + arr, + dim_order="TCZYX", + chunk_shape=(1, 64, 64), + chunk_encoding="bytes", + chunk_compression="auto", + ) + record = scalar.as_py() + + assert record["chunks"][0]["compression"] is None + np.testing.assert_array_equal(to_numpy(scalar), arr) + + def test_inline_byte_parquet_roundtrip_and_tensor_view(tmp_path: Path) -> None: """Write/read an ergonomic inline OME value backed by typed chunk bytes.""" arr = np.arange(1 * 1 * 2 * 3 * 4, dtype=np.uint16).reshape(1, 1, 2, 3, 4) @@ -140,3 +181,50 @@ def test_inline_byte_parquet_roundtrip_and_tensor_view(tmp_path: Path) -> None: arr[0, 0, 1], ) assert struct_array.type == OME_ARROW_BYTE_STRUCT + + +def test_inline_byte_parquet_converts_existing_chunks_without_densifying( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Convert numeric chunks directly to byte chunks at Parquet write time.""" + arr = np.arange(1 * 1 * 2 * 16 * 16, dtype=np.uint16).reshape(1, 1, 2, 16, 16) + scalar = from_numpy(arr, dim_order="TCZYX", chunk_shape=(1, 8, 8)) + out = tmp_path / "inline-direct-bytes.ome.parquet" + + monkeypatch.setattr( + export_mod, + "to_numpy", + lambda *_args, **_kwargs: pytest.fail("dense conversion was used"), + ) + + to_ome_parquet( + scalar, + str(out), + column_name="ome_arrow", + inline_chunk_encoding="bytes", + inline_chunk_compression="auto", + ) + + roundtrip = from_ome_parquet(out, column_name="ome_arrow") + np.testing.assert_array_equal(to_numpy(roundtrip), arr) + + +def test_inline_leaf_compressed_byte_parquet_roundtrip(tmp_path: Path) -> None: + """Convert nested values to leaf-compressed inline byte chunks.""" + arr = np.arange(1 * 1 * 2 * 16 * 16, dtype=np.uint16).reshape(1, 1, 2, 16, 16) + scalar = from_numpy(arr, dim_order="TCZYX", chunk_shape=(1, 8, 8)) + out = tmp_path / "inline-zstd-bytes.ome.parquet" + + to_ome_parquet( + scalar, + str(out), + column_name="ome_arrow", + inline_chunk_encoding="bytes", + inline_chunk_compression="zstd", + inline_chunk_compression_level=1, + ) + + roundtrip = from_ome_parquet(out, column_name="ome_arrow") + assert roundtrip.as_py()["chunks"][0]["compression"] == "zstd" + np.testing.assert_array_equal(to_numpy(roundtrip), arr) From 99a5bfb878fbaf4f59abda0ff81a74d1c27fb35f Mon Sep 17 00:00:00 2001 From: d33bs Date: Sat, 6 Jun 2026 22:28:20 -0600 Subject: [PATCH 9/9] address gregs comments Co-Authored-By: Gregory Way --- README.md | 95 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index 882f6e2..f20cff0 100644 --- a/README.md +++ b/README.md @@ -122,9 +122,8 @@ See full docs: [`docs/src/dlpack.md`](docs/src/dlpack.md) ## Inline byte-backed OME values -The historical nested table stores pixel payloads as numeric lists inside -`chunks[].pixels` and `planes[].pixels`. For faster one-row-per-image Parquet -tables, write inline chunk bytes instead: +The historical nested table stores pixel payloads as numeric lists inside `chunks[].pixels` and `planes[].pixels`. +For faster one-row-per-image Parquet tables, write inline chunk bytes instead: ```python from ome_arrow import from_numpy, to_ome_parquet @@ -144,10 +143,9 @@ to_ome_parquet( ) ``` -This keeps the ergonomic inline OME value while storing chunk payloads as typed -`pixel_bytes: large_binary`. Use it for moderate image-level tables and -whole-image reads. For large 3D/5D selective reads, prefer the typed chunk -dataset API below. +This keeps the ergonomic inline OME value while storing chunk payloads as typed `pixel_bytes: large_binary`. +Use it for moderate image-level tables and whole-image reads. +For large 3D/5D selective reads, prefer the typed chunk dataset API below. Leaf-level chunk compression is also available for inline byte chunks: @@ -176,12 +174,14 @@ Compression guidance from `benchmarks/benchmark_inline_byte_compression.py`: | Best storage on compressible 3D/volume data | `chunk_compression="small"` plus Parquet `compression="zstd"` | Uses Zstd level 1 only when chunks shrink, then applies Parquet compression. | | Noisy/high-entropy images | `chunk_compression="auto"` or no leaf compression; use Parquet `compression="zstd"` | Auto skips chunks that would grow; noisy data often does not compress. | -Explicit codecs such as `chunk_compression="zstd"` with -`chunk_compression_level=1` and `chunk_compression="lz4"` are also supported -when you want fixed behavior instead of a preset. +Explicit codecs such as `chunk_compression="zstd"` with `chunk_compression_level=1` and `chunk_compression="lz4"` are also supported when you want fixed behavior instead of a preset. ## Typed chunk datasets +Typed chunk datasets are the optimized pixel IO path for OME-Arrow. +Their goal is to keep image metadata small and queryable while storing pixels as typed byte chunks that can be read directly by image, plane, channel, region, or volume. +Use this layout when performance matters for selective reads, larger 3D/5D images, or data engineering workflows that need predictable chunk indexing. + `OMEArrow.export(how="ome-parquet")` writes the typed byte-buffer dataset layout. For explicit control over layout and chunks, use the dataset writer directly. By default, this stores image metadata separately from pixel chunks and writes one chunk per Parquet row group, so `read_plane()` and `read_region()` can jump through a physical index instead of materializing the older nested struct payload. @@ -215,14 +215,12 @@ plane_torch = dataset.read_plane(t=0, c=0, z=0, return_type="torch") plane_jax = dataset.read_plane(t=0, c=0, z=0, return_type="jax") ``` -Use `chunk_rows_per_row_group=1` for the fastest direct chunk reads. Use a -larger value, such as `8`, to reduce row-group overhead for small chunks when -storage size matters. +Use `chunk_rows_per_row_group=1` for the fastest direct chunk reads. +Use a larger value, such as `8`, to reduce row-group overhead for small chunks when storage size matters. -The writer preserves source pixel dtype by default. To normalize stored pixel -buffers explicitly, pass `pixel_dtype`, for example `pixel_dtype="uint16"`. -Integer casts clamp by default; pass `clamp=False` to use NumPy casting -behavior directly. +The writer preserves source pixel dtype by default. +To normalize stored pixel buffers explicitly, pass `pixel_dtype`, for example `pixel_dtype="uint16"`. +Integer casts clamp by default; pass `clamp=False` to use NumPy casting behavior directly. ## Tensor ingest (PyTorch/JAX) @@ -260,8 +258,7 @@ scalar_jax = from_jax_array(jax_array, dim_order="TCYX") Notes: - Torch/JAX support is optional. -- Install extras as needed: - `pip install "ome-arrow[dlpack-torch]"` or `pip install "ome-arrow[dlpack-jax]"`. +- Install extras as needed: `pip install "ome-arrow[dlpack-torch]"` or `pip install "ome-arrow[dlpack-jax]"`. - Torch tensors are detached and converted on CPU for ingest. - `dim_order` is accepted only for NumPy/torch/JAX array inputs. - Ingest now passes flattened NumPy pixel buffers directly to Arrow. @@ -269,23 +266,39 @@ Notes: ## Benchmarking lazy reads -Use the lightweight benchmark utility in `benchmarks/` to compare lazy tensor -read paths (TIFF source-backed, Parquet planes, Parquet chunks): +Use the lightweight benchmark utility in `benchmarks/` to compare lazy tensor read paths (TIFF source-backed, Parquet planes, Parquet chunks): ```bash uv run python benchmarks/benchmark_lazy_tensor.py --repeats 5 --warmup 1 ``` -For OME-IRIS-style 2D/3D/4D/5D access patterns, run: +For OME-IRIS-style 2D/3D/4D/5D access patterns, use `benchmark_ome_iris.py`. +This benchmark is intended to answer practical questions about pixel IO: how fast each format writes a matched artifact, how fast it reads full images or volumes, and how fast it serves selective access patterns such as planes, crops, subvolumes, timepoints, and channels. ```bash uv run python benchmarks/benchmark_ome_iris.py --repeats 3 --warmup 1 ``` -You can pass local real-data fixtures with `--fixture name=/path/to/image.tif`. -The benchmark writes matched temporary OME-Zarr and typed OME-Arrow artifacts, -then compares full-image, plane, crop, subvolume, timepoint, and channel reads -where those axes are present. +By default, the benchmark uses local test-data fixtures when available. +You can also pass real local TIFF fixtures explicitly: + +```bash +uv run python benchmarks/benchmark_ome_iris.py \ + --fixture 2d=/path/to/plate-image.tif \ + --fixture 3d=/path/to/volume.tif \ + --fixture 5d=/path/to/tczyx-image.ome.tif \ + --repeats 3 \ + --warmup 1 \ + --json-out benchmark-results.json +``` + +Each `--fixture` argument is `name=/path/to/image.tif`. +The `name` label is used only in the output table, so choose labels that describe the dimensionality or dataset source. +Inputs must be TIFF files; the benchmark creates temporary matched OME-Zarr and OME-Arrow artifacts for the same source image, then reports latency, returned shape, dtype, and artifact size. +Temporary artifacts are deleted automatically after the run. + +Use the printed table for quick local iteration and `--json-out` when comparing runs over time or attaching results to an issue/PR. +Prefer multiple repeats when making performance claims, because local filesystem cache, codec warmup, and Torch/JAX initialization can affect single-run timings. The OME-IRIS-style benchmark separates return/API paths: @@ -294,37 +307,27 @@ The OME-IRIS-style benchmark separates return/API paths: - `ome-tiff-tensor-numpy`: OME-Arrow `tensor_view(...).to_numpy()` over TIFF. - `ome-tiff-bioio-numpy`: direct BioImage NumPy reads over TIFF. - `ome-arrow-src-numpy`: source-dtype typed OME-Arrow dataset NumPy reads. -- `ome-arrow-u16-numpy`: typed OME-Arrow dataset NumPy reads normalized to - `uint16` for apples-to-apples comparisons with normalized paths. -- `ome-arrow-u16-raw-numpy`: normalized `uint16` typed OME-Arrow reads with - uncompressed chunk bytes for local speed comparisons. -- `ome-arrow-*-chunks`: Arrow-native raw chunk-row reads that return - `pixel_bytes` without decoding into NumPy. -- `ome-tiff-tensor-torch` / `ome-tiff-tensor-jax`: OME-Arrow tensor-view - Torch/JAX returns over TIFF. -- `ome-zarr-tensor-torch` / `ome-zarr-tensor-jax`: OME-Arrow tensor-view - Torch/JAX returns over OME-Zarr. -- `ome-arrow-src-torch` / `ome-arrow-src-jax`: source-dtype typed OME-Arrow - dataset reads with `return_type="torch"` or `return_type="jax"`. -- `ome-arrow-u16-torch` / `ome-arrow-u16-jax`: normalized `uint16` typed - OME-Arrow dataset reads with Torch/JAX returns. +- `ome-arrow-u16-numpy`: typed OME-Arrow dataset NumPy reads normalized to `uint16` for apples-to-apples comparisons with normalized paths. +- `ome-arrow-u16-raw-numpy`: normalized `uint16` typed OME-Arrow reads with uncompressed chunk bytes for local speed comparisons. +- `ome-arrow-*-chunks`: Arrow-native raw chunk-row reads that return `pixel_bytes` without decoding into NumPy. +- `ome-tiff-tensor-torch` / `ome-tiff-tensor-jax`: OME-Arrow tensor-view Torch/JAX returns over TIFF. +- `ome-zarr-tensor-torch` / `ome-zarr-tensor-jax`: OME-Arrow tensor-view Torch/JAX returns over OME-Zarr. +- `ome-arrow-src-torch` / `ome-arrow-src-jax`: source-dtype typed OME-Arrow dataset reads with `return_type="torch"` or `return_type="jax"`. +- `ome-arrow-u16-torch` / `ome-arrow-u16-jax`: normalized `uint16` typed OME-Arrow dataset reads with Torch/JAX returns. Notes: - This benchmark is for local iteration and relative comparisons. - It is not part of CI pass/fail checks. -- CI also runs this benchmark in a dedicated `benchmark_canary` job and - uploads `benchmark-results.json` as a workflow artifact. +- CI also runs this benchmark in a dedicated `benchmark_canary` job and uploads `benchmark-results.json` as a workflow artifact. Recalibrating `benchmarks/ci-baseline.json`: 1. Run the benchmark on `main` a few times (for example 3-5 runs): `uv run python benchmarks/benchmark_lazy_tensor.py --repeats 7 --warmup 2 --json-out benchmark-results.json` 1. For each case, collect the observed `median_ms` values. -1. Update `benchmarks/ci-baseline.json` with stable medians from those runs - (prefer a conservative value near the slower side, not the fastest sample). -1. Keep CI canary tolerance (`regression_factor` + `absolute_slack_ms`) unchanged - unless you have repeated false positives. +1. Update `benchmarks/ci-baseline.json` with stable medians from those runs (prefer a conservative value near the slower side, not the fastest sample). +1. Keep CI canary tolerance (`regression_factor` + `absolute_slack_ms`) unchanged unless you have repeated false positives. ## Contributing, Development, and Testing