WayScience · d33bs · Jun 7, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -69,7 +69,7 @@ oa_image.view(how="matplotlib")
 # (great for ZYX 3D images; install extras: `pip install 'ome-arrow[viz]'`).
 oa_image.view(how="pyvista")
 
-# Export to OME-Parquet.
+# Export to OME-Parquet. This writes the typed chunk dataset layout.
 # We can also export OME-TIFF, OME-Zarr or NumPy arrays.
 oa_image.export(how="ome-parquet", out="your_image.ome.parquet")
 
@@ -120,6 +120,108 @@ Advanced options:
 
 See full docs: [`docs/src/dlpack.md`](docs/src/dlpack.md)
 
+## Inline byte-backed OME values
+
+The historical nested table stores pixel payloads as numeric lists inside `chunks[].pixels` and `planes[].pixels`.
+For faster one-row-per-image Parquet tables, write inline chunk bytes instead:
+
+```python
+from ome_arrow import from_numpy, to_ome_parquet
+
+record = from_numpy(arr, dim_order="TCZYX", chunk_encoding="bytes")
+to_ome_parquet(record, "image.ome.parquet", column_name="ome_arrow")
+```
+
+You can also convert an existing OME-Arrow record at write time:
+
+```python
+to_ome_parquet(
+    record,
+    "image.ome.parquet",
+    column_name="ome_arrow",
+    inline_chunk_encoding="bytes",
+)
+```
+
+This keeps the ergonomic inline OME value while storing chunk payloads as typed `pixel_bytes: large_binary`.
+Use it for moderate image-level tables and whole-image reads.
+For large 3D/5D selective reads, prefer the typed chunk dataset API below.
+
+Leaf-level chunk compression is also available for inline byte chunks:
+
+```python
+record = from_numpy(
+    arr,
+    dim_order="TCZYX",
+    chunk_encoding="bytes",
+    chunk_compression="auto",
+)
+
+to_ome_parquet(
+    record,
+    "image.ome.parquet",
+    column_name="ome_arrow",
+    compression="zstd",
+)
+```
+
+Compression guidance from `benchmarks/benchmark_inline_byte_compression.py`:
+
+| Data/workload                               | Suggested setting                                                                   | Why                                                                           |
+| ------------------------------------------- | ----------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- |
+| General inline-byte tables                  | `chunk_compression="auto"` and Parquet `compression="zstd"`                         | Compresses chunks only when they shrink, then lets Parquet compress metadata. |
+| Faster reads on compressible images         | `chunk_compression="fast"` with Parquet `compression=None`                          | Uses LZ4 only when chunks shrink, keeping decode overhead low.                |
+| Best storage on compressible 3D/volume data | `chunk_compression="small"` plus Parquet `compression="zstd"`                       | Uses Zstd level 1 only when chunks shrink, then applies Parquet compression.  |
+| Noisy/high-entropy images                   | `chunk_compression="auto"` or no leaf compression; use Parquet `compression="zstd"` | Auto skips chunks that would grow; noisy data often does not compress.        |
+
+Explicit codecs such as `chunk_compression="zstd"` with `chunk_compression_level=1` and `chunk_compression="lz4"` are also supported when you want fixed behavior instead of a preset.
+
+## Typed chunk datasets
+
+Typed chunk datasets are the optimized pixel IO path for OME-Arrow.
+Their goal is to keep image metadata small and queryable while storing pixels as typed byte chunks that can be read directly by image, plane, channel, region, or volume.
+Use this layout when performance matters for selective reads, larger 3D/5D images, or data engineering workflows that need predictable chunk indexing.
+
+`OMEArrow.export(how="ome-parquet")` writes the typed byte-buffer dataset layout.
+For explicit control over layout and chunks, use the dataset writer directly.
+By default, this stores image metadata separately from pixel chunks and writes one chunk per Parquet row group, so `read_plane()` and `read_region()` can jump through a physical index instead of materializing the older nested struct payload.
+You can change that row-group packing with `chunk_rows_per_row_group`.
+
+```python
+import numpy as np
+from ome_arrow import OMEArrowDataset, write_ome_arrow_dataset
+
+arr = np.zeros((1, 1, 1, 1024, 1024), dtype=np.uint16)  # TCZYX
+
+choice = write_ome_arrow_dataset(
+    [arr],
+    "image.ome-arrow",
+    layout="tile",
+    chunk_shape=(1, 1, 1, 512, 512),
+    compression="zstd",
+    chunk_rows_per_row_group=1,
+)
+print(choice.rationale)
+
+dataset = OMEArrowDataset("image.ome-arrow")
+image_id = dataset.images["image_id"].to_pylist()[0]
+plane = dataset.pixels.read_plane(image_id, t=0, c=0, z=0)
+crop = dataset.pixels.read_region(image_id, y=slice(128, 384), x=slice(128, 384))
+
+# Dataset-level shortcuts return NumPy by default and can return Torch/JAX
+# arrays when those packages are installed.
+plane_np = dataset.read_plane(t=0, c=0, z=0)
+plane_torch = dataset.read_plane(t=0, c=0, z=0, return_type="torch")
+plane_jax = dataset.read_plane(t=0, c=0, z=0, return_type="jax")
+```
+
+Use `chunk_rows_per_row_group=1` for the fastest direct chunk reads.
+Use a larger value, such as `8`, to reduce row-group overhead for small chunks when storage size matters.
+
+The writer preserves source pixel dtype by default.
+To normalize stored pixel buffers explicitly, pass `pixel_dtype`, for example `pixel_dtype="uint16"`.
+Integer casts clamp by default; pass `clamp=False` to use NumPy casting behavior directly.
+
 ## Tensor ingest (PyTorch/JAX)
 
 You can ingest torch or JAX arrays directly with `OMEArrow(...)`.
@@ -156,38 +258,76 @@ scalar_jax = from_jax_array(jax_array, dim_order="TCYX")
 Notes:
 
 - Torch/JAX support is optional.
-- Install extras as needed:
-  `pip install "ome-arrow[dlpack-torch]"` or `pip install "ome-arrow[dlpack-jax]"`.
+- Install extras as needed: `pip install "ome-arrow[dlpack-torch]"` or `pip install "ome-arrow[dlpack-jax]"`.
 - Torch tensors are detached and converted on CPU for ingest.
 - `dim_order` is accepted only for NumPy/torch/JAX array inputs.
 - Ingest now passes flattened NumPy pixel buffers directly to Arrow.
 - This avoids materializing Python `list` payloads per plane/chunk.
 
 ## Benchmarking lazy reads
 
-Use the lightweight benchmark utility in `benchmarks/` to compare lazy tensor
-read paths (TIFF source-backed, Parquet planes, Parquet chunks):
+Use the lightweight benchmark utility in `benchmarks/` to compare lazy tensor read paths (TIFF source-backed, Parquet planes, Parquet chunks):
 
 ```bash
 uv run python benchmarks/benchmark_lazy_tensor.py --repeats 5 --warmup 1
 ```
 
+For OME-IRIS-style 2D/3D/4D/5D access patterns, use `benchmark_ome_iris.py`.
+This benchmark is intended to answer practical questions about pixel IO: how fast each format writes a matched artifact, how fast it reads full images or volumes, and how fast it serves selective access patterns such as planes, crops, subvolumes, timepoints, and channels.
+
+```bash
+uv run python benchmarks/benchmark_ome_iris.py --repeats 3 --warmup 1
+```
+
+By default, the benchmark uses local test-data fixtures when available.
+You can also pass real local TIFF fixtures explicitly:
+
+```bash
+uv run python benchmarks/benchmark_ome_iris.py \
+  --fixture 2d=/path/to/plate-image.tif \
+  --fixture 3d=/path/to/volume.tif \
+  --fixture 5d=/path/to/tczyx-image.ome.tif \
+  --repeats 3 \
+  --warmup 1 \
+  --json-out benchmark-results.json
+```
+
+Each `--fixture` argument is `name=/path/to/image.tif`.
+The `name` label is used only in the output table, so choose labels that describe the dimensionality or dataset source.
+Inputs must be TIFF files; the benchmark creates temporary matched OME-Zarr and OME-Arrow artifacts for the same source image, then reports latency, returned shape, dtype, and artifact size.
+Temporary artifacts are deleted automatically after the run.
+
+Use the printed table for quick local iteration and `--json-out` when comparing runs over time or attaching results to an issue/PR.
+Prefer multiple repeats when making performance claims, because local filesystem cache, codec warmup, and Torch/JAX initialization can affect single-run timings.
+
+The OME-IRIS-style benchmark separates return/API paths:
+
+- `ome-zarr-tensor-numpy`: OME-Arrow `tensor_view(...).to_numpy()` over OME-Zarr.
+- `ome-zarr-bioio-numpy`: direct BioImage NumPy reads over OME-Zarr.
+- `ome-tiff-tensor-numpy`: OME-Arrow `tensor_view(...).to_numpy()` over TIFF.
+- `ome-tiff-bioio-numpy`: direct BioImage NumPy reads over TIFF.
+- `ome-arrow-src-numpy`: source-dtype typed OME-Arrow dataset NumPy reads.
+- `ome-arrow-u16-numpy`: typed OME-Arrow dataset NumPy reads normalized to `uint16` for apples-to-apples comparisons with normalized paths.
+- `ome-arrow-u16-raw-numpy`: normalized `uint16` typed OME-Arrow reads with uncompressed chunk bytes for local speed comparisons.
+- `ome-arrow-*-chunks`: Arrow-native raw chunk-row reads that return `pixel_bytes` without decoding into NumPy.
+- `ome-tiff-tensor-torch` / `ome-tiff-tensor-jax`: OME-Arrow tensor-view Torch/JAX returns over TIFF.
+- `ome-zarr-tensor-torch` / `ome-zarr-tensor-jax`: OME-Arrow tensor-view Torch/JAX returns over OME-Zarr.
+- `ome-arrow-src-torch` / `ome-arrow-src-jax`: source-dtype typed OME-Arrow dataset reads with `return_type="torch"` or `return_type="jax"`.
+- `ome-arrow-u16-torch` / `ome-arrow-u16-jax`: normalized `uint16` typed OME-Arrow dataset reads with Torch/JAX returns.
+
 Notes:
 
 - This benchmark is for local iteration and relative comparisons.
 - It is not part of CI pass/fail checks.
-- CI also runs this benchmark in a dedicated `benchmark_canary` job and
-  uploads `benchmark-results.json` as a workflow artifact.
+- CI also runs this benchmark in a dedicated `benchmark_canary` job and uploads `benchmark-results.json` as a workflow artifact.
 
 Recalibrating `benchmarks/ci-baseline.json`:
 
 1. Run the benchmark on `main` a few times (for example 3-5 runs):
    `uv run python benchmarks/benchmark_lazy_tensor.py --repeats 7 --warmup 2 --json-out benchmark-results.json`
 1. For each case, collect the observed `median_ms` values.
-1. Update `benchmarks/ci-baseline.json` with stable medians from those runs
-   (prefer a conservative value near the slower side, not the fastest sample).
-1. Keep CI canary tolerance (`regression_factor` + `absolute_slack_ms`) unchanged
-   unless you have repeated false positives.
+1. Update `benchmarks/ci-baseline.json` with stable medians from those runs (prefer a conservative value near the slower side, not the fastest sample).
+1. Keep CI canary tolerance (`regression_factor` + `absolute_slack_ms`) unchanged unless you have repeated false positives.
 
 ## Contributing, Development, and Testing
 

@@ -0,0 +1,159 @@
+"""Benchmark inline byte-backed OME values with leaf-level compression."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import tempfile
+import time
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Callable
+
+import numpy as np
+
+from ome_arrow import from_numpy
+from ome_arrow.export import to_numpy, to_ome_parquet
+from ome_arrow.ingest import from_ome_parquet
+
+
+@dataclass(frozen=True)
+class Result:
+    """One inline byte compression benchmark result."""
+
+    dataset: str
+    codec: str
+    parquet_compression: str | None
+    write_ms: float
+    read_ms: float
+    size_mb: float
+
+
+def _time(fn: Callable[[], object], *, repeats: int, warmup: int) -> float:
+    for _ in range(warmup):
+        fn()
+    times = []
+    for _ in range(repeats):
+        start = time.perf_counter()
+        fn()
+        times.append((time.perf_counter() - start) * 1000.0)
+    return statistics.median(times)
+
+
+def _make_arrays() -> dict[str, np.ndarray]:
+    y, x = np.mgrid[:512, :512]
+    smooth = ((y * 3 + x * 5) % 4096).astype(np.uint16)
+    rng = np.random.default_rng(42)
+    noisy = rng.integers(0, 65535, size=(512, 512), dtype=np.uint16)
+    volume = np.stack(
+        [((smooth + z * 17) % 4096).astype(np.uint16) for z in range(16)],
+        axis=0,
+    )
+    return {
+        "2d-smooth": smooth.reshape(1, 1, 1, 512, 512),
+        "2d-noisy": noisy.reshape(1, 1, 1, 512, 512),
+        "3d-smooth": volume.reshape(1, 1, 16, 512, 512),
+    }
+
+
+def _cases() -> list[tuple[str, str | None, int | None, str | None]]:
+    return [
+        ("leaf-none/parquet-none", None, None, None),
+        ("leaf-none/parquet-zstd", None, None, "zstd"),
+        ("leaf-auto/parquet-none", "auto", None, None),
+        ("leaf-auto/parquet-zstd", "auto", None, "zstd"),
+        ("leaf-fast/parquet-none", "fast", None, None),
+        ("leaf-small/parquet-zstd", "small", None, "zstd"),
+        ("leaf-lz4/parquet-none", "lz4", None, None),
+        ("leaf-zstd1/parquet-none", "zstd", 1, None),
+        ("leaf-zstd3/parquet-none", "zstd", 3, None),
+        ("leaf-zstd1/parquet-zstd", "zstd", 1, "zstd"),
+        ("leaf-brotli3/parquet-none", "brotli", 3, None),
+    ]
+
+
+def run(*, repeats: int, warmup: int) -> list[Result]:
+    """Run inline byte compression benchmarks."""
+    results: list[Result] = []
+    with tempfile.TemporaryDirectory(prefix="ome_arrow_inline_compression_") as tmp:
+        tmpdir = Path(tmp)
+        for dataset, arr in _make_arrays().items():
+            base = from_numpy(
+                arr,
+                dim_order="TCZYX",
+                chunk_shape=(1, 256, 256),
+                build_chunks=True,
+            )
+            expected = arr
+            for label, codec, level, parquet_compression in _cases():
+                filename_label = label.replace("/", "_")
+                out = tmpdir / f"{dataset}.{filename_label}.ome.parquet"
+
+                def write() -> None:
+                    to_ome_parquet(
+                        base,
+                        str(out),
+                        column_name="ome_arrow",
+                        compression=parquet_compression,
+                        inline_chunk_encoding="bytes",
+                        inline_chunk_compression=codec,
+                        inline_chunk_compression_level=level,
+                    )
+
+                write_ms = _time(write, repeats=repeats, warmup=warmup)
+
+                def read() -> np.ndarray:
+                    value = from_ome_parquet(out, column_name="ome_arrow")
+                    decoded = to_numpy(value, dtype=expected.dtype)
+                    if decoded.shape != expected.shape:
+                        raise AssertionError(
+                            f"decoded shape {decoded.shape} != {expected.shape}"
+                        )
+                    return decoded
+
+                decoded = read()
+                np.testing.assert_array_equal(decoded, expected)
+                read_ms = _time(read, repeats=repeats, warmup=warmup)
+                results.append(
+                    Result(
+                        dataset=dataset,
+                        codec=label,
+                        parquet_compression=parquet_compression,
+                        write_ms=write_ms,
+                        read_ms=read_ms,
+                        size_mb=out.stat().st_size / (1024 * 1024),
+                    )
+                )
+    return results
+
+
+def main() -> None:
+    """Run the command-line benchmark."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repeats", type=int, default=3)
+    parser.add_argument("--warmup", type=int, default=1)
+    parser.add_argument("--json-out", type=Path, default=None)
+    args = parser.parse_args()
+
+    results = run(repeats=args.repeats, warmup=args.warmup)
+    print("")
+    print("Inline byte compression benchmark")
+    print(
+        f"{'dataset':12} {'codec':26} {'write ms':>10} {'read ms':>10} {'size MB':>10}"
+    )
+    print("-" * 72)
+    for result in results:
+        print(
+            f"{result.dataset:12} {result.codec:26} "
+            f"{result.write_ms:10.2f} {result.read_ms:10.2f} "
+            f"{result.size_mb:10.2f}"
+        )
+    if args.json_out is not None:
+        args.json_out.write_text(
+            json.dumps({"results": [asdict(r) for r in results]}, indent=2)
+        )
+
+
+if __name__ == "__main__":
+    main()