From bcf9133d125a892e202a447a0a53fba203ba08e7 Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 16:19:58 +0000
Subject: [PATCH 01/13] perf(np_utils): hybrid np_volume (cc3d for few labels,
 bincount for many)

cc3d statistics also computes centroids+bboxes that np_volume discards.
For many-label arrays (e.g. connected-component maps) np.bincount is far
cheaper: ~37x faster on a 64k-component map and ~3x on ~400 labels, while
cc3d stays faster for the few-label anatomical-segmentation case. Switch on
a cheap arr.max()>256 check to get best-of-both. Verified equal across
dtypes / label counts / include_zero; speedtest_volume.py added.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/np_utils.py                     |  8 ++-
 TPTBox/tests/speedtests/speedtest_volume.py | 68 +++++++++++++++++++++
 2 files changed, 73 insertions(+), 3 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_volume.py

diff --git a/TPTBox/core/np_utils.py b/TPTBox/core/np_utils.py
index 2b3ebbd..ef359ca 100755
--- a/TPTBox/core/np_utils.py
+++ b/TPTBox/core/np_utils.py
@@ -125,10 +125,12 @@ def np_volume(arr: UINTARRAY, include_zero: bool = False) -> dict[int, int]:
     Returns:
         dict[int, int]: Mapping from label value to number of voxels with that label.
     """
+    # np.bincount wins decisively when there are many labels (e.g. connected-component maps);
+    # cc3d statistics is faster for the few-label case typical of anatomical segmentations.
+    counts = np.bincount(arr.ravel()) if int(arr.max()) > 256 else cc3dstatistics(arr, use_crop=not include_zero)["voxel_counts"]
     if include_zero:
-        return {idx: i for idx, i in dict(enumerate(cc3dstatistics(arr, use_crop=False)["voxel_counts"])).items() if i > 0}
-    else:
-        return {idx: i for idx, i in dict(enumerate(cc3dstatistics(arr)["voxel_counts"])).items() if i > 0 and idx != 0}
+        return {idx: i for idx, i in enumerate(counts) if i > 0}
+    return {idx: i for idx, i in enumerate(counts) if i > 0 and idx != 0}
 
 
 def np_is_empty(arr: UINTARRAY | INTARRAY) -> bool:
diff --git a/TPTBox/tests/speedtests/speedtest_volume.py b/TPTBox/tests/speedtests/speedtest_volume.py
new file mode 100644
index 0000000..d044525
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_volume.py
@@ -0,0 +1,68 @@
+if __name__ == "__main__":
+    # speed test np_volume: cc3d statistics vs np.bincount across label-count regimes.
+    # Finding: cc3d wins for FEW labels (typical seg); bincount wins for MANY labels (CC maps).
+    import random
+
+    import numpy as np
+
+    from TPTBox.core.np_utils import cc3dstatistics, np_bbox_binary, np_connected_components
+    from TPTBox.tests.speedtests.speedtest import speed_test
+    from TPTBox.tests.test_utils import get_nii
+
+    # ---- candidate implementations -------------------------------------------------
+
+    def np_volume_cc3d(arr: np.ndarray):
+        # current implementation (frozen baseline)
+        return {idx: i for idx, i in dict(enumerate(cc3dstatistics(arr)["voxel_counts"])).items() if i > 0 and idx != 0}
+
+    def np_volume_bincount_full(arr: np.ndarray):
+        counts = np.bincount(arr.ravel())
+        return {idx: c for idx, c in enumerate(counts) if c > 0 and idx != 0}
+
+    def np_volume_bincount_crop(arr: np.ndarray):
+        crop = np_bbox_binary(arr, raise_error=False, px_dist=2)
+        counts = np.bincount(arr[crop].ravel())
+        return {idx: c for idx, c in enumerate(counts) if c > 0 and idx != 0}
+
+    def np_volume_hybrid(arr: np.ndarray):
+        # cheap max() pass decides: many labels -> bincount, few labels -> cc3d(+crop)
+        if int(arr.max()) > 64:
+            counts = np.bincount(arr.ravel())
+            return {idx: c for idx, c in enumerate(counts) if c > 0 and idx != 0}
+        return {idx: i for idx, i in dict(enumerate(cc3dstatistics(arr)["voxel_counts"])).items() if i > 0 and idx != 0}
+
+    functions = [np_volume_cc3d, np_volume_bincount_full, np_volume_bincount_crop, np_volume_hybrid]
+    assert_equal = lambda x, y: x == y  # noqa: E731
+
+    # ---- regime 1: few labels (typical segmentation) -------------------------------
+    def get_few_uint8():
+        nii, *_ = get_nii(x=(256, 256, 256), num_point=random.randint(5, 25))
+        return nii.get_seg_array().astype(np.uint8)
+
+    def get_few_uint16():
+        nii, *_ = get_nii(x=(256, 256, 256), num_point=random.randint(5, 25))
+        return nii.get_seg_array().astype(np.uint16)
+
+    print("\n=== regime 1a: few labels (256^3, uint8) ===")
+    speed_test(repeats=20, get_input_func=get_few_uint8, functions=functions, assert_equal_function=assert_equal)
+
+    print("\n=== regime 1b: few labels (256^3, uint16) ===")
+    speed_test(repeats=20, get_input_func=get_few_uint16, functions=functions, assert_equal_function=assert_equal)
+
+    # ---- regime 2: many connected components (grid of isolated voxels) -------------
+    def get_many_cc():
+        base = np.zeros((160, 160, 160), dtype=np.uint8)
+        base[::4, ::4, ::4] = 1  # ~64k isolated single-voxel components
+        cc, n = np_connected_components(base)
+        return cc.astype(np.uint32)
+
+    print("\n=== regime 2: many CC labels (160^3 grid, ~64k labels, uint32) ===")
+    speed_test(repeats=15, get_input_func=get_many_cc, functions=functions, assert_equal_function=assert_equal)
+
+    # ---- regime 3: moderate label count (realistic filter_connected_components) -----
+    def get_moderate_labels():
+        arr = np.random.randint(0, 400, size=(160, 160, 160)).astype(np.uint16)
+        return arr
+
+    print("\n=== regime 3: moderate labels (160^3 randint<400, uint16) ===")
+    speed_test(repeats=15, get_input_func=get_moderate_labels, functions=functions, assert_equal_function=assert_equal)

From 310c8a0d42f320e656afd56eb1139bef98e0790a Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 16:22:56 +0000
Subject: [PATCH 02/13] perf(np_utils): np_bbox_binary 3D path uses 2 full
 passes instead of 3

For 3D arrays, two of the three axis extents are derived from a single
shared 2D projection (np.any over the contiguous last axis), so only one
extra full reduction is needed. ~13-18% faster across 256^3/512^3 and
px_dist values; identical slices (verified vs old impl on 2D+3D, incl.
empty handling). Generic n-D path unchanged. speedtest_bbox_binary.py added.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/np_utils.py                       | 13 +++-
 .../tests/speedtests/speedtest_bbox_binary.py | 69 +++++++++++++++++++
 2 files changed, 79 insertions(+), 3 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_bbox_binary.py

diff --git a/TPTBox/core/np_utils.py b/TPTBox/core/np_utils.py
index ef359ca..31cca82 100755
--- a/TPTBox/core/np_utils.py
+++ b/TPTBox/core/np_utils.py
@@ -705,9 +705,16 @@ def np_bbox_binary(img: np.ndarray, px_dist: int | Sequence[int] | np.ndarray =
     assert len(px_dist) == n, f"dimension mismatch, got img shape {shp} and px_dist {px_dist}"
 
     bbox: list[float] = []
-    for ax in itertools.combinations(reversed(range(n)), n - 1):
-        nonzero = np.any(a=img, axis=ax)
-        bbox.extend(np.where(nonzero)[0][[0, -1]])  # type: ignore
+    if n == 3:
+        # 2 full passes instead of 3: two axis extents come from a shared 2D projection (cheap),
+        # only the third axis needs a second full reduction.
+        p = np.any(img, axis=2)
+        for nonzero in (np.any(p, axis=1), np.any(p, axis=0), np.any(img, axis=(0, 1))):
+            bbox.extend(np.where(nonzero)[0][[0, -1]])  # type: ignore
+    else:
+        for ax in itertools.combinations(reversed(range(n)), n - 1):
+            nonzero = np.any(a=img, axis=ax)
+            bbox.extend(np.where(nonzero)[0][[0, -1]])  # type: ignore
     out: tuple[slice, ...] = tuple(
         slice(
             max(bbox[i] - px_dist[i // 2], 0),
diff --git a/TPTBox/tests/speedtests/speedtest_bbox_binary.py b/TPTBox/tests/speedtests/speedtest_bbox_binary.py
new file mode 100644
index 0000000..b8ffc86
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_bbox_binary.py
@@ -0,0 +1,69 @@
+if __name__ == "__main__":
+    # speed test np_bbox_binary: 3 full np.any passes vs 2 full passes (3D specialization)
+    import itertools
+    import random
+
+    import numpy as np
+
+    from TPTBox.core.np_utils import np_is_empty
+    from TPTBox.tests.speedtests.speedtest import speed_test
+    from TPTBox.tests.test_utils import get_nii
+
+    def _finalize(bbox, shp, px_dist):
+        return tuple(
+            slice(max(bbox[i] - px_dist[i // 2], 0), min(bbox[i + 1] + px_dist[i // 2], shp[i // 2]) + 1) for i in range(0, len(bbox), 2)
+        )
+
+    def bbox_3pass(img: np.ndarray, px_dist=0, raise_error=True):
+        # current implementation (frozen baseline)
+        if np_is_empty(img):
+            if raise_error:
+                raise ValueError("empty")
+            return tuple([slice(None)] * img.ndim)
+        n = img.ndim
+        shp = img.shape
+        if isinstance(px_dist, int):
+            px_dist = np.ones(n, dtype=np.uint8) * px_dist
+        bbox: list = []
+        for ax in itertools.combinations(reversed(range(n)), n - 1):
+            nonzero = np.any(a=img, axis=ax)
+            bbox.extend(np.where(nonzero)[0][[0, -1]])
+        return _finalize(bbox, shp, px_dist)
+
+    def bbox_2pass(img: np.ndarray, px_dist=0, raise_error=True):
+        if np_is_empty(img):
+            if raise_error:
+                raise ValueError("empty")
+            return tuple([slice(None)] * img.ndim)
+        n = img.ndim
+        shp = img.shape
+        if isinstance(px_dist, int):
+            px_dist = np.ones(n, dtype=np.uint8) * px_dist
+        if n == 3:
+            p = np.any(img, axis=2)  # full pass -> 2D (d0, d1)
+            projections = (np.any(p, axis=1), np.any(p, axis=0), np.any(img, axis=(0, 1)))
+            bbox: list = []
+            for nz in projections:
+                w = np.where(nz)[0]
+                bbox.extend((w[0], w[-1]))
+        else:
+            bbox = []
+            for ax in itertools.combinations(reversed(range(n)), n - 1):
+                nonzero = np.any(a=img, axis=ax)
+                bbox.extend(np.where(nonzero)[0][[0, -1]])
+        return _finalize(bbox, shp, px_dist)
+
+    functions = [bbox_3pass, bbox_2pass]
+    assert_equal = lambda x, y: x == y  # noqa: E731  (tuples of slices compare elementwise)
+
+    def make_input(size, n_points, px_dist):
+        def _f():
+            num_points = random.randint(1, n_points)
+            nii, *_ = get_nii(x=(size, size, size), num_point=num_points)
+            return (nii.get_seg_array().astype(np.uint8), px_dist)
+
+        return _f
+
+    for size, npts, px in [(512, 20, 0), (512, 20, 2), (256, 8, 0)]:
+        print(f"\n=== np_bbox_binary | size {size}^3, up to {npts} blocks, px_dist={px} ===")
+        speed_test(repeats=30, get_input_func=make_input(size, npts, px), functions=functions, assert_equal_function=assert_equal)

From 01b83d11d7fd51f83b6505ac2d2f6eb932b1b468 Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 16:25:45 +0000
Subject: [PATCH 03/13] perf(np_utils): drop quadratic membership test in
 center_of_mass/bounding_boxes

np_center_of_mass and np_bounding_boxes built a `unique` list then filtered
with `idx in unique` (O(max_label x n_unique)). Check voxel_counts[idx]
directly instead. ~2.2x faster at ~4k labels, ~1.3x at ~2k, unchanged for
few labels; identical output (use_crop preserved). speedtest_center_of_mass.py added.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/np_utils.py                       |  8 +--
 .../speedtests/speedtest_center_of_mass.py    | 52 +++++++++++++++++++
 2 files changed, 56 insertions(+), 4 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_center_of_mass.py

diff --git a/TPTBox/core/np_utils.py b/TPTBox/core/np_utils.py
index 31cca82..24590b1 100755
--- a/TPTBox/core/np_utils.py
+++ b/TPTBox/core/np_utils.py
@@ -255,8 +255,8 @@ def np_center_of_mass(arr: UINTARRAY) -> dict[int, COORDINATE]:
     """
     stats = cc3dstatistics(arr, use_crop=False)
     # Does not use the other calls for speed reasons
-    unique = [idx for idx, i in enumerate(stats["voxel_counts"]) if i > 0 and idx != 0]
-    return {idx: v for idx, v in enumerate(stats["centroids"]) if idx in unique}
+    vc = stats["voxel_counts"]
+    return {idx: v for idx, v in enumerate(stats["centroids"]) if idx != 0 and vc[idx] > 0}
 
 
 def np_bounding_boxes(arr: UINTARRAY) -> dict[int, tuple[slice, slice, slice]]:
@@ -272,8 +272,8 @@ def np_bounding_boxes(arr: UINTARRAY) -> dict[int, tuple[slice, slice, slice]]:
     """
     stats = cc3dstatistics(arr)
     # Does not use the other calls for speed reasons
-    unique = [idx for idx, i in enumerate(stats["voxel_counts"]) if i > 0 and idx != 0]
-    return {idx: v for idx, v in enumerate(stats["bounding_boxes"]) if idx in unique}
+    vc = stats["voxel_counts"]
+    return {idx: v for idx, v in enumerate(stats["bounding_boxes"]) if idx != 0 and vc[idx] > 0}
 
 
 def np_contacts(arr: UINTARRAY, connectivity: int) -> dict[tuple[int, int], int]:
diff --git a/TPTBox/tests/speedtests/speedtest_center_of_mass.py b/TPTBox/tests/speedtests/speedtest_center_of_mass.py
new file mode 100644
index 0000000..feea7bf
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_center_of_mass.py
@@ -0,0 +1,52 @@
+if __name__ == "__main__":
+    # speed test np_center_of_mass / np_bounding_boxes:
+    # drop the O(max_label x n_unique) `idx in unique` list-membership post-processing.
+    import numpy as np
+
+    from TPTBox.core.np_utils import cc3dstatistics, np_connected_components
+    from TPTBox.tests.speedtests.speedtest import speed_test
+
+    # ---- center of mass ------------------------------------------------------------
+    def com_current(arr: np.ndarray):
+        stats = cc3dstatistics(arr, use_crop=False)
+        unique = [idx for idx, i in enumerate(stats["voxel_counts"]) if i > 0 and idx != 0]
+        return {idx: v for idx, v in enumerate(stats["centroids"]) if idx in unique}
+
+    def com_direct(arr: np.ndarray):
+        stats = cc3dstatistics(arr, use_crop=False)
+        vc = stats["voxel_counts"]
+        return {idx: v for idx, v in enumerate(stats["centroids"]) if idx != 0 and vc[idx] > 0}
+
+    # ---- bounding boxes ------------------------------------------------------------
+    def bbox_current(arr: np.ndarray):
+        stats = cc3dstatistics(arr)
+        unique = [idx for idx, i in enumerate(stats["voxel_counts"]) if i > 0 and idx != 0]
+        return {idx: v for idx, v in enumerate(stats["bounding_boxes"]) if idx in unique}
+
+    def bbox_direct(arr: np.ndarray):
+        stats = cc3dstatistics(arr)
+        vc = stats["voxel_counts"]
+        return {idx: v for idx, v in enumerate(stats["bounding_boxes"]) if idx != 0 and vc[idx] > 0}
+
+    def com_equal(x, y):
+        return x.keys() == y.keys() and all(np.allclose(x[k], y[k]) for k in x)
+
+    def bbox_equal(x, y):
+        return x == y
+
+    def get_many_cc():
+        # grid of isolated single voxels -> ~4k connected components (high max label).
+        # exercises the O(max_label x n_unique) post-processing the optimization removes.
+        base = np.zeros((64, 64, 64), dtype=np.uint8)
+        base[::4, ::4, ::4] = 1
+        cc, _ = np_connected_components(base)
+        return cc.astype(np.uint32)
+
+    def get_moderate_labels():
+        return np.random.randint(0, 2000, size=(120, 120, 120)).astype(np.uint16)
+
+    for name, gen in [("~4k CC labels (64^3 grid)", get_many_cc), ("~2000 labels (120^3 randint)", get_moderate_labels)]:
+        print(f"\n=== np_center_of_mass | {name} ===")
+        speed_test(repeats=12, get_input_func=gen, functions=[com_current, com_direct], assert_equal_function=com_equal)
+        print(f"\n=== np_bounding_boxes | {name} ===")
+        speed_test(repeats=12, get_input_func=gen, functions=[bbox_current, bbox_direct], assert_equal_function=bbox_equal)

From 3b8dcf7dee70a5527e05bbea22667a77e679e965 Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 16:50:21 +0000
Subject: [PATCH 04/13] perf(np_utils): np_dilate_msk uses boolean (out == i)
 instead of copy+mask

The per-label per-iteration 'data = out.copy(); data[i != data] = 0' was a full
array copy plus a masked write. _binary_dilation casts its input to bool anyway,
so 'data = out == i' is bit-exact (verified across 6480 configs) and skips the
copy. ~11-18% faster across n_pixel/connectivity on few-label 150^3 segs. The
n_pixel loop is kept (it encodes iterative inter-label competition that a single
larger-kernel pass would not reproduce). speedtest_dilate_vectorized.py added.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/np_utils.py                       |   3 +-
 .../speedtests/speedtest_dilate_vectorized.py | 102 ++++++++++++++++++
 2 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_dilate_vectorized.py

diff --git a/TPTBox/core/np_utils.py b/TPTBox/core/np_utils.py
index 24590b1..0bd5e41 100755
--- a/TPTBox/core/np_utils.py
+++ b/TPTBox/core/np_utils.py
@@ -523,8 +523,7 @@ def np_dilate_msk(
     out = arrc
     for _ in range(n_pixel):
         for i in labels:
-            data = out.copy()
-            data[i != data] = 0
+            data = out == i  # boolean mask; _binary_dilation casts to bool anyway, so this is exact and avoids a full copy
             if use_crop:
                 lcrop = np_bbox_binary(data, px_dist=2 + n_pixel, raise_error=False)
                 data = data[lcrop]
diff --git a/TPTBox/tests/speedtests/speedtest_dilate_vectorized.py b/TPTBox/tests/speedtests/speedtest_dilate_vectorized.py
new file mode 100644
index 0000000..d9d0bc1
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_dilate_vectorized.py
@@ -0,0 +1,102 @@
+if __name__ == "__main__":
+    # speed test np_dilate_msk: per-iteration full copy+mask vs boolean (out == i) mask.
+    # Dilation has iterative inter-label competition, so the n_pixel loop is preserved;
+    # only the per-label per-iteration `out.copy(); data[i != data] = 0` is replaced by `out == i`.
+    import random
+
+    import numpy as np
+    from scipy.ndimage import generate_binary_structure
+
+    from TPTBox.core.np_utils import _binary_dilation, _to_labels, np_bbox_binary
+    from TPTBox.tests.speedtests.speedtest import speed_test
+    from TPTBox.tests.test_utils import get_nii
+
+    def dilate_loop_baseline(arr, label_ref=None, n_pixel=5, connectivity=3, use_crop=True, ignore_axis=None):
+        labels = _to_labels(arr, label_ref)
+        if use_crop:
+            arr_bin = arr.copy()
+            arr_bin[np.isin(arr_bin, labels, invert=True)] = 0
+            crop = np_bbox_binary(arr_bin, px_dist=1 + n_pixel, raise_error=False)
+            arrc = arr[crop]
+        else:
+            arrc = arr
+        if ignore_axis is None:
+            struct = generate_binary_structure(arr.ndim, connectivity)
+        else:
+            struct = generate_binary_structure(arr.ndim - 1, connectivity)
+            struct = np.expand_dims(struct, ignore_axis)
+        labels = [l for l in labels if l != 0]
+        out = arrc
+        for _ in range(n_pixel):
+            for i in labels:
+                data = out.copy()
+                data[i != data] = 0
+                if use_crop:
+                    lcrop = np_bbox_binary(data, px_dist=2 + n_pixel, raise_error=False)
+                    data = data[lcrop]
+                msk_ibe_data = _binary_dilation(data, struct=struct)
+                if use_crop:
+                    oc = out[lcrop] == 0
+                    out[lcrop][oc] = msk_ibe_data[oc] * i
+                else:
+                    out[out == 0] = msk_ibe_data[out == 0] * i
+        if use_crop:
+            arr[crop] = out
+            return arr
+        return out
+
+    def dilate_m2a(arr, label_ref=None, n_pixel=5, connectivity=3, use_crop=True, ignore_axis=None):
+        labels = _to_labels(arr, label_ref)
+        if use_crop:
+            arr_bin = arr.copy()
+            arr_bin[np.isin(arr_bin, labels, invert=True)] = 0
+            crop = np_bbox_binary(arr_bin, px_dist=1 + n_pixel, raise_error=False)
+            arrc = arr[crop]
+        else:
+            arrc = arr
+        if ignore_axis is None:
+            struct = generate_binary_structure(arr.ndim, connectivity)
+        else:
+            struct = generate_binary_structure(arr.ndim - 1, connectivity)
+            struct = np.expand_dims(struct, ignore_axis)
+        labels = [l for l in labels if l != 0]
+        out = arrc
+        for _ in range(n_pixel):
+            for i in labels:
+                data = out == i  # boolean mask instead of full copy + masked write
+                if use_crop:
+                    lcrop = np_bbox_binary(data, px_dist=2 + n_pixel, raise_error=False)
+                    data = data[lcrop]
+                msk_ibe_data = _binary_dilation(data, struct=struct)
+                if use_crop:
+                    oc = out[lcrop] == 0
+                    out[lcrop][oc] = msk_ibe_data[oc] * i
+                else:
+                    out[out == 0] = msk_ibe_data[out == 0] * i
+        if use_crop:
+            arr[crop] = out
+            return arr
+        return out
+
+    functions = [dilate_loop_baseline, dilate_m2a]
+    eq = lambda x, y: np.array_equal(x, y)  # noqa: E731
+
+    def get_few():
+        nii, *_ = get_nii(x=(150, 150, 150), num_point=random.randint(5, 12))
+        return nii.get_seg_array().astype(np.uint8)
+
+    # NOTE: np_dilate_msk mutates in place; the harness runs each function 5x via timeit on the
+    # same deep-copied input, so absolute numbers compound across runs. Both candidates mutate
+    # identically, so the *relative* comparison stays fair. Dilation is only ever run on
+    # few-label anatomical segmentations, so we benchmark that regime only.
+    for n_pixel in (1, 3):
+        for connectivity in (1, 3):
+            print(f"\n=== np_dilate_msk few labels (150^3) | n_pixel={n_pixel} conn={connectivity} ===")
+            speed_test(
+                repeats=12,
+                get_input_func=get_few,
+                functions=functions,
+                assert_equal_function=eq,
+                n_pixel=n_pixel,
+                connectivity=connectivity,
+            )

From 03a0e647bd2964f0c818ac3c2cb72a036b04069b Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 17:00:05 +0000
Subject: [PATCH 05/13] perf(np_utils): add np_isin LUT helper, use it for
 multi-label masking

np.isin is 3-6x slower than a boolean lookup table for multi-label membership
on uint segmentation masks. New np_isin() builds lut[labels]=True and gathers
lut[arr] for unsigned arrays with a small label range; it special-cases the
single-label (arr==label) case and falls back to np.isin for signed/negative/
huge-range inputs. Verified equal to np.isin across 132 dtype/label/invert
cases. Applied at the 9 multi-label np.isin sites (extract_label, erode/dilate
(+euclid), connected_components, filter_connected_components). numpy's own
kind='table' did not help. speedtest_isin_lut.py added.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/np_utils.py                       | 52 +++++++++++++++----
 TPTBox/tests/speedtests/speedtest_isin_lut.py | 46 ++++++++++++++++
 2 files changed, 89 insertions(+), 9 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_isin_lut.py

diff --git a/TPTBox/core/np_utils.py b/TPTBox/core/np_utils.py
index 0bd5e41..d37c84e 100755
--- a/TPTBox/core/np_utils.py
+++ b/TPTBox/core/np_utils.py
@@ -36,6 +36,40 @@
 INTARRAY = Union[UINTARRAY, NDArray[INT]]
 
 
+def np_isin(arr: np.ndarray, labels, invert: bool = False) -> np.ndarray:
+    """Fast ``np.isin`` for non-negative integer label arrays via a boolean lookup table.
+
+    For unsigned-integer segmentation masks this is ~3-6x faster than ``np.isin`` when testing
+    membership in more than one label, because it replaces the general algorithm with a single
+    ``lut[arr]`` gather. Falls back to ``np.isin`` for non-unsigned dtypes, negative labels, or
+    very large label ranges; uses ``arr == label`` for the single-label case.
+
+    Args:
+        arr (np.ndarray): Input array.
+        labels: A label or iterable of labels to test membership against.
+        invert (bool, optional): If True, return the complement (equivalent to
+            ``np.isin(arr, labels, invert=True)``). Defaults to False.
+
+    Returns:
+        np.ndarray: Boolean mask, same shape as ``arr``.
+    """
+    if not isinstance(labels, (list, tuple, np.ndarray)):
+        labels = [labels]
+    if len(labels) == 0:
+        return np.ones(arr.shape, dtype=bool) if invert else np.zeros(arr.shape, dtype=bool)
+    if len(labels) == 1:
+        res = arr == labels[0]
+        return ~res if invert else res
+    if np.issubdtype(arr.dtype, np.unsignedinteger) and min(int(x) for x in labels) >= 0:
+        m = max(int(arr.max()), int(max(labels))) + 1
+        if m < 2**20:  # keep the lookup table small (same threshold as np_unique's bincount path)
+            lut = np.zeros(m, dtype=bool)
+            lut[np.asarray(labels)] = True
+            res = lut[arr]
+            return ~res if invert else res
+    return np.isin(arr, labels, invert=invert)
+
+
 def np_extract_label(
     arr: np.ndarray,
     label: int | list[int],
@@ -69,7 +103,7 @@ def np_extract_label(
 
     if isinstance(label, list):
         assert 0 not in label, "label 0 is not supported in list mode"
-        arr_msk = np.isin(arr, label)
+        arr_msk = np_isin(arr, label)
         arr[arr_msk] = to_label
         arr[~arr_msk] = 0
         return arr
@@ -385,14 +419,14 @@ def np_erode_msk_euclid(arr: np.ndarray, n_pixel: int = 3, use_crop=True, labels
     if use_crop:
         arr_bin = arr.copy()
         if labels is not None:
-            arr_bin[np.isin(arr_bin, labels, invert=True)] = 0
+            arr_bin[np_isin(arr_bin, labels, invert=True)] = 0
         crop = np_bbox_binary(arr_bin, px_dist=1 + n_pixel, raise_error=False)
         arrc = arr[crop]
     else:
         arrc = arr
         if labels is not None:
             arrc = arrc.copy()
-            arrc[np.isin(arrc, labels, invert=True)] = 0
+            arrc[np_isin(arrc, labels, invert=True)] = 0
 
     if mask is not None:
         mask = mask.copy()
@@ -431,14 +465,14 @@ def np_dilate_msk_euclid(arr: np.ndarray, n_pixel: int = 3, use_crop=True, label
     if use_crop:
         arr_bin = arr.copy()
         if labels is not None:
-            arr_bin[np.isin(arr_bin, labels, invert=True)] = 0
+            arr_bin[np_isin(arr_bin, labels, invert=True)] = 0
         crop = np_bbox_binary(arr_bin, px_dist=1 + n_pixel, raise_error=False)
         arrc = arr[crop]
     else:
         arrc = arr
         if labels is not None:
             arrc = arrc.copy()
-            arrc[np.isin(arr_bin, labels, invert=True)] = 0
+            arrc[np_isin(arr_bin, labels, invert=True)] = 0
     if mask is not None:
         mask[mask != 0] = 1
         if use_crop:
@@ -502,7 +536,7 @@ def np_dilate_msk(
     if use_crop:
         # try:
         arr_bin = arr.copy()
-        arr_bin[np.isin(arr_bin, labels, invert=True)] = 0
+        arr_bin[np_isin(arr_bin, labels, invert=True)] = 0
         crop = np_bbox_binary(arr_bin, px_dist=1 + n_pixel, raise_error=False)
         arrc = arr[crop]
     else:
@@ -576,7 +610,7 @@ def np_erode_msk(
     labels: list[int] = _to_labels(arr, label_ref)
 
     if use_crop:
-        crop = np_bbox_binary(np.isin(arr, labels, invert=False), px_dist=1 + n_pixel, raise_error=False)
+        crop = np_bbox_binary(np_isin(arr, labels, invert=False), px_dist=1 + n_pixel, raise_error=False)
         arrc = arr[crop]
     else:
         arrc = arr
@@ -875,7 +909,7 @@ def np_connected_components(
     labels: Sequence[int] = _to_labels(arr, label_ref)
     if include_zero:
         arr[arr == 0] = arr.max() + 1
-    arr[np.isin(arr, labels, invert=True)] = 0
+    arr[np_isin(arr, labels, invert=True)] = 0
     cc_map, n = _connected_components(arr, connectivity=connectivity, return_N=True)
     return cc_map, n
 
@@ -960,7 +994,7 @@ def np_filter_connected_components(
 
     arr2 = arr.copy()
     labels: Sequence[int] = _to_labels(arr, label_ref)
-    arr2[np.isin(arr2, labels, invert=True)] = 0  # type:ignore
+    arr2[np_isin(arr2, labels, invert=True)] = 0  # type:ignore
 
     labels_out, n = _connected_components(arr2, connectivity=connectivity, return_N=True)
     largest_k_components_org = largest_k_components
diff --git a/TPTBox/tests/speedtests/speedtest_isin_lut.py b/TPTBox/tests/speedtests/speedtest_isin_lut.py
new file mode 100644
index 0000000..89743e3
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_isin_lut.py
@@ -0,0 +1,46 @@
+if __name__ == "__main__":
+    # speed test np.isin(arr, labels) for uint segmentation masks:
+    #   default vs kind='table' vs explicit boolean lookup-table (LUT).
+    # numpy auto-selects kind='table' for small integer ranges, so the explicit LUT may only
+    # help for wide-range uint16 (labels near dtype max) where numpy declines the table.
+    import numpy as np
+
+    from TPTBox.tests.speedtests.speedtest import speed_test
+
+    def isin_default(arr, labels):
+        return np.isin(arr, labels)
+
+    def isin_table(arr, labels):
+        return np.isin(arr, labels, kind="table")
+
+    def lut_explicit(arr, labels):
+        if len(labels) == 0:
+            return np.zeros(arr.shape, dtype=bool)
+        m = max(int(arr.max()), int(max(labels))) + 1
+        lut = np.zeros(m, dtype=bool)
+        lut[labels] = True
+        return lut[arr]
+
+    functions = [isin_default, isin_table, lut_explicit]
+    eq = lambda x, y: np.array_equal(x, y)  # noqa: E731
+
+    def make(dtype, maxval, n_labels, shape=(256, 256, 256)):
+        def _f():
+            arr = np.random.randint(0, maxval, size=shape).astype(dtype)
+            labels = list(np.random.choice(np.arange(1, maxval), size=min(n_labels, maxval - 1), replace=False))
+            labels = [int(x) for x in labels]
+            return (arr, labels)
+
+        return _f
+
+    regimes = [
+        ("uint8  range<=30   n_labels=5", np.uint8, 30, 5),
+        ("uint8  range<=200  n_labels=50", np.uint8, 200, 50),
+        ("uint16 range<=30   n_labels=5", np.uint16, 30, 5),
+        ("uint16 wide<=60000 n_labels=1", np.uint16, 60000, 1),
+        ("uint16 wide<=60000 n_labels=5", np.uint16, 60000, 5),
+        ("uint16 wide<=60000 n_labels=50", np.uint16, 60000, 50),
+    ]
+    for name, dtype, maxval, n_labels in regimes:
+        print(f"\n=== np.isin | {name} (256^3) ===")
+        speed_test(repeats=12, get_input_func=make(dtype, maxval, n_labels), functions=functions, assert_equal_function=eq)

From caecb8e4e3e12bb9714632d6fa7b2c5eb763f00d Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 19:23:01 +0000
Subject: [PATCH 06/13] perf(nii): extract_label(keep_label=True) uses one copy
 + np_isin mask

The keep_label path called get_seg_array() twice (two full array copies) plus
np_extract_label and a multiply. Now it takes a single copy and zeros voxels
not in the label set via np_isin, keeping original label values. ~1.74x faster
on a 300^3 mask; output identical (verified scalar+list, keep+binary paths).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/nii_wrapper.py                    | 14 +++++---
 .../speedtest_nii_extract_label_keep.py       | 34 +++++++++++++++++++
 2 files changed, 43 insertions(+), 5 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_nii_extract_label_keep.py

diff --git a/TPTBox/core/nii_wrapper.py b/TPTBox/core/nii_wrapper.py
index c5aa88c..93423e2 100755
--- a/TPTBox/core/nii_wrapper.py
+++ b/TPTBox/core/nii_wrapper.py
@@ -43,6 +43,7 @@
     np_filter_connected_components,
     np_get_connected_components_center_of_mass,
     np_is_empty,
+    np_isin,
     np_map_labels,
     np_map_labels_based_on_majority_label_mask_overlap,
     np_point_coordinates,
@@ -2685,9 +2686,8 @@ def extract_label(self,label:int|Enum|Sequence[int]|Sequence[Enum]|None, keep_la
         seg_arr = self.get_seg_array()
 
         if isinstance(label, Sequence):
-            label_int:list[int] = [idx.value if isinstance(idx,Enum) else idx for idx in label]
-            assert 0 not in label_int, 'Zero label does not make sense. This is the background'
-            seg_arr = np_extract_label(seg_arr, label_int, to_label=1, inplace=True)
+            labels:int|list[int] = [idx.value if isinstance(idx,Enum) else idx for idx in label]
+            assert 0 not in labels, 'Zero label does not make sense. This is the background'
         else:
             if isinstance(label,Enum):
                 label = label.value
@@ -2695,9 +2695,13 @@ def extract_label(self,label:int|Enum|Sequence[int]|Sequence[Enum]|None, keep_la
                 label = int(label)
 
             assert label != 0, 'Zero label does not make sense. This is the background'
-            seg_arr = np_extract_label(seg_arr, label, to_label=1, inplace=True)
+            labels = label
         if keep_label:
-            seg_arr = seg_arr * self.get_seg_array()
+            # keep the original label values where in `labels`, zero everywhere else.
+            # single get_seg_array() copy + one np_isin mask (faster than extract + a second copy/multiply)
+            seg_arr[~np_isin(seg_arr, labels)] = 0
+        else:
+            seg_arr = np_extract_label(seg_arr, labels, to_label=1, inplace=True)
         return self.set_array(seg_arr,inplace=inplace)
     def ravel(self,order:Literal["K", "A", "C", "F"] | None="C")->np.ndarray:
         """Return a contiguous flattened array.
diff --git a/TPTBox/tests/speedtests/speedtest_nii_extract_label_keep.py b/TPTBox/tests/speedtests/speedtest_nii_extract_label_keep.py
new file mode 100644
index 0000000..3740b9a
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_nii_extract_label_keep.py
@@ -0,0 +1,34 @@
+if __name__ == "__main__":
+    # speed test NII.extract_label(keep_label=True):
+    # current calls get_seg_array() twice (two full copies) + np_extract_label + multiply;
+    # optimized uses one copy + np_isin mask + masked write.
+    import random
+
+    import numpy as np
+
+    from TPTBox.core.np_utils import np_extract_label, np_isin
+    from TPTBox.tests.speedtests.speedtest import speed_test
+    from TPTBox.tests.test_utils import get_nii
+
+    extract_label = [2, 3, 4, 5]
+
+    def old_keep(nii):
+        # frozen baseline: two get_seg_array() copies + np_extract_label + multiply
+        seg_arr = nii.get_seg_array()
+        seg_arr = np_extract_label(seg_arr, extract_label, to_label=1, inplace=True)
+        seg_arr = seg_arr * nii.get_seg_array()
+        return seg_arr
+
+    def new_keep(nii):
+        seg_arr = nii.get_seg_array()
+        seg_arr[~np_isin(seg_arr, extract_label)] = 0
+        return seg_arr
+
+    def get_input():
+        nii, *_ = get_nii(x=(300, 300, 300), num_point=random.randint(5, 10))
+        return nii
+
+    print("\n=== NII.extract_label keep_label=True (300^3, labels [2,3,4,5]) ===")
+    speed_test(
+        repeats=50, get_input_func=get_input, functions=[old_keep, new_keep], assert_equal_function=lambda x, y: np.array_equal(x, y)
+    )

From 2f91110d25e36c9b259c31855a2a5f5c6cb9d4d5 Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 19:39:55 +0000
Subject: [PATCH 07/13] perf(nii): remove_labels uses a single np_map_labels
 gather

The per-label 'seg_arr[seg_arr == l] = 0' loop costs one full pass per removed
label (linear in label count). A single np_map_labels gather ({label: fill})
is constant-time: tied with the loop for a few labels, ~2.2x faster at 20
labels (sparse) and ~6x on dense masks. Enums are now resolved to .value like
extract_label does (the int path is unchanged). Verified equal across
scalar/list/nested labels and removed_to_label values.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/nii_wrapper.py                    | 10 ++--
 .../speedtests/speedtest_nii_remove_labels.py | 49 +++++++++++++++++++
 2 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_nii_remove_labels.py

diff --git a/TPTBox/core/nii_wrapper.py b/TPTBox/core/nii_wrapper.py
index 93423e2..cbe4f86 100755
--- a/TPTBox/core/nii_wrapper.py
+++ b/TPTBox/core/nii_wrapper.py
@@ -2723,15 +2723,17 @@ def extract_label_(self, label: int | Enum | Sequence[int] | Sequence[Enum], kee
     def remove_labels(self,label:int|Enum|Sequence[int]|Sequence[Enum], inplace=False, verbose:logging=True, removed_to_label=0) -> Self:
         """If this NII is a segmentation you can single out one label."""
         assert label != 0, 'Zero label does not make sens.  This is the background'
-        seg_arr = self.get_seg_array()
         if not isinstance(label,Sequence):
             label = [label] # type: ignore
+        flat: list[int] = []
         for l in label:
             if isinstance(l, list):
-                for g in l:
-                    seg_arr[seg_arr == g] = removed_to_label
+                flat.extend(g.value if isinstance(g, Enum) else g for g in l)
             else:
-                seg_arr[seg_arr == l] = removed_to_label
+                flat.append(l.value if isinstance(l, Enum) else l)
+        # one np_map_labels gather is constant-time in the number of labels (a per-label
+        # `seg_arr == l` loop costs one full pass per label).
+        seg_arr = np_map_labels(self.get_seg_array(), dict.fromkeys(flat, removed_to_label))
         return self.set_array(seg_arr,inplace=inplace, verbose=verbose)
     def remove_labels_(self, label: int | Enum | Sequence[int] | Sequence[Enum], removed_to_label=0, verbose: logging = True) -> Self:
         """In-place variant of `remove_labels`."""
diff --git a/TPTBox/tests/speedtests/speedtest_nii_remove_labels.py b/TPTBox/tests/speedtests/speedtest_nii_remove_labels.py
new file mode 100644
index 0000000..22226f0
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_nii_remove_labels.py
@@ -0,0 +1,49 @@
+if __name__ == "__main__":
+    # speed test NII.remove_labels: per-label `seg_arr[seg_arr == l] = 0` loop (N masked passes)
+    # vs a single np_map_labels gather ({label: removed_to_label}).
+    import random
+
+    import numpy as np
+
+    from TPTBox.core.np_utils import np_isin, np_map_labels
+    from TPTBox.tests.speedtests.speedtest import speed_test
+    from TPTBox.tests.test_utils import get_nii
+
+    def old_remove(arr, remove):
+        arr = arr.copy()
+        for l in remove:
+            arr[arr == l] = 0
+        return arr
+
+    def new_remove_map(arr, remove):
+        return np_map_labels(arr, dict.fromkeys(remove, 0))
+
+    def new_remove_isin(arr, remove):
+        arr = arr.copy()
+        arr[np_isin(arr, remove)] = 0
+        return arr
+
+    functions = [old_remove, new_remove_map, new_remove_isin]
+    eq = lambda x, y: np.array_equal(x, y)  # noqa: E731
+
+    # realistic: get_nii blocks (sparse foreground)
+    def get_sparse(n_remove):
+        def _f():
+            nii, *_ = get_nii(x=(300, 300, 300), num_point=random.randint(n_remove + 2, n_remove + 14))
+            return (nii.get_seg_array().astype(np.uint16), list(range(2, 2 + n_remove)))
+
+        return _f
+
+    for n_remove in (6, 20):
+        print(f"\n=== NII.remove_labels (300^3 get_nii blocks, remove {n_remove} labels, uint16) ===")
+        speed_test(repeats=40, get_input_func=get_sparse(n_remove), functions=functions, assert_equal_function=eq)
+
+    # dense: many abundant labels
+    def get_dense(n_remove):
+        def _f():
+            return (np.random.randint(0, 60, size=(200, 200, 200)).astype(np.uint16), list(range(2, 2 + n_remove)))
+
+        return _f
+
+    print("\n=== NII.remove_labels (200^3 dense randint<60, remove 20 labels, uint16) ===")
+    speed_test(repeats=40, get_input_func=get_dense(20), functions=functions, assert_equal_function=eq)

From 7331e99bbf5fe09985c60e25d96a39b3711d18a0 Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 19:44:39 +0000
Subject: [PATCH 08/13] perf(nii): map_labels skips the before/after np_unique
 scans when not verbose

The two np_unique full-array scans only feed the verbose log line. Guarding
them behind 'verbose' makes the common in-loop verbose=False path ~5x faster
on a 300^3 mask. The verbose=True output is unchanged; the returned data is
identical either way.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/nii_wrapper.py                    | 22 ++++++------
 .../speedtests/speedtest_nii_map_labels.py    | 36 +++++++++++++++++++
 2 files changed, 48 insertions(+), 10 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_nii_map_labels.py

diff --git a/TPTBox/core/nii_wrapper.py b/TPTBox/core/nii_wrapper.py
index cbe4f86..420f814 100755
--- a/TPTBox/core/nii_wrapper.py
+++ b/TPTBox/core/nii_wrapper.py
@@ -2254,7 +2254,8 @@ def map_labels(self, label_map:LABEL_MAP , verbose:logging=True, inplace=False)
             If inplace is True, returns the current NIfTI image object with mapped labels. Otherwise, returns a new NIfTI image object with mapped labels.
         """
         data_orig = self.get_seg_array()
-        labels_before = [v for v in np_unique(data_orig) if v > 0]
+        # the before/after np_unique scans are only used for the verbose log line; skip them otherwise
+        labels_before = [v for v in np_unique(data_orig) if v > 0] if verbose else None
         # enforce keys to be str to support both str and int
         label_map_ = {
             (v_name2idx[k] if k in v_name2idx else int(k)): (
@@ -2264,15 +2265,16 @@ def map_labels(self, label_map:LABEL_MAP , verbose:logging=True, inplace=False)
         }
         log.print("label_map_ =", label_map_, verbose=verbose)
         data = np_map_labels(data_orig, label_map_)
-        labels_after = [v for v in np_unique(data) if v > 0]
-        log.print(
-                "N =",
-                len(label_map_),
-                "labels reassigned, before labels: ",
-                labels_before,
-                " after: ",
-                labels_after,verbose=verbose
-            )
+        if verbose:
+            labels_after = [v for v in np_unique(data) if v > 0]
+            log.print(
+                    "N =",
+                    len(label_map_),
+                    "labels reassigned, before labels: ",
+                    labels_before,
+                    " after: ",
+                    labels_after,verbose=verbose
+                )
         nii = data.astype(np.uint16), self.affine, self.header
         if inplace:
             self.nii = nii
diff --git a/TPTBox/tests/speedtests/speedtest_nii_map_labels.py b/TPTBox/tests/speedtests/speedtest_nii_map_labels.py
new file mode 100644
index 0000000..508a5f5
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_nii_map_labels.py
@@ -0,0 +1,36 @@
+if __name__ == "__main__":
+    # speed test NII.map_labels with verbose=False:
+    # current computes np_unique BEFORE and AFTER mapping purely to feed log.print(verbose=verbose),
+    # i.e. two wasted full-array scans when verbose=False. Optimized skips them.
+    import random
+
+    import numpy as np
+
+    from TPTBox.core.np_utils import np_map_labels, np_unique
+    from TPTBox.tests.speedtests.speedtest import speed_test
+    from TPTBox.tests.test_utils import get_nii
+
+    label_map_ = {1: 10, 2: 11, 3: 12, 4: 13, 5: 14}
+
+    def old_map_verbose_false(arr):
+        # mirrors current map_labels internals with verbose=False (still scans!)
+        labels_before = [v for v in np_unique(arr) if v > 0]  # noqa: F841
+        data = np_map_labels(arr, label_map_)
+        labels_after = [v for v in np_unique(data) if v > 0]  # noqa: F841
+        return data.astype(np.uint16)
+
+    def new_map_verbose_false(arr):
+        data = np_map_labels(arr, label_map_)
+        return data.astype(np.uint16)
+
+    def get_input():
+        nii, *_ = get_nii(x=(300, 300, 300), num_point=random.randint(5, 10))
+        return nii.get_seg_array().astype(np.uint8)
+
+    print("\n=== NII.map_labels verbose=False (300^3): skip the two logging np_unique scans ===")
+    speed_test(
+        repeats=50,
+        get_input_func=get_input,
+        functions=[old_map_verbose_false, new_map_verbose_false],
+        assert_equal_function=lambda x, y: np.array_equal(x, y),
+    )

From c2129e969f0b5690b9b95693719a54161de87c9c Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 19:51:49 +0000
Subject: [PATCH 09/13] perf(nii): truncate_labels_beyond_reference_ builds
 masks via np_isin

It called extract_label(...).get_seg_array() twice (each a full round-trip:
copy + np_extract_label + NII construction + copy) just to get two binary
masks. Both masks now come directly from the single get_array() via np_isin.
~2.15x faster on a 300^3 mask; output identical (verified across
idx/not_beyond/axis/inclusion).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/nii_wrapper.py                    |  7 ++--
 .../speedtest_nii_truncate_masks.py           | 38 +++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_nii_truncate_masks.py

diff --git a/TPTBox/core/nii_wrapper.py b/TPTBox/core/nii_wrapper.py
index 420f814..de809e8 100755
--- a/TPTBox/core/nii_wrapper.py
+++ b/TPTBox/core/nii_wrapper.py
@@ -2093,10 +2093,11 @@ def truncate_labels_beyond_reference_(
         flip = self.orientation[axis_] != axis  # Check orientation for flipping
         # Get the array data
         np_array = self.get_array()
-        np_array_cond = self.extract_label(idx).get_seg_array()
+        # both masks come directly from np_array via np_isin (avoids two extract_label round-trips)
+        np_array_cond = np_isin(np_array, idx)
 
         # Find the lowest point (smallest index) along the axis where `not_above` exists
-        threshold = np.where(self.extract_label(not_beyond).get_seg_array() == 1)
+        threshold = np.where(np_isin(np_array, not_beyond))
         if len(threshold[axis_]) == 0:
             return self if inplace else self.copy()
         flip_up = flip
@@ -2116,7 +2117,7 @@ def truncate_labels_beyond_reference_(
         mask = np.broadcast_to(mask, self.shape)
 
         # Replace values of `idx` with `fill` in the masked region
-        np_array = np.where((np_array_cond == 1) & mask, fill, np_array)
+        np_array = np.where(np_array_cond & mask, fill, np_array)
 
         # Update the NIfTI object with the modified array
         return self.set_array(np_array, inplace=inplace)
diff --git a/TPTBox/tests/speedtests/speedtest_nii_truncate_masks.py b/TPTBox/tests/speedtests/speedtest_nii_truncate_masks.py
new file mode 100644
index 0000000..c03129c
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_nii_truncate_masks.py
@@ -0,0 +1,38 @@
+if __name__ == "__main__":
+    # speed test the mask extraction inside truncate_labels_beyond_reference_:
+    # current does two full extract_label(...).get_seg_array() round-trips; the two binary masks
+    # can be obtained directly from one get_array() via np_isin.
+    import random
+
+    import numpy as np
+
+    from TPTBox.core.np_utils import np_isin
+    from TPTBox.tests.speedtests.speedtest import speed_test
+    from TPTBox.tests.test_utils import get_nii
+
+    idx = 1
+    not_beyond = 2
+
+    def old_masks(nii):
+        nii.get_array()
+        cond = nii.extract_label(idx).get_seg_array() == 1
+        nb = nii.extract_label(not_beyond).get_seg_array() == 1
+        return (cond, nb)
+
+    def new_masks(nii):
+        np_array = nii.get_array()
+        cond = np_isin(np_array, idx)
+        nb = np_isin(np_array, not_beyond)
+        return (cond, nb)
+
+    def get_input():
+        nii, *_ = get_nii(x=(300, 300, 300), num_point=random.randint(5, 10))
+        return nii
+
+    print("\n=== truncate_labels_beyond_reference_ mask extraction (300^3) ===")
+    speed_test(
+        repeats=50,
+        get_input_func=get_input,
+        functions=[old_masks, new_masks],
+        assert_equal_function=lambda x, y: np.array_equal(x[0], y[0]) and np.array_equal(x[1], y[1]),
+    )

From 6c8a1bdb43025f3aedb6cb1cdb49453c5d010366 Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 20:25:05 +0000
Subject: [PATCH 10/13] perf(poi): calc_centroids computes all centroids in one
 np_center_of_mass pass

The default _crop path looped over every label doing extract_label(i) +
compute_crop + scipy center_of_mass. np_center_of_mass (cc3d) returns every
label's centroid in a single pass. ~5x faster at 8-16 labels and ~9x at
20-40 labels; output bit-identical (verified 379/379 points exact to the
rounded decimal). The non-_crop fallback is unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/poi.py                            | 32 ++++++++--------
 .../speedtest_poi_calc_centroids.py           | 38 +++++++++++++++++++
 2 files changed, 53 insertions(+), 17 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_poi_calc_centroids.py

diff --git a/TPTBox/core/poi.py b/TPTBox/core/poi.py
index 6a3d53a..f30b7c9 100755
--- a/TPTBox/core/poi.py
+++ b/TPTBox/core/poi.py
@@ -21,6 +21,7 @@
 from TPTBox.core.compat import zip_strict
 from TPTBox.core.nii_poi_abstract import Has_Grid
 from TPTBox.core.nii_wrapper import NII, Image_Reference, to_nii, to_nii_optional
+from TPTBox.core.np_utils import np_center_of_mass
 from TPTBox.core.poi_fun import save_load
 from TPTBox.core.poi_fun.poi_abstract import Abstract_POI, POI_Descriptor
 from TPTBox.core.vert_constants import (
@@ -1272,30 +1273,27 @@ def calc_centroids(
             extend_to = extend_to.copy()
         ctd_list = extend_to.centroids
         extend_to.assert_affine(msk_nii, shape_tolerance=1, origin_tolerance=1)
-    u = msk_nii.unique()
-    if bar:
-        from tqdm import tqdm
-
-        u = tqdm(u)
-    for i in u:
-        if _crop:
-            # TODO test implementation and remove old
-            m = msk_nii.extract_label(i)
-            crop = m.compute_crop()
-            m2: NII = m[crop]
-            ctr_mass: Sequence[float] = center_of_mass(m2.get_seg_array())  # type: ignore
-            out_coord = tuple(round(x + crop.start, decimals) for x, crop in zip(ctr_mass, crop))
-        else:
+    if _crop:
+        # all per-label centroids in a single cc3d pass (bit-identical to the per-label
+        # extract_label + crop + scipy center_of_mass loop, but ~5-9x faster)
+        coords = [(int(i), tuple(round(float(x), decimals) for x in c)) for i, c in np_center_of_mass(msk_data).items()]
+    else:
+        coords = []
+        for i in msk_nii.unique():
             # OLD
             msk_temp = np.zeros(msk_data.shape, dtype=bool)
             msk_temp[msk_data == i] = True
             ctr_mass: Sequence[float] = center_of_mass(msk_temp)  # type: ignore
-            out_coord = tuple(round(x, decimals) for x in ctr_mass)
+            coords.append((int(i), tuple(round(x, decimals) for x in ctr_mass)))
+    if bar:
+        from tqdm import tqdm
 
+        coords = tqdm(coords)
+    for i, out_coord in coords:
         if second_stage == -1:
-            ctd_list[first_stage, int(i)] = out_coord
+            ctd_list[first_stage, i] = out_coord
         else:
-            ctd_list[int(i), second_stage] = out_coord
+            ctd_list[i, second_stage] = out_coord
     return POI(ctd_list, **msk_nii._extract_affine(), **args)
 
 
diff --git a/TPTBox/tests/speedtests/speedtest_poi_calc_centroids.py b/TPTBox/tests/speedtests/speedtest_poi_calc_centroids.py
new file mode 100644
index 0000000..1929508
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_poi_calc_centroids.py
@@ -0,0 +1,38 @@
+if __name__ == "__main__":
+    # speed test calc_centroids core: per-label extract_label + crop + scipy center_of_mass loop
+    # vs a single np_center_of_mass (cc3d) call that returns every label's centroid at once.
+    import random
+
+    import numpy as np
+    from scipy.ndimage import center_of_mass
+
+    from TPTBox.core.np_utils import np_center_of_mass
+    from TPTBox.tests.speedtests.speedtest import speed_test
+    from TPTBox.tests.test_utils import get_nii
+
+    def old_centroids(nii):
+        out = {}
+        for i in nii.unique():
+            m = nii.extract_label(i)
+            crop = m.compute_crop()
+            m2 = m[crop]
+            cm = center_of_mass(m2.get_seg_array())
+            out[int(i)] = tuple(round(x + c.start, 3) for x, c in zip(cm, crop))
+        return out
+
+    def new_centroids(nii):
+        coms = np_center_of_mass(nii.get_seg_array())
+        return {int(i): tuple(round(float(x), 3) for x in c) for i, c in coms.items()}
+
+    eq = lambda x, y: x == y  # noqa: E731
+
+    def make(size, npts):
+        def _f():
+            nii, *_ = get_nii(x=(size, size, size), num_point=random.randint(npts[0], npts[1]))
+            return nii
+
+        return _f
+
+    for size, npts in [(200, (8, 16)), (150, (20, 40))]:
+        print(f"\n=== calc_centroids ({size}^3, {npts[0]}-{npts[1]} labels) ===")
+        speed_test(repeats=30, get_input_func=make(size, npts), functions=[old_centroids, new_centroids], assert_equal_function=eq)

From 5f4a819f082385ac11b0ce73502179c73f97cc34 Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Wed, 10 Jun 2026 20:31:02 +0000
Subject: [PATCH 11/13] perf(poi): batch the global<->local affine conversions

POI_Global construction (to_global) and to_other applied the affine transform
one point at a time in a Python loop. Added vectorized local_to_global_arr
(POI) and global_to_local_arr (Has_Grid) that transform an (N,3) array in a
single matmul, and use them in those loops. ~7-8x faster (100-400 points);
output bit-identical (verified vs per-point, with/without itk_coords). to_other
keeps the per-point path when verbose=True to preserve its logging.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/nii_poi_abstract.py               |  9 +++++
 TPTBox/core/poi.py                            | 13 +++++++
 TPTBox/core/poi_fun/poi_global.py             | 36 +++++++++++++-----
 .../speedtests/speedtest_poi_to_global.py     | 37 +++++++++++++++++++
 4 files changed, 86 insertions(+), 9 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_poi_to_global.py

diff --git a/TPTBox/core/nii_poi_abstract.py b/TPTBox/core/nii_poi_abstract.py
index 0205dcd..2c3ca6b 100755
--- a/TPTBox/core/nii_poi_abstract.py
+++ b/TPTBox/core/nii_poi_abstract.py
@@ -482,6 +482,15 @@ def global_to_local(self, x: COORDINATE) -> tuple:
         a = self.rotation.T @ (np.array(x) - self.origin) / np.array(self.zoom)
         return tuple(round(float(v), 7) for v in a)
 
+    def global_to_local_arr(self, coords: np.ndarray) -> np.ndarray:
+        """Vectorized :meth:`global_to_local` for an ``(N, 3)`` array of world coordinates.
+
+        Equivalent to applying ``global_to_local`` to each row but in a single batched
+        inverse-affine matmul.
+        """
+        a = (np.asarray(coords, dtype=float) - np.asarray(self.origin)) @ np.asarray(self.rotation) / np.asarray(self.zoom)
+        return np.round(a, 7)
+
     def local_to_global(self, x: COORDINATE) -> tuple:
         """Convert voxel (local) coordinates to world (RAS/LPS) coordinates.
 
diff --git a/TPTBox/core/poi.py b/TPTBox/core/poi.py
index f30b7c9..8126175 100755
--- a/TPTBox/core/poi.py
+++ b/TPTBox/core/poi.py
@@ -260,6 +260,19 @@ def local_to_global(self, x: COORDINATE, itk_coords=False) -> COORDINATE:
         # return tuple(a.tolist())
         return tuple(round(float(v), ROUNDING_LVL) for v in a)
 
+    def local_to_global_arr(self, coords: np.ndarray, itk_coords=False) -> np.ndarray:
+        """Vectorized :meth:`local_to_global` for an ``(N, 3)`` array of coordinates.
+
+        Equivalent to applying ``local_to_global`` to each row but in a single batched
+        affine matmul (much faster for many points).
+        """
+        assert self.zoom is not None and self.rotation is not None and self.origin is not None
+        a = (np.asarray(coords, dtype=float) * np.asarray(self.zoom)) @ np.asarray(self.rotation).T + np.asarray(self.origin)
+        if itk_coords:
+            a[:, 0] *= -1
+            a[:, 1] *= -1
+        return np.round(a, ROUNDING_LVL)
+
     def global_to_local(self, x: COORDINATE) -> COORDINATE:
         """Converts global coordinates to local coordinates using zoom, rotation, and origin.
 
diff --git a/TPTBox/core/poi_fun/poi_global.py b/TPTBox/core/poi_fun/poi_global.py
index 4c1ba8d..0ff8304 100755
--- a/TPTBox/core/poi_fun/poi_global.py
+++ b/TPTBox/core/poi_fun/poi_global.py
@@ -3,6 +3,8 @@
 from copy import deepcopy
 from pathlib import Path
 
+import numpy as np
+
 ###### GLOBAL POI #####
 from typing_extensions import Self
 
@@ -50,8 +52,12 @@ def __init__(
         elif isinstance(input_poi, poi.POI):
             local_poi = input_poi.copy()
             global_points = poi.POI_Descriptor(definition=local_poi.centroids.definition)
-            for k1, k2, v in local_poi.items():
-                global_points[k1:k2] = local_poi.local_to_global(v, itk_coords)
+            items = list(local_poi.items())
+            if items:
+                # one batched affine matmul instead of a per-point local_to_global loop
+                arr = local_poi.local_to_global_arr(np.asarray([v for _, _, v in items]), itk_coords)
+                for (k1, k2, _), row in zip(items, arr.tolist()):
+                    global_points[k1:k2] = tuple(row)
             info = input_poi.info.copy()
             _format = input_poi.format
         else:
@@ -178,14 +184,26 @@ def to_other(self, msk: Has_Grid, verbose=False) -> poi.POI:
             poi.POI: The converted POI.
         """
         out = poi.POI_Descriptor(definition=self._get_centroids().definition)
-        for k1, k2, v in self.items():
+        items = list(self.items())
+        if items and not verbose:
+            # one batched inverse-affine matmul instead of a per-point global_to_local loop
+            arr = np.asarray([v for _, _, v in items], dtype=float)
             if self.itk_coords:
-                assert len(v) == 3, "n-d vec not implemented for n != 3"
-                v = (-v[0], -v[1], v[2])  # noqa: PLW2901
-            v_out = msk.global_to_local(v)
-            if verbose:
-                log.print(v, "-->", v_out)
-            out[k1, k2] = tuple(v_out)
+                assert arr.shape[1] == 3, "n-d vec not implemented for n != 3"
+                arr[:, 0] *= -1
+                arr[:, 1] *= -1
+            arr = msk.global_to_local_arr(arr)
+            for (k1, k2, _), row in zip(items, arr.tolist()):
+                out[k1, k2] = tuple(row)
+        else:
+            for k1, k2, v in items:
+                if self.itk_coords:
+                    assert len(v) == 3, "n-d vec not implemented for n != 3"
+                    v = (-v[0], -v[1], v[2])  # noqa: PLW2901
+                v_out = msk.global_to_local(v)
+                if verbose:
+                    log.print(v, "-->", v_out)
+                out[k1, k2] = tuple(v_out)
 
         return poi.POI(centroids=out, **msk._extract_affine(), info=self.info, format=self.format)
 
diff --git a/TPTBox/tests/speedtests/speedtest_poi_to_global.py b/TPTBox/tests/speedtests/speedtest_poi_to_global.py
new file mode 100644
index 0000000..c93b8f6
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_poi_to_global.py
@@ -0,0 +1,37 @@
+if __name__ == "__main__":
+    # speed test POI global coordinate conversion: per-point local_to_global / global_to_local
+    # loops vs a single batched (N,3) affine matmul.
+    import nibabel as nib
+    import numpy as np
+
+    from TPTBox.core.nii_wrapper import NII
+    from TPTBox.core.poi import calc_centroids
+    from TPTBox.core.vert_constants import ROUNDING_LVL
+    from TPTBox.tests.speedtests.speedtest import speed_test
+
+    def make_poi(n_labels):
+        def _f():
+            arr = np.random.randint(0, n_labels, size=(110, 110, 110)).astype(np.uint16)
+            nii = NII(nib.Nifti1Image(arr, affine=np.diag([1.5, 1.5, 2.0, 1.0])), seg=True)
+            return calc_centroids(nii, second_stage=50)
+
+        return _f
+
+    def old_to_global(poi):
+        out = {}
+        for k1, k2, v in poi.items():
+            out[(k1, k2)] = poi.local_to_global(v, False)
+        return out
+
+    def new_to_global(poi):
+        items = list(poi.items())
+        arr = np.asarray([v for _, _, v in items])
+        a = (arr * np.asarray(poi.zoom)) @ np.asarray(poi.rotation).T + np.asarray(poi.origin)
+        a = np.round(a, ROUNDING_LVL)
+        return {(k1, k2): tuple(row) for (k1, k2, _), row in zip(items, a.tolist())}
+
+    eq = lambda x, y: x == y  # noqa: E731
+
+    for n_labels in (100, 400):
+        print(f"\n=== POI.to_global affine ({n_labels} points) ===")
+        speed_test(repeats=30, get_input_func=make_poi(n_labels), functions=[old_to_global, new_to_global], assert_equal_function=eq)

From 00357c17cf3f6ea682696aff12957ccc51ab4448 Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Thu, 11 Jun 2026 15:27:53 +0000
Subject: [PATCH 12/13] perf(bids): Searchquery filter/filter_non_existence are
 O(n) not O(n^2)

flatten-mode filtering did candidates.copy() then list.remove() per dropped
file (each remove is O(n) -> O(n^2) overall). Replaced with a single list
comprehension; ~48x faster filtering 2000 candidates. The dict-mode branches
likewise drop the throwaway dict copy()+pop() for a dict comprehension. Output
identical (verified across flatten/dict x keys for both filter methods). The
comprehension also removes by identity, avoiding list.remove's first-equal
removal quirk.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/core/bids_files.py                     | 28 ++++++------
 .../tests/speedtests/speedtest_bids_filter.py | 44 +++++++++++++++++++
 2 files changed, 58 insertions(+), 14 deletions(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_bids_filter.py

diff --git a/TPTBox/core/bids_files.py b/TPTBox/core/bids_files.py
index f459b95..f384991 100755
--- a/TPTBox/core/bids_files.py
+++ b/TPTBox/core/bids_files.py
@@ -1750,15 +1750,15 @@ def filter(
         """
         if self._flatten:
             assert isinstance(self.candidates, list)
-            for bids_file in self.candidates.copy():
-                if not bids_file.do_filter(key, filter_fun, required=required):
-                    self.candidates.remove(bids_file)
+            # list comprehension is O(n); the old copy()+list.remove() loop was O(n^2)
+            self.candidates = [f for f in self.candidates if f.do_filter(key, filter_fun, required=required)]
         else:
             assert isinstance(self.candidates, dict)
-            for sequences, bids_files in self.candidates.copy().items():
-                # print(sequences, list(bids_file.do_filter(key, filter_fun, required=required) for bids_file in bids_files))
-                if not any(bids_file.do_filter(key, filter_fun, required=required) for bids_file in bids_files):
-                    self.candidates.pop(sequences)
+            self.candidates = {
+                seq: bids_files
+                for seq, bids_files in self.candidates.items()
+                if any(f.do_filter(key, filter_fun, required=required) for f in bids_files)
+            }
 
     def filter_format(self, filter_fun: list[str] | str | typing.Callable[[str | object], bool]) -> None:
         """Keep only files whose format label satisfies *filter_fun*.
@@ -1807,15 +1807,15 @@ def filter_non_existence(
         """
         if self._flatten:
             assert isinstance(self.candidates, list)
-            for bids_file in self.candidates.copy():
-                if bids_file.do_filter(key, filter_fun, required=required):
-                    self.candidates.remove(bids_file)
+            # list comprehension is O(n); the old copy()+list.remove() loop was O(n^2)
+            self.candidates = [f for f in self.candidates if not f.do_filter(key, filter_fun, required=required)]
         else:
             assert isinstance(self.candidates, dict)
-            for sequences, bids_files in self.candidates.copy().items():
-                # print(sequences, list(bids_file.do_filter(key, filter_fun, required=required) for bids_file in bids_files))
-                if any(bids_file.do_filter(key, filter_fun, required=required) for bids_file in bids_files):
-                    self.candidates.pop(sequences)
+            self.candidates = {
+                seq: bids_files
+                for seq, bids_files in self.candidates.items()
+                if not any(f.do_filter(key, filter_fun, required=required) for f in bids_files)
+            }
 
     def filter_dixon_only_inphase(self) -> None:
         """Remove Dixon files that are fat, water, out-of-phase, or difference images.
diff --git a/TPTBox/tests/speedtests/speedtest_bids_filter.py b/TPTBox/tests/speedtests/speedtest_bids_filter.py
new file mode 100644
index 0000000..224cb00
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_bids_filter.py
@@ -0,0 +1,44 @@
+if __name__ == "__main__":
+    # speed test Searchquery.filter (flatten mode): candidates.copy() + list.remove (O(n^2))
+    # vs a single list comprehension (O(n)). Built on real BIDS_FILE candidates.
+    import tempfile
+    from pathlib import Path
+
+    from TPTBox.core.bids_files import BIDS_FILE
+    from TPTBox.tests.speedtests.speedtest import speed_test
+
+    # build N real BIDS_FILE candidates once (touches a tempdir; not part of the timed comparison)
+    tmp = Path(tempfile.mkdtemp())
+    formats = ["T1w", "T2w", "ct", "msk", "dixon"]
+    N = 2000
+    CANDIDATES = []
+    for i in range(N):
+        fmt = formats[i % len(formats)]
+        sub = f"sub-{i:04d}"
+        p = tmp / sub / f"{sub}_ses-01_sequ-{i}_{fmt}.nii.gz"
+        p.parent.mkdir(parents=True, exist_ok=True)
+        p.touch()
+        CANDIDATES.append(BIDS_FILE(p, tmp, verbose=False))
+
+    keep = ["T1w"]  # keep 1/5, so the old loop removes ~80% of candidates (worst case for list.remove)
+
+    def old_filter(cands):
+        cands = list(cands)
+        for bids_file in cands.copy():
+            if not bids_file.do_filter("format", keep, required=True):
+                cands.remove(bids_file)
+        return cands
+
+    def new_filter(cands):
+        return [bids_file for bids_file in cands if bids_file.do_filter("format", keep, required=True)]
+
+    def get_input():
+        return (CANDIDATES,)  # 1-tuple so the harness passes the list as a single argument
+
+    print(f"\n=== Searchquery.filter flatten mode ({N} candidates, keep {keep}) ===")
+    speed_test(
+        repeats=12,
+        get_input_func=get_input,
+        functions=[old_filter, new_filter],
+        assert_equal_function=lambda x, y: [f.BIDS_key for f in x] == [f.BIDS_key for f in y],
+    )

From 302e21636822ba49481bea3cafa9c91f3482309e Mon Sep 17 00:00:00 2001
From: iback <hendrik.moeller@tum.de>
Date: Thu, 11 Jun 2026 17:27:06 +0000
Subject: [PATCH 13/13] perf(mesh): html_preview reorients/rescales once
 instead of per label

_get_mesh called from_segmentation_nii(extract_label(u)) for every label, and
that reorients + rescales (resamples) the image each time. Reorient/rescale
commute with extract_label for nearest-neighbour segmentation resampling, so
the image is now transformed once before the loop. ~5x (12 labels) to ~7x (25
labels) faster on the transform; the per-label marching-cubes meshes are
bit-identical (verified arrays and mesh vertices for rescale_to_iso True/False).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TPTBox/mesh3D/html_preview.py                 |  8 +++-
 .../speedtest_mesh_preview_hoist.py           | 39 +++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 TPTBox/tests/speedtests/speedtest_mesh_preview_hoist.py

diff --git a/TPTBox/mesh3D/html_preview.py b/TPTBox/mesh3D/html_preview.py
index 195dcaa..0012334 100644
--- a/TPTBox/mesh3D/html_preview.py
+++ b/TPTBox/mesh3D/html_preview.py
@@ -78,9 +78,15 @@ def _get_mesh(
                 color = default_poi_nii if is_poi else default_color_nii
             yield mesh, color
         elif isinstance(img, NII):
+            # reorient/rescale the whole image ONCE instead of per label. They commute with
+            # extract_label for nearest-neighbour seg resampling, so each label's array (and
+            # therefore its marching-cubes mesh) is identical to the per-label version.
+            img = img.reorient()
+            if rescale_to_iso:
+                img = img.rescale()
             for u in img.unique():
                 color = get_color_by_label(u)
-                mesh = SegmentationMesh.from_segmentation_nii(img.extract_label(u), rescale_to_iso=rescale_to_iso)
+                mesh = SegmentationMesh(img.extract_label(u).get_seg_array())
                 if self.offset is not None:
                     mesh = mesh.get_mesh_with_offset(self.offset)
                 yield mesh, color
diff --git a/TPTBox/tests/speedtests/speedtest_mesh_preview_hoist.py b/TPTBox/tests/speedtests/speedtest_mesh_preview_hoist.py
new file mode 100644
index 0000000..86792ac
--- /dev/null
+++ b/TPTBox/tests/speedtests/speedtest_mesh_preview_hoist.py
@@ -0,0 +1,39 @@
+if __name__ == "__main__":
+    # speed test html_preview._get_mesh: reorient_()+rescale_() were done per label (inside
+    # from_segmentation_nii(extract_label(u))). Hoisting them out of the loop reorients/rescales
+    # the image once. reorient/rescale commute with extract_label for nearest-neighbour seg
+    # resampling, so each label's array (and thus its marching-cubes mesh) is unchanged.
+    import nibabel as nib
+    import numpy as np
+
+    from TPTBox.core.nii_wrapper import NII
+    from TPTBox.tests.speedtests.speedtest import speed_test
+
+    def old_per_label(nii):
+        out = {}
+        for u in nii.unique():
+            m = nii.extract_label(u)  # NII
+            m.reorient_()
+            m.rescale_()
+            out[u] = m.get_seg_array()
+        return out
+
+    def new_hoisted(nii):
+        base = nii.reorient()
+        base = base.rescale()
+        return {u: base.extract_label(u).get_seg_array() for u in base.unique()}
+
+    def eq(x, y):
+        return set(x) == set(y) and all(np.array_equal(x[k], y[k]) for k in x)
+
+    def make(n_labels, spacing_z):
+        def _f():
+            aff = np.diag([1.0, 1.0, float(spacing_z), 1.0])
+            arr = np.random.randint(0, n_labels, size=(96, 98, 44)).astype(np.uint16)
+            return NII(nib.Nifti1Image(arr, aff), seg=True)
+
+        return _f
+
+    for n_labels, sz in [(12, 3.0), (25, 2.0)]:
+        print(f"\n=== html_preview per-label reorient+rescale ({n_labels} labels, z-spacing {sz}) ===")
+        speed_test(repeats=10, get_input_func=make(n_labels, sz), functions=[old_per_label, new_hoisted], assert_equal_function=eq)