openproblems-bio · dariarom94 · Jun 22, 2026 · Jun 22, 2026
diff --git a/src/datasets/loaders/bruker_cosmx/config.vsh.yaml b/src/datasets/loaders/bruker_cosmx/config.vsh.yaml
@@ -4,11 +4,11 @@ namespace: datasets/loaders
 argument_groups:
   - name: Inputs
     arguments:
-      - type: string
+      - type: file
         name: --input_raw
         example: "https://smi-public.objects.liquidweb.services/HalfBrain.zip"
         description: "Download file url for the raw data"
-      - type: string
+      - type: file
         name: --input_flat_files
         example: "https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip"
         description: "Download file url for the flat files"
@@ -67,9 +67,6 @@ engines:
       - type: python 
         pypi:
           - sopa
-      #NOTE: Changeed to sopa releases since work of https://github.com/gustaveroussy/sopa/issues/285 is merged and released
-      #- type: python
-      #  github: [gustaveroussy/sopa@cosmx_labels]
   - type: native
 
 runners:

diff --git a/src/datasets/loaders/bruker_cosmx/script.py b/src/datasets/loaders/bruker_cosmx/script.py
@@ -88,8 +88,10 @@
 
 ## VIASH START
 par = {
-    "input_raw": "https://smi-public.objects.liquidweb.services/HalfBrain.zip",
-    "input_flat_files": "https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip",
+    # downloaded from https://smi-public.objects.liquidweb.services/HalfBrain.zip
+    "input_raw": "temp/datasets/bruker_cosmx/HalfBrain.zip",
+    # downloaded from https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip
+    "input_flat_files": "temp/datasets/bruker_cosmx/Half Brain simple files .zip",
     "segmentation_id": ["cell"],
     "output": "output.zarr",
     "dataset_id": "bruker_cosmx/bruker_mouse_brain_cosmx/rep1",
@@ -101,109 +103,104 @@
     "dataset_organism": "human",
 }
 meta = {
-    #"temp_dir": "./temp/datasets/bruker_cosmx",
-    "temp_dir": "/Volumes/Sandisk2TB/G3_temp/bruker_cosmx/test_folder"
+    "temp_dir": "./temp/datasets/bruker_cosmx/temp",
 }
-
 ## VIASH END
 
 assert ("cell" in par["segmentation_id"]) and (len(par["segmentation_id"]) == 1), "Currently cell labels are definitely assigned in this script. And cosmx does not provide other segmentations."
 
 t0 = datetime.now()
 
+def log(msg):
+    print(datetime.now() - t0, msg, flush=True)
+
 # Define temp dir and file names
 TMP_DIR = Path(meta["temp_dir"] or "/tmp")
 TMP_DIR.mkdir(parents=True, exist_ok=True)
-FILE_NAME_RAW = TMP_DIR / par["input_raw"].split("/")[-1]
-
-if par["input_flat_files"] is not None:
-    FILE_NAME_FLAT = TMP_DIR / par["input_flat_files"].split("/")[-1].replace("%20", " ")
 
-# Download raw files
-print(datetime.now() - t0, "Download raw files", flush=True)
-os.system(f"wget {par['input_raw']} -O '{FILE_NAME_RAW}'")
+def extract_zip(input_zip: Path, output_dir: Path, strip_root: bool = False):
+    """
+    Extracts the input zip file to the specified output directory.
 
-# Set DATA_DIR
-extracted_dir_name = zipfile.ZipFile(FILE_NAME_RAW).infolist()[0].filename.split("/")[0]
-DATA_DIR = TMP_DIR / extracted_dir_name / "CellStatsDir"
-
-# Extract zip files
-print(datetime.now() - t0, "Extract zip of raw files", flush=True)
-with zipfile.ZipFile(FILE_NAME_RAW, 'r') as zip_ref:
-    zip_ref.extractall(TMP_DIR)
-
-# Print files and folders in TMP_DIR
-print(datetime.now() - t0, f"Files and folders in TMP_DIR ({TMP_DIR})", flush=True)
-print(os.listdir(TMP_DIR))
-if DATA_DIR.parent.exists():
-    print(datetime.now() - t0, f"Files and folders in {DATA_DIR.parent}", flush=True)
-    print(os.listdir(DATA_DIR.parent))
-else:
-    print(datetime.now() - t0, f"{DATA_DIR.parent} does not exist", flush=True)
-if DATA_DIR.exists():
-    print(datetime.now() - t0, f"Files and folders in {DATA_DIR}", flush=True)
-    print(os.listdir(DATA_DIR))
-else:
-    print(datetime.now() - t0, f"{DATA_DIR} does not exist", flush=True)
-
-# Download and extract flat files if they are not already present
-FLAT_FILES_ENDINGS = ["_exprMat_file.csv", "_fov_positions_file.csv", "_metadata_file.csv", "_tx_file.csv"] #, "polygons.csv"]
-flat_files_count = 0
-for ending in FLAT_FILES_ENDINGS:
-    if any(f.endswith(ending) for f in os.listdir(DATA_DIR)):
-        print(f"Flat file with ending {ending} already present in extracted raw files", flush=True)
-        flat_files_count += 1
-
-if flat_files_count == len(FLAT_FILES_ENDINGS):
-    print(datetime.now() - t0, "All flat files already present in extracted raw files", flush=True)
-else:
-    print(datetime.now() - t0, "Download and extract flat files", flush=True)
-    os.system(f"wget {par['input_flat_files']} -O '{FILE_NAME_FLAT}'")
-
-    with zipfile.ZipFile(FILE_NAME_FLAT, 'r') as zip_ref:
-        zip_ref.extractall(TMP_DIR)
-
-    print(datetime.now() - t0, f"Move flat files to {DATA_DIR}", flush=True)
-    source_dir = FILE_NAME_FLAT.parent / FILE_NAME_FLAT.stem
+    Arguments:
 
-    file_names = os.listdir(source_dir)
-    for file_name in file_names:
-        if not (DATA_DIR / file_name).exists():
-            shutil.move(source_dir / file_name, DATA_DIR)
-        else:
-            print(datetime.now() - t0, f"File {file_name} already present in {DATA_DIR}", flush=True)
+    - input_zip: Path to the input zip file.
+    - output_dir: Path to the directory where the contents of the zip file will be extracted
+    - strip_root: If True, and if all entries in the zip file share exactly one top-level directory, then this top-level directory will be stripped when extracting. 
+    """
+    output_dir = Path(output_dir)
+    with zipfile.ZipFile(input_zip, 'r') as zip_ref:
+        members = zip_ref.infolist()
+
+        # only strip when all entries share exactly one top-level directory
+        roots = {Path(m.filename).parts[0] for m in members if m.filename.strip("/") and not m.filename.startswith("__MACOSX/")}
+        if not (strip_root and len(roots) == 1):
+            zip_ref.extractall(output_dir)
+            return
+
+        for member in members:
+            if member.filename.startswith("__MACOSX/"):
+                continue  # skip macOS resource-fork entries, don't recreate them
+            parts = Path(member.filename).parts[1:]
+            if not parts:
+                continue  # the wrapper directory entry itself
+            target = output_dir.joinpath(*parts)
+            if member.is_dir():
+                target.mkdir(parents=True, exist_ok=True)
+            else:
+                target.parent.mkdir(parents=True, exist_ok=True)
+                with zip_ref.open(member) as src, open(target, "wb") as dst:
+                    shutil.copyfileobj(src, dst)
 
+# Extract zip files
+log("Extract zip of raw files")
+INPUT_RAW_EXTRACTED = TMP_DIR / "input_raw"
+extract_zip(par["input_raw"], INPUT_RAW_EXTRACTED, strip_root=True)
+
+log(f"Files and folders in input_raw_extracted ({INPUT_RAW_EXTRACTED})")
+print(os.listdir(INPUT_RAW_EXTRACTED))
+
+log("Detect CellStatsDir directory inside extracted raw files")
+DATA_DIR = INPUT_RAW_EXTRACTED / "CellStatsDir"
+assert DATA_DIR.is_dir(), (
+    f"Expected CellStatsDir at {DATA_DIR}, but it does not exist. "
+    f"Contents of {INPUT_RAW_EXTRACTED}: {os.listdir(INPUT_RAW_EXTRACTED)}"
+)
 
-# Move CellLabels_FXXX.tif files to CellLabels folder if they are not already present
+log("Extract zip of flat files")
+INPUT_FLAT_FILES_EXTRACTED = TMP_DIR / "input_flat_files"
+extract_zip(par["input_flat_files"], INPUT_FLAT_FILES_EXTRACTED, strip_root=True)
+
+log("Symlink csvs from flat files to data dir")
+for path in INPUT_FLAT_FILES_EXTRACTED.glob("*.csv"):
+    target = DATA_DIR / path.name
+    if not target.exists():
+        log(f"Symlink file {path.name} to {DATA_DIR}")
+        os.symlink(path.resolve(), target)
+    else:
+        log(f"File {path.name} already present in {DATA_DIR}")
+
+# sopa expects a CellLabels/ folder, but some CosMx exports only ship the per-FOV
+# label tifs (inside FOV* folders). When that's the case, gather them into a
+# CellLabels/ folder. See https://github.com/gustaveroussy/sopa/issues/285
 labels_dir = DATA_DIR / "CellLabels"
+fov_label_tifs = sorted(DATA_DIR.glob("FOV*/CellLabels_F*.tif"))
 
-if not labels_dir.exists():
-    print(datetime.now() - t0, "Create CellLabels folder with CellLabels tif", flush=True)
-    # Create CellLabels folder with CellLabels tif (somehow this folder name is expected and this is not always present)
-    # see e.g. late discussion in https://github.com/gustaveroussy/sopa/issues/285
-
+if fov_label_tifs:
+    log(f"Symlink {len(fov_label_tifs)} per-FOV CellLabels tifs into {labels_dir}")
     labels_dir.mkdir(parents=True, exist_ok=True)
-
-    # Get all folders in data_dir that start with "FOV" and move the CellLabels_FXXX.tif file to the CellLabels folder
-    print(datetime.now() - t0, "Move CellLabels_FXXX.tif files to CellLabels folder", flush=True)
-    for fov_dir in DATA_DIR.glob("FOV*"):
-        fov_id = str(fov_dir)[-3:]
-        shutil.copy(fov_dir / f"CellLabels_F{fov_id}.tif", labels_dir / f"CellLabels_F{fov_id}.tif")
+    for tif in fov_label_tifs:
+        os.symlink(tif.resolve(), labels_dir / tif.name)
+elif labels_dir.is_dir():
+    log("CellLabels folder already present in raw data")
 else:
-    print(datetime.now() - t0, "CellLabels folder already present", flush=True)
-
-
+    raise AssertionError(f"No CellLabels folder and no per-FOV CellLabels tifs found in {DATA_DIR}")
 
 #########################################
 # Convert raw files to spatialdata zarr #
 #########################################
 
-
-#from pathlib import Path
-#import sopa
-#data_dir = Path("/Volumes/Sandisk2TB/G3_temp/bruker_cosmx/HalfBrain/CellStatsDir")
-
-print(datetime.now() - t0, "Convert raw files to spatialdata zarr", flush=True)
+log("Convert raw files to spatialdata zarr")
 
 sdata = sopa.io.cosmx(
     DATA_DIR, 
@@ -220,8 +217,7 @@
 ###############
 # Rename keys #
 ###############
-print(datetime.now() - t0, "Rename keys", flush=True)
-
+log("Rename keys")
 elements_renaming_map = {
     "stitched_image"     : "morphology_mip", 
     "stitched_labels"    : "cell_labels",
@@ -234,15 +230,14 @@
     sdata[new_key] = sdata[old_key]
     del sdata[old_key]
 
-# Rename transcript column (somehow overwriting the 'target' column leads to an error, so instead we add a duplicate with the right name)
-#sdata['transcripts'] = sdata['transcripts'].rename(columns={"global_cell_id":"cell_id", "target":"feature_name"})
-sdata['transcripts'] = sdata['transcripts'].rename(columns={"global_cell_id":"cell_id"})
+# Add transcript columns with the names expected downstream.
+sdata['transcripts']["cell_id"] = sdata['transcripts']["global_cell_id"]
 sdata['transcripts']["feature_name"] = sdata['transcripts']["target"]
 
 #########################################
 # Throw out all channels except of DAPI #
 ######################################### NOTE: We assume the "DNA" stain is comparable to DAPI.
-print(datetime.now() - t0, "Throw out all channels except of 'DNA' (DAPI?)", flush=True)
+log("Throw out all channels except of 'DNA' (DAPI?)")
 
 # TODO: in the future we want to keep PolyT and Cellbound1/2/3 stains. Note however, that somehow saving or plotting the sdata fails when
 #       these channels aren't excluded, not sure why...
@@ -252,17 +247,15 @@
 ##############################
 # Add info to metadata table #
 ##############################
-print(datetime.now() - t0, "Add metadata to table", flush=True)
-
-#TODO: values as input variables
+log("Add metadata to table")
 for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism", "segmentation_id"]:
     sdata["table"].uns[key] = par[key]
 
 #########
 # Write #
 #########
-print(datetime.now() - t0, f"Writing to {par['output']}", flush=True)
+log(f"Writing to {par['output']}")
 
 sdata.write(par["output"])
 
-print(datetime.now() - t0, "Done", flush=True)
+log(f"Done in {datetime.now() - t0}")
diff --git a/src/datasets/workflows/process_bruker_cosmx/config.vsh.yaml b/src/datasets/workflows/process_bruker_cosmx/config.vsh.yaml
@@ -4,11 +4,11 @@ namespace: datasets/workflows
 argument_groups:
   - name: Inputs
     arguments:
-      - type: string
+      - type: file
         name: --input_raw
         example: "https://smi-public.objects.liquidweb.services/HalfBrain.zip"
         description: "Download file url for the raw data"
-      - type: string
+      - type: file
         name: --input_flat_files
         example: "https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip"
         description: "Download file url for the flat files"