diff --git a/Dockerfile b/Dockerfile index 0f72d953..35707167 100644 --- a/Dockerfile +++ b/Dockerfile @@ -104,9 +104,12 @@ RUN make -j4 pyopenms WORKDIR /openms-build/pyOpenMS RUN pip install dist/*.whl -# Install other dependencies (excluding pyopenms) -COPY requirements.txt ./requirements.txt -RUN grep -Ev '^pyopenms([=<>!~].*)?$' requirements.txt > requirements_cleaned.txt && mv requirements_cleaned.txt requirements.txt +# Install other dependencies (excluding pyopenms and openms-insight). +# openms-insight is installed from source in the run-app stage instead: the +# pinned PyPI release does not yet carry the new FLASHApp visualization +# components, so we build our branch (with its Vue bundle) below. +COPY requirements.txt ./requirements.txt +RUN grep -Ev '^(pyopenms|openms-insight)([=<>!~].*)?$' requirements.txt > requirements_cleaned.txt && mv requirements_cleaned.txt requirements.txt RUN pip install -r requirements.txt WORKDIR / @@ -144,6 +147,31 @@ WORKDIR /openms-streamlit-vue-component RUN npm install RUN npm run build +# Build the OpenMS-Insight Vue bundle and stage its source tree so the Python +# package can be installed (with the bundle baked in) in the run-app stage. +# Like js-build, kept after the slow OpenMS compile so component changes don't +# invalidate that cache. +FROM node:21 AS oi-build + +# OpenMS-Insight repo/branch providing the new visualization components. +# Defaults to the migration branch because those components are not yet in the +# published PyPI release. Override via --build-arg once it is merged/released. +ARG OPENMS_INSIGHT_REPO=https://github.com/t0mdavid-m/OpenMS-Insight.git +ARG OPENMS_INSIGHT_BRANCH=claude/flashapp-openms-visualizations-LVv66 + +# Bust the build cache whenever the branch head moves. +ADD https://api.github.com/repos/t0mdavid-m/OpenMS-Insight/git/refs/heads/$OPENMS_INSIGHT_BRANCH oi-version.json + +RUN git clone -b ${OPENMS_INSIGHT_BRANCH} --single-branch ${OPENMS_INSIGHT_REPO} /openms-insight +WORKDIR /openms-insight/js-component +RUN npm install +RUN npm run build +# The runtime bridge and the wheel packaging both expect the built bundle at +# openms_insight/js-component/dist; place it there so the `pip install` in the +# run-app stage bundles it into site-packages. +RUN mkdir -p /openms-insight/openms_insight/js-component \ + && cp -r /openms-insight/js-component/dist /openms-insight/openms_insight/js-component/dist + # Prepare and run streamlit app. FROM compile-openms AS run-app @@ -190,6 +218,14 @@ COPY presets.json /app/presets.json # Copy the pre-built Vue/JS component (built in the js-build stage above). COPY --from=js-build openms-streamlit-vue-component/dist /app/js-component/dist +# Install OpenMS-Insight (our branch build, with its freshly built Vue bundle) +# into the streamlit env. requirements.txt has openms-insight stripped, so this +# is the authoritative install and provides the new visualization components. +# The package ships its Vue bundle at openms_insight/js-component/dist (staged +# in the oi-build stage), which the runtime bridge loads in production mode. +COPY --from=oi-build /openms-insight /opt/openms-insight +RUN mamba run -n streamlit-env pip install /opt/openms-insight + # add cron job to the crontab RUN echo "0 3 * * * /root/miniforge3/envs/streamlit-env/bin/python /app/clean-up-workspaces.py >> /app/clean-up-workspaces.log 2>&1" | crontab - diff --git a/Dockerfile.arm b/Dockerfile.arm index 9fe055ec..a9e2f010 100644 --- a/Dockerfile.arm +++ b/Dockerfile.arm @@ -99,9 +99,12 @@ RUN make -j4 pyopenms WORKDIR /openms-build/pyOpenMS RUN pip install dist/*.whl -# Install other dependencies (excluding pyopenms) -COPY requirements.txt ./requirements.txt -RUN grep -Ev '^pyopenms([=<>!~].*)?$' requirements.txt > requirements_cleaned.txt && mv requirements_cleaned.txt requirements.txt +# Install other dependencies (excluding pyopenms and openms-insight). openms-insight +# is installed from source in the run-app stage: the pinned PyPI release does not +# yet carry the new FLASHApp visualization components, so we build our branch +# (with its Vue bundle) below. +COPY requirements.txt ./requirements.txt +RUN grep -Ev '^(pyopenms|openms-insight)([=<>!~].*)?$' requirements.txt > requirements_cleaned.txt && mv requirements_cleaned.txt requirements.txt RUN pip install -r requirements.txt WORKDIR / @@ -139,6 +142,28 @@ WORKDIR /openms-streamlit-vue-component RUN npm install RUN npm run build +# Build the OpenMS-Insight Vue bundle and stage its source tree so the Python +# package can be installed (with the bundle baked in) in the run-app stage. +FROM node:21 AS oi-build + +# OpenMS-Insight repo/branch providing the new visualization components. Defaults +# to the migration branch (its components are not yet in the PyPI release). +# Override via --build-arg once merged/released. +ARG OPENMS_INSIGHT_REPO=https://github.com/t0mdavid-m/OpenMS-Insight.git +ARG OPENMS_INSIGHT_BRANCH=claude/flashapp-openms-visualizations-LVv66 + +# Bust the build cache whenever the branch head moves. +ADD https://api.github.com/repos/t0mdavid-m/OpenMS-Insight/git/refs/heads/$OPENMS_INSIGHT_BRANCH oi-version.json + +RUN git clone -b ${OPENMS_INSIGHT_BRANCH} --single-branch ${OPENMS_INSIGHT_REPO} /openms-insight +WORKDIR /openms-insight/js-component +RUN npm install +RUN npm run build +# The runtime bridge and wheel packaging expect the bundle at +# openms_insight/js-component/dist; place it there so pip install bundles it. +RUN mkdir -p /openms-insight/openms_insight/js-component \ + && cp -r /openms-insight/js-component/dist /openms-insight/openms_insight/js-component/dist + # Prepare and run streamlit app. FROM compile-openms AS run-app @@ -171,6 +196,14 @@ COPY presets.json /app/presets.json # Copy the pre-built Vue/JS component (built in the js-build stage above). COPY --from=js-build openms-streamlit-vue-component/dist /app/js-component/dist +# Install OpenMS-Insight (our branch build, with its freshly built Vue bundle) +# into the streamlit env. requirements.txt has openms-insight stripped, so this +# is the authoritative install providing the new visualization components. The +# package ships its Vue bundle at openms_insight/js-component/dist (staged in the +# oi-build stage), which the runtime bridge loads in production mode. +COPY --from=oi-build /openms-insight /opt/openms-insight +RUN mamba run -n streamlit-env pip install /opt/openms-insight + # add cron job to the crontab RUN echo "0 3 * * * /root/miniforge3/envs/streamlit-env/bin/python /app/clean-up-workspaces.py >> /app/clean-up-workspaces.log 2>&1" | crontab - diff --git a/content/FLASHDeconv/FLASHDeconvLayoutManager.py b/content/FLASHDeconv/FLASHDeconvLayoutManager.py index a2094d2b..0b1d1cc8 100644 --- a/content/FLASHDeconv/FLASHDeconvLayoutManager.py +++ b/content/FLASHDeconv/FLASHDeconvLayoutManager.py @@ -219,11 +219,13 @@ def handleSettingButtons(): def setSequenceView(): if get_sequence() is not None: + # Parity with the TnT layout: `internal_fragment_map` was dropped because + # neither the legacy grid nor the OI viewer renders it (it produces + # nothing). Only the sequence view is added on sequence submission. global COMPONENT_OPTIONS - COMPONENT_OPTIONS = COMPONENT_OPTIONS + ['Sequence view (Mass table needed)', - 'Internal fragment map (Mass table needed)'] + COMPONENT_OPTIONS = COMPONENT_OPTIONS + ['Sequence view (Mass table needed)'] global COMPONENT_NAMES - COMPONENT_NAMES = COMPONENT_NAMES + ['sequence_view', 'internal_fragment_map'] + COMPONENT_NAMES = COMPONENT_NAMES + ['sequence_view'] # page initialization diff --git a/content/FLASHDeconv/FLASHDeconvViewer.py b/content/FLASHDeconv/FLASHDeconvViewer.py index 4097e32d..55b4276c 100644 --- a/content/FLASHDeconv/FLASHDeconvViewer.py +++ b/content/FLASHDeconv/FLASHDeconvViewer.py @@ -4,7 +4,45 @@ from src.common.common import page_setup, save_params from src.workflow.FileManager import FileManager +# Legacy bespoke-grid render path (kept importable until OI integration is verified). from src.render.render import render_grid +# The OpenMS-Insight viewer (Stage B) is imported lazily inside render_panel (see +# below) so an import failure (e.g. a missing openms-insight install) falls back +# to the legacy grid instead of breaking the whole page. + + +def _use_oi_viewer(): + return st.session_state.get("settings", {}).get( + "use_openms_insight_viewer", True + ) + + +def render_panel(experiment_id, layout_info_per_exp, file_manager, identifier, + grid_key, panel_index): + """Render one experiment panel via the configured viewer. + + Routes to the new OpenMS-Insight viewer when enabled, else the legacy grid. + The OI viewer is imported lazily and guarded so an import failure falls back + to the legacy grid rather than breaking the page. + """ + if _use_oi_viewer(): + try: + from content.FLASHDeconv.FLASHDeconvViewerOI import ( + render_experiment_panel, + ) + except Exception as exc: # noqa: BLE001 - OI viewer unavailable + st.warning( + f"OpenMS-Insight viewer unavailable ({exc}); using legacy grid." + ) + else: + render_experiment_panel( + experiment_id, layout_info_per_exp, file_manager, panel_index + ) + return + render_grid( + experiment_id, layout_info_per_exp, file_manager, + 'flashdeconv', identifier, grid_key + ) DEFAULT_LAYOUT = [['ms1_deconv_heat_map'], ['scan_table', 'mass_table'], ['anno_spectrum', 'deconv_spectrum'], ['3D_SN_plot']] @@ -84,9 +122,9 @@ def get_sequence(): on_change=select_experiment ) if 'selected_experiment0' in st.session_state: - render_grid( - st.session_state.selected_experiment0, layout[0], file_manager, - 'flashdeconv', "selected_experiment0", 'flash_viewer_grid_0' + render_panel( + st.session_state.selected_experiment0, layout[0], file_manager, + "selected_experiment0", 'flash_viewer_grid_0', panel_index=0 ) with c2: st.selectbox( @@ -97,10 +135,10 @@ def get_sequence(): ) if f"selected_experiment1" in st.session_state: with st.spinner('Loading component...'): - render_grid( - st.session_state["selected_experiment1"], layout[1], - file_manager, 'flashdeconv', 'selected_experiment1', - 'flash_viewer_grid_1' + render_panel( + st.session_state["selected_experiment1"], layout[1], + file_manager, 'selected_experiment1', + 'flash_viewer_grid_1', panel_index=1 ) else: @@ -114,9 +152,9 @@ def get_sequence(): if 'selected_experiment0' in st.session_state: - render_grid( - st.session_state.selected_experiment0, layout[0], file_manager, - 'flashdeconv', 'selected_experiment0' + render_panel( + st.session_state.selected_experiment0, layout[0], file_manager, + 'selected_experiment0', 'flash_viewer_grid', panel_index=0 ) ### for multiple experiments on one view @@ -135,11 +173,11 @@ def get_sequence(): ) # if #experiment input files are less than #layouts, all the pre-selection will be the first experiment if f"selected_experiment{exp_index}" in st.session_state: - render_grid( - st.session_state["selected_experiment%d" % exp_index], - layout[exp_index], file_manager, 'flashdeconv', - "selected_experiment%d" % exp_index, - 'flash_viewer_grid_%d' % exp_index + render_panel( + st.session_state["selected_experiment%d" % exp_index], + layout[exp_index], file_manager, + "selected_experiment%d" % exp_index, + 'flash_viewer_grid_%d' % exp_index, panel_index=exp_index ) save_params(params) diff --git a/content/FLASHDeconv/FLASHDeconvViewerOI.py b/content/FLASHDeconv/FLASHDeconvViewerOI.py new file mode 100644 index 00000000..a9c0ecc1 --- /dev/null +++ b/content/FLASHDeconv/FLASHDeconvViewerOI.py @@ -0,0 +1,546 @@ +"""FLASHDeconv viewer rendered entirely with OpenMS-Insight components (Stage B). + +This is the NEW viewer for the FLASHApp -> OpenMS-Insight visualization migration. +It renders the FLASHDeconv workflow using the reusable ``openms_insight`` component +library (``Table``, ``LinePlot``, ``Heatmap``, ``Scatter3D``, ``DensityPlot``, +``SequenceView``) instead of the bespoke ``flash_viewer_grid`` Vue grid in +``src/render/*``. + +Design goals (see ``/home/user/parity/STRATEGY.md`` §4/§5): + +* ONE shared ``StateManager`` per rendered experiment panel, keyed by a DISTINCT + ``session_key`` (``svc_state_deconv_``) so that selections never + leak between side-by-side experiment panels (HARD edge #6). +* Layout parity: the ``[experiment][row][col]`` nested grid is reproduced with + ``st.columns`` per row (<=3 cols), rows stacked; multi-experiment side-by-side + uses a top-level ``st.columns`` (<=5 panels). +* The component->frame->filters/interactivity wiring exactly mirrors the schema + from the long-format parse producers in ``src/parse/deconv.py``. + +The OLD render path (``src/render/render.py`` / ``flash_viewer_grid``) is left +intact and importable; the page chooses which path to use. + +NOTE ON CACHES: every OpenMS-Insight component persists a preprocessed cache under +``{cache_path}/{cache_id}/``. We derive a per-experiment cache directory inside the +workspace so the caches live next to the FLASHApp parquet cache and are stable +across reruns. ``cache_id`` is suffixed with the experiment id to keep experiments +isolated on disk as well as in session state. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import List, Optional + +import polars as pl +import streamlit as st + +from openms_insight import ( + DensityPlot, + Heatmap, + LinePlot, + Scatter3D, + SequenceView, + StateManager, + Table, +) + +from content.FLASHDeconv.deconv_sequence import bake_fixed_modifications + +# Map the layout COMPONENT_NAMES (FLASHDeconvLayoutManager) to a builder. Every +# builder returns a *callable* OpenMS-Insight component already wired with the +# shared filters/interactivity identifiers. The identifiers below are the FLASHApp +# StateTracker keys (scanIndex / massIndex / heatmap zoom ids) so that state flows +# across components exactly like the legacy grid. + +SCAN_KEY = "scanIndex" +MASS_KEY = "massIndex" +# Receives the user-entered sequence from the SequenceView "Change sequence" +# dialog (Vue `sequence_out` interactivity sentinel). Mirrors the legacy +# `sequenceOut` selection consumed by src/render/update.py:get_sequence. +SEQ_OUT_KEY = "sequenceOut" + +# Curated column definitions mirroring the LEGACY Vue tables (titles / order / +# field selection). The OI Table's ``_get_columns_to_select`` projects to ONLY the +# fields named here (plus index / interactivity / filter columns), so any internal +# frame column not listed is hidden -- the visual-parity goal. + +# TabulatorScanTable.vue columns -> scan_table fields. Legacy "Index" (id) maps to +# the frame's `index` (row position == scan index). +_SCAN_COLUMN_DEFINITIONS = [ + {"title": "Index", "field": "index", "sorter": "number"}, + {"title": "Scan Number", "field": "Scan", "sorter": "number"}, + {"title": "MS Level", "field": "MSLevel", "sorter": "number"}, + {"title": "Retention time", "field": "RT", "sorter": "number", + "formatter": "fixed", "formatterParams": {"precision": 4}}, + {"title": "Precursor Mass", "field": "PrecursorMass", "sorter": "number", + "formatter": "fixed", "formatterParams": {"precision": 4}}, + {"title": "#Masses", "field": "#Masses", "sorter": "number"}, +] + +# TabulatorMassTable.vue columns -> mass_table_long fields. Legacy "Index" (id) maps +# to the long frame's `mass_id` (0-based mass position within the scan). +_MASS_COLUMN_DEFINITIONS = [ + {"title": "Index", "field": "mass_id", "sorter": "number"}, + {"title": "Monoisotopic mass", "field": "MonoMass", "sorter": "number", + "formatter": "fixed", "formatterParams": {"precision": 4}}, + {"title": "Sum intensity", "field": "SumIntensity", "sorter": "number", + "formatter": "fixed", "formatterParams": {"precision": 4}}, + {"title": "Min charge", "field": "MinCharges", "sorter": "number"}, + {"title": "Max charge", "field": "MaxCharges", "sorter": "number"}, + {"title": "Min isotope", "field": "MinIsotopes", "sorter": "number"}, + {"title": "Max isotope", "field": "MaxIsotopes", "sorter": "number"}, + {"title": "Cosine score", "field": "CosineScore", "sorter": "number", + "formatter": "fixed", "formatterParams": {"precision": 4}}, + {"title": "SNR", "field": "SNR", "sorter": "number", + "formatter": "fixed", "formatterParams": {"precision": 4}}, + {"title": "QScore", "field": "QScore", "sorter": "number", + "formatter": "fixed", "formatterParams": {"precision": 4}}, +] + + +def _component_cache_dir(file_manager, experiment_id: str) -> str: + """Directory under the workspace cache where OI component caches are written.""" + cache_root = Path(file_manager.cache_path, "oi_components", str(experiment_id)) + cache_root.mkdir(parents=True, exist_ok=True) + return str(cache_root) + + +def _lazy(file_manager, experiment_id: str, name_tag: str) -> Optional[pl.LazyFrame]: + """Load a stored frame as a polars LazyFrame, or None if absent.""" + if not file_manager.result_exists(experiment_id, name_tag): + return None + return file_manager.get_results( + experiment_id, [name_tag], use_polars=True + )[name_tag] + + +# --------------------------------------------------------------------------- +# Per-component builders. Each returns an OpenMS-Insight component instance, or +# None when the underlying data frame is missing (component is silently skipped). +# --------------------------------------------------------------------------- + +def _build_heatmap( + file_manager, experiment_id: str, cache_dir: str, frame_tag: str, + zoom_id: str, title: str, +): + data = _lazy(file_manager, experiment_id, frame_tag) + if data is None: + return None + # Long heatmap frames carry columns: mass, rt, intensity, scan_idx, mass_idx. + # Axes per Heatmap.md: x = Retention Time (rt), y = Mass (mass). + return Heatmap( + cache_id=f"{frame_tag}_{experiment_id}", + data=data, + x_column="rt", + y_column="mass", + intensity_column="intensity", + zoom_identifier=zoom_id, + title=title, + x_label="Retention Time", + y_label="Mass", + cache_path=cache_dir, + ) + + +def _build_scan_table(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "scan_table") + if data is None: + return None + # Scan table: clicking a row sets scanIndex to the row's `index`. + return Table( + cache_id=f"scan_table_{experiment_id}", + data=data, + interactivity={SCAN_KEY: "index"}, + index_field="index", + column_definitions=_SCAN_COLUMN_DEFINITIONS, + go_to_fields=["index", "Scan"], + title="Scan Table", + cache_path=cache_dir, + ) + + +def _build_mass_table(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "mass_table_long") + if data is None: + return None + # Mass table (long): filtered to the selected scan via `index`; clicking a row + # sets massIndex to the row's `mass_id`. + return Table( + cache_id=f"mass_table_{experiment_id}", + data=data, + filters={SCAN_KEY: "index"}, + interactivity={MASS_KEY: "mass_id"}, + index_field="mass_id", + column_definitions=_MASS_COLUMN_DEFINITIONS, + go_to_fields=["mass_id"], + title="Mass Table", + cache_path=cache_dir, + ) + + +def _build_deconv_spectrum(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "deconv_spectrum_long") + if data is None: + return None + # Deconvolved spectrum: filtered by scan; clicking a peak sets massIndex. + # The per-row signal_* list columns (emitted on deconv_spectrum_long by + # src/parse/deconv.py) drive the per-mass charge-state drill-down sub-view. + return LinePlot( + cache_id=f"deconv_spectrum_{experiment_id}", + data=data, + filters={SCAN_KEY: "index"}, + interactivity={MASS_KEY: "peak_id"}, + x_column="MonoMass", + y_column="SumIntensity", + signal_mz_column="signal_mzs", + signal_charge_column="signal_charges", + signal_intensity_column="signal_intensities", + title="Deconvolved Spectrum", + x_label="Monoisotopic Mass", + y_label="Intensity", + cache_path=cache_dir, + ) + + +def _build_anno_spectrum(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "anno_spectrum_long") + if data is None: + return None + # Annotated/raw spectrum: filtered by scan; consumer only (no interactivity). + return LinePlot( + cache_id=f"anno_spectrum_{experiment_id}", + data=data, + filters={SCAN_KEY: "index"}, + x_column="MonoMass_Anno", + y_column="SumIntensity_Anno", + title="Annotated Spectrum", + x_label="m/z", + y_label="Intensity", + cache_path=cache_dir, + ) + + +def _build_combined_spectrum(file_manager, experiment_id: str, cache_dir: str): + # DEAD CODE in the OI Deconv viewer: the FLASHDeconv layout (see + # FLASHDeconvLayoutManager.COMPONENT_NAMES) exposes no "combined_spectrum" / + # augmented panel, and this builder is not registered in COMPONENT_BUILDERS, + # so it is never invoked. The "Augmented Deconvolved Spectrum" only exists in + # the LEGACY grid path (src/render/initialize.py). The real Deconv spectrum the + # OI layout renders is `deconv_spectrum` (_build_deconv_spectrum), which now + # carries the signal_* charge drill-down columns. Kept here for reference until + # an augmented panel is (if ever) added to the Deconv layout. + primary = _lazy(file_manager, experiment_id, "combined_spectrum_long") + if primary is None: + return None + anno = _lazy(file_manager, experiment_id, "anno_spectrum_long") + # Augmented/combined: primary deconv series + signal-peak markers, with the + # annotated overlay supplied as the second series. The LinePlot Vue reads the + # x2/y2 columns as INDEPENDENT column arrays (their own length), NOT row-aligned + # with the primary series. Because the deconv peak axis and the anno peak axis + # have different per-scan lengths, we must VERTICALLY STACK the two long frames + # (diagonal concat) rather than relationally join them (a join would multiply + # rows cartesian-style). After the scanIndex value-filter on `index`, the + # primary columns are populated on the deconv rows and the anno columns on the + # anno rows; each column array is then the correct length for its series. + if anno is not None: + primary = pl.concat([primary, anno], how="diagonal") + x2, y2 = "MonoMass_Anno", "SumIntensity_Anno" + else: + x2 = y2 = None + return LinePlot( + cache_id=f"combined_spectrum_{experiment_id}", + data=primary, + filters={SCAN_KEY: "index"}, + interactivity={MASS_KEY: "peak_id"}, + x_column="MonoMass", + y_column="SumIntensity", + signal_peak_column="is_signal", + x2_column=x2, + y2_column=y2, + title="Augmented Deconvolved Spectrum", + x_label="Monoisotopic Mass", + y_label="Intensity", + cache_path=cache_dir, + ) + + +def _build_scatter3d(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "threedim_SN_plot") + if data is None: + return None + # MS2 precursor-signal lookup: locate the precursor scan's row + # (Scan == PrecursorScan) and the index into its MonoMass array whose value + # matches PrecursorMass. Fresh parses (src/parse/deconv.py) emit all four + # columns (7-col frame), but STALE/OLD ``threedim_SN_plot.pq`` caches only + # carry the 4 legacy columns (index, PrecursorScan, SignalPeaks, NoisyPeaks). + # Scatter3D._validate_mappings raises ValueError if any precursor column is + # configured-but-missing, and this builder runs OUTSIDE the page try/except, + # so we MUST schema-gate: pass the precursor params ONLY when ALL FOUR + # columns are present; otherwise fall back to the legacy per-scan behavior. + schema_names = data.collect_schema().names() + precursor_cols = ("Scan", "PrecursorScan", "PrecursorMass", "MonoMass") + precursor_kwargs = {} + if all(col in schema_names for col in precursor_cols): + precursor_kwargs = { + "scan_column": "Scan", + "precursor_scan_column": "PrecursorScan", + "precursor_mass_column": "PrecursorMass", + "mono_mass_column": "MonoMass", + } + # 3D S/N plot: scanIndex value-filters on `index`; massIndex handled internally + # as an array subscript (NOT a value filter). + return Scatter3D( + cache_id=f"threedim_SN_plot_{experiment_id}", + data=data, + scan_filter="index", + signal_column="SignalPeaks", + noisy_column="NoisyPeaks", + title="Precursor Signals", + cache_path=cache_dir, + **precursor_kwargs, + ) + + +def _build_fdr_plot(file_manager, experiment_id: str, cache_dir: str): + # Precomputed {x,y} density frames stored by deconv.py. The TnT/Deconv literals + # (axis "QScore", series "Target/Decoy QScores") are the DensityPlot defaults. + target = _lazy(file_manager, experiment_id, "density_target") + decoy = _lazy(file_manager, experiment_id, "density_decoy") + if target is None and decoy is None: + return None + return DensityPlot( + cache_id=f"fdr_plot_{experiment_id}", + density_target=target, + density_decoy=decoy, + title="Score Distribution", + cache_path=cache_dir, + ) + + +def _selected_precursor_mass(file_manager, experiment_id: str, state_manager): + """Observed PrecursorMass of the currently-selected scan, or None. + + Mirrors the legacy "Precursor" mass header (src/render/update.py get_sequence + / per-scan data), which reads ``PrecursorMass`` from the selected scan. The + selected scan is the ``scanIndex`` selection (== the scan_table ``index``); + we look its ``PrecursorMass`` up in the scan_table frame. Returns None when no + scan is selected, the table/column is absent, or the value is 0.0 (the legacy + sentinel for "scan not eligible for this view", which renders an empty header). + """ + if state_manager is None: + return None + selected_index = state_manager.get_selection(SCAN_KEY) + if selected_index is None: + return None + scan_table = _lazy(file_manager, experiment_id, "scan_table") + if scan_table is None: + return None + names = scan_table.collect_schema().names() + if "index" not in names or "PrecursorMass" not in names: + return None + try: + row = ( + scan_table.filter(pl.col("index") == selected_index) + .select("PrecursorMass") + .collect() + ) + except Exception: + return None + if row.height == 0: + return None + value = row["PrecursorMass"][0] + if value is None: + return None + try: + value = float(value) + except (TypeError, ValueError): + return None + if value == 0.0: + return None + return value + + +def _get_sequence(file_manager): + """Return the submitted (sequence, fix_C, fix_M) tuple, or None.""" + if not file_manager.result_exists("sequence", "sequence"): + return None + sequence = file_manager.get_results("sequence", "sequence")["sequence"] + return ( + sequence["input_sequence"], + sequence["fixed_mod_cysteine"], + sequence["fixed_mod_methionine"], + ) + + +def _build_sequence_view( + file_manager, experiment_id: str, cache_dir: str, state_manager=None +): + seq = _get_sequence(file_manager) + if seq is None: + return None + submitted_sequence, fix_c, fix_m = seq + + # Prefer a sequence the user entered via the SequenceView "Change sequence" + # dialog (Vue emits it into the `sequenceOut` selection through the + # `sequence_out` interactivity sentinel). Mirrors legacy + # src/render/update.py:get_sequence which prefers `sequenceOut`. The + # user-entered sequence is taken verbatim (no fixed-mod baking, matching the + # legacy path which returns it with no C/M mods). + user_sequence = None + if state_manager is not None: + candidate = state_manager.get_selection(SEQ_OUT_KEY) + if isinstance(candidate, str) and len(candidate) > 0: + user_sequence = candidate + + if user_sequence is not None: + sequence_string = user_sequence + else: + # Bake the selected C/M fixed modifications into the sequence string so + # the theoretical fragment masses (computed by SequenceView via pyOpenMS + # from the literal string) reflect them -- parity with the legacy + # setFixedModification, which applied the mods BEFORE fragment-mass + # calculation. (compute_fixed_mods only marks residue types; it does NOT + # shift masses, so baking is required.) + sequence_string = bake_fixed_modifications(submitted_sequence, fix_c, fix_m) + + # Deconv peaks are neutral masses (deconvolved=True). Wire the deconv long + # spectrum as the peaks_data (renamed to the SequenceView schema: peak_id, + # mass, intensity), filtered by the selected scan. + peaks = _lazy(file_manager, experiment_id, "deconv_spectrum_long") + if peaks is None: + return None + peaks = peaks.select( + pl.col("index"), + pl.col("peak_id"), + pl.col("MonoMass").alias("mass"), + pl.col("SumIntensity").alias("intensity"), + ) + + # Mass header parity (legacy Deconv shows the "Precursor" header: + # Theoretical / Observed / Δ). We MUST NOT emit `computed_mass` here: in the + # OI SequenceView Vue, `displayTnT = (computedMass !== undefined)`, which would + # (a) mislabel the header "Proteoform" instead of "Precursor", and (b) force + # `disableVariableModifications = true`, silently disabling the variable/custom + # modification context menu that this path explicitly enables via + # `disable_variable_modifications=False`. So `computed_mass` stays dropped. + # + # Instead emit `precursor_mass` = the OBSERVED precursor mass of the selected + # scan (legacy reads `PrecursorMass` from the selected scan; see + # src/render/update.py get_sequence). When a scan is selected and its + # PrecursorMass is reachable in the scan_table, wire it so the "Precursor" + # header renders; otherwise omit it (header observed/Δ rows render empty) -- + # either way the variable-mod menu stays enabled. + precursor_mass = _selected_precursor_mass( + file_manager, experiment_id, state_manager + ) + if precursor_mass is not None: + sequence_data = pl.LazyFrame( + { + "sequence": [sequence_string], + "precursor_charge": [1], + "precursor_mass": [precursor_mass], + } + ) + else: + sequence_data = sequence_string + + return SequenceView( + cache_id=f"sequence_view_{experiment_id}", + sequence_data=sequence_data, + peaks_data=peaks, + filters={SCAN_KEY: "index"}, + # Click a fragment-table row -> set massIndex to the matched peak_id. + # The "Change sequence" dialog -> set sequenceOut to the entered sequence. + interactivity={MASS_KEY: "peak_id", SEQ_OUT_KEY: "sequence_out"}, + deconvolved=True, + compute_fixed_mods=True, + # Enable the variable / custom modification context menu on this + # submitted-sequence path (TnT path keeps it disabled). + disable_variable_modifications=False, + title="Sequence View", + cache_path=cache_dir, + ) + + +# COMPONENT_NAMES (layout) -> builder. Mirrors FLASHDeconvLayoutManager COMPONENT_NAMES. +COMPONENT_BUILDERS = { + "ms1_raw_heatmap": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms1_raw_heatmap", "heatmap_raw", "Raw MS1 Heatmap"), + "ms2_raw_heatmap": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms2_raw_heatmap", "heatmap_raw2", "Raw MS2 Heatmap"), + "ms1_deconv_heat_map": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms1_deconv_heatmap", "heatmap_deconv", "Deconvolved MS1 Heatmap"), + "ms2_deconv_heat_map": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms2_deconv_heatmap", "heatmap_deconv2", "Deconvolved MS2 Heatmap"), + "scan_table": _build_scan_table, + "deconv_spectrum": _build_deconv_spectrum, + "anno_spectrum": _build_anno_spectrum, + "mass_table": _build_mass_table, + "3D_SN_plot": _build_scatter3d, + "fdr_plot": _build_fdr_plot, + # sequence_view is built separately (needs the panel StateManager to consume + # the `sequenceOut` selection); see build_component. + # internal_fragment_map: deferred (component disabled in the legacy path too). +} + + +def build_component( + file_manager, experiment_id: str, cache_dir: str, comp_name: str, + state_manager=None, +): + """Instantiate the OpenMS-Insight component for a layout cell, or None.""" + if comp_name == "sequence_view": + # The SequenceView builder consumes the user-entered sequence from the + # panel StateManager (`sequenceOut`); the other builders are stateless. + return _build_sequence_view( + file_manager, experiment_id, cache_dir, state_manager=state_manager + ) + builder = COMPONENT_BUILDERS.get(comp_name) + if builder is None: + return None + return builder(file_manager, experiment_id, cache_dir) + + +def render_experiment_panel( + experiment_id: str, + layout_info_per_exp: List[List[str]], + file_manager, + panel_index: int, +): + """Render one experiment's [row][col] grid with its OWN isolated StateManager. + + The StateManager uses a DISTINCT session_key per experiment so selections made + in this panel do not leak into other side-by-side panels. + """ + session_key = f"svc_state_deconv_{experiment_id}_{panel_index}" + state_manager = StateManager(session_key=session_key) + cache_dir = _component_cache_dir(file_manager, experiment_id) + + # When the selected scan changes, clear the mass selection so the mass table / + # 3D plot / spectrum highlight do not keep a stale mass from the prior scan + # (parity with TabulatorScanTable.vue:85-95, which clears the mass selection on + # a fresh scan-row click). We track the last-seen scanIndex per panel via a + # dedicated session_state key so the reset triggers once per change. + scan_seen_key = f"{session_key}__last_scan_index" + current_scan = state_manager.get_selection(SCAN_KEY) + last_scan = st.session_state.get(scan_seen_key) + if current_scan != last_scan: + state_manager.clear_selection(MASS_KEY) + st.session_state[scan_seen_key] = current_scan + + for row_index, row in enumerate(layout_info_per_exp): + columns = st.columns(len(row)) + for col, (col_index, comp_name) in zip(columns, enumerate(row)): + with col: + component = build_component( + file_manager, experiment_id, cache_dir, comp_name, + state_manager=state_manager, + ) + # A builder returns None when its optional backing frame is + # absent (e.g. no sequence submitted, or *_long not yet cached); + # skip silently rather than warning on every rerun. + if component is None: + continue + key = f"deconv_oi_{panel_index}_{row_index}_{col_index}_{comp_name}" + component(key=key, state_manager=state_manager) diff --git a/content/FLASHDeconv/deconv_sequence.py b/content/FLASHDeconv/deconv_sequence.py new file mode 100644 index 00000000..87c2fb77 --- /dev/null +++ b/content/FLASHDeconv/deconv_sequence.py @@ -0,0 +1,118 @@ +"""Helpers for the FLASHDeconv OpenMS-Insight SequenceView path. + +The FLASHDeconv "submitted sequence" path lets the user pick a fixed +modification on cysteine and/or methionine (``fixed_mod_cysteine`` / +``fixed_mod_methionine`` in ``src/render/sequence.py``). The legacy renderer +applied those via ``setFixedModification`` *before* computing theoretical +fragment masses, so the masses reflected the mods. + +The OpenMS-Insight ``SequenceView`` computes theoretical fragment masses from +the literal sequence string (``calculate_fragment_masses_pyopenms``), and its +``compute_fixed_mods`` flag only *marks* which residue types carry a mod (for +display) -- it does NOT shift the fragment masses. To get parity we therefore +BAKE the selected fixed mods into the sequence string (e.g. +``C(Carbamidomethyl)``) so pyOpenMS includes the mass shift in every fragment. + +Mapping the FLASHApp option label (e.g. ``'Carbamidomethyl (+57)'``) to an +OpenMS modification name is done by mass, mirroring ``setFixedModification``'s +``ModificationsDB().getBestModificationByDiffMonoMass`` lookup, so the baked +name is one ``AASequence.fromString`` accepts. +""" + +from __future__ import annotations + +from typing import Optional + +# Mass shifts for the selectable fixed modifications, mirroring +# ``src/render/sequence.py`` (``fixed_mod_cysteine`` / ``fixed_mod_methionine``). +# Duplicated here (rather than imported) so this helper does not pull in +# ``src/render/sequence.py``'s top-level ``pyopenms`` import at module load: that +# keeps the helper importable/testable when pyOpenMS is absent (the mass-based +# name resolution and theoretical-mass calc degrade gracefully below). +fixed_mod_cysteine = { + "No modification": 0, + "Carbamidomethyl (+57)": 57.021464, + "Carboxymethyl (+58)": 58.005479, + "Xlink:Disulfide (-1 per C)": -1.007825, +} +fixed_mod_methionine = { + "No modification": 0, + "L-methionine sulfoxide (+16)": 15.994915, + "L-methionine sulfone (+32)": 31.989829, +} + + +def _resolve_mod_name(diff_mass: float, residue: str) -> Optional[str]: + """Resolve an OpenMS modification id for a mass shift on ``residue``. + + Mirrors ``setFixedModification`` (``getBestModificationByDiffMonoMass``). + Returns None if pyOpenMS is unavailable or no modification matches. + """ + if diff_mass == 0: + return None + try: + from pyopenms import ModificationsDB + except Exception: + return None + try: + mod = ModificationsDB().getBestModificationByDiffMonoMass( + diff_mass, 0.001, residue, 0 + ) + except Exception: + return None + if mod is None: + return None + try: + name = mod.getId() + except Exception: + return None + return name or None + + +def bake_fixed_modifications( + sequence: str, fix_c: Optional[str], fix_m: Optional[str] +) -> str: + """Return ``sequence`` with the chosen C/M fixed mods baked in as OpenMS mods. + + ``fix_c`` / ``fix_m`` are FLASHApp option labels (keys of + ``fixed_mod_cysteine`` / ``fixed_mod_methionine``); falsy / 'No modification' + leave that residue untouched. Unknown labels or a missing pyOpenMS leave the + sequence unchanged (graceful degradation; the static string still renders). + """ + if not sequence: + return sequence + + c_name = None + if fix_c and fix_c in fixed_mod_cysteine: + c_name = _resolve_mod_name(fixed_mod_cysteine[fix_c], "C") + m_name = None + if fix_m and fix_m in fixed_mod_methionine: + m_name = _resolve_mod_name(fixed_mod_methionine[fix_m], "M") + + if c_name is None and m_name is None: + return sequence + + out = [] + for aa in sequence: + out.append(aa) + if aa == "C" and c_name is not None: + out.append(f"({c_name})") + elif aa == "M" and m_name is not None: + out.append(f"({m_name})") + return "".join(out) + + +def theoretical_mass(sequence: str) -> Optional[float]: + """Monoisotopic mass of the (possibly modified) sequence, or None. + + Used to populate the SequenceView mass header (``computed_mass``). Returns + None when pyOpenMS is unavailable so the caller simply omits the column. + """ + if not sequence: + return None + try: + from pyopenms import AASequence + + return AASequence.fromString(sequence).getMonoWeight() + except Exception: + return None diff --git a/content/FLASHQuant/FLASHQuantViewer.py b/content/FLASHQuant/FLASHQuantViewer.py index 05077e9f..0226ed34 100644 --- a/content/FLASHQuant/FLASHQuantViewer.py +++ b/content/FLASHQuant/FLASHQuantViewer.py @@ -4,17 +4,24 @@ from src.workflow.FileManager import FileManager from src.common.common import page_setup, save_params -# from src.render.components import flash_viewer_grid_component, FlashViewerComponent, FLASHQuant -from src.render.render import render_grid + +# NOTE (Stage D rewiring): FLASHQuant now renders through the reusable +# `openms_insight.FeatureView` component instead of the bespoke +# `flash_viewer_grid` / `FLASHQuantView` path. The old render path is left +# importable on purpose (do NOT delete) so it can be restored or compared. +# from src.render.components import flash_viewer_grid_component, FlashViewerComponent, FLASHQuant +# from src.render.render import render_grid +from openms_insight import FeatureView # page initialization params = page_setup() # Get available results +workspace = st.session_state["workspace"] file_manager = FileManager( - st.session_state["workspace"], - Path(st.session_state['workspace'], 'flashquant', 'cache') + workspace, + Path(workspace, 'flashquant', 'cache') ) results = file_manager.get_results_list( ['quant_dfs'] @@ -26,22 +33,29 @@ st.stop() # Map names to index -name_to_index = {n : i for i, n in enumerate(results)} +name_to_index = {n: i for i, n in enumerate(results)} -# for only single experiment on one view +# FLASHQuant is a single-experiment, single-component page (no cross-linking, +# no configurable grid). Pick one experiment and render one FeatureView for it. st.selectbox("choose experiment", results, key="selected_experiment0_quant") selected_exp0 = st.session_state.selected_experiment0_quant -render_grid( - st.session_state.selected_experiment0_quant, [['quant_visualization']], - file_manager, 'flashquant', 'selected_experiment0_quant' +# Load the parsed feature-group frame produced by src/parse/flashquant.py +# (`connectTraceWithResult`): the 12 scalar columns plus the per-feature-group +# array columns Charges / IsotopeIndices / CentroidMzs / RTs / MZs / Intensities, +# where RTs/MZs/Intensities elements are comma-joined point strings. FeatureView +# consumes this frame directly (no transformation needed). +quant_df = file_manager.get_results(selected_exp0, ['quant_dfs'])['quant_dfs'] + +# Cache id is per-experiment so switching experiments yields an independent, +# correctly-scoped cache and selection. The cache lives under the FLASHQuant +# workspace cache directory. +feature_view = FeatureView( + cache_id=f'flashquant_{selected_exp0}', + data=quant_df, + cache_path=str(Path(workspace, 'flashquant', 'cache', 'featureview')), ) - -# # Get data -# quant_df = file_manager.get_results(selected_exp0, 'quant_dfs')['quant_dfs'] - -# component = [[FlashViewerComponent(FLASHQuant())]] -# flash_viewer_grid_component(components=component, data={'quant_data': quant_df, 'dataset': selected_exp0}, component_key='flash_viewer_grid') +feature_view(key=f'flashquant_featureview_{selected_exp0}') save_params(params) diff --git a/content/FLASHTnT/FLASHTnTViewer.py b/content/FLASHTnT/FLASHTnTViewer.py index e94392f3..0d3d7079 100644 --- a/content/FLASHTnT/FLASHTnTViewer.py +++ b/content/FLASHTnT/FLASHTnTViewer.py @@ -4,12 +4,50 @@ from src.common.common import page_setup, save_params from src.workflow.FileManager import FileManager +# Legacy bespoke-grid render path (kept importable until OI integration is verified). from src.render.render import render_grid +# The OpenMS-Insight viewer (Stage C) is imported lazily inside render_panel (see +# below) so an import failure (e.g. a missing openms-insight install) falls back +# to the legacy grid instead of breaking the whole page. + + +def _use_oi_viewer(): + return st.session_state.get("settings", {}).get( + "use_openms_insight_viewer", True + ) + + +def render_panel(experiment_id, layout_info_per_exp, file_manager, identifier, + grid_key, panel_index): + """Render one experiment panel via the configured viewer. + + Routes to the new OpenMS-Insight viewer when enabled, else the legacy grid. + The OI viewer is imported lazily and guarded so an import failure falls back + to the legacy grid rather than breaking the page. + """ + if _use_oi_viewer(): + try: + from content.FLASHTnT.FLASHTnTViewerOI import ( + render_experiment_panel, + ) + except Exception as exc: # noqa: BLE001 - OI viewer unavailable + st.warning( + f"OpenMS-Insight viewer unavailable ({exc}); using legacy grid." + ) + else: + render_experiment_panel( + experiment_id, layout_info_per_exp, file_manager, panel_index + ) + return + render_grid( + experiment_id, layout_info_per_exp, file_manager, + 'flashtnt', identifier, grid_key + ) DEFAULT_LAYOUT = [ - ['protein_table'], - ['sequence_view'], + ['protein_table'], + ['sequence_view'], ['tag_table'], ['combined_spectrum'] ] @@ -81,7 +119,11 @@ def validate_selected_index(file_manager, selected_experiment): on_change=select_experiment ) if 'selected_experiment0_tagger' in st.session_state: - render_grid(st.session_state.selected_experiment0_tagger, layout[0], file_manager, 'flashtnt', 'selected_experiment0_tagger') + render_panel( + st.session_state.selected_experiment0_tagger, layout[0], + file_manager, 'selected_experiment0_tagger', + 'flash_viewer_grid_0', panel_index=0 + ) with c2: st.selectbox( "choose experiment", display_names, @@ -90,7 +132,12 @@ def validate_selected_index(file_manager, selected_experiment): on_change=select_experiment ) if f"selected_experiment1_tagger" in st.session_state: - render_grid(st.session_state.selected_experiment1_tagger, layout[1], file_manager, 'flashtnt', 'selected_experiment1_tagger', 'flash_viewer_grid_1') + with st.spinner('Loading component...'): + render_panel( + st.session_state.selected_experiment1_tagger, layout[1], + file_manager, 'selected_experiment1_tagger', + 'flash_viewer_grid_1', panel_index=1 + ) else: @@ -103,7 +150,11 @@ def validate_selected_index(file_manager, selected_experiment): ) if 'selected_experiment0_tagger' in st.session_state: - render_grid(st.session_state.selected_experiment0_tagger, layout[0], file_manager, 'flashtnt', 'selected_experiment0_tagger') + render_panel( + st.session_state.selected_experiment0_tagger, layout[0], + file_manager, 'selected_experiment0_tagger', + 'flash_viewer_grid', panel_index=0 + ) ### for multiple experiments on one view if len(layout) > 1: @@ -122,6 +173,11 @@ def validate_selected_index(file_manager, selected_experiment): # if #experiment input files are less than #layouts, all the pre-selection will be the first experiment if f"selected_experiment{exp_index}_tagger" in st.session_state: - render_grid(st.session_state["selected_experiment%d_tagger" % exp_index], layout[exp_index], file_manager, 'flashtnt', f"selected_experiment{exp_index}_tagger", 'flash_viewer_grid_%d' % exp_index) + render_panel( + st.session_state["selected_experiment%d_tagger" % exp_index], + layout[exp_index], file_manager, + f"selected_experiment{exp_index}_tagger", + 'flash_viewer_grid_%d' % exp_index, panel_index=exp_index + ) save_params(params) \ No newline at end of file diff --git a/content/FLASHTnT/FLASHTnTViewerOI.py b/content/FLASHTnT/FLASHTnTViewerOI.py new file mode 100644 index 00000000..52f96360 --- /dev/null +++ b/content/FLASHTnT/FLASHTnTViewerOI.py @@ -0,0 +1,844 @@ +"""FLASHTnT viewer rendered entirely with OpenMS-Insight components (Stage C). + +This is the NEW viewer for the FLASHApp -> OpenMS-Insight visualization migration, +mirroring ``content/FLASHDeconv/FLASHDeconvViewerOI.py`` (Stage B). It renders the +FLASHTnT (tagger / top-down identification) workflow using the reusable +``openms_insight`` component library (``Table``, ``LinePlot``, ``SequenceView``, +``DensityPlot``, ``Heatmap``) instead of the bespoke ``flash_viewer_grid`` Vue grid +in ``src/render/*``. + +Design goals (see ``/home/user/parity/STRATEGY.md`` §4/§5 and Stage C edges): + +* ONE shared ``StateManager`` per rendered experiment panel, keyed by a DISTINCT + ``session_key`` (``svc_state_tnt__``) so selections never + leak between side-by-side experiment panels (HARD edge #6). +* Layout parity: the ``[experiment][row][col]`` nested grid is reproduced with + ``st.columns`` per row (<=3 cols), rows stacked; multi-experiment side-by-side + uses a top-level ``st.columns``. +* TnT-specific wiring (STRATEGY §2/§3): + - ``protein_dfs`` is row-per-proteoform with ``index``; the protein Table sets + ``proteinIndex`` on click. + - ``tag_dfs`` is row-per-tag with ``Scan``/``ProteinIndex``/``StartPos``/``EndPos``/``mzs``. + - The per-proteoform ``sequence_data`` store (``sequence_data_store.py``) carries + ``coverage``/``maxCoverage`` keyed by ``proteoform_index``. + - **Scan resolution (HARD edge #3):** a proteoform selection must resolve to the + correct deconv scan. ``build_proteoform_scan_frame`` (additive helper in + ``src/render/scan_resolution.py``, reproducing the legacy + ``build_proteoform_scan_map`` PyArrow pushdown) surfaces ``proteoform_index -> + (scan, deconv_index)`` as COLUMNS. We stamp a ``proteoform_index`` column onto + the combined-spectrum / sequence-peak frames by joining on the deconv ``index``, + so the OpenMS-Insight components value-filter + (``filters={'proteinIndex': 'proteoform_index'}``) exactly like Deconv filters by + scan. + - **Tagger overlay (HARD edge #1):** the Tag Table sets ``tagData`` to the clicked + tag's list of masses; the combined-spectrum ``LinePlot`` highlights peaks whose + ``MonoMass`` matches a selected tag mass within ``abs(Δ) < 1e-5``. + +NOTE: FLASHTnT runs BOTH ``parseDeconv`` and ``parseTnT`` on the same dataset +(``src/Workflow.py``), so the Deconv long-format frames (``combined_spectrum_long``, +``scan_table``, heatmaps) are present alongside the TnT frames (``protein_dfs``, +``tag_dfs``, ``sequence_data``, ``settings``, ``density_id_target``/``density_id_decoy``). + +The OLD render path (``src/render/render.py`` / ``flash_viewer_grid``) is left intact +and importable; the page chooses which path to use via ``use_openms_insight_viewer``. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, List, Optional + +import polars as pl +import streamlit as st + +from openms_insight import ( + DensityPlot, + Heatmap, + LinePlot, + SequenceView, + StateManager, + Table, +) + +from src.render.scan_resolution import build_proteoform_scan_frame + +# FLASHApp StateTracker keys reused as OpenMS-Insight identifiers so state flows +# across components exactly like the legacy grid. +PROTEIN_KEY = "proteinIndex" +# Tag selection: the Tag Table sets a SCALAR `tagData` to the clicked tag's +# `TagIndex` (a scalar — list-valued interactivity columns are not supported by +# the OpenMS-Insight Table, which calls `.item()` on the cell). The viewer then +# resolves that TagIndex to the tag's list of masses and publishes it under +# `TAG_MASSES_KEY`, which the combined-spectrum LinePlot consumes for the tagger +# overlay (`tag_filters={'tagMasses': 'MonoMass'}`). +TAG_KEY = "tagData" +TAG_MASSES_KEY = "tagMasses" +MASS_KEY = "massIndex" +# Residue -> Tag-Table cross-link (legacy `selectionStore.selectedAApos`). +# Clicking a covered residue in the SequenceView sets this to the residue's +# PROTEIN-ABSOLUTE 0-based position; the Tag Table range-filters its rows to tags +# whose [StartPos, EndPos] span contains that position (StartPos <= pos <= EndPos), +# clearing on re-click (toggle). The SequenceView now renders the FULL protein +# sequence, so the residue grid index IS the protein-absolute position +# (`sequence_offset` == 0) and the emitted coordinate matches tag StartPos/EndPos +# for ALL proteoforms directly. +AA_KEY = "selectedAApos" +# Tag-span highlight on the SequenceView (legacy `selectedTag.{startPos,endPos}`). +# Published as {"start": StartPos, "end": EndPos, "nTerminal": bool} (protein- +# absolute indices) and consumed by the SequenceView `"tag_span"` interactivity +# sentinel, which reads this selection value to bracket-highlight the tag span. +TAG_SPAN_KEY = "tagSpan" + + +def _component_cache_dir(file_manager, experiment_id: str) -> str: + """Directory under the workspace cache where OI component caches are written.""" + cache_root = Path(file_manager.cache_path, "oi_components_tnt", str(experiment_id)) + cache_root.mkdir(parents=True, exist_ok=True) + return str(cache_root) + + +def _lazy(file_manager, experiment_id: str, name_tag: str) -> Optional[pl.LazyFrame]: + """Load a stored frame as a polars LazyFrame, or None if absent.""" + if not file_manager.result_exists(experiment_id, name_tag): + return None + return file_manager.get_results( + experiment_id, [name_tag], use_polars=True + )[name_tag] + + +def _pandas(file_manager, experiment_id: str, name_tag: str): + """Load a stored frame as pandas (for the precomputed density frames), or None.""" + if not file_manager.result_exists(experiment_id, name_tag): + return None + return file_manager.get_results(experiment_id, [name_tag])[name_tag] + + +# --------------------------------------------------------------------------- +# Scan resolution: proteoform_index -> deconv index, exposed as a frame so the +# spectrum / sequence components can value-filter by proteoform. +# --------------------------------------------------------------------------- + +def _proteoform_scan_frame(file_manager, experiment_id: str) -> Optional[pl.DataFrame]: + """proteoform_index / scan / deconv_index frame for this experiment, or None. + + Reproduces the legacy ``build_proteoform_scan_map`` (PyArrow pushdown in + ``src/render/update.py``) by reading the already-stored ``protein_dfs`` and + ``scan_table`` frames. Cached in session state per experiment to avoid + recomputing on every rerun. + """ + protein = _lazy(file_manager, experiment_id, "protein_dfs") + scan_table = _lazy(file_manager, experiment_id, "scan_table") + if protein is None or scan_table is None: + return None + protein_df = protein.select(["index", "Scan"]).collect() + scan_df = scan_table.select(["index", "Scan"]).collect() + return build_proteoform_scan_frame(protein_df, scan_df) + + +def _stamp_proteoform_index( + spectrum_lf: pl.LazyFrame, scan_frame: pl.DataFrame +) -> pl.LazyFrame: + """Join a deconv-``index``-keyed long spectrum frame with the proteoform/scan + frame so each peak row carries the ``proteoform_index`` whose scan it belongs + to. This converts the proteoform selection into a plain value filter on the + spectrum (``filters={'proteinIndex': 'proteoform_index'}``). + + A scan may map to multiple proteoforms; the inner join replicates the peak + rows per proteoform so each proteoform selection sees its scan's peaks (the + legacy path resolves proteoform->scan then pushes that single scan down, which + is equivalent for the selected proteoform).""" + map_lf = scan_frame.lazy().select( + pl.col("deconv_index").alias("index"), + pl.col("proteoform_index"), + ) + return spectrum_lf.join(map_lf, on="index", how="inner") + + +# --------------------------------------------------------------------------- +# Per-component builders. Each returns an OpenMS-Insight component instance, or +# None when the underlying data frame is missing (component is silently skipped). +# --------------------------------------------------------------------------- + +# Curated column definitions mirroring the LEGACY Vue tables (titles / order / +# field selection). The OI Table's ``_get_columns_to_select`` projects to ONLY the +# fields named here (plus the index / interactivity / filter columns), so any +# internal frame column not listed is hidden -- exactly the parity goal. + +# TabulatorProteinTable.vue columns -> protein_dfs fields. +_PROTEIN_COLUMN_DEFINITIONS = [ + {"title": "Scan No.", "field": "Scan", "sorter": "number"}, + {"title": "Accession", "field": "accession", "sorter": "string"}, + {"title": "Description", "field": "description", "sorter": "string"}, + {"title": "Length", "field": "length", "sorter": "number"}, + # Legacy TabulatorProteinTable.vue renders the `-1` sentinel as "-" and the + # RAW unrounded value otherwise (TabulatorProteinTable.vue:78-81). The OI + # `dashNegativeOne` formatter reproduces the sentinel rule and renders the raw + # value when `precision` is omitted -- so we drop `formatterParams` to avoid + # rounding (critical for tiny Q-values: 0.00012 must NOT become 0.0001). + {"title": "Mass", "field": "ProteoformMass", "sorter": "number", + "formatter": "dashNegativeOne"}, + {"title": "No. of Matched Fragments", "field": "MatchingFragments", "sorter": "number"}, + {"title": "No. of Modifications", "field": "ModCount", "sorter": "number"}, + {"title": "No. of Tags", "field": "TagCount", "sorter": "number"}, + {"title": "Score", "field": "Score", "sorter": "number"}, + # Q-Value also uses the `-1 -> "-"` sentinel rule with the RAW unrounded value + # otherwise (TabulatorProteinTable.vue:105-108). No `formatterParams` so the + # raw value is shown -- rounding would corrupt tiny Q-values (e.g. 0.00012). + {"title": "Q-Value (Proteoform Level)", "field": "ProteoformLevelQvalue", "sorter": "number", + "formatter": "dashNegativeOne"}, +] + +# TabulatorTagTable.vue columns -> tag_dfs fields. +_TAG_COLUMN_DEFINITIONS = [ + {"title": "Scan Number", "field": "Scan", "sorter": "number"}, + {"title": "Start Position", "field": "StartPos", "sorter": "number"}, + {"title": "End Position", "field": "EndPos", "sorter": "number"}, + {"title": "Sequence", "field": "TagSequence", "sorter": "string"}, + {"title": "Length", "field": "Length", "sorter": "number"}, + {"title": "Tag Score", "field": "Score", "sorter": "number"}, + # N/C mass use the legacy `-1 -> "-"` sentinel rule and render the RAW + # unrounded value otherwise (TabulatorTagTable.vue:72-83). No `formatterParams` + # so the raw value is shown (rounding would lose precision on the mass offset). + {"title": "N mass", "field": "Nmass", "sorter": "number", + "formatter": "dashNegativeOne"}, + {"title": "C mass", "field": "Cmass", "sorter": "number", + "formatter": "dashNegativeOne"}, + {"title": "Δ mass", "field": "DeltaMass", "sorter": "number"}, +] + + +def _filter_best_per_spectrum(protein_lf: pl.LazyFrame) -> pl.LazyFrame: + """Collapse the protein frame to the highest-``Score`` proteoform per ``Scan``. + + Reproduces the legacy default-ON "Best per spectrum" toggle + (TabulatorProteinTable.vue ~57-58, 116-198): keep, per ``Scan``, only the row + with the maximum ``Score``; ties keep the first-seen row (lowest ``index``). + Rows without a numeric ``Scan`` pass through unchanged (legacy passthrough). + """ + # Rank within each Scan by descending Score, tie-broken by ascending index so a + # deterministic single survivor is kept (mirrors the legacy first-seen tie rule + # once the frame is read in index order). + ranked = protein_lf.with_columns( + pl.col("Score") + .rank(method="ordinal", descending=True) + .over("Scan") + .alias("_score_rank") + ) + # Keep the best row per Scan; rows with a null Scan are passed through (their + # rank within the null group is irrelevant — keep them all, matching legacy). + kept = ranked.filter( + (pl.col("_score_rank") == 1) | pl.col("Scan").is_null() + ).drop("_score_rank") + return kept + + +def _build_protein_table( + file_manager, experiment_id: str, cache_dir: str, + best_per_spectrum: bool = True, +): + data = _lazy(file_manager, experiment_id, "protein_dfs") + if data is None: + return None + # Best-per-spectrum (default ON): pre-filter to the max-Score proteoform per + # Scan BEFORE building the Table so the displayed rows / default-selected best + # hit / pagination all operate on the collapsed set (legacy parity). The + # checkbox in render_experiment_panel toggles this off to show every hit. + if best_per_spectrum: + data = _filter_best_per_spectrum(data) + # The cache_id encodes the toggle so the ON / OFF frames get distinct caches + # (the Table caches its preprocessed parquet by cache_id). + suffix = "best" if best_per_spectrum else "all" + # Protein table: clicking a row sets proteinIndex to the row's `index`. + # Curated columns/titles match TabulatorProteinTable.vue (the `index` + # interactivity column is auto-included by the Table but stays hidden). + return Table( + cache_id=f"protein_table_{experiment_id}_{suffix}", + data=data, + interactivity={PROTEIN_KEY: "index"}, + index_field="index", + column_definitions=_PROTEIN_COLUMN_DEFINITIONS, + initial_sort=[{"column": "Score", "dir": "desc"}], + go_to_fields=["Scan", "accession"], + title="Protein Table", + cache_path=cache_dir, + ) + + +def _build_tag_table(file_manager, experiment_id: str, cache_dir: str): + data = _lazy(file_manager, experiment_id, "tag_dfs") + if data is None: + return None + scan_frame = _proteoform_scan_frame(file_manager, experiment_id) + if scan_frame is None: + return None + # Tags are scan (spectrum) data. To filter by the SELECTED PROTEOFORM we need + # a proteoform_index column on each tag row; resolve via the proteoform's scan + # (Scan column on the tag) so a proteoform selection shows its scan's tags + # (parity with the legacy filter_data Tag-Table path stamping ProteinIndex). + map_lf = scan_frame.lazy().select( + pl.col("scan").alias("Scan"), + pl.col("proteoform_index"), + ) + tag_lf = data.join(map_lf, on="Scan", how="inner") + # Clicking a tag row sets the SCALAR `tagData` to the row's `TagIndex`. The + # viewer resolves that index to the tag's masses (see _resolve_tag_masses) and + # publishes them for the combined-spectrum tagger overlay. A list-valued + # interactivity column cannot be used here because the OI Table calls + # `.item()` on the clicked cell. + return Table( + cache_id=f"tag_table_{experiment_id}", + data=tag_lf, + filters={PROTEIN_KEY: "proteoform_index"}, + # Residue -> Tag-Table cross-link (legacy StartPos<=selectedAApos<=EndPos): + # when a covered residue is clicked in the SequenceView, AA_KEY holds its + # protein-absolute position and the tags are narrowed to those whose span + # contains it; cleared (no-op) when no residue is selected. + range_filters={AA_KEY: ("StartPos", "EndPos")}, + interactivity={TAG_KEY: "TagIndex"}, + index_field="TagIndex", + column_definitions=_TAG_COLUMN_DEFINITIONS, + initial_sort=[{"column": "Score", "dir": "desc"}], + go_to_fields=["Scan", "StartPos", "EndPos", "TagSequence"], + title="Tag Table", + cache_path=cache_dir, + ) + + +def _resolve_tag_masses(file_manager, experiment_id: str, state_manager) -> None: + """Resolve the selected ``tagData`` (a ``TagIndex``) to its masses + residue + walk and publish under ``tagMasses`` so the combined-spectrum LinePlot tagger + overlay renders the tag walk (residue letters between consecutive masses, with + the x-axis auto-zoomed to the tag span). Clears ``tagMasses`` when no tag is + selected. + + Only the selected tag's row is collected (filtered by ``TagIndex``). The tag + ``mzs`` are a comma-joined string (trailing comma); parse and drop non-numeric + AND zero entries (legacy ``number !== 0``), keeping the STORED order (ascending + for C-term tags, descending for N-term tags). ``TagSequence`` gives the residue + letters; the legacy walks consecutive stored masses labelling gap ``i`` with + ``sequence[len-1-i]`` — i.e. the REVERSED sequence aligns to the stored-order + gaps regardless of anchoring (verified against both an ascending C-term and a + descending N-term tag). Do NOT sort the masses: sorting breaks the alignment + for descending (N-term) tags. The published value is a dict + ``{"masses": [...], "residues": [...], "nTerminal": bool}`` consumed by the OI + LinePlot tag walk; when no residues are available it carries only masses + (highlight-only). When a residue within the selected tag's span is also + selected (``selectedAApos``), a tag-relative ``selectedAA`` index is added so + the walk gold-highlights that residue (legacy ``selectedAApos - StartPos``).""" + def _clear_all() -> None: + state_manager.clear_selection(TAG_MASSES_KEY) + state_manager.clear_selection(TAG_SPAN_KEY) + + tag_index = state_manager.get_selection(TAG_KEY) + if tag_index is None: + _clear_all() + return + + tags = _lazy(file_manager, experiment_id, "tag_dfs") + if tags is None: + _clear_all() + return + + selected = ( + tags.filter(pl.col("TagIndex") == int(tag_index)) + .select( + pl.col("mzs") + .str.strip_chars(",") + .str.split(",") + .list.eval(pl.element().cast(pl.Float64, strict=False)) + .alias("tag_masses"), + pl.col("TagSequence").alias("tag_sequence"), + # Tag span for the selected-residue (selectedAA) highlight (legacy + # TabulatorTagTable.vue:142-173). + pl.col("StartPos").alias("start_pos"), + pl.col("EndPos").alias("end_pos"), + pl.col("Nmass").alias("n_mass"), + ) + .collect() + ) + if not selected.height: + _clear_all() + return + + raw = selected["tag_masses"][0] + # Keep STORED order (do not sort) so the reversed-sequence walk aligns for + # both ascending (C-term) and descending (N-term) tags. Drop null AND zero + # masses (legacy `number !== 0`, TabulatorTagTable.vue:140): a literal 0 mass + # would misalign the reversed-residue walk. + masses = ( + [m for m in raw if m is not None and m != 0] if raw is not None else [] + ) + if not masses: + _clear_all() + return + + # Residue letter per consecutive-mass gap (len(masses) - 1 gaps): the legacy + # labels gap i with sequence[len-1-i], i.e. reversed(sequence) over the + # stored-order gaps. Trim to the number of gaps. + seq = selected["tag_sequence"][0] or "" + residues = list(reversed(str(seq)))[: max(len(masses) - 1, 0)] + + # Selected-residue highlight direction. The residue list above is already + # REVERSED to align with the stored mass order, so a tag-relative `selectedAA` + # (an N->C index) always maps to the MIRRORED walk gap (gaps-1-selectedAA). + # Publish `nTerminal=False` so the LinePlot mirrors it: this is geometrically + # correct for BOTH N- and C-terminal-anchored tags AND matches the legacy + # behavior, whose `nTerminal` was effectively always false (it read a + # non-existent `row["N mass"]` key, so the legacy walk always mirrored). Driving + # the direction off `Nmass == -1` here instead would misplace the gold highlight + # for C-anchored tags relative to the (already reversed) residue letters. + tag_masses = { + "masses": list(masses), + "residues": residues, + "nTerminal": False, + } + + # Selected-residue gold (#F3A712) highlight (legacy + # `selectedTag.selectedAA = selectedAApos - StartPos`, TabulatorTagTable.vue: + # 151,169). When a residue is selected (AA_KEY holds its protein-absolute + # position) AND it falls within the selected tag's [StartPos, EndPos] span, + # publish the tag-relative residue index so the LinePlot tag walk highlights + # that residue; omit otherwise (no highlight). + start_pos = selected["start_pos"][0] + end_pos = selected["end_pos"][0] + selected_aa_pos = state_manager.get_selection(AA_KEY) + if ( + selected_aa_pos is not None + and start_pos is not None + and end_pos is not None + and int(start_pos) <= int(selected_aa_pos) <= int(end_pos) + ): + tag_masses["selectedAA"] = int(int(selected_aa_pos) - int(start_pos)) + + state_manager.set_selection(TAG_MASSES_KEY, tag_masses) + + # Tag-span highlight on the SequenceView. StartPos/EndPos are protein-absolute + # (matching the full-protein residue grid), so they bracket the tag directly. + if start_pos is not None and end_pos is not None: + # Real terminal anchoring (legacy `selectedTag.nTerminal = (N mass == -1)`) + # for the SequenceView tag-span orientation -- distinct from the LinePlot + # walk above, which always mirrors because its residues are pre-reversed. + n_mass = selected["n_mass"][0] + n_terminal = (n_mass is not None) and (float(n_mass) == -1.0) + state_manager.set_selection( + TAG_SPAN_KEY, + {"start": int(start_pos), "end": int(end_pos), "nTerminal": n_terminal}, + ) + else: + state_manager.clear_selection(TAG_SPAN_KEY) + + +def _normalize_mod_ranges(raw) -> list: + """Normalize the cache ``modifications`` field into the SequenceView + ``mod_ranges`` shape: a list of ``{start, end, mass_diff, labels}`` dicts. + + The ``modifications`` field is a ``list[struct{start,end,mass_diff,labels}]`` + (sequence_data_store.SCHEMA) carrying ambiguous/spanning modification ranges + (DISTINCT from per-residue fixed mods). Entries missing start/end are skipped; + indices are protein-absolute (the SequenceView renders the full protein).""" + if raw is None: + return [] + out = [] + for item in raw: + if not isinstance(item, dict): + continue + if item.get("start") is None or item.get("end") is None: + continue + md = item.get("mass_diff") + labels = item.get("labels") + out.append({ + "start": int(item["start"]), + "end": int(item["end"]), + "mass_diff": float(md) if md is not None else 0.0, + "labels": "" if labels is None else str(labels), + }) + return out + + +def _precursor_mass_lookup(file_manager, experiment_id: str) -> dict: + """``proteoform_index -> PrecursorMass`` from ``protein_dfs`` (or ``{}``). + + The observed precursor mass is not stored in ``sequence_data``; the protein + frame carries it per proteoform (``index`` == proteoform_index). Used for the + SequenceView mass header ``precursor_mass`` column.""" + protein = _lazy(file_manager, experiment_id, "protein_dfs") + if protein is None: + return {} + schema = protein.collect_schema().names() + if "PrecursorMass" not in schema or "index" not in schema: + return {} + df = protein.select(["index", "PrecursorMass"]).collect() + return { + int(i): float(m) + for i, m in zip(df["index"].to_list(), df["PrecursorMass"].to_list()) + if i is not None and m is not None + } + + +def _build_sequence_frame( + file_manager, experiment_id: str +) -> Optional[pl.LazyFrame]: + """Build the SequenceView-ready per-proteoform sequence frame. + + Source: the per-proteoform ``sequence_data`` store (keyed by + ``proteoform_index``). It carries the FULL protein ``sequence`` list, the + matching full-length per-residue ``coverage`` / ``maxCoverage``, the + determined-region bounds ``proteoform_start`` / ``proteoform_end`` (with the + ``-2`` sentinel = that terminus undetermined / open), the observed + ``computed_mass`` and the ambiguous ``modifications`` ranges. + + We emit the FULL protein sequence (NOT pre-sliced) so the OpenMS-Insight + SequenceView Vue renders the truncated N-/C-flanks and undetermined termini + itself from ``proteoform_start`` / ``proteoform_end``; the full-length + ``coverage`` stays aligned to the full sequence. Because the full protein is + rendered, the residue grid index IS the protein-absolute 0-based position, so + ``sequence_offset`` is always 0 (the residue-click cross-link then emits + positions that already match tag ``StartPos`` / ``EndPos`` directly). + + Columns emitted: ``proteoform_index`` (filter key), ``sequence`` (str, FULL + protein), ``precursor_charge`` (=1, neutral/deconvolved peaks), ``coverage`` + (full-length list[f64]), ``maxCoverage`` (f64), ``fixed_modifications`` + (list[str]), ``sequence_offset`` (=0), ``proteoform_start`` / ``proteoform_end`` + (int, sentinel ``-2`` carried through unchanged), ``computed_mass`` (f64), + ``precursor_mass`` (f64, from protein_dfs), and ``mod_ranges`` + (list[struct{start,end,mass_diff,labels}]). + + ``sequence_data`` is loaded with ``use_polars=True`` and arrives in EITHER of + two formats handled identically here: + + * **parquet (current ``parseTnT``):** one row per proteoform (schema in + ``src/render/sequence_data_store.py``) returned as a polars ``LazyFrame``. + * **pickle dict (legacy ``.pkl.gz`` example caches):** a dict keyed by the + proteoform index; each value a dict with the same keys.""" + if not file_manager.result_exists(experiment_id, "sequence_data"): + return None + store = file_manager.get_results( + experiment_id, ["sequence_data"], use_polars=True + )["sequence_data"] + + # Normalise either format into an iterable of per-proteoform row dicts. + if isinstance(store, pl.LazyFrame): + rows = store.collect().iter_rows(named=True) + elif isinstance(store, pl.DataFrame): + rows = store.iter_rows(named=True) + elif isinstance(store, dict): + if not store: + return None + rows = ( + {"proteoform_index": pid, **(store[pid] or {})} + for pid in sorted(store.keys()) + ) + else: + return None + + precursor_masses = _precursor_mass_lookup(file_manager, experiment_id) + + proteoform_indices: List[int] = [] + sequences: List[str] = [] + coverages: List[list] = [] + max_coverages: List[float] = [] + fixed_mods: List[list] = [] + sequence_offsets: List[int] = [] + proteoform_starts: List[int] = [] + proteoform_ends: List[int] = [] + computed_masses: List[float] = [] + precursor_mass_col: List[float] = [] + mod_ranges_col: List[list] = [] + for entry in rows: + pid = entry.get("proteoform_index") + if pid is None: + continue + # Emit the FULL protein sequence + full-length coverage (no slicing): the + # Vue side derives truncation / undetermined termini from + # proteoform_start/end, so the residue grid index already equals the + # protein-absolute position (sequence_offset = 0). + full = list(entry.get("sequence") or []) + cov = list(entry.get("coverage") or []) + start = entry.get("proteoform_start") + end = entry.get("proteoform_end") + proteoform_indices.append(int(pid)) + sequences.append("".join(str(a) for a in full)) + coverages.append([float(c) for c in cov]) + mc = entry.get("maxCoverage") + max_coverages.append(float(mc) if mc is not None else 0.0) + fm = entry.get("fixed_modifications") or [] + fixed_mods.append([str(m) for m in fm]) + # Full protein rendered => residue grid index IS protein-absolute position. + sequence_offsets.append(0) + # Carry the determined-region bounds through UNCHANGED, including the + # `-2` (UNDETERMINED_TERMINUS) sentinel. Absent bound => 0 / last residue + # default on the Vue side (no truncation); we default to 0 / len-1 here. + proteoform_starts.append(int(start) if start is not None else 0) + proteoform_ends.append( + int(end) if end is not None else (len(full) - 1 if full else 0) + ) + cm = entry.get("computed_mass") + computed_masses.append(float(cm) if cm is not None else -1.0) + precursor_mass_col.append(float(precursor_masses.get(int(pid), 0.0))) + mod_ranges_col.append(_normalize_mod_ranges(entry.get("modifications"))) + + if not proteoform_indices: + return None + + out = pl.DataFrame({ + "proteoform_index": proteoform_indices, + "sequence": sequences, + "precursor_charge": [1] * len(proteoform_indices), + "coverage": coverages, + "maxCoverage": max_coverages, + "fixed_modifications": fixed_mods, + "sequence_offset": sequence_offsets, + "proteoform_start": proteoform_starts, + "proteoform_end": proteoform_ends, + "computed_mass": computed_masses, + "precursor_mass": precursor_mass_col, + "mod_ranges": mod_ranges_col, + }) + return out.lazy() + + +def _build_sequence_view(file_manager, experiment_id: str, cache_dir: str): + seq_frame = _build_sequence_frame(file_manager, experiment_id) + if seq_frame is None: + return None + scan_frame = _proteoform_scan_frame(file_manager, experiment_id) + combined = _lazy(file_manager, experiment_id, "combined_spectrum_long") + if combined is None: + combined = _lazy(file_manager, experiment_id, "deconv_spectrum_long") + peaks = None + if combined is not None and scan_frame is not None: + # Deconv peaks are neutral masses; filter by the proteoform's scan and + # rename to the SequenceView peaks schema (peak_id, mass, intensity). + peaks = ( + _stamp_proteoform_index(combined, scan_frame) + .select( + pl.col("proteoform_index"), + pl.col("peak_id"), + pl.col("MonoMass").alias("mass"), + pl.col("SumIntensity").alias("intensity"), + ) + ) + + settings = _pandas(file_manager, experiment_id, "settings") + settings = dict(settings) if isinstance(settings, dict) else None + + return SequenceView( + cache_id=f"sequence_view_{experiment_id}", + sequence_data=seq_frame, + peaks_data=peaks, + filters={PROTEIN_KEY: "proteoform_index"}, + # Click / span sources (all routed through the interactivity mapping): + # - MASS_KEY: a fragment-table row click sets it to the matched peak's + # peak_id (combined-spectrum cross-link). + # - AA_KEY: a RESIDUE click sets it to the clicked residue's protein- + # absolute position via the "residue_position" sentinel (Tag-Table + # range filter). Now that the full protein is rendered, the grid index + # already IS the protein-absolute position (sequence_offset == 0). + # - TAG_SPAN_KEY: the "tag_span" sentinel does NOT set state on click; + # Vue READS this selection value ({start,end,nTerminal}, protein- + # absolute) to bracket-highlight the selected tag's span on the + # sequence. Published by _resolve_tag_masses. + interactivity={ + MASS_KEY: "peak_id", + AA_KEY: "residue_position", + TAG_SPAN_KEY: "tag_span", + }, + deconvolved=True, + compute_fixed_mods=True, + # TnT path: keep the variable/custom-mod context menu disabled (default). + disable_variable_modifications=True, + settings=settings, + title="Sequence View", + cache_path=cache_dir, + ) + + +def _build_combined_spectrum(file_manager, experiment_id: str, cache_dir: str): + primary = _lazy(file_manager, experiment_id, "combined_spectrum_long") + if primary is None: + return None + scan_frame = _proteoform_scan_frame(file_manager, experiment_id) + if scan_frame is None: + return None + primary = _stamp_proteoform_index(primary, scan_frame) + + # Annotated overlay (2nd series), stamped + filtered by the same proteoform. + anno = _lazy(file_manager, experiment_id, "anno_spectrum_long") + if anno is not None: + anno = _stamp_proteoform_index(anno, scan_frame) + primary = pl.concat([primary, anno], how="diagonal") + x2, y2 = "MonoMass_Anno", "SumIntensity_Anno" + else: + x2 = y2 = None + + # Combined spectrum: filtered by proteoform (resolved to scan), clicking a + # peak sets massIndex, signal-peak markers via is_signal, and the TAGGER + # OVERLAY highlights peaks whose MonoMass matches a selected tag mass + # (abs(Δ) < 1e-5, FLASHApp PlotlyLineplotTagger parity). The selected tag's + # masses arrive via the `tagData` state value (a list) set by the Tag Table. + return LinePlot( + cache_id=f"combined_spectrum_{experiment_id}", + data=primary, + filters={PROTEIN_KEY: "proteoform_index"}, + interactivity={MASS_KEY: "peak_id"}, + x_column="MonoMass", + y_column="SumIntensity", + signal_peak_column="is_signal", + # Charge-state drill-down: per deconv-peak row, the list of constituent + # signal-peak m/z / charge / intensity (present on combined_spectrum_long + # from a fresh parse; empty lists for non-signal peaks). + signal_mz_column="signal_mzs", + signal_charge_column="signal_charges", + signal_intensity_column="signal_intensities", + x2_column=x2, + y2_column=y2, + tag_filters={TAG_MASSES_KEY: "MonoMass"}, + tag_mass_column="MonoMass", + tag_tolerance=1e-5, + title="Augmented Deconvolved Spectrum", + x_label="Monoisotopic Mass", + y_label="Intensity", + cache_path=cache_dir, + ) + + +def _build_id_fdr_plot(file_manager, experiment_id: str, cache_dir: str): + # Precomputed TnT id-FDR density frames (computed in parseTnT with the TnT + # grouping: DECOY_ accession + ProteoformLevelQvalue>0). Literal labels stay + # "QScore"/"Target QScores"/"Decoy QScores" (DensityPlot defaults). + target = _lazy(file_manager, experiment_id, "density_id_target") + decoy = _lazy(file_manager, experiment_id, "density_id_decoy") + if target is None and decoy is None: + return None + return DensityPlot( + cache_id=f"id_fdr_plot_{experiment_id}", + density_target=target, + density_decoy=decoy, + title="Score Distribution", + cache_path=cache_dir, + ) + + +def _build_heatmap( + file_manager, experiment_id: str, cache_dir: str, frame_tag: str, + zoom_id: str, title: str, +): + data = _lazy(file_manager, experiment_id, frame_tag) + if data is None: + return None + return Heatmap( + cache_id=f"{frame_tag}_{experiment_id}", + data=data, + x_column="rt", + y_column="mass", + intensity_column="intensity", + zoom_identifier=zoom_id, + title=title, + x_label="Retention Time", + y_label="Mass", + cache_path=cache_dir, + ) + + +# COMPONENT_NAMES (FLASHTnTLayoutManager) -> builder. +COMPONENT_BUILDERS = { + "protein_table": _build_protein_table, + "sequence_view": _build_sequence_view, + "tag_table": _build_tag_table, + "combined_spectrum": _build_combined_spectrum, + "id_fdr_plot": _build_id_fdr_plot, + "ms1_raw_heatmap": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms1_raw_heatmap", "heatmap_raw", "Raw MS1 Heatmap"), + "ms1_deconv_heat_map": lambda fm, eid, cd: _build_heatmap( + fm, eid, cd, "ms1_deconv_heatmap", "heatmap_deconv", "Deconvolved MS1 Heatmap"), + # internal_fragment_map: deferred (disabled in the legacy path too; the + # sequence_data store would need internal-fragment arrays — see module note). +} + + +def build_component( + file_manager, experiment_id: str, cache_dir: str, comp_name: str, + best_per_spectrum: bool = True, +): + """Instantiate the OpenMS-Insight component for a layout cell, or None.""" + builder = COMPONENT_BUILDERS.get(comp_name) + if builder is None: + return None + if comp_name == "protein_table": + return _build_protein_table( + file_manager, experiment_id, cache_dir, + best_per_spectrum=best_per_spectrum, + ) + return builder(file_manager, experiment_id, cache_dir) + + +def _clear_proteoform_dependent_selections(state_manager) -> None: + """Clear the per-proteoform downstream selections (mirrors legacy + TabulatorProteinTable.vue:235-237, which resets the selected tag / tag data / + selected AA on a proteoform change). We also clear the resolved tag masses, + tag span and selected mass so no stale tag/peak highlight survives the switch.""" + for ident in (TAG_KEY, TAG_MASSES_KEY, AA_KEY, TAG_SPAN_KEY, MASS_KEY): + state_manager.clear_selection(ident) + + +def render_experiment_panel( + experiment_id: str, + layout_info_per_exp: List[List[str]], + file_manager, + panel_index: int, +): + """Render one experiment's [row][col] grid with its OWN isolated StateManager. + + The StateManager uses a DISTINCT session_key per experiment so selections made + in this panel do not leak into other side-by-side panels (HARD edge #6). + """ + session_key = f"svc_state_tnt_{experiment_id}_{panel_index}" + state_manager = StateManager(session_key=session_key) + cache_dir = _component_cache_dir(file_manager, experiment_id) + + # Selection clearing on proteoform change (legacy + # TabulatorProteinTable.vue:235-237): when the selected proteinIndex differs + # from the last-seen one for THIS panel, clear the downstream per-proteoform + # selections (tag / tag masses / selected AA / tag span / selected mass) BEFORE + # building components so no stale selection leaks across proteoforms. + last_seen_key = f"{session_key}__last_protein_index" + current_protein = state_manager.get_selection(PROTEIN_KEY) + if st.session_state.get(last_seen_key, "__unset__") != current_protein: + _clear_proteoform_dependent_selections(state_manager) + st.session_state[last_seen_key] = current_protein + + # Best-per-spectrum toggle (legacy default ON). Per-panel widget key so + # side-by-side panels toggle independently. Shown only when a protein table is + # in the layout. + has_protein_table = any( + "protein_table" in row for row in layout_info_per_exp + ) + best_per_spectrum = True + if has_protein_table: + best_per_spectrum = st.checkbox( + "Best per spectrum", + value=True, + key=f"tnt_oi_best_per_spectrum_{panel_index}", + help="Show only the highest-scoring proteoform per spectrum (scan).", + ) + + # Resolve the selected tag (scalar TagIndex set by the Tag Table) into its + # list of masses BEFORE rendering so the combined-spectrum tagger overlay + # sees the up-to-date `tagMasses` selection this rerun. + _resolve_tag_masses(file_manager, experiment_id, state_manager) + + for row_index, row in enumerate(layout_info_per_exp): + columns = st.columns(len(row)) + for col, (col_index, comp_name) in zip(columns, enumerate(row)): + with col: + component = build_component( + file_manager, experiment_id, cache_dir, comp_name, + best_per_spectrum=best_per_spectrum, + ) + if component is None: + # Silently skip an absent component (data frame missing), + # matching the Deconv viewer's documented intent and avoiding + # noisy warnings on stale / partial caches. + continue + key = f"tnt_oi_{panel_index}_{row_index}_{col_index}_{comp_name}" + component(key=key, state_manager=state_manager) diff --git a/requirements.txt b/requirements.txt index 8fcf0064..b693ef64 100644 --- a/requirements.txt +++ b/requirements.txt @@ -140,6 +140,9 @@ xlsxwriter scipy>=1.15 polars>=1.0.0 +# OpenMS-Insight visualization components (FLASHQuant FeatureView, Stage D) +openms-insight>=0.1.11 + # Redis Queue dependencies (for online mode) redis>=5.0.0 rq>=1.16.0 diff --git a/settings.json b/settings.json index 52e8063d..2ffa04fb 100644 --- a/settings.json +++ b/settings.json @@ -20,6 +20,7 @@ }, "online_deployment": false, "enable_workspaces": true, + "use_openms_insight_viewer": true, "test": true, "workspaces_dir": "..", "local_data_dir": "", diff --git a/src/parse/deconv.py b/src/parse/deconv.py index addeb3b6..f872d0c9 100644 --- a/src/parse/deconv.py +++ b/src/parse/deconv.py @@ -10,6 +10,297 @@ # pushdown reads only the matching group(s) instead of the whole file. SPECTRA_ROW_GROUP_SIZE = 64 +# Long-format (one-row-per-peak / one-row-per-mass) frames are consumed by the +# OpenMS-Insight components (LinePlot / Table / Scatter3D), which filter by COLUMN +# VALUE (filters={'scanIndex':'index'}) rather than by the old `iloc[scanIndex]` +# ROW-INDEX path in src/render/update.py. These producers are ADDITIVE: the legacy +# array-per-scan frames (deconv_spectrum, anno_spectrum, combined_spectrum, +# mass_table) are still stored unchanged for the old render path; the long frames +# are stored under separate `*_long` tags so both can coexist during Stage B. +# +# Exploded rows are sorted/grouped by `index` so a value-filter `index == k` +# reads exactly the rows the old `iloc[k]` array slice produced (the legacy frames +# are built with with_row_index + sort('index'), so row position == index value). +# Long peak frames use a slightly larger row group (predicate pushdown is by +# value, and per-scan peak counts are modest) than the array frames. +LONG_ROW_GROUP_SIZE = 10_000 + + +def _explode_long_by_position(indexed_lf, id_col, value_exprs): + """Explode parallel per-scan list columns into one row per position. + + This reproduces, exactly, the legacy FLASHApp Vue per-column expansion + (``TabulatorMassTable.vue`` ``tableData`` and the lineplot/3D consumers): + each per-scan list column is laid out by POSITION ``0..L-1`` independently; + the number of rows for a scan is the MAXIMUM list length across the supplied + columns, and any column shorter than that maximum yields ``null`` for the + missing trailing positions (the JS ``undefined``). Columns are therefore + ALIGNED BY POSITION, never lock-stepped — important because in real + FLASHDeconv output ``mz_array``/``intensity_array`` (the full spectrum) can be + LONGER than the per-mass ``MinCharges``/``SignalPeaks`` axis, and the legacy + UI pads the short columns with blanks rather than truncating. + + Args: + indexed_lf: a polars LazyFrame carrying an integer ``index`` column. + id_col: name of the per-scan position id column to emit (``peak_id`` or + ``mass_id``) — the 0-based row position within the scan. + value_exprs: list of ``(out_name, list_expr)``. ``list_expr`` is a polars + expression evaluating to a per-scan list; its element at position + ``id_col`` becomes the scalar ``out_name`` (null when out of range). + + Returns: + LazyFrame with columns ``index``, ``id_col``, then each ``out_name``, + sorted by ``index`` then ``id_col``. Scans whose columns are all empty + contribute 0 rows (matching the old ``iloc[k]`` empty-array slice). + """ + out_names = [name for name, _ in value_exprs] + + # Per-scan max length across all contributing list columns → number of + # positions to emit. (max of list.len() over the columns.) + max_len = value_exprs[0][1].list.len() + for _, expr in value_exprs[1:]: + max_len = pl.max_horizontal(max_len, expr.list.len()) + + # Pad every value list to the per-scan ``max_len`` with nulls — gathering by + # the position range with ``null_on_oob`` reproduces the legacy blank-tail + # cell — then zip-explode the id column and all value lists together so each + # output row is exactly one position. + # + # This stays O(total output rows). The earlier approach exploded only the id + # column while every row still carried the full per-scan value lists, then + # gathered the scalar — i.e. O(rows × max_len), which materialises the lists + # `max_len` times and OOMs on real spectra (e.g. an 865k-row annotated frame + # with multi-thousand-length lists ≈ tens of GB). Exploding the lists in + # lock-step avoids the duplication entirely. + positions = pl.int_ranges(0, max_len) + lf = ( + indexed_lf + .select( + [pl.col("index")] + + [expr.list.gather(positions, null_on_oob=True).alias(name) + for name, expr in value_exprs] + + [positions.alias(id_col)] + ) + .explode([id_col] + out_names) + # Empty scans explode to a single null-id row; drop so they contribute 0 rows. + .filter(pl.col(id_col).is_not_null()) + .sort(["index", id_col]) + ) + return lf.select(["index", id_col] + out_names) + + +# Each per-mass entry of the nested ``SignalPeaks`` column is a list of matched +# signal peaks, and each signal peak is a 4-tuple ``[peak_index, mz, intensity, +# charge]`` (verified against the example caches under example-data/workspaces/**: +# ``SignalPeaks`` has dtype ``List(List(List(Float64)))`` — scan → mass → peak → +# quadruple). The charge drill-down ("Augmented Annotated Spectrum") needs, for +# each deconvolved mass, the per-mass mz / charge / intensity of its signal peaks +# (not just the ``is_signal`` boolean). These expressions pull those three axes +# out of the nested column WITHOUT flattening across masses: each evaluates to a +# per-scan ``List(List(...))`` whose outer position is the mass axis (aligned to +# ``MonoMass`` / ``peak_id``) and whose inner list is that one mass's small signal +# arrays. _explode_long_by_position then gathers the inner list by mass position, +# so each output row carries exactly the signal peaks belonging to that peak. +def _signal_mzs_expr(): + # mz is element index 1 of each [idx, mz, intensity, charge] signal peak. + return pl.col("SignalPeaks").list.eval( + pl.element().list.eval(pl.element().list.get(1)) + ) + + +def _signal_intensities_expr(): + # intensity is element index 2 of each signal-peak quadruple. + return pl.col("SignalPeaks").list.eval( + pl.element().list.eval(pl.element().list.get(2)) + ) + + +def _signal_charges_expr(): + # charge is element index 3; stored as float in the nested array, cast to i64. + return pl.col("SignalPeaks").list.eval( + pl.element().list.eval(pl.element().list.get(3).cast(pl.Int64)) + ) + + +def _signal_is_signal_expr(): + # Per-mass boolean: True where that mass has >=1 matched signal peak. + return pl.col("SignalPeaks").list.eval(pl.element().list.len() > 0) + + +def _fill_empty_signal_lists(lf): + """Replace ``null`` signal/charge/intensity cells with empty lists. + + Positions past the per-mass ``SignalPeaks`` axis (e.g. a ragged scan whose + full-spectrum ``mz_array`` is longer than ``SignalPeaks``) explode to ``null`` + list cells. Coerce them to empty lists so consumers always read a list (parity + with ``is_signal`` being filled to ``False``), and every signal-flagged peak + has equal-length ``signal_mzs`` / ``signal_charges`` / ``signal_intensities``. + """ + return lf.with_columns([ + pl.col("signal_mzs").fill_null([]).alias("signal_mzs"), + pl.col("signal_charges").fill_null([]).alias("signal_charges"), + pl.col("signal_intensities").fill_null([]).alias("signal_intensities"), + ]) + + +def deconv_spectrum_long(pl_deconv_indexed): + """One row per deconvolved peak with the per-mass signal-peak arrays. + + Columns: index, peak_id, MonoMass, SumIntensity, signal_mzs (list[f64]), + signal_charges (list[i64]), signal_intensities (list[f64]). + + Long-format replacement for the array-valued ``deconv_spectrum`` frame, + consumed by ``LinePlot(filters={'scanIndex':'index'}, x_column='MonoMass', + y_column='SumIntensity')``. + + The three ``signal_*`` list columns carry, for the deconvolved mass at this + ``peak_id`` position, the mz / charge / intensity of each of its matched + signal peaks (the per-mass ``SignalPeaks[peak_id]`` axis). They are aligned to + each other (same length, one entry per signal peak of this mass) and to the + ``peak_id``/``MonoMass`` row, and back the "Augmented Annotated Spectrum" + charge drill-down. A peak with no matched signal (or a ragged past-end + position) carries empty lists. + """ + lf = _explode_long_by_position( + pl_deconv_indexed, + "peak_id", + [("MonoMass", pl.col("mz_array")), + ("SumIntensity", pl.col("intensity_array")), + ("signal_mzs", _signal_mzs_expr()), + ("signal_charges", _signal_charges_expr()), + ("signal_intensities", _signal_intensities_expr())], + ) + return _fill_empty_signal_lists(lf) + + +def anno_spectrum_long(pl_anno_indexed): + """One row per annotated/raw peak: index, peak_id, MonoMass_Anno, + SumIntensity_Anno. + + Long-format replacement for the array-valued ``anno_spectrum`` frame, + consumed by ``LinePlot(filters={'scanIndex':'index'}, + x_column='MonoMass_Anno', y_column='SumIntensity_Anno')``. + """ + return _explode_long_by_position( + pl_anno_indexed, + "peak_id", + [("MonoMass_Anno", pl.col("mz_array")), + ("SumIntensity_Anno", pl.col("intensity_array"))], + ) + + +def combined_spectrum_long(pl_deconv_indexed): + """One row per deconvolved peak with a signal-membership flag and the + per-mass signal-peak arrays. + + Columns: index, peak_id, MonoMass, SumIntensity, is_signal (bool), + signal_mzs (list[f64]), signal_charges (list[i64]), + signal_intensities (list[f64]). + + ``is_signal`` is True when the corresponding per-mass entry of the nested + ``SignalPeaks`` column is non-empty, i.e. the deconvolved mass at that + position has at least one matched signal peak (mirrors the per-mass alignment + the 3D plot uses: ``Plotly3Dplot.vue`` indexes ``SignalPeaks[massIndex]`` by + the same position). ``SignalPeaks`` is the per-mass axis and in real output + can be SHORTER than ``mz_array``; positions beyond its length therefore have + no signal entry and are flagged ``False`` (parity with the JS ``undefined`` + → no-signal). + + ``signal_mzs`` / ``signal_charges`` / ``signal_intensities`` carry, for the + mass at this ``peak_id`` position, the mz / charge / intensity of each matched + signal peak (the contents of ``SignalPeaks[peak_id]``). The three lists are + mutually aligned (one entry per signal peak of this mass, equal length) and + aligned to the ``peak_id``/``MonoMass`` row; they back the legacy "Augmented + Annotated Spectrum" charge drill-down. When ``is_signal`` is False (no matched + signal, or a ragged past-end position) all three lists are empty. + + This is the long-format counterpart of the array-valued ``combined_spectrum`` + deconv side; the annotated overlay is provided separately by + ``anno_spectrum_long`` (the OpenMS-Insight LinePlot reads the 2nd series from + its own ``x2_column``/``y2_column`` frame). + """ + # Per-mass list columns, all aligned to the SignalPeaks (per-mass) axis. + # _explode_long_by_position gathers each by the same position id as MonoMass + # and yields null past its end (coerced below to False / empty lists). + lf = _explode_long_by_position( + pl_deconv_indexed, + "peak_id", + [("MonoMass", pl.col("mz_array")), + ("SumIntensity", pl.col("intensity_array")), + ("is_signal", _signal_is_signal_expr()), + ("signal_mzs", _signal_mzs_expr()), + ("signal_charges", _signal_charges_expr()), + ("signal_intensities", _signal_intensities_expr())], + ) + lf = lf.with_columns(pl.col("is_signal").fill_null(False)) + return _fill_empty_signal_lists(lf) + + +def mass_table_long(pl_deconv_indexed): + """One row per mass: index, mass_id, plus scalar mass-table fields. + + Long-format replacement for the array-valued ``mass_table`` frame. Each row is + one deconvolved mass within a scan; ``MonoMass``/``SumIntensity`` and the + per-mass charge/isotope/score columns become scalars. + + Consumed by ``Table(interactivity={'massIndex':'mass_id'}, + filters={'scanIndex':'index'})``: clicking a row sets ``massIndex`` to the + row's ``mass_id``, and the table is filtered to the selected scan via + ``index``. ``mass_id`` is the 0-based position of the mass within its scan, + matching the array-subscript semantics the 3D plot uses for ``massIndex``. + + Columns are aligned BY POSITION (not lock-stepped): the legacy + ``TabulatorMassTable.vue`` builds one row per position up to the MAX array + length across the required columns, leaving blanks where a column is shorter. + In real FLASHDeconv output ``MonoMass``/``SumIntensity`` (the full spectrum + ``mz_array``/``intensity_array``) may be LONGER than the per-mass charge/ + isotope/score arrays; those trailing rows therefore carry the mass/intensity + with ``null`` charge/isotope/score cells, exactly as the old UI rendered them. + """ + value_exprs = [ + ("MonoMass", pl.col("mz_array")), + ("SumIntensity", pl.col("intensity_array")), + ("MinCharges", pl.col("MinCharges")), + ("MaxCharges", pl.col("MaxCharges")), + ("MinIsotopes", pl.col("MinIsotopes")), + ("MaxIsotopes", pl.col("MaxIsotopes")), + ("CosineScore", pl.col("cos")), + ("SNR", pl.col("snr")), + ("QScore", pl.col("qscore")), + ] + return _explode_long_by_position(pl_deconv_indexed, "mass_id", value_exprs) + +def threedim_SN_plot(pl_deconv_indexed): + """3D signal/noise scatter frame with precursor-lookup keys. + + Columns: index, Scan (i64), PrecursorScan (f64), PrecursorMass (f64), + MonoMass (list[f64], == ``mz_array``), SignalPeaks, NoisyPeaks. + + Carries the precursor-lookup keys the OpenMS-Insight Scatter3D needs to match + a fragment scan's precursor back to the mass that generated it: each (MS2) scan + row has its own ``Scan`` id, the ``PrecursorScan`` it was isolated from, the + scalar ``PrecursorMass``, and the per-mass ``MonoMass`` array of that scan. The + viewer locates the precursor scan's row (``Scan == PrecursorScan``) and finds + the index into that scan's ``MonoMass`` array whose value matches + ``PrecursorMass``, using the same per-mass position (``SignalPeaks[massIndex]``) + as the rest of the 3D plot. ``Scan`` / ``PrecursorScan`` are the join keys, + ``PrecursorMass`` is the scalar to match, ``MonoMass`` is the per-mass axis to + search. + """ + return ( + pl_deconv_indexed + .select([ + pl.col('index'), + pl.col('Scan'), + pl.col('PrecursorScan'), + pl.col('PrecursorMass'), + pl.col('mz_array').alias('MonoMass'), + pl.col('SignalPeaks'), + pl.col('NoisyPeaks') + ]) + ) + + def parseDeconv( file_manager, dataset_id, out_deconv_mzML, anno_annotated_mzML, spec1_tsv=None, spec2_tsv=None, logger=None @@ -111,6 +402,13 @@ def parseDeconv( ) file_manager.store_data(dataset_id, 'anno_spectrum', anno_spectrum_lazy, row_group_size=SPECTRA_ROW_GROUP_SIZE) + # anno_spectrum_long - long-format (one row per peak) for OpenMS-Insight LinePlot + file_manager.store_data( + dataset_id, 'anno_spectrum_long', + anno_spectrum_long(pl_anno_indexed), + row_group_size=LONG_ROW_GROUP_SIZE, + ) + logger.log("40.0 %", level=2) # mass_table - using native polars LazyFrame operations @@ -132,6 +430,13 @@ def parseDeconv( ) file_manager.store_data(dataset_id, 'mass_table', mass_table_lazy, row_group_size=SPECTRA_ROW_GROUP_SIZE) + # mass_table_long - long-format (one row per mass) for OpenMS-Insight Table + file_manager.store_data( + dataset_id, 'mass_table_long', + mass_table_long(pl_deconv_indexed), + row_group_size=LONG_ROW_GROUP_SIZE, + ) + logger.log("50.0 %", level=2) # sequence_view - using native polars LazyFrame operations @@ -160,6 +465,13 @@ def parseDeconv( ) file_manager.store_data(dataset_id, 'deconv_spectrum', deconv_spectrum_lazy, row_group_size=SPECTRA_ROW_GROUP_SIZE) + # deconv_spectrum_long - long-format (one row per peak) for OpenMS-Insight LinePlot + file_manager.store_data( + dataset_id, 'deconv_spectrum_long', + deconv_spectrum_long(pl_deconv_indexed), + row_group_size=LONG_ROW_GROUP_SIZE, + ) + logger.log("70.0 %", level=2) # anno & deconv spectrum (combined_spectrum) - using native polars LazyFrame join @@ -184,18 +496,19 @@ def parseDeconv( ) file_manager.store_data(dataset_id, 'combined_spectrum', combined_spectrum_lazy, row_group_size=SPECTRA_ROW_GROUP_SIZE) + # combined_spectrum_long - long-format deconv peaks + is_signal flag for + # OpenMS-Insight LinePlot (primary series). The annotated overlay (2nd series) + # is the separate anno_spectrum_long frame wired via x2_column/y2_column. + file_manager.store_data( + dataset_id, 'combined_spectrum_long', + combined_spectrum_long(pl_deconv_indexed), + row_group_size=LONG_ROW_GROUP_SIZE, + ) + logger.log("80.0 %", level=2) - # 3D_SN_plot - using native polars LazyFrame operations - threedim_SN_plot_lazy = ( - pl_deconv_indexed - .select([ - pl.col('index'), - pl.col('PrecursorScan'), - pl.col('SignalPeaks'), - pl.col('NoisyPeaks') - ]) - ) + # 3D_SN_plot - precursor-lookup keys + signal/noise peaks (see threedim_SN_plot). + threedim_SN_plot_lazy = threedim_SN_plot(pl_deconv_indexed) file_manager.store_data(dataset_id, 'threedim_SN_plot', threedim_SN_plot_lazy) logger.log("90.0 %", level=2) diff --git a/src/render/scan_resolution.py b/src/render/scan_resolution.py index a27a1038..3e4c06c9 100644 --- a/src/render/scan_resolution.py +++ b/src/render/scan_resolution.py @@ -1,4 +1,5 @@ import pandas as pd +import polars as pl def build_proteoform_scan_map(protein_df, scan_table_df): @@ -25,3 +26,52 @@ def build_proteoform_scan_map(protein_df, scan_table_df): "deconv_index": int(scan_to_index.loc[scan]), } return result + + +def build_proteoform_scan_frame(protein_df, scan_table_df): + """Polars frame surfacing the proteoform->scan resolution as COLUMNS. + + ADDITIVE helper for the OpenMS-Insight TnT viewer (Stage C). It reproduces + ``build_proteoform_scan_map`` (the legacy PyArrow pushdown in + ``src/render/update.py``) as a value-filterable frame so OpenMS-Insight + components can value-filter (``filters={'proteinIndex': 'proteoform_index'}``) + instead of doing an ``iloc`` / per-scan pushdown by hand. + + Args: + protein_df: pandas/polars frame with columns ``index`` (proteoform index) + and ``Scan``. + scan_table_df: pandas/polars frame with columns ``index`` (deconv row + index) and ``Scan``. + + Returns: + Polars DataFrame with columns ``proteoform_index`` (int64), ``scan`` + (int64) and ``deconv_index`` (int64). Proteoforms whose ``Scan`` is null + or absent from ``scan_table_df`` are omitted (same as the map builder). + """ + scan_map = build_proteoform_scan_map( + _to_pandas(protein_df), _to_pandas(scan_table_df) + ) + if not scan_map: + return pl.DataFrame( + schema={ + "proteoform_index": pl.Int64, + "scan": pl.Int64, + "deconv_index": pl.Int64, + } + ) + rows = [ + {"proteoform_index": int(pid), "scan": int(v["scan"]), + "deconv_index": int(v["deconv_index"])} + for pid, v in scan_map.items() + ] + return pl.DataFrame(rows).sort("proteoform_index") + + +def _to_pandas(df): + """Accept a polars or pandas frame; return pandas (the map builder uses + pandas indexing semantics).""" + if isinstance(df, pl.DataFrame): + return df.to_pandas() + if isinstance(df, pl.LazyFrame): + return df.collect().to_pandas() + return df diff --git a/tests/test_deconv_long_format.py b/tests/test_deconv_long_format.py new file mode 100644 index 00000000..96d2add4 --- /dev/null +++ b/tests/test_deconv_long_format.py @@ -0,0 +1,286 @@ +""" +Tests for the Stage B long-format producers in src/parse/deconv.py. + +FLASHApp's legacy render path filters per-scan spectra/masses by ROW INDEX +(``iloc[scanIndex]``) and stores arrays-per-scan. The OpenMS-Insight components +filter by COLUMN VALUE and expect LONG format (one row per peak/mass). These +tests validate the additive long-format producers: + + - row-count fidelity: exploded rows == legacy per-column max-length expansion + (the TabulatorMassTable.vue ``forEach``-per-column semantics); + - index-filter parity: ``filter(index == k)`` reproduces, position by position, + the legacy ``iloc[k]`` array contents (with shorter columns padded to None); + - ragged-scan handling: when ``mz_array`` (full spectrum) is longer than the + per-mass charge/score arrays, trailing rows carry mass/intensity with null + charge/score cells; ``is_signal`` is False past the SignalPeaks axis; + - id columns: ``peak_id`` / ``mass_id`` are 0-based positions within each scan, + and the deconv peak axis and mass-table mass axis are aligned. + +The producers are pure polars (no Streamlit / pyopenms), so they are unit +testable without booting the app. ``pyopenms`` is stubbed at import time only +because src/parse/deconv.py imports src/parse/masstable.py at module load. +""" + +import os +import sys +import types + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Stub pyopenms so importing src.parse.deconv works without the native dep +# (the long-format producers do not use it). +if "pyopenms" not in sys.modules: + _m = types.ModuleType("pyopenms") + for _a in ("MSExperiment", "MzMLFile", "SpectrumLookup", "Constants"): + setattr(_m, _a, type(_a, (), {"PROTON_MASS_U": 1.0, "C13C12_MASSDIFF_U": 1.0})) + sys.modules["pyopenms"] = _m + +import polars as pl + +from src.parse.deconv import ( + anno_spectrum_long, + combined_spectrum_long, + deconv_spectrum_long, + mass_table_long, + threedim_SN_plot, +) + + +def _deconv(): + # scan 0 is RAGGED: mz_array length 4 > per-mass arrays length 3. + # scan 1 is EMPTY. scans 2,3 have 2 and 1 masses. + return pl.DataFrame( + { + "mz_array": [[1000.1, 2000.2, 3000.3, 4000.4], [], [500.5, 600.6], [777.7]], + "intensity_array": [[10.0, 20.0, 30.0, 40.0], [], [5.0, 6.0], [7.0]], + "MinCharges": [[1, 2, 3], [], [1, 2], [4]], + "MaxCharges": [[5, 6, 7], [], [3, 4], [8]], + "MinIsotopes": [[0, 1, 2], [], [0, 1], [3]], + "MaxIsotopes": [[4, 5, 6], [], [2, 3], [7]], + "cos": [[0.9, 0.8, 0.7], [], [0.95, 0.85], [0.6]], + "snr": [[11.0, 12.0, 13.0], [], [14.0, 15.0], [16.0]], + "qscore": [[0.99, 0.98, 0.97], [], [0.96, 0.95], [0.94]], + "SignalPeaks": [ + [[[0.0, 1000.1, 10.0, 1.0]], [], [[2.0, 3000.3, 30.0, 3.0]]], + [], + [[[0.0, 500.5, 5.0, 1.0]], []], + [[]], + ], + } + ).with_row_index("index") + + +def _anno(): + return pl.DataFrame( + { + "mz_array": [[101.1, 102.2], [201.1], [], [401.1, 402.2, 403.3]], + "intensity_array": [[1.0, 2.0], [3.0], [], [4.0, 5.0, 6.0]], + } + ).with_row_index("index") + + +def _max_len_expansion(row, cols): + arrays = {c: list(row[c]) for c in cols} + n = max((len(a) for a in arrays.values()), default=0) + return [{c: (arrays[c][i] if i < len(arrays[c]) else None) for c in arrays} for i in range(n)] + + +def test_deconv_spectrum_long_schema_and_rowcount(): + df = deconv_spectrum_long(_deconv().lazy()).collect() + assert df.columns == [ + "index", "peak_id", "MonoMass", "SumIntensity", + "signal_mzs", "signal_charges", "signal_intensities", + ] + # 4 + 0 + 2 + 1 = 7 + assert df.height == 7 + + +def test_anno_spectrum_long_index_filter_parity(): + anno = _anno() + df = anno_spectrum_long(anno.lazy()).collect() + assert df.columns == ["index", "peak_id", "MonoMass_Anno", "SumIntensity_Anno"] + apd = anno.to_pandas() + for k in range(len(apd)): + sub = df.filter(pl.col("index") == k).sort("peak_id") + want_mass = list(apd.iloc[k]["mz_array"]) + assert sub["MonoMass_Anno"].to_list() == want_mass + assert sub["peak_id"].to_list() == list(range(len(want_mass))) + + +def test_mass_table_long_ragged_padding(): + deconv = _deconv() + df = mass_table_long(deconv.lazy()).collect() + expected_cols = [ + "index", "mass_id", "MonoMass", "SumIntensity", + "MinCharges", "MaxCharges", "MinIsotopes", "MaxIsotopes", + "CosineScore", "SNR", "QScore", + ] + assert df.columns == expected_cols + # Scan 0 has 4 mass/intensity positions but only 3 charge positions → + # row at mass_id 3 carries MonoMass=4000.4 with null MinCharges. + scan0 = df.filter(pl.col("index") == 0).sort("mass_id") + assert scan0.height == 4 + last = scan0.row(3, named=True) + assert last["MonoMass"] == 4000.4 + assert last["MinCharges"] is None + assert last["QScore"] is None + # First three rows have full charge/score data. + assert scan0.row(0, named=True)["MinCharges"] == 1 + + +def test_mass_table_long_rowcount_and_empty_scan(): + df = mass_table_long(_deconv().lazy()).collect() + # max-length per scan: 4 + 0 + 2 + 1 = 7 + assert df.height == 7 + # Empty scan contributes no rows. + assert df.filter(pl.col("index") == 1).height == 0 + + +def test_combined_spectrum_long_is_signal(): + deconv = _deconv() + df = combined_spectrum_long(deconv.lazy()).collect() + assert df.columns == [ + "index", "peak_id", "MonoMass", "SumIntensity", "is_signal", + "signal_mzs", "signal_charges", "signal_intensities", + ] + dpd = deconv.to_pandas() + for r in df.iter_rows(named=True): + sp = dpd[dpd["index"] == r["index"]].iloc[0]["SignalPeaks"] + pid = r["peak_id"] + want = (pid < len(sp)) and (len(sp[pid]) > 0) + assert bool(r["is_signal"]) == want + # Ragged past-end position (scan 0, peak_id 3) must be is_signal False. + row3 = df.filter((pl.col("index") == 0) & (pl.col("peak_id") == 3)).row(0, named=True) + assert row3["is_signal"] is False + + +# Signal-peak quadruple layout in the nested SignalPeaks column: +# [peak_index, mz, intensity, charge]. (Verified against the example caches: +# SignalPeaks has dtype List(List(List(Float64))) — scan -> mass -> peak -> tuple.) +_SP_MZ, _SP_INT, _SP_CH = 1, 2, 3 + + +def _check_signal_arrays(df): + """Shared assertions for the per-mass signal_* list columns on a long frame.""" + # Column dtypes: lists of f64 / i64 / f64. + assert df.schema["signal_mzs"] == pl.List(pl.Float64) + assert df.schema["signal_intensities"] == pl.List(pl.Float64) + assert df.schema["signal_charges"] == pl.List(pl.Int64) + + deconv = _deconv() + dpd = deconv.to_pandas() + for r in df.iter_rows(named=True): + sp = dpd[dpd["index"] == r["index"]].iloc[0]["SignalPeaks"] + pid = r["peak_id"] + peaks = list(sp[pid]) if pid < len(sp) else [] + + mzs = r["signal_mzs"] + chs = r["signal_charges"] + ints = r["signal_intensities"] + + # Never null — past-end / no-signal positions are empty lists. + assert mzs is not None and chs is not None and ints is not None + # The three arrays are mutually aligned (one entry per signal peak). + assert len(mzs) == len(chs) == len(ints) == len(peaks) + # Contents match the per-mass signal peaks at this position. + assert mzs == [p[_SP_MZ] for p in peaks] + assert ints == [p[_SP_INT] for p in peaks] + assert chs == [int(p[_SP_CH]) for p in peaks] + + # Alignment with is_signal (combined frame only) / non-emptiness. + if "is_signal" in r: + assert bool(r["is_signal"]) == (len(mzs) > 0) + + +def test_deconv_spectrum_long_signal_arrays(): + df = deconv_spectrum_long(_deconv().lazy()).collect() + _check_signal_arrays(df) + # Concrete spot check: scan 0, peak 0 has one signal peak (mz 1000.1, ch 1). + r0 = df.filter((pl.col("index") == 0) & (pl.col("peak_id") == 0)).row(0, named=True) + assert r0["signal_mzs"] == [1000.1] + assert r0["signal_charges"] == [1] + assert r0["signal_intensities"] == [10.0] + # Non-signal peak (scan 0, peak 1) and ragged past-end peak (scan 0, peak 3) + # both carry empty lists. + for pid in (1, 3): + rr = df.filter((pl.col("index") == 0) & (pl.col("peak_id") == pid)).row(0, named=True) + assert rr["signal_mzs"] == [] + assert rr["signal_charges"] == [] + assert rr["signal_intensities"] == [] + + +def test_combined_spectrum_long_signal_arrays(): + df = combined_spectrum_long(_deconv().lazy()).collect() + _check_signal_arrays(df) + # Signal-flagged peaks have non-empty, equal-length signal_* lists; non-signal + # peaks have empty lists across all three. + for r in df.iter_rows(named=True): + if r["is_signal"]: + assert len(r["signal_mzs"]) > 0 + assert len(r["signal_mzs"]) == len(r["signal_charges"]) == len(r["signal_intensities"]) + else: + assert r["signal_mzs"] == [] + assert r["signal_charges"] == [] + assert r["signal_intensities"] == [] + + +def _deconv_3d(): + # MS1 precursor scan (scan 100) with two masses; MS2 fragment scan (scan 101) + # isolated from precursor mass 2000.2 in scan 100. + return pl.DataFrame( + { + "Scan": [100, 101], + "PrecursorScan": [0.0, 100.0], + "PrecursorMass": [0.0, 2000.2], + "mz_array": [[1000.1, 2000.2], [3000.3]], + "intensity_array": [[10.0, 20.0], [30.0]], + "SignalPeaks": [ + [[[0.0, 1000.1, 10.0, 1.0]], [[1.0, 2000.2, 20.0, 2.0]]], + [[[0.0, 3000.3, 30.0, 1.0]]], + ], + "NoisyPeaks": [ + [[], []], + [[]], + ], + } + ).with_row_index("index") + + +def test_threedim_SN_plot_precursor_lookup_columns(): + df = threedim_SN_plot(_deconv_3d().lazy()).collect() + assert df.columns == [ + "index", "Scan", "PrecursorScan", "PrecursorMass", + "MonoMass", "SignalPeaks", "NoisyPeaks", + ] + # Precursor-lookup key dtypes. + assert df.schema["Scan"] == pl.Int64 + assert df.schema["PrecursorScan"] == pl.Float64 + assert df.schema["PrecursorMass"] == pl.Float64 + # MonoMass is the per-mass array (== mz_array). + assert df.schema["MonoMass"] == pl.List(pl.Float64) + + # The MS2 fragment scan's precursor resolves to a mass in its precursor scan: + # find the precursor-scan row (Scan == PrecursorScan) and the MonoMass index + # matching PrecursorMass — the position the Scatter3D uses for SignalPeaks. + ms2 = df.filter(pl.col("Scan") == 101).row(0, named=True) + assert ms2["PrecursorScan"] == 100.0 + assert ms2["PrecursorMass"] == 2000.2 + prec = df.filter(pl.col("Scan") == int(ms2["PrecursorScan"])).row(0, named=True) + mass_index = prec["MonoMass"].index(ms2["PrecursorMass"]) + assert mass_index == 1 + # That per-mass position carries the matching signal peaks in the precursor scan. + assert len(prec["SignalPeaks"][mass_index]) > 0 + + +def test_peak_id_and_mass_id_share_mass_axis(): + deconv = _deconv() + ds = deconv_spectrum_long(deconv.lazy()).collect() + mt = mass_table_long(deconv.lazy()).collect() + join = ds.join( + mt.select(["index", "mass_id", "MonoMass"]), + left_on=["index", "peak_id"], + right_on=["index", "mass_id"], + how="inner", + suffix="_mt", + ) + assert (join["MonoMass"] == join["MonoMass_mt"]).all()