From 906539867b661ba52a09992922d4de92d6245e20 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 27 May 2026 17:55:16 -0700 Subject: [PATCH 01/23] =?UTF-8?q?explorer:=20heatmap=20SQL=20pre-aggregati?= =?UTF-8?q?on=20=E2=80=94=20phase=201.5=20(#233)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the LIMIT 100000 raw-row scan + JS per-pixel binning with a single DuckDB GROUP BY query that does the binning server-side. Removes the arbitrary cap honestly: every sample in the bbox is counted into its true pixel cell, regardless of total sample count. Why the LIMIT was bad: `LIMIT 100000` returned the first 100k rows in parquet storage order — not random, not geographic. At world view, the heatmap silently showed whichever source happened to be physically first in the file (likely SESAR, the largest source by row count). The "(capped)" status warning disclosed the problem but didn't fix it. RY feedback 2026-05-27 on PR #240 ("wondering whether we can do better geographic random sampling"). How the SQL pushdown works: compute `(x_bin, y_bin)` pixel coordinates from `latitude`/`longitude` server-side using FLOOR / LEAST / GREATEST, then GROUP BY (x_bin, y_bin) returning one row per non-empty pixel with COUNT(*) as the sample count. Result cardinality is bounded by canvas pixels (≤ 512² = 262k), independent of bbox sample count. JS just iterates the aggregated rows and applies the same log(1+n) scaling for heatmap.js. Verified counts vs `samples table` summary line (= true sample count for the current view): view | heatmap | table | match ------------------|---------|---------|------ PKAP (100km) | 77,840 | 77,840 | ✅ Cyprus medium | 100,970 | 100,970 | ✅ (was capped at 100k) Cyprus regional | 682,029 | 682,029 | ✅ (was capped at 100k) World view | 5.98M | 5.98M | ✅ (was capped at 100k) Render time at world view (~6M samples → 35k cells): ~7s on localhost, similar to or faster than the LIMIT 100k version at smaller zooms. Removes the "(capped)" status branch and the `HEATMAP_LIMIT` constant becomes unused (left in place for now in case Phase 2 progressive refinement reintroduces a safety cap on cell count). Side effect of removing the cap: the per-pixel max-bias is now even more extreme at high-density views, but the log(1+n) scaling from PR #240 handles it. Verified: 5/5 heatmap-overlay.spec.js still pass on localhost. (The spec asserts `lastPointCount > 0`, which is still true; one spec change worth a follow-up: the spec used to expect capped behavior for large views, but no test currently asserts that, so no spec changes needed here.) Co-Authored-By: Claude Opus 4.7 (1M context) --- explorer.qmd | 145 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 93 insertions(+), 52 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 3efccf74..c476408f 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -2890,13 +2890,35 @@ zoomWatcher = { function getHeatmapInstance() { if (heatmapInstance) return heatmapInstance; if (!window.h337) throw new Error('heatmap.js did not load'); + // maxOpacity caps the rendered alpha so dense areas don't fully + // wash out the satellite imagery underneath. Without this, world + // view (35k+ pixel cells with overlapping blur radii) saturates + // to solid red. RY feedback 2026-05-27 on PR #240 follow-up. heatmapInstance = window.h337.create({ container: ensureHeatmapContainer(), radius: 25, + maxOpacity: 0.6, }); return heatmapInstance; } + // Adaptive per-point radius. heatmap.js applies a Gaussian blur of + // size `radius` around each data point; overlapping blurs add + // linearly, so at high cell density (world view: 35k cells on 512² + // canvas, each cell's default 25-pixel blur covering ~1% of canvas) + // the sum exceeds 1.0 across most of the canvas and everything + // saturates to full red regardless of underlying density. + // + // Empirical scaling: at world view (35k cells) want ~6 px; at small + // viewports (~300 cells) want ~30 px to fill space smoothly. + // sqrt(canvas_pixels / cell_count) gives ~3 at world, ~30 at small — + // double it and clamp to [6, 30]. + function heatmapRadiusFor(cellCount) { + const canvasPx = HEATMAP_CANVAS_SIZE * HEATMAP_CANVAS_SIZE; + const raw = Math.sqrt(canvasPx / Math.max(1, cellCount)) * 2; + return Math.max(6, Math.min(30, Math.round(raw))); + } + function heatmapFilterHash() { return JSON.stringify({ sources: getActiveSources().slice().sort(), @@ -2950,57 +2972,79 @@ zoomWatcher = { if (!heatmapEnabled()) return; setHeatmapStatus('Rendering heatmap...'); try { - const rows = await db.query(` - SELECT latitude, longitude - FROM read_parquet('${lite_url}') - WHERE ${heatmapBboxPredicate(bounds, 'latitude', 'longitude')} - ${sourceFilterSQL('source')} - ${facetFilterSQL()} - LIMIT ${HEATMAP_LIMIT} - `); - if (myReq !== heatmapReqId || !heatmapEnabled()) return; - + // SQL pre-aggregation at pixel resolution (issue #233 phase 1.5). + // + // Previous approach: SELECT latitude, longitude LIMIT 100000 then + // bin per pixel in JS. Two problems: + // (1) LIMIT 100000 picks an arbitrary first 100k rows in parquet + // storage order — NOT geographic random. At world view, the + // heatmap silently showed whichever source happened to be + // physically first in the file (likely SESAR). + // (2) For sample sets above the cap, the density was unfaithful. + // + // This approach: push the binning into DuckDB. The SQL groups by + // pixel-cell coordinates derived from the bbox + canvas size, so + // each row returned is one (x, y, count) tuple. Result cardinality + // is bounded by canvas pixels (≤ 512² = 262k), independent of how + // many samples the bbox contains. No LIMIT needed — every sample + // counted into its true pixel bucket. + // + // Antimeridian handling: when bbox wraps (west > east), the SQL + // shifts longitudes < west by +360 so the pixel arithmetic works + // in a continuous coordinate space, matching what the old JS loop + // did at line 2976. Same `eastForRectangle` adjustment downstream. const width = HEATMAP_CANVAS_SIZE; const height = HEATMAP_CANVAS_SIZE; const west = bounds.west; const eastNorm = bounds.west > bounds.east ? bounds.east + 360 : bounds.east; const lngSpan = Math.max(1e-9, eastNorm - west); const latSpan = Math.max(1e-9, bounds.north - bounds.south); - const bins = new Map(); - let max = 1; - - for (const row of rows) { - let lng = Number(row.longitude); - const lat = Number(row.latitude); - if (!Number.isFinite(lat) || !Number.isFinite(lng)) continue; - if (bounds.west > bounds.east && lng < west) lng += 360; - const x = Math.max(0, Math.min(width - 1, Math.floor(((lng - west) / lngSpan) * width))); - const y = Math.max(0, Math.min(height - 1, Math.floor(((bounds.north - lat) / latSpan) * height))); - const binKey = `${x},${y}`; - const next = (bins.get(binKey) || 0) + 1; - bins.set(binKey, next); - if (next > max) max = next; - } + const wraps = bounds.west > bounds.east; + // SQL-side pixel coordinate computation. CAST(... AS INTEGER) is + // explicit so DuckDB groups by integer keys, not floats. + const lngExprBase = `(longitude ${wraps ? `+ CASE WHEN longitude < ${west} THEN 360 ELSE 0 END` : ``})`; + const xExpr = `CAST(LEAST(${width - 1}, GREATEST(0, FLOOR((${lngExprBase} - ${west}) / ${lngSpan} * ${width}))) AS INTEGER)`; + const yExpr = `CAST(LEAST(${height - 1}, GREATEST(0, FLOOR((${bounds.north} - latitude) / ${latSpan} * ${height}))) AS INTEGER)`; + const aggregated = await db.query(` + SELECT + ${xExpr} AS x, + ${yExpr} AS y, + COUNT(*) AS n + FROM read_parquet('${lite_url}') + WHERE ${heatmapBboxPredicate(bounds, 'latitude', 'longitude')} + ${sourceFilterSQL('source')} + ${facetFilterSQL()} + GROUP BY x, y + `); + if (myReq !== heatmapReqId || !heatmapEnabled()) return; - // Log-scale bin weights to defeat supersite max-bias. - // iSamples data has extreme power-law spatial distribution: at - // Cyprus medium zoom, one position carries 52,252 co-located - // samples (likely a museum aggregation) while the median - // position has 2 — a 26,000× ratio. Linear heatmap.js - // max-normalization makes the supersite bin full red and - // everything else essentially invisible (2/52252 = 0.004% - // intensity). log(1+n) compresses the supersite (log(52253) ≈ - // 10.86) and lifts the median (log(3) ≈ 1.10), bringing the - // ratio to ~10× and revealing the actual density distribution - // the user expects to see. RY feedback 2026-05-27 on PR #240. - const points = []; + // SQL did the binning. Convert each row to a heatmap.js point. + // Log-scale bin weights to defeat supersite max-bias. iSamples + // data has extreme power-law spatial distribution: at Cyprus + // medium zoom, one position carries 52,252 co-located samples + // (likely a museum aggregation) while the median position has + // 2 — a 26,000× ratio. Linear heatmap.js max-normalization + // makes the supersite bin full red and everything else + // essentially invisible (2/52252 = 0.004% intensity). log(1+n) + // compresses the supersite (log(52253) ≈ 10.86) and lifts the + // median (log(3) ≈ 1.10), bringing the ratio to ~10× and + // revealing the actual density distribution the user expects + // to see. RY feedback 2026-05-27 on PR #240. + const pointsRaw = []; let logMax = 0; - for (const [binKey, value] of bins) { - const [x, y] = binKey.split(',').map(Number); - const logVal = Math.log1p(value); + let totalSamples = 0; + for (const row of aggregated) { + const n = Number(row.n); + totalSamples += n; + const logVal = Math.log1p(n); if (logVal > logMax) logMax = logVal; - points.push({ x, y, value: logVal }); + pointsRaw.push({ x: Number(row.x), y: Number(row.y), value: logVal }); } + // Adaptive radius: tight at high cell counts (world view) to + // avoid blur-overlap saturation; wide at low cell counts to + // fill space smoothly. + const radius = heatmapRadiusFor(pointsRaw.length); + const points = pointsRaw.map(p => ({ ...p, radius })); const hm = getHeatmapInstance(); hm.setData({ min: 0, max: logMax, data: points }); @@ -3022,25 +3066,22 @@ zoomWatcher = { heatmapImageryLayer = nextLayer; heatmapLastKey = key; // success-only — see refreshHeatmap() const refreshedAt = Date.now(); - const capped = rows.length >= HEATMAP_LIMIT; + // With SQL pre-aggregation, every sample in the bbox is counted + // into its pixel cell — no more arbitrary LIMIT cap. `capped` is + // kept on the state shape (for spec back-compat) but always + // false. `lastPointCount` is now the true sample total, not the + // capped raw-row count. viewer._heatmapOverlay = { enabled: true, layer: heatmapImageryLayer, lastRefreshAt: refreshedAt, - lastPointCount: rows.length, + lastPointCount: totalSamples, lastBinnedPointCount: points.length, lastImageHash: heatmapStringHash(url), lastKey: key, - capped, + capped: false, }; - // Codex round-1 review of #240: silent cap is misleading on - // global views (lite parquet has ~6M rows; LIMIT 100k shows an - // arbitrary first 100k, not honest density). Phase 2 progressive - // refinement removes the cap; for phase 1, warn explicitly so - // the user knows the heatmap is a sample, not the full density. - setHeatmapStatus(capped - ? `Heatmap rendered from first ${HEATMAP_LIMIT.toLocaleString()} samples (capped — zoom or filter for full density).` - : `Heatmap rendered from ${rows.length.toLocaleString()} samples.`); + setHeatmapStatus(`Heatmap rendered from ${totalSamples.toLocaleString()} samples.`); } catch (err) { if (myReq !== heatmapReqId) return; console.warn('Heatmap refresh failed:', err); From 204d2dfca88e920a3915b70cecd813922cca7ff0 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 29 May 2026 16:12:12 -0700 Subject: [PATCH 02/23] =?UTF-8?q?WIP=20A1=20(#234=20Step=204):=20search-as?= =?UTF-8?q?-global-filter=20=E2=80=94=20pid-set=20+=20table=20surface?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strategy B: materialize search_pids (one ILIKE scan over facets_url) on a committed search, then constrain surfaces with a cheap pid semi-join. This increment (table surface, verified): - buildSearchFilter/clearSearchFilter: non-temp search_pids table (DISTINCT, NOT NULL), token-versioned _next→swap, captures match total. Published on window.__searchFilter {active,term,token,total} + window.searchFilterSQL(). - doSearch builds the filter (shows "Building search filter…") then refreshes the table; clears it on empty/short submit. - loadCount/loadPage semi-join on search_pids; summaryText → "N of M \"term\" matches in this map view" (replaces #250 interim copy). - Dev probe cell (a1PersistenceProbe) — REMOVE before PR. Verified on local build: bucchero → table shows only OpenContext Poggio Civitate matches (2,693), no GEOME mollusks; non-temp table persists across db.query() calls. Probe (#249 data): no coord-less matches, no dup pids, broad-term max ~82k. TODO (still in PR #1, NOT YET DONE): points loader, facet counts + cube gating, stats, and C3 auto-point-mode so the globe isn't left unfiltered. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 182 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 148 insertions(+), 34 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 1bfc7c7b..dc73a4f0 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -1448,6 +1448,38 @@ db = { //| echo: false //| output: false +// === A1 dev probe (#234 Step 4) — TEMPORARY, remove before PR === +// Confirms the linchpin of the search-as-global-filter design: a NON-TEMP +// table created in one db.query() call is visible to a LATER db.query() +// call. Observable's DuckDBClient opens a fresh connection per query(), so a +// connection-local TEMP table would NOT survive — but a regular table in the +// shared in-memory database does. Also checks TEMP for contrast. +a1PersistenceProbe = { + try { + await db.query(`CREATE OR REPLACE TABLE __a1_probe AS SELECT 42 AS x`); + const r = Array.from(await db.query(`SELECT x FROM __a1_probe`)); + const nonTempOK = r.length === 1 && Number(r[0].x) === 42; + console.log(`[A1probe] NON-TEMP table persists across db.query():`, nonTempOK); + let tempOK = null; + try { + await db.query(`CREATE OR REPLACE TEMP TABLE __a1_probe_t AS SELECT 7 AS x`); + const rt = Array.from(await db.query(`SELECT x FROM __a1_probe_t`)); + tempOK = rt.length === 1 && Number(rt[0].x) === 7; + } catch (e) { tempOK = `threw: ${e.message}`; } + console.log(`[A1probe] TEMP table persists across db.query():`, tempOK); + await db.query(`DROP TABLE IF EXISTS __a1_probe`); + return { nonTempOK, tempOK }; + } catch (e) { + console.log(`[A1probe] FAILED:`, e.message); + return { error: e.message }; + } +} +``` + +```{ojs} +//| echo: false +//| output: false + // === Cesium Viewer (created once, never re-created) === viewer = { performance.mark('viewer-init-start'); @@ -1834,6 +1866,15 @@ tableView = { let pageGen = 0; // bumped on any new load; in-flight callbacks check this let lastPageFailed = false; // surfaces a sentinel table state when loadPage errors + // A1 (#234 Step 4): semi-join predicate against the materialized + // search_pids set, published by the search cell on window. Empty string + // when no search is committed, so non-search queries are unchanged. Read + // at query-build time (not cell-definition time), so it picks up a search + // committed after this cell first ran. + const searchFilterSQL = (col = 'pid') => + (typeof window !== 'undefined' && window.searchFilterSQL) + ? window.searchFilterSQL(col) : ''; + const prevBtn = document.getElementById('tablePrev'); const nextBtn = document.getElementById('tableNext'); const metaEl = document.getElementById('tableMeta'); @@ -2101,6 +2142,7 @@ tableView = { ${sourceFilterSQL('source')} ${facetFilterSQL()} ${bboxSQL} + ${searchFilterSQL('pid')} `); if (genAtStart !== pageGen) return true; // DuckDB-WASM returns BigInt for COUNT(*); coerce safely. @@ -2132,6 +2174,7 @@ tableView = { ${sourceFilterSQL('source')} ${facetFilterSQL()} ${bboxSQL} + ${searchFilterSQL('pid')} ORDER BY pid LIMIT ${TABLE_PAGE_SIZE} OFFSET ${offset} `); @@ -2156,25 +2199,25 @@ tableView = { function summaryText() { if (totalRows == null) return 'Counting samples...'; - // Honesty fix (#247): when a free-text search is committed, the - // table still reflects ONLY the viewport + source/facet filters — - // the search term is NOT a table predicate (pre-A1; see #234 Step - // 4 / axis A1). Disclose that explicitly and point to the side - // panel, instead of claiming these rows "match the current filters" - // (which after a search-fly can be e.g. 43,803 GEOME mollusks for a - // "bucchero" search). `__explorerActiveSearch` is maintained by - // doSearch; null when no search is committed. setMeta uses - // textContent, so the term needs no HTML-escaping here. - const activeSearch = (typeof window !== 'undefined') ? window.__explorerActiveSearch : null; + // A1 (#234 Step 4): when a search is committed, the table IS now + // filtered by it (semi-join on search_pids), so totalRows already + // counts only `searchTerm ∩ viewport ∩ source/facet`. Report that + // honestly as "N of M matches in this map view", where M is the + // global match total stashed on window.__searchFilter. (The probe + // confirmed every searchable sample has coordinates, so M is also the + // count of mappable matches — no coordinate-less caveat needed.) + const sf = (typeof window !== 'undefined') ? window.__searchFilter : null; + const activeSearch = sf && sf.active ? sf.term : null; if (totalRows === 0) { return activeSearch - ? `No samples in this map view and current non-search filters. Search results for "${activeSearch}" are shown in the panel →` + ? `No "${activeSearch}" matches in this map view (of ${(sf.total || 0).toLocaleString()} total — pan or zoom out).` : 'No samples match the current filters.'; } const total = totalRows.toLocaleString(); const plural = totalRows === 1 ? '' : 's'; if (activeSearch) { - return `${total} sample${plural} in this map view and current non-search filters. Search results for "${activeSearch}" are shown in the panel →`; + const matchTotal = (sf.total || totalRows).toLocaleString(); + return `${total} of ${matchTotal} "${activeSearch}" match${sf.total === 1 ? '' : 'es'} in this map view.`; } return `${total} sample${plural} match the current filters.`; } @@ -3846,6 +3889,68 @@ zoomWatcher = { } } + // === A1: search-as-global-filter pid-set (#234 Step 4) === + // + // Strategy B: materialize the set of pids matching the committed search + // term ONCE (one ILIKE scan over facets_url), then let every count + // surface (table, points, facet legend, stats) constrain itself with a + // cheap `pid IN (SELECT pid FROM search_pids)` semi-join — instead of + // re-running the expensive ILIKE per surface per camera move. + // + // `search_pids` is a NON-TEMP table: Observable's DuckDBClient opens a + // fresh connection per db.query(), so a connection-local TEMP table would + // not survive to the next query(); a regular table in the shared + // in-memory database persists across those connections (verified by the + // a1PersistenceProbe cell during development). + // + // Build into `search_pids_next` then atomically swap, so a surface + // refresh can never read a half-built set. `_searchFilterToken` versions + // each build so a superseded one (newer search fired mid-build) discards + // its swap. State is published on `window.__searchFilter` so the + // separate `tableView` OJS cell (and others) can read it; the SQL + // predicate is exposed as `window.searchFilterSQL(pidCol)`. + let _searchFilterToken = 0; + if (typeof window !== 'undefined' && !window.__searchFilter) { + window.__searchFilter = { active: false, term: null, token: 0, total: 0 }; + // Surfaces append this to their WHERE. Empty string when no search is + // committed, so non-search queries are byte-for-byte unchanged. + window.searchFilterSQL = (pidCol = 'pid') => + (window.__searchFilter && window.__searchFilter.active) + ? ` AND ${pidCol} IN (SELECT pid FROM search_pids)` + : ''; + } + + async function buildSearchFilter(terms, term) { + const token = ++_searchFilterToken; + const searchWhere = textSearchWhere(terms, [ + 'label', + 'description', + 'CAST(place_name AS VARCHAR)', + ]); + // DISTINCT + NOT NULL: pid is unique in facets_url today, but DISTINCT + // is cheap insurance against a future facet-shaped projection. + await db.query(` + CREATE OR REPLACE TABLE search_pids_next AS + SELECT DISTINCT pid + FROM read_parquet('${facets_url}') + WHERE pid IS NOT NULL AND ${searchWhere} + `); + if (token !== _searchFilterToken) return false; // superseded mid-build + await db.query(`CREATE OR REPLACE TABLE search_pids AS SELECT pid FROM search_pids_next`); + if (token !== _searchFilterToken) return false; + const cnt = Array.from(await db.query(`SELECT COUNT(*) AS n FROM search_pids`)); + const total = cnt.length ? Number(cnt[0].n) : 0; + if (token !== _searchFilterToken) return false; + window.__searchFilter = { active: true, term, token, total }; + return true; + } + + async function clearSearchFilter() { + _searchFilterToken++; + window.__searchFilter = { active: false, term: null, token: _searchFilterToken, total: 0 }; + try { await db.query(`DROP TABLE IF EXISTS search_pids`); } catch (e) { /* best effort */ } + } + async function doSearch(scope) { if (scope === 'area' || scope === 'world') _searchScope = scope; const effectiveScope = _searchScope; @@ -3864,34 +3969,20 @@ zoomWatcher = { const term = searchInput.value.trim(); if (!term || term.length < 2) { searchResults.textContent = 'Type at least 2 characters'; - // Honesty fix (#247): the samples-table meta line keys off this - // flag. An empty / too-short submit means no committed search, so - // clear it and refresh the table to revert to the plain "match - // the current filters" copy. - if (typeof window !== 'undefined') { - window.__explorerActiveSearch = null; - window.refreshSamplesTable?.(); - } + // A1 (#234 Step 4): an empty / too-short submit means no committed + // search — drop the pid-set filter and refresh surfaces so they + // revert to the unfiltered "match the current filters" state. + await clearSearchFilter(); + if (typeof window !== 'undefined') window.refreshSamplesTable?.(); writeQueryState(); persistSearchScope(effectiveScope); return; } writeQueryState(); persistSearchScope(effectiveScope); - // Honesty fix (#247): record the committed search term so the - // samples-table meta line can disclose that the table reflects the - // map view + source/facet filters only — NOT this search. Free-text - // search is not yet a table/globe predicate (pre-A1; see #234 Step - // 4). Set at fire-time (independent of result success) so the meta is - // honest the moment the search commits. - if (typeof window !== 'undefined') window.__explorerActiveSearch = term; - // Refresh the table now so the meta copy updates immediately. World- - // scope searches fly the camera (moveEnd → refreshAll covers it), but - // area-scope searches do NOT move the camera, so without this nudge - // the meta would stay stale until the next interaction. Triggers the - // SAME refreshAll that moveEnd/filter-change already use (no new query - // shape); pageGen dedups against the fly-triggered refresh. - if (typeof window !== 'undefined') window.refreshSamplesTable?.(); + // A1 (#234 Step 4): the pid-set filter is built below (after `terms` + // is parsed) and the dependent surfaces are refreshed against it; the + // table/legend/points then reflect this search, not just the viewport. // Shared, prominent + sticky search-results heading, reused by the // success / zero-result / error paths so the side panel ALWAYS // reflects the committed search the table meta points at ("…shown in @@ -3912,6 +4003,29 @@ zoomWatcher = { performance.mark(markStart); const tStart = performance.now(); const terms = searchTerms(term); + + // A1 (#234 Step 4): materialize the search pid-set ONCE, then refresh + // the table (and, in later increments, points / facet counts / stats) + // so every count surface constrains to this search via a cheap + // semi-join. One ILIKE scan here warms the text columns in the DuckDB + // buffer, so the LIMIT-50 side-panel query below reads them warm. + // (Follow-up: derive the side-panel list from search_pids to drop the + // second scan entirely.) Failure leaves surfaces unfiltered, not broken. + const buildingMsg = effectiveScope === 'area' + ? 'Building search filter for selected areas…' + : 'Building search filter…'; + searchResults.textContent = buildingMsg; + try { + await buildSearchFilter(terms, term); + } catch (e) { + console.warn('A1 search-filter build failed; surfaces stay unfiltered:', e); + await clearSearchFilter(); + } + // Superseded by a newer search while building? Bail before mutating UI. + if (searchId === _searchSeq && typeof window !== 'undefined') { + window.refreshSamplesTable?.(); + } + // Snapshot the filter-state telemetry booleans here, BEFORE the // try block, so they remain in scope through `finally`. They // reflect the DOM state at search-fire time — independent of From 936f1f3bae03e979f40c12ff8650fb221a0b2586 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 29 May 2026 16:56:15 -0700 Subject: [PATCH 03/23] =?UTF-8?q?WIP=20A1=20(#234=20Step=204):=20wire=20po?= =?UTF-8?q?ints,=20facet=20counts=20(+cube=20gate),=20C3=20=E2=80=94=20glo?= =?UTF-8?q?be=20path=20BUGGY?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds to the working table surface: - searchIsActive()/searchFilterSQL() cell-local helpers in the viewer cell. - loadViewportSamples: semi-join on search_pids. - updateCrossFilteredCounts: semi-join on both paths; gate off the cube fast-path AND the global baseline early-return when a search is active. - applySearchFilterChange(): C3 orchestrator — force point mode on search, revert to altitude-appropriate mode on clear; refresh table+facets. - camera-changed handler: latch point mode while a search is active. - doSearch calls applySearchFilterChange after build / on clear. KNOWN BUG (needs debugging): the GLOBE points render the UNFILTERED viewport count (e.g. "5000 of 1,591,051") even though search is active and the table correctly shows 2,693. C3 does not enter point mode at high altitude on boot either (globe stays unfiltered clusters). Likely an async race between the boot point-load / mode entry and the post-build applySearchFilterChange (filter built ~40-90s into boot, after the camera has already settled). The table surface (loadCount/loadPage) IS correctly filtered. Probe cell still present (remove before PR). Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 112 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 94 insertions(+), 18 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index dc73a4f0..450cf3ae 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -2368,6 +2368,16 @@ zoomWatcher = { const EXIT_POINT_ALT = 180000; // 180 km → exit point mode const POINT_BUDGET = DEFAULT_POINT_BUDGET; + // A1 (#234 Step 4): accessors for the search-as-global-filter state the + // search machinery publishes on window.__searchFilter. searchFilterSQL() + // returns the semi-join predicate (or '' when no search is committed) so + // the point loader and facet-count queries scope to `searchTerm ∩ …` the + // same way the samples table does. + const searchIsActive = () => + (typeof window !== 'undefined' && window.__searchFilter && window.__searchFilter.active); + const searchFilterSQL = (col = 'pid') => + searchIsActive() ? ` AND ${col} IN (SELECT pid FROM search_pids)` : ''; + // No viewport cache: the samples table (PR #219) re-queries on every // `moveEnd` against the current padded bbox, so reusing a cached // `cachedTotalCount` here would have point-mode show a stale count @@ -2529,6 +2539,7 @@ zoomWatcher = { AND longitude BETWEEN ${padded.west} AND ${padded.east} ${sourceFilterSQL('source')} ${facetFilterSQL()} + ${searchFilterSQL('pid')} `; const query = ` SELECT pid, label, source, latitude, longitude, @@ -2841,7 +2852,9 @@ zoomWatcher = { // spatial constraint. In a non-global view with no facet filter, B1 // still wants per-value counts scoped to what's visible — fall // through to the slow path with `where = '1=1'`. - if (!sourceImpossible && activeDims.length === 0 && bboxSQL === null) { + // A1 (#234 Step 4): an active search is itself a constraint, so even a + // global view with no facet filter must take the slow (filtered) path. + if (!sourceImpossible && activeDims.length === 0 && bboxSQL === null && !searchIsActive()) { for (const d of dims) applyFacetCounts(d.key, null); return; } @@ -2849,11 +2862,14 @@ zoomWatcher = { markFacetCountsRecomputing(); // Cube fast-path: pre-aggregated globally, so it's valid only when - // the camera is at (or close to) the global view. + // the camera is at (or close to) the global view — AND no search is + // active (A1 #234 Step 4): the cube is pre-aggregated over the whole + // corpus and cannot be constrained to a free-text pid-set, so a + // committed search must fall through to the on-the-fly slow path. const singleActiveDim = !sourceImpossible && activeDims.length === 1 && activeDims[0].values.length === 1 ? activeDims[0] : null; - if (singleActiveDim && totalActiveValues === 1 && bboxSQL === null) { + if (singleActiveDim && totalActiveValues === 1 && bboxSQL === null && !searchIsActive()) { try { const filterCols = ['filter_source', 'filter_material', 'filter_context', 'filter_object_type']; const filterColForKey = { @@ -2902,7 +2918,7 @@ zoomWatcher = { SELECT f.${d.col} AS value, COUNT(*) AS count FROM read_parquet('${facets_url}') f JOIN read_parquet('${lite_url}') l ON l.pid = f.pid - WHERE ${where} AND f.${d.col} IS NOT NULL${bboxSQL} + WHERE ${where} AND f.${d.col} IS NOT NULL${bboxSQL}${searchFilterSQL('f.pid')} GROUP BY f.${d.col} `); } else { @@ -2910,7 +2926,7 @@ zoomWatcher = { rows = await db.query(` SELECT ${d.col} AS value, COUNT(*) AS count FROM read_parquet('${facets_url}') - WHERE ${where} AND ${d.col} IS NOT NULL + WHERE ${where} AND ${d.col} IS NOT NULL${searchFilterSQL('pid')} GROUP BY ${d.col} `); } @@ -3485,6 +3501,51 @@ zoomWatcher = { busyRelease(); } } + + // A1 (#234 Step 4): push the just-built (or just-cleared) search pid-set + // through every dynamic surface. Mirrors handleFacetFilterChange, plus + // the C3 globe-mode rule: + // - search active → FORCE point mode. Clusters are pre-aggregated H3 + // summaries that cannot be text-filtered, so showing them while the + // table/legend ARE filtered is the incoherent half-state we must + // avoid. Point mode renders the actual filtered samples (capped at + // POINT_BUDGET with an honest "showing N — zoom in" note for very + // broad terms — that cap is the density-fallback, in lieu of + // reverting to dishonest clusters). + // - search cleared → restore the mode the current altitude implies. + // The camera-changed handler keeps point mode latched while a search is + // active (its targetMode is overridden below), so panning/zooming during + // a search won't bounce back to clusters. + async function applySearchFilterChange() { + busyAcquire(); + try { + syncFacetNote(); + refreshHeatmap(); + if (searchIsActive()) { + if (getMode() !== 'point') { + enterPointMode(false); // forces point; loads filtered viewport samples + } else { + await loadViewportSamples(); + } + } else { + // Search cleared: revert to the altitude-appropriate mode. + const h = viewer.camera.positionCartographic.height; + if (getMode() === 'point' && h >= EXIT_POINT_ALT) { + exitPointMode(false); + const target = h > 3000000 ? 4 : h > 300000 ? 6 : 8; + if (target !== currentRes) { + await loadRes(target, { 4: h3_res4_url, 6: h3_res6_url, 8: h3_res8_url }[target]); + } + } else if (getMode() === 'point') { + await loadViewportSamples(); // still point altitude — reload now-unfiltered points + } + } + refreshFacetCounts(); + if (typeof window !== 'undefined') window.refreshSamplesTable?.(); + } finally { + busyRelease(); + } + } document.getElementById('materialFilterBody').addEventListener('change', handleFacetFilterChange); document.getElementById('contextFilterBody').addEventListener('change', handleFacetFilterChange); document.getElementById('objectTypeFilterBody').addEventListener('change', handleFacetFilterChange); @@ -3508,18 +3569,29 @@ zoomWatcher = { history.replaceState(null, '', buildHash(viewer)); } - // Determine target mode with hysteresis - const targetMode = h < ENTER_POINT_ALT ? 'point' + // Determine target mode with hysteresis. + // A1 (#234 Step 4) / C3: while a search is active, latch point + // mode regardless of altitude — clusters can't be text-filtered, + // so we keep showing the filtered sample dots even when zoomed out. + const targetMode = searchIsActive() ? 'point' + : h < ENTER_POINT_ALT ? 'point' : h > EXIT_POINT_ALT ? 'cluster' : getMode(); if (targetMode === 'point' && getMode() !== 'point') { - // Cold-cache deep-link: the res8 + samples_map_lite fetches - // can take 60-90s (DuckDB-WASM 1.24.0 falls back to a full - // HTTP read; see issue #190). Delegate to the shared helper - // so the source-filter handler can call the same path on - // supersession recovery. - await tryEnterPointModeIfNeeded(); + if (searchIsActive()) { + // Search forces point mode even above ENTER_POINT_ALT, + // where tryEnterPointModeIfNeeded() would refuse; enter + // directly so the filtered dots render at any zoom. + enterPointMode(false); + } else { + // Cold-cache deep-link: the res8 + samples_map_lite fetches + // can take 60-90s (DuckDB-WASM 1.24.0 falls back to a full + // HTTP read; see issue #190). Delegate to the shared helper + // so the source-filter handler can call the same path on + // supersession recovery. + await tryEnterPointModeIfNeeded(); + } } else if (targetMode === 'cluster' && getMode() !== 'cluster') { exitPointMode(); // Reload appropriate resolution @@ -3970,10 +4042,11 @@ zoomWatcher = { if (!term || term.length < 2) { searchResults.textContent = 'Type at least 2 characters'; // A1 (#234 Step 4): an empty / too-short submit means no committed - // search — drop the pid-set filter and refresh surfaces so they - // revert to the unfiltered "match the current filters" state. + // search — drop the pid-set filter and refresh every surface + // (table, points, facet counts) plus revert the globe mode so the + // page returns to its unfiltered state. await clearSearchFilter(); - if (typeof window !== 'undefined') window.refreshSamplesTable?.(); + await applySearchFilterChange(); writeQueryState(); persistSearchScope(effectiveScope); return; @@ -4022,8 +4095,11 @@ zoomWatcher = { await clearSearchFilter(); } // Superseded by a newer search while building? Bail before mutating UI. - if (searchId === _searchSeq && typeof window !== 'undefined') { - window.refreshSamplesTable?.(); + // Otherwise push the freshly-built filter through every surface: table, + // points, facet counts, and the globe mode (C3 — switch to filtered + // point dots, since clusters can't be text-filtered). + if (searchId === _searchSeq) { + await applySearchFilterChange(); } // Snapshot the filter-state telemetry booleans here, BEFORE the From 4e79830a5c314aa0b5914a9d26806063260a0fb9 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 29 May 2026 17:28:39 -0700 Subject: [PATCH 04/23] =?UTF-8?q?WIP=20A1:=20Codex=20C3=20fixes=20(moveEnd?= =?UTF-8?q?=20latch,=20awaitable=20enterPointMode,=20search-token=20stalen?= =?UTF-8?q?ess)=20+=20[A1dbg]=20logging;=20globe=20still=20not=20entering?= =?UTF-8?q?=20point=20mode=20=E2=80=94=20next:=20Codex=20rec=20#4=20one-re?= =?UTF-8?q?conciler=20refactor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 64 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 450cf3ae..55294728 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -2509,6 +2509,15 @@ zoomWatcher = { // --- Load individual samples for current viewport --- async function loadViewportSamples() { const myReqId = ++requestId; + // A1 (#234 Step 4): snapshot the search generation too. `requestId` + // only orders point loads against each other; it does NOT know that a + // load was built under a given search filter. Without this, an + // unfiltered boot load (search not yet built), a stale filtered load, + // or a post-clear load can render after the search generation changed. + // Every post-await resume below re-checks BOTH tokens. + const mySearchToken = (typeof window !== 'undefined' && window.__searchFilter) ? window.__searchFilter.token : 0; + const isStaleLoad = () => myReqId !== requestId + || mySearchToken !== ((typeof window !== 'undefined' && window.__searchFilter) ? window.__searchFilter.token : 0); const bounds = getViewportBounds(); if (!bounds) return; @@ -2554,8 +2563,9 @@ zoomWatcher = { performance.measure('sp', 'sp-s', 'sp-e'); const elapsed = performance.getEntriesByName('sp').pop().duration; - // Stale guard: discard if a newer request was issued - if (myReqId !== requestId) { + // Stale guard: discard if a newer point load OR a newer search + // generation superseded this one (A1 #234 Step 4). + if (isStaleLoad()) { console.log(`Discarding stale sample response (req ${myReqId}, current ${requestId})`); return; } @@ -2575,14 +2585,14 @@ zoomWatcher = { FROM read_parquet('${lite_url}') ${whereClause} `); - if (myReqId !== requestId) return; // stale guard + if (isStaleLoad()) return; // stale guard (req + search gen) totalCount = Number(countRow[0]?.n ?? data.length); capReached = totalCount > data.length; } catch(err) { // Stale guard before any state mutation/logging: // a newer request may have started while count was in // flight (Codex review of PR #210). - if (myReqId !== requestId) return; + if (isStaleLoad()) return; // Don't fail the whole load if the count query fails; // just fall back to the displayed-count behavior. console.warn("Real-count query failed; falling back to rendered count:", err); @@ -2605,7 +2615,7 @@ zoomWatcher = { console.log(`Point mode: rendered ${samples.length} of ${totalCount} samples in ${elapsed.toFixed(0)}ms${capReached ? ' (cap reached)' : ''}`); } catch(err) { - if (myReqId !== requestId) return; + if (isStaleLoad()) return; console.error("Viewport sample query failed:", err); updatePhaseMsg('Sample query failed — try again.', 'loading'); } @@ -2662,14 +2672,19 @@ zoomWatcher = { } // --- Mode transitions --- - function enterPointMode(pushHistory) { + // async + awaits loadViewportSamples so callers (notably + // applySearchFilterChange under A1 #234 Step 4) have one real completion + // point — otherwise the point load is fire-and-forget and can lose a + // requestId race to a stale unfiltered load. Fire-and-forget callers can + // still call it without awaiting; the load just resolves later. + async function enterPointMode(pushHistory) { setExplorerMode('point'); applyLayerVisibility(); if (pushHistory !== false) history.pushState(null, '', buildHash(viewer)); // #facetNote is only meaningful in cluster mode (#234 step 1). syncFacetNote(); - loadViewportSamples(); console.log('Entered point mode'); + await loadViewportSamples(); } function exitPointMode(pushHistory) { @@ -2699,7 +2714,7 @@ zoomWatcher = { updateStats(`H3 Res${currentRes}`, viewer.h3Points.length, '—', '—', 'Clusters Loaded', 'Samples Loaded'); } updatePhaseMsg(`${inView.clusters.toLocaleString()} clusters in view. Zoom closer for individual samples.`, 'done'); - console.log('Exited point mode'); + console.log('[A1dbg] Exited point mode (searchActive=' + searchIsActive() + ')', new Error().stack?.split('\n').slice(1,4).join(' | ')); } // --- Boot→point-mode transition (issue #190 fix 2) --- @@ -2774,7 +2789,7 @@ zoomWatcher = { if (res8Ready && getMode() !== 'point' && hNow < ENTER_POINT_ALT) { // Propagate `pushHistory` so boot/hash hydration callers can // avoid growing the browser history stack (issue #207 item 3). - enterPointMode(opts && opts.pushHistory); + await enterPointMode(opts && opts.pushHistory); } } @@ -3517,13 +3532,14 @@ zoomWatcher = { // active (its targetMode is overridden below), so panning/zooming during // a search won't bounce back to clusters. async function applySearchFilterChange() { + console.log('[A1dbg] applySearchFilterChange entry: active=', searchIsActive(), 'mode=', getMode()); busyAcquire(); try { syncFacetNote(); refreshHeatmap(); if (searchIsActive()) { if (getMode() !== 'point') { - enterPointMode(false); // forces point; loads filtered viewport samples + await enterPointMode(false); // forces point; awaits filtered viewport load } else { await loadViewportSamples(); } @@ -3583,7 +3599,7 @@ zoomWatcher = { // Search forces point mode even above ENTER_POINT_ALT, // where tryEnterPointModeIfNeeded() would refuse; enter // directly so the filtered dots render at any zoom. - enterPointMode(false); + await enterPointMode(false); } else { // Cold-cache deep-link: the res8 + samples_map_lite fetches // can take 60-90s (DuckDB-WASM 1.24.0 falls back to a full @@ -3718,7 +3734,12 @@ zoomWatcher = { refreshHeatmap(); if (getMode() !== 'point') return; const h = viewer.camera.positionCartographic.height; - if (h > EXIT_POINT_ALT) { + // A1 (#234 Step 4) / C3: while a search is active, point mode is + // LATCHED (clusters can't be text-filtered), so a high-altitude + // moveEnd must NOT exit to clusters — otherwise the post-search + // flyTo (200 km, above EXIT_POINT_ALT) would immediately undo the + // forced point mode and the globe would show unfiltered clusters. + if (h > EXIT_POINT_ALT && !searchIsActive()) { // Sub-10% zoom-out from point mode (e.g. 175 km → 181 km) won't // fire `camera.changed`, so without driving the exit here we'd // be stuck in point mode above `EXIT_POINT_ALT` until a larger @@ -3852,11 +3873,14 @@ zoomWatcher = { // (issue #207 item 4 → Codex follow-up): without this, back/forward // through a `#alt=8000` URL with no `mode=point` would exit point // mode here even though boot would have entered it. - viewer._suppressTimer = setTimeout(() => { + viewer._suppressTimer = setTimeout(async () => { viewer._suppressHashWrite = false; const s = readHash(); - const wantsPoint = s.mode === 'point' || (s.alt != null && s.alt < ENTER_POINT_ALT); - if (wantsPoint && getMode() !== 'point') enterPointMode(false); + // A1 (#234 Step 4): an active search forces point mode regardless + // of the restored altitude, so the back/forward globe state stays + // coherent with the (still-filtered) table/legend. + const wantsPoint = searchIsActive() || s.mode === 'point' || (s.alt != null && s.alt < ENTER_POINT_ALT); + if (wantsPoint && getMode() !== 'point') await enterPointMode(false); else if (!wantsPoint && getMode() === 'point') exitPointMode(false); }, 2000); @@ -3982,8 +4006,13 @@ zoomWatcher = { // separate `tableView` OJS cell (and others) can read it; the SQL // predicate is exposed as `window.searchFilterSQL(pidCol)`. let _searchFilterToken = 0; - if (typeof window !== 'undefined' && !window.__searchFilter) { - window.__searchFilter = { active: false, term: null, token: 0, total: 0 }; + if (typeof window !== 'undefined') { + // Preserve existing state across an OJS hot re-run of this cell, but + // ALWAYS (re)install the helper so it can't go missing while + // __searchFilter persists (Codex review). + if (!window.__searchFilter) { + window.__searchFilter = { active: false, term: null, token: 0, total: 0 }; + } // Surfaces append this to their WHERE. Empty string when no search is // committed, so non-search queries are byte-for-byte unchanged. window.searchFilterSQL = (pidCol = 'pid') => @@ -4098,6 +4127,7 @@ zoomWatcher = { // Otherwise push the freshly-built filter through every surface: table, // points, facet counts, and the globe mode (C3 — switch to filtered // point dots, since clusters can't be text-filtered). + console.log('[A1dbg] post-build: searchId=', searchId, '_searchSeq=', _searchSeq, 'active=', window.__searchFilter?.active); if (searchId === _searchSeq) { await applySearchFilterChange(); } From 62d5500099aa50f8bc4c13a8d39b53e0a8772ecd Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 29 May 2026 22:26:31 -0700 Subject: [PATCH 05/23] =?UTF-8?q?dev:=20fast=20verify-loop=20infra=20?= =?UTF-8?q?=E2=80=94=20local=20parquet=20mirror=20support=20+=20range=20se?= =?UTF-8?q?rver=20+=20deterministic=20A1=20observability?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - R2_BASE honors ?data_base= / localStorage ISAMPLES_DATA_BASE (default prod), so the explorer can read a local parquet mirror instead of 40-90s remote range-fetches. - dev_server.py: range-capable (206) static server; stock python http.server returns 200 and breaks DuckDB-WASM partial reads. - window.__a1log/__a1state + a1dbg() + on-page panel (?debug=a1) replace flaky console capture; window.__a1globe() exposes mode/point state for a Playwright harness. - Converted [A1dbg] console.logs to a1dbg events at build/mode/point-load/discard points. NOTE: cold cost is init-dominated (DuckDB-WASM+Cesium+OJS ~40s) — mirror helps the DATA phase only; the real lever is load-once + in-page iteration. Mirror range verified (curl -r => 206) but a full end-to-end speedup run hung in init (shakedown tomorrow; check 0-byte current/wide.parquet). Co-Authored-By: Claude Opus 4.8 (1M context) --- dev_server.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++ explorer.qmd | 64 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 dev_server.py diff --git a/dev_server.py b/dev_server.py new file mode 100644 index 00000000..bd36a633 --- /dev/null +++ b/dev_server.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +"""Range-capable static dev server for the explorer verify loop. + +Stock `python3 -m http.server` (3.13) answers Range requests with 200 + the +full body, which makes DuckDB-WASM fall back to whole-file reads — the slow +path this repo has hit before. This server returns proper 206 Partial Content +so DuckDB-WASM can do real partial reads against a LOCAL parquet mirror, making +the cold verify loop seconds instead of 40-90s. + +Usage: + python3 dev_server.py # serves ./docs on :8099 + python3 dev_server.py --dir docs --port 8099 + +Then load the explorer against the local mirror under docs/data: + http://localhost:8099/explorer.html?data_base=/data&debug=a1#v=1&lat=... + +Verify Range actually works (must be 206, not 200): + curl -r 0-99 -i http://localhost:8099/data/isamples_202601_samples_map_lite.parquet +""" +import argparse +import http.server +import os +import re + + +class RangeHandler(http.server.SimpleHTTPRequestHandler): + def end_headers(self): + # CORS + always-Accept-Ranges so a cross-origin data_base also works. + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Access-Control-Allow-Headers", "Range") + self.send_header("Access-Control-Expose-Headers", + "Content-Length, Content-Range, Accept-Ranges, ETag") + self.send_header("Accept-Ranges", "bytes") + self.send_header("Cache-Control", "no-cache") + super().end_headers() + + def do_GET(self): + rng = self.headers.get("Range") + path = self.translate_path(self.path) + if rng and os.path.isfile(path): + m = re.match(r"bytes=(\d*)-(\d*)\s*$", rng) + if m: + size = os.path.getsize(path) + start = int(m.group(1)) if m.group(1) else 0 + end = int(m.group(2)) if m.group(2) else size - 1 + end = min(end, size - 1) + if start > end: + self.send_error(416, "Requested Range Not Satisfiable") + return + length = end - start + 1 + self.send_response(206) + self.send_header("Content-Type", self.guess_type(path)) + self.send_header("Content-Range", f"bytes {start}-{end}/{size}") + self.send_header("Content-Length", str(length)) + self.end_headers() + with open(path, "rb") as f: + f.seek(start) + self.wfile.write(f.read(length)) + return + super().do_GET() + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--dir", default="docs") + ap.add_argument("--port", type=int, default=8099) + args = ap.parse_args() + os.chdir(args.dir) + httpd = http.server.ThreadingHTTPServer(("", args.port), RangeHandler) + print(f"Range-capable dev server: http://localhost:{args.port} (serving ./{args.dir})") + print(" Range check: curl -r 0-99 -i " + f"http://localhost:{args.port}/data/isamples_202601_samples_map_lite.parquet") + httpd.serve_forever() + + +if __name__ == "__main__": + main() diff --git a/explorer.qmd b/explorer.qmd index 55294728..60a95f0e 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -680,7 +680,17 @@ Cesium.Ion.defaultAccessToken = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOi //| output: false // === Constants === -R2_BASE = "https://data.isamples.org" +// DEV: override the data origin to a fast local parquet mirror so the +// verify loop doesn't pay 40-90s of remote range-fetch per cold load. +// ?data_base=/data (same-origin mirror under docs/data, served +// by a RANGE-capable server — stock python +// http.server returns 200 not 206, breaking +// DuckDB-WASM partial reads; use dev_server.py) +// localStorage ISAMPLES_DATA_BASE (sticky per-browser override) +// Defaults to the production R2 origin, so shipped builds are unchanged. +R2_BASE = new URLSearchParams(location.search).get('data_base') + || (typeof localStorage !== 'undefined' && localStorage.getItem('ISAMPLES_DATA_BASE')) + || "https://data.isamples.org" h3_res4_url = `${R2_BASE}/isamples_202601_h3_summary_res4.parquet` h3_res6_url = `${R2_BASE}/isamples_202601_h3_summary_res6.parquet` h3_res8_url = `${R2_BASE}/isamples_202601_h3_summary_res8.parquet` @@ -2566,6 +2576,7 @@ zoomWatcher = { // Stale guard: discard if a newer point load OR a newer search // generation superseded this one (A1 #234 Step 4). if (isStaleLoad()) { + window.a1dbg?.('point-load-discard', { myReqId, requestId, mySearchToken, curToken: window.__searchFilter?.token }); console.log(`Discarding stale sample response (req ${myReqId}, current ${requestId})`); return; } @@ -2612,6 +2623,7 @@ zoomWatcher = { ? `${totalCount.toLocaleString()} samples in view (showing ${samples.length.toLocaleString()} — zoom in for more). Click one for details.` : `${samples.length.toLocaleString()} individual samples. Click one for details.`; updatePhaseMsg(phaseMsg, 'done'); + window.a1dbg?.('point-load-render', { rendered: samples.length, totalCount, searchActive: searchIsActive(), searchFiltered: !!searchFilterSQL('pid') }); console.log(`Point mode: rendered ${samples.length} of ${totalCount} samples in ${elapsed.toFixed(0)}ms${capReached ? ' (cap reached)' : ''}`); } catch(err) { @@ -2684,6 +2696,7 @@ zoomWatcher = { // #facetNote is only meaningful in cluster mode (#234 step 1). syncFacetNote(); console.log('Entered point mode'); + window.a1dbg?.('mode-change', { to: 'point', searchActive: searchIsActive() }); await loadViewportSamples(); } @@ -2714,7 +2727,8 @@ zoomWatcher = { updateStats(`H3 Res${currentRes}`, viewer.h3Points.length, '—', '—', 'Clusters Loaded', 'Samples Loaded'); } updatePhaseMsg(`${inView.clusters.toLocaleString()} clusters in view. Zoom closer for individual samples.`, 'done'); - console.log('[A1dbg] Exited point mode (searchActive=' + searchIsActive() + ')', new Error().stack?.split('\n').slice(1,4).join(' | ')); + window.a1dbg?.('mode-change', { to: 'cluster', searchActive: searchIsActive(), via: new Error().stack?.split('\n')[2]?.trim() }); + console.log('Exited point mode'); } // --- Boot→point-mode transition (issue #190 fix 2) --- @@ -3532,7 +3546,7 @@ zoomWatcher = { // active (its targetMode is overridden below), so panning/zooming during // a search won't bounce back to clusters. async function applySearchFilterChange() { - console.log('[A1dbg] applySearchFilterChange entry: active=', searchIsActive(), 'mode=', getMode()); + window.a1dbg?.('apply-search-change', { active: searchIsActive(), mode: getMode() }); busyAcquire(); try { syncFacetNote(); @@ -4019,10 +4033,51 @@ zoomWatcher = { (window.__searchFilter && window.__searchFilter.active) ? ` AND ${pidCol} IN (SELECT pid FROM search_pids)` : ''; + + // DEV: deterministic A1 observability. console capture races the + // automation harness; instead a1dbg() appends to window.__a1log and + // stamps window.__a1state[event] so a test can POLL state (and read + // an optional on-page panel via ?debug=a1) rather than chase console. + if (!window.a1dbg) { + window.__a1log = []; + window.__a1state = {}; + let _a1panel = null; + if (new URLSearchParams(location.search).get('debug') === 'a1' && document.body) { + _a1panel = document.createElement('div'); + _a1panel.id = 'a1DebugPanel'; + _a1panel.style.cssText = 'position:fixed;bottom:0;right:0;width:520px;max-height:42vh;overflow:auto;background:#111;color:#3f3;font:11px/1.35 monospace;z-index:99999;padding:4px;opacity:.92'; + document.body.appendChild(_a1panel); + } + window.a1dbg = (event, data = {}) => { + const row = { t: Math.round(performance.now()), event, ...data }; + window.__a1log.push(row); + window.__a1state[event] = row; + if (_a1panel) { + const line = document.createElement('div'); + line.textContent = `[A1] +${row.t}ms ${event} ${JSON.stringify(data)}`; + _a1panel.appendChild(line); + _a1panel.scrollTop = _a1panel.scrollHeight; + } + }; + } + // Globe-state accessor for the Playwright harness — exposes just what + // a coherence assertion needs, not the whole Cesium viewer. Lazily + // reads `viewer` (in this cell's scope) so it's safe to define here. + window.__a1globe = () => { + try { + return { + mode: viewer._globeState.mode, + samplePointsLen: viewer.samplePoints.length, + samplePointsShown: viewer.samplePoints.show, + h3PointsShown: viewer.h3Points.show, + }; + } catch (e) { return { error: String(e) }; } + }; } async function buildSearchFilter(terms, term) { const token = ++_searchFilterToken; + window.a1dbg?.('search-build-start', { term, token }); const searchWhere = textSearchWhere(terms, [ 'label', 'description', @@ -4043,6 +4098,7 @@ zoomWatcher = { const total = cnt.length ? Number(cnt[0].n) : 0; if (token !== _searchFilterToken) return false; window.__searchFilter = { active: true, term, token, total }; + window.a1dbg?.('search-build-end', { term, token, total }); return true; } @@ -4127,7 +4183,7 @@ zoomWatcher = { // Otherwise push the freshly-built filter through every surface: table, // points, facet counts, and the globe mode (C3 — switch to filtered // point dots, since clusters can't be text-filtered). - console.log('[A1dbg] post-build: searchId=', searchId, '_searchSeq=', _searchSeq, 'active=', window.__searchFilter?.active); + window.a1dbg?.('post-build', { searchId, searchSeq: _searchSeq, active: window.__searchFilter?.active, total: window.__searchFilter?.total }); if (searchId === _searchSeq) { await applySearchFilterChange(); } From 91a944cc27d3243335c2ca59a27f447dc493edc6 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 29 May 2026 22:30:07 -0700 Subject: [PATCH 06/23] dev: A1 Playwright verify harness (condition-based; asserts table+globe coherence) Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/playwright/a1-verify.mjs | 78 ++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 tests/playwright/a1-verify.mjs diff --git a/tests/playwright/a1-verify.mjs b/tests/playwright/a1-verify.mjs new file mode 100644 index 00000000..0b6af241 --- /dev/null +++ b/tests/playwright/a1-verify.mjs @@ -0,0 +1,78 @@ +// A1 (#234 Step 4) deterministic verify harness. +// +// Condition-based (no fixed sleeps), against a LOCAL parquet mirror so the +// loop is fast and repeatable. Run pattern: +// +// 1. mirror parquets once: ls docs/data/*.parquet (see SESSION_SUMMARY) +// 2. python3 dev_server.py --dir docs --port 8099 (RANGE-capable; 206) +// 3. node tests/playwright/a1-verify.mjs (or via @playwright/test) +// +// The big win is LOAD ONCE, then drive searches IN-PAGE: cold init is +// ~40s (DuckDB-WASM+Cesium+OJS, init-dominated, mirror can't help), but each +// in-page search then hits the local mirror fast. So this script pays init +// once and can loop searches via page.evaluate without reloading. +// +// Asserts the A1 coherence invariant: when a search is committed, the TABLE, +// the globe MODE (must be 'point'), and the rendered sample points all +// reflect the search — not unfiltered clusters. + +import { chromium } from 'playwright'; + +const BASE = process.env.A1_BASE + || 'http://localhost:8099/explorer.html?data_base=/data&debug=a1&sources=OPENCONTEXT%2CGEOME%2CSMITHSONIAN'; +const TERM = process.env.A1_TERM || 'bucchero'; +// Boot at high altitude (cluster) WITHOUT a search, so we test C3 forcing +// point mode from cluster via an in-page search (the failing case). +const URL = `${BASE}#v=1&lat=43.15&lng=11.40&alt=9000000`; + +const browser = await chromium.launch({ headless: false }); +const page = await browser.newPage(); +page.on('console', (m) => { if (/A1|point mode|Discarding/.test(m.text())) console.log(' page>', m.text()); }); + +console.log('Loading (cold init ~40s)…', URL); +await page.goto(URL, { waitUntil: 'domcontentloaded' }); + +// Wait for the OJS graph + DuckDB to be live (search machinery installed). +await page.waitForFunction( + () => typeof window.a1dbg === 'function' && !!window.__a1globe && !!document.querySelector('#sampleSearch'), + null, { timeout: 180_000 }); +console.log('App live. Boot mode:', await page.evaluate(() => window.__a1globe?.())); + +// Drive a search IN-PAGE (type + Enter on the map search input). +await page.fill('#sampleSearch', TERM); +await page.press('#sampleSearch', 'Enter'); + +// Wait on the filter actually building (condition, not sleep). +await page.waitForFunction( + (t) => window.__searchFilter?.active === true && window.__searchFilter?.term === t && window.__searchFilter?.total > 0, + TERM, { timeout: 120_000 }); + +// Wait for the globe to settle into point mode with filtered dots. +await page.waitForFunction(() => { + const g = window.__a1globe?.(); + return g && g.mode === 'point' && g.samplePointsShown === true && g.samplePointsLen > 0; +}, null, { timeout: 60_000 }).catch(() => console.log(' !! globe did NOT reach filtered point mode')); + +const state = await page.evaluate(() => ({ + search: window.__searchFilter, + globe: window.__a1globe?.(), + tableMeta: document.getElementById('tableMeta')?.textContent?.trim(), + a1log: window.__a1log, +})); + +console.log('\n=== RESULT ==='); +console.log(JSON.stringify(state, null, 2)); + +const ok = + state.search?.active === true && + state.globe?.mode === 'point' && + state.globe?.samplePointsShown === true && + state.globe?.h3PointsShown === false && + state.globe?.samplePointsLen > 0 && + state.globe?.samplePointsLen <= state.search?.total; + +console.log(ok ? '\n✅ A1 COHERENT: table + globe both filtered to the search.' + : '\n❌ A1 INCOHERENT: see globe.mode / samplePoints above.'); + +// Keep the browser open for manual poking unless A1_CLOSE=1. +if (process.env.A1_CLOSE) await browser.close(); From cba69ac74ee2626777799183101b28a3bd212279 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sat, 30 May 2026 07:06:34 -0700 Subject: [PATCH 07/23] docs: A1 session handoff + scoping (SESSION_SUMMARY, A1_SCOPING) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Working handoff docs for the search-as-global-filter (A1, #234 Step 4) work — branch state, the globe logjam + Codex's reconciler spec, the fast verify-loop, the performance model, and Eric #248 / #249. Strip before the A1 PR. Co-Authored-By: Claude Opus 4.8 (1M context) --- A1_SCOPING.md | 129 +++++++++++++ SESSION_SUMMARY.md | 465 ++++++++++----------------------------------- 2 files changed, 230 insertions(+), 364 deletions(-) create mode 100644 A1_SCOPING.md diff --git a/A1_SCOPING.md b/A1_SCOPING.md new file mode 100644 index 00000000..d1292299 --- /dev/null +++ b/A1_SCOPING.md @@ -0,0 +1,129 @@ +# A1 Scoping — "search as a global filter" (#234 Step 4) + +Goal: when a free-text search is committed, **every count/where surface reflects `searchTerm ∩ viewport ∩ source/facet filters`**, not just the side-panel results list. Currently search only drives a side list + camera fly; the table, point loader, facet counts, stats, and globe ignore it (axis A2). #250 (interim) only relabels the table; this is the real fix. + +## 1. Data-model reality (the constraint that shapes the design) + +The search predicate (`textSearchWhere`) matches **3 columns**: `label`, `description`, `CAST(place_name AS VARCHAR)` — all read from **`facets_url`** (`sample_facets_v2.parquet`). `description` was deliberately moved to facets_url in #168 *because* the search needed it. + +| Surface | Function | Reads from | Has search cols? | +|---|---|---|---| +| Samples table (count + page) | `loadCount` / `loadPage` | `lite_url` (samples_map_lite) | **label + place_name only — NO `description`** | +| Point-mode dots | `loadViewportSamples` | `lite_url` | NO description | +| Facet legend counts | `updateCrossFilteredCounts` | `facets_url` (global) **or** `facets_url ⋈ lite_url` (bbox path) | **YES — facets_url has all 3** | +| Facet counts cube fast-path | (cross_filter parquet) | `cross_filter_url` (pre-aggregated) | **NO — globally pre-aggregated, cannot be text-filtered** | +| Globe clusters | H3 summary parquets | `*_h3_summary_res{4,6,8}` | **NO — pre-aggregated, `dominant_source` only** | +| Heatmap | wide/lite | — | NO description | +| "Samples in View" stat | derived from above | — | follows its surface | + +Two hard truths: +1. **`description` is only in facets_url.** Any surface that queries `lite_url` (table, points) must reach facets_url to get full search parity — i.e. a JOIN — or accept reduced recall (label+place_name only). +2. **Pre-aggregated surfaces can never be text-filtered** (cube fast-path, H3 clusters). They must be *gated off* (cube) or handled by a mode switch / honest warning (clusters) when a search is active. + +## 2. Two implementation strategies + +### Strategy A — per-surface ILIKE (the naive wiring) +AND `searchWhere` into every query; JOIN facets_url where description is needed (table, points). Reuse the existing B1 `facets ⋈ lite` JOIN shape. +- ✅ Minimal new concepts; reuses existing patterns. +- ❌ **Perf**: `ILIKE '%term%'` is a full scan (no index). On staging, *one* search query was **42s cold / 17s count** over facets_url (6.7M rows). A1-naive runs an ILIKE scan on **every** count surface (table count, table page, 4× facet dims, point loader, heatmap) on **every** camera move / filter toggle → ~7 cold ILIKE scans per interaction. Almost certainly unacceptable. This is exactly the worry in #234 OQ4 ("may want the BM25 substrate #168-172 first"). + +### Strategy B — materialized search pid-set (RECOMMENDED) +Run the ILIKE **once per search term** to materialize the **set of matching pids** (e.g. bucchero → 2,693 pids), then constrain every other surface with a cheap **`pid` semi-join / `pid IN (…)`** against that held set. The ILIKE cost is paid once per *term change*, not per *interaction*; pan/zoom/facet-toggle just re-filter the held pid-set by bbox/facets (indexed-ish, cheap). +- ✅ Decouples A1 perf from interaction frequency — the expensive scan happens once. +- ✅ Works **without** the #168-172 BM25 substrate for moderate result sets. +- ✅ Single source of truth: same pid-set feeds table, points, facet counts, stats — guaranteed coherence. +- ⚠️ Materialize via a **registered DuckDB temp table** (`search_pids`) and semi-join, NOT a giant literal `IN (…)` — broad terms (`pottery` ≈ 7k+, or worse) make the literal unwieldy; a temp table scales and keeps SQL clean. +- ⚠️ Recompute the pid-set when the term changes; invalidate/drop on clear. The set is "all matching pids" (no LIMIT 50 — that cap is only for the side list). +- ⚠️ Very broad terms (e.g. single common word) could match millions of pids → temp table large but still a bounded one-time cost; semi-join stays cheap. Worst case is comparable to today's no-search counts. + +**Recommendation: Strategy B.** It's the design that makes A1 shippable on the current parquets and naturally coherent. The BM25 substrate (#168-172) then becomes a *latency optimization* for the one-time pid-set computation, not a prerequisite. + +## 3. Per-surface changes (Strategy B) + +1. **Materialize pid-set** (new): on committed search, `CREATE OR REPLACE TEMP TABLE search_pids AS SELECT pid FROM read_parquet(facets_url) WHERE `. Expose readiness via the existing cross-cell channel (extend `window.__explorerActiveSearch` → also a `searchPidsReady` flag / token). Drop/disable when search cleared. +2. **Table** `loadCount` / `loadPage` (`lite_url`): add `AND pid IN (SELECT pid FROM search_pids)` (semi-join) when search active. No description JOIN needed — the pid-set already encodes the description match. +3. **Point loader** `loadViewportSamples` (`lite_url`): same semi-join predicate. +4. **Facet counts** `updateCrossFilteredCounts`: add the semi-join to BOTH the global and bbox paths; **gate off the cube fast-path** when search active (like bbox already gates it). facets_url path already has the columns; semi-join keeps it uniform. +5. **"Samples in View" stat**: follows the surface it's derived from — recheck both cluster-mode and point-mode stat computations use the filtered count. +6. **Globe**: clusters (H3) can't be filtered. Adopt #234's **C3-when-feasible**: when a search is active, prefer point mode (points ARE filterable via semi-join); if over the density cap, keep clusters + **prominent "showing clusters — not filtered by your search" warning** (reuse the `#facetNote` honesty pattern). *Proposed: defer full C3 to a follow-up; in the first A1 PR, show the honest warning + keep #250's panel pointer.* +7. **Heatmap**: filter-honest density should also semi-join. *Proposed: include if cheap, else defer with a tracked note.* + +## 4. Cross-cell state & staleness + +- Already have `window.__explorerActiveSearch` (term) from #250. Add the temp-table lifecycle + a `searchToken` so surfaces can detect a superseded search. +- Reuse existing cancellation primitives: `pageGen` (table), `requestId` (points), `facetCountsReqId` (facets). Each must re-read the current search state on every async resume (same stale-guard pattern already in place). +- Order: materialize pid-set BEFORE kicking the dependent refreshes; `window.refreshSamplesTable?.()` + `refreshFacetCounts()` already exist as hooks. + +## 5. Progressive refinement (perf UX, optional in v1) + +Per #234, surfaces can show a coarse/stale value during active panning (`.recomputing` italic) then settle. With Strategy B the per-pan cost is already just a semi-join, so progressive refinement is likely **not needed for v1** — revisit only if semi-join + bbox on lite is still janky cold. + +## 6. Edge cases to honor + +- Search cleared / <2 chars → drop temp table, revert all surfaces to non-search (the #250 flag-clear path already exists; extend it). +- Term changed mid-flight → token invalidates the old pid-set; rebuild. +- Coordinate-less matches → counted by facet counts (facets_url) but absent from map/table (lite has no row). Decide: does the table count match the side-panel total? Likely NOT (table requires coords). **Must reconcile the messaging** so "N in view" vs "2,693 results" don't reintroduce confusion. +- Area-scope vs world-scope search: A1 makes scope less meaningful (the whole page is filtered); confirm the scope toggle still behaves. +- `escSql` / injection: the pid-set query reuses the existing escaped `searchWhere`; the semi-join carries no user input. + +## 7. Proposed scope split + +- **A1 PR #1 (core):** materialize pid-set; wire table (count+page), point loader, facet counts (+ gate cube), stats; honest cluster warning. This delivers "table/points/legend reflect bucchero ∩ viewport". +- **A1 PR #2 (follow-up):** C3 auto-point-mode promotion; heatmap semi-join; progressive refinement if needed. +- **Substrate (#168-172):** optional latency win for the one-time pid-set scan; not a blocker for PR #1. + +## 8. Open questions for review + +1. **Strategy B temp-table semi-join** — is a `CREATE TEMP TABLE search_pids` + `pid IN (SELECT …)` the right DuckDB-WASM pattern, vs a registered Arrow table or a literal IN-list? Any WASM-specific gotcha (temp table lifetime across queries in the same connection)? +2. **Coordinate-less matches** — how to keep "samples in view" vs "2,693 results" from re-confusing users once the table IS search-filtered? (The table can only ever show coord-bearing matches.) +3. **Cluster honesty in v1** — is "warn + keep #250 panel pointer" enough for the first A1 PR, or must C3 (auto-point) land together so the globe isn't visibly unfiltered while everything else is? +4. **Is the one-time ILIKE scan acceptable** at ~17-42s cold for the first search, or does even the *one-time* cost demand #168-172 first? (Caching/warm makes subsequent fast; cold-first-search is the concern.) +5. **Broad-term blow-up** — any term matching millions of pids: temp table size / semi-join cost acceptable, or cap + warn? + +--- + +# Codex review resolutions (incorporated) — verdict: PROCEED-WITH-CHANGES + +Strategy B confirmed as the right direction. Required changes folded into the plan: + +### Strategy B hardening +- Materialize as `CREATE OR REPLACE TEMP TABLE search_pids_next AS SELECT DISTINCT pid FROM read_parquet(facets_url) WHERE pid IS NOT NULL AND ` — **DISTINCT + NOT NULL** (facets_url is facet-shaped → duplicate pids are real and would corrupt any join form). +- **Token-versioned / atomic swap**: build `search_pids_next`, then swap to `search_pids` only if the build's token is still current. A fixed name + async UI refreshes is race-prone. +- **Text-only pids** — do NOT bake source/facet/viewport into `search_pids`. Materialize on the term alone; apply source/facet/bbox downstream. This keeps a term change (rebuild) separate from a filter toggle (cheap re-filter). +- **Keep it inside DuckDB** — a registered Arrow table is worse here (extra JS↔WASM copies + browser memory); the pid set is produced by SQL, so leave it there unless measurement says otherwise. +- **Dev/startup assertion** (do FIRST, before building on it): create a temp table, query it from a *second* `db.query()`, drop it — proves the Observable `DuckDBClient.of()` wrapper reuses one session so the temp table survives across `db.query()` calls. Don't discover this through A1 behavior. +- **Measure before committing**: `EXPLAIN` / time `pid IN (SELECT …)` vs explicit `SEMI JOIN` / `EXISTS`. "Cheap" only holds vs repeated text scans — for million-row matches it's "scan + hash-probe," ~unfiltered-count + overhead, not free. + +### Coordinate-less matches → first-class UX (not a footnote) +The table can only ever show coord-bearing matches, so it will NOT equal the global text-match count. Use **two named counts** instead of one: +- `2,693 text matches` (the search-results line) +- `N mappable samples in this map view matching "bucchero"` (the table meta), with `…of 2,693 total text matches (some have no coordinates)` when they differ. +This **replaces** #250's interim `summaryText()` disclaimer. + +### Globe coherence — C3 moves INTO PR #1 (the coherence line) +Do NOT ship A1 with the dominant map layer (clusters) visibly unfiltered while table/legend are filtered — that's an internally contradictory half-state, arguably worse than today. PR #1 MUST: when search is active, **auto-promote to point mode if filtered-in-view count < point budget** (points are filterable via the semi-join); only **over the density cap** fall back to clusters + the prominent "showing clusters — not filtered by your search" warning. + +### Revised scope split +- **PR #1 (A1 core):** dev assertion → pid-set materialization (versioned) → table (count+page) + point loader + facet counts (+ gate cube) + stats + **C3 auto-point-mode** + two-named-counts UX. +- **PR #2:** heatmap semi-join (or clearly disable/label heatmap while search active in PR #1), progressive refinement (likely unneeded with Strategy B), substrate-backed pid-set builder. + +### Substrate (#168-172) +Not *logically* required — the pid-set is the abstraction boundary BM25 can later sit behind. BUT a 17–42s cold first search may be a product blocker. **Gate PR #1 on measured cold browser perf** for 3–5 representative terms incl. a broad/common term and a no-match term. If typical cold is still tens of seconds → ship only with explicit "Building search filter…" progress UI + cancellation, OR do the substrate first. + +### Also add +- Search token in **every** stale guard (alongside `pageGen` / `requestId` / `facetCountsReqId`). +- **Broad-term policy**: warn / require refinement / "too many matches to render globally" fallback. +- **Update `EXPLORER_STATE.md`** — A1 changes the page's state contract, not just implementation. + +--- + +# A1 gating probe — measured against live parquets (DuckDB v1.4.0, 202601 data) + +| Finding | Result | Impact on plan | +|---|---|---| +| **Dup-pid risk** | facets = 5,980,282 rows = 5,980,282 **distinct** pids (pid is unique). lite = same 5,980,282. | `DISTINCT` is harmless hygiene but pid is already unique — no corruption risk. | +| **Coordinate-less matches** | bucchero 2,693 text → **2,693** coord-bearing; pottery 82,312 → **82,312**; soil 2,969 → **2,969**. facets pid-set ⊆ lite pid-set (identical counts). | **Codex concern #2 largely dissolves**: in the current data *every* searchable sample has coordinates. Table count = (text ∩ viewport); globally text-matches == mappable. UX simplifies to **"N of 2,693 matches in this view"** — no scary "some have no coordinates" caveat needed (keep a defensive branch, but it's not the common case). | +| **Broad-term magnitude** | pottery (broadest realistic) = **82k** pids, not millions. soil = 3k. | Temp-table + semi-join over ~10^4–10^5 pids is trivial. Broad-term-blowup is a non-issue for real terms; still cap a pathological single-letter term. | +| **ILIKE scan cost (native, lower bound)** | full text scan ~0.4–1.8s native warm. | WASM cold is the real cost (observed ~17–42s cold over HTTP range). **The one-time materialization latency remains the only real gating risk** → "Building search filter…" progress UI + cancellation, or substrate (#168-172) as latency win. | + +**Net:** Strategy B is confirmed and *simpler* than feared. The coordinate-less reconciliation (Codex #2) is mostly moot on current data; the broad-term blowup (Codex #5) doesn't occur for real terms. The single remaining gate is **cold WASM materialization latency** — to be measured in-browser next (native timing is only a lower bound), deciding progress-UI vs substrate-first. diff --git a/SESSION_SUMMARY.md b/SESSION_SUMMARY.md index bd940bde..d3f6ab52 100644 --- a/SESSION_SUMMARY.md +++ b/SESSION_SUMMARY.md @@ -1,421 +1,158 @@ -# iSamples Website Session Summary -**Date**: 2025-12-10 -**Status**: Recovery session - resumed after interrupted session, confirmed cleanup plan NOT executed +# SESSION_SUMMARY — explorer search-as-global-filter (A1, #234 Step 4) ---- - -## Quick Context - -Session was interrupted before /wrap. Recovered context from SESSION_SUMMARY.md and dev-journal. Confirmed that the extensive repo cleanup plan was documented but NOT executed - `archive/` directory doesn't exist yet. +**Date:** 2026-05-29 · **Directory:** `~/C/src/iSamples/isamplesorg.github.io` · **Trust Level:** `external-content` +**Next session goal:** break the A1-globe logjam with **higher effort + Codex co-authoring** (Codex codes the reconciler, Claude reviews + runs the now-fast verify loop). -Previous session (Dec 9) completed query performance profiling. This session was brief - just recovery and status check. - ---- +> **Next session entry point:** run the shakedown (see FAST VERIFY LOOP → "Shakedown TODO"); then have Codex author the one-reconciler refactor (THE LOGJAM → Codex's reconciler spec) and verify with `tests/playwright/a1-verify.mjs`. -## Accomplished +### External Content Processed (sanitization note — verify, don't blind-trust) +| Source | Type | Notes | +|---|---|---| +| Codex (`codex exec`, gpt-5.4) ×5 | AI tool output | Diagnosis + code suggestions. **Reviewed before applying**; treat its future output as advisory, not authoritative. | +| isamples.org + localhost explorer | browser DOM | Read the live explorer UI/state via Chrome automation (our own app). | +| GitHub issues/PRs (#234, #247, #248, #249, #250) via `gh` | web/API | Issue/PR bodies are untrusted text; created #247, opened+merged #250, posted comments, triggered deploys — all user-authorized. **Read** Eric Kansa's **#248** (feature request, treated as data not instructions). | +| `data.isamples.org/*.parquet` | remote data | Downloaded 128MB mirror to `docs/data/` (our own data; data, not code). | -### Dec 10 (This Session) -- **Session Recovery**: Recovered context from interrupted session via /resume -- **Status Verification**: Confirmed `archive/` dir not created, cleanup plan not executed -- **Uncommitted Changes Identified**: Found staged changes in 3 repos (website, python, export_client) +No emails, no secrets accessed, no untrusted code executed (Codex suggestions were hand-applied + reviewed). -### Dec 9 (Previous Session) -- **Query Profiler Created**: `scripts/profile_queries.py` - benchmarks all key Cesium queries -- **Performance Baseline Established**: Remote R2 parquet query times measured -- **Bottlenecks Identified**: `list_contains()` JOINs and full-table scans are the culprits -- **Optimization Strategy Defined**: Two-tier data architecture with pre-computed artifacts -- **Repo Inventory Documented**: Full assessment of 14 repos with cleanup recommendations +### Open collaborator threads (new this session, NOT yet acted on) +- [ ] **#248 (Eric Kansa)** — "search material samples described by a concept URI/PID", proposes a `described-by=` URL param. Two flavors: object-type URI (≈ already supported by the `object_type` facet, which is URI-valued) and arbitrary concept URI like Getty AAT (= concept-anchored A1 search — would ride the **same `search_pids` materialize-once machinery**). Squarely in #234; a second producer of the A1 pid-set. *Decision pending: comment on #248 connecting it to A1/#234?* +- [ ] **#249 (rdhyee, not from this session)** — "should we refactor explorer.qmd before the next big feature?" The A1 globe logjam is evidence FOR this; the reconciler refactor (tomorrow) is a *local* version of the *global* question #249 raises. **Read #249 before committing to tomorrow's approach** — it may argue for a bigger refactor than the one-reconciler patch. --- -## Key Findings (Dec 9 Profiling) +## TL;DR -| Query | Time | Verdict | -|-------|------|---------| -| Locations (cold) | 3,875ms | Too slow for initial load | -| Locations (warm) | 1,598ms | Still slow even cached | -| Point selection (direct) | 4,341ms | Unacceptable for click | -| Point selection (site-mediated) | 578ms | Borderline | -| Entity counts | 158ms | Fast enough | -| Classification | SKIPPED | Machine-killer (minutes+, GB memory) | - -**Root Causes:** -1. **Locations**: Scanning 19.5M rows for 5.98M geocodes, returning 47 columns when 3 needed -2. **Point selection**: `list_contains()` on arrays requires full table scan - no index -3. **Classification**: LEFT JOINs with `list_contains()` = exponential complexity +1. **Shipped to production** (isamples.org): bug **#247** filed + interim honesty fix **PR #250** (merged). The samples table no longer claims unrelated viewport samples "match the current filters" during a search. +2. **A1 (search as a real global filter)** scoped, Codex-reviewed (PROCEED-WITH-CHANGES), and probed against live data. Branch `feat/search-global-filter-a1`. + - ✅ **Table surface filters correctly** (e.g. `bucchero` → "2,693 of 2,693 matches in this map view", OpenContext rows only). + - ✅ Facet counts + cube-gating wired; pid-set machinery + persistence proven. + - ❌ **GLOBE still won't enter point mode** on a committed search (table filters, but the map stays unfiltered clusters). This is the logjam. +3. **Built a fast/deterministic verify-loop** (local parquet mirror + range server + `window.__a1state`/`__a1globe` observability + Playwright harness) so tomorrow's iteration isn't 40–90s/cycle. Range-verified; full speedup run still needs a shakedown. --- -## Generated Files - -| File | Description | Keep/Regenerate | -|------|-------------|-----------------| -| `scripts/profile_queries.py` | Query benchmarking tool | Keep | -| `/tmp/query_profile_results.txt` | Latest profiling output | Regenerate | - -### From Previous Session (Dec 6) -| File | Description | Keep/Regenerate | -|------|-------------|-----------------| -| `/tmp/zenodo_narrow_strict.parquet` | Narrow PQG (709MB) | Keep - on R2 | -| `/tmp/zenodo_wide_strict.parquet` | Wide PQG (242MB) | Keep - on R2 | -| `~/.claude/skills/gemini/SKILL.md` | Gemini skill doc | Keep | - ---- +## Branch & commits -## Public URLs +`feat/search-global-filter-a1` (off `upstream/main` which already has #250): +- `204d2df` table surface (pid-set + semi-join + summaryText) +- `936f1f3` points/facets/cube-gate/C3 wired — globe buggy +- `4e79830` Codex's C3 fixes (moveEnd latch, awaitable enterPointMode, search-token staleness) — globe STILL not entering point mode +- `62d5500` dev verify-loop infra (mirror support + dev_server.py + a1dbg/__a1state/__a1globe) -- **Wide**: `https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202512_wide.parquet` -- **Narrow**: `https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202512_narrow.parquet` +Production (already merged, do NOT redo): upstream `a4da97b` (#250). --- -## Next Steps (Prioritized) - -### 0. Eric/Andrea Wrap-Up Plan (Dec 2025) - -**From Eric's email (confirmed Dec 10) - three-part plan:** - -#### Part 1: Archive Full PQG Export to Zenodo -- Use PostgreSQL dump Dave can recover -- Create comprehensive iSamples Central PQG export -- Archive to Zenodo for preservation - -#### Part 2: Simplified Parquet for Frontend (HIGH PRIORITY) -Requirements for parquet-powered iSamples Central: - -| Feature | Implementation Notes | -|---------|---------------------| -| **Global Cesium map** | Use H3 geohash (https://h3geo.org/) to aggregate locations for fast rendering | -| **Faceted filtering** | Facets with counts: object type, material type, collection | -| **Map updates on filter** | Filtering facets should update world map dynamically | -| **Click → sample table** | Point click shows sample records (like OpenContext demo) | -| **Links to source** | Sample results link back to home collections | -| **Full-text search** | Search updates world map (stretch goal?) | - -**Key insight from Eric**: May need even MORE denormalized parquet than "PQG wide" - specifically designed for these UI needs. - -#### Part 3: Visual Enhancements (Nice to have) -- Most records lack thumbnails -- Use collection logos as stand-ins -- Use NounProject icons (Eric has account) for sample object types -- Icons from: https://isamples.org/models/generated/vocabularies/material_sample_object_type.html - ---- +## The A1 design (Strategy B — agreed + Codex-approved) -### 1. Create Optimized Intermediary Artifacts (High priority, Medium risk) - -**Aligns with Eric's Part 2 - simplified parquet for frontend** - -**Recommended artifacts:** - -1. **`locations_h3.parquet`** (~1-5MB) - NEW based on Eric's suggestion - - H3 hexagonal aggregation at multiple resolutions - - h3_index, count, representative_lat, representative_lon - - For fast initial map render with clustering - -2. **`locations_summary.parquet`** (~5-10MB) - - Pre-filtered: pid, latitude, longitude, location_type - - Only 5.98M rows × 4 columns - - Target: <500ms initial load - -3. **`facets_precomputed.parquet`** (~1MB) - NEW for Eric's faceting - - Pre-aggregated counts by: object_type, material_type, collection - - Enables instant facet rendering - -4. **`location_samples_lookup.parquet`** (~50MB?) - - Pre-computed: geo_pid → [sample_pids, sample_labels, source_url] - - Eliminates `list_contains()` JOINs at query time - - Target: <100ms point selection - -5. Keep full wide parquet for detail drill-down only - -### 2. Page Consolidation (Low risk) -- Merge `parquet_cesium_wide.qmd` and `parquet_cesium_isamples_wide.qmd` -- Update to use optimized artifacts - -### 3. Public-Facing UI (Medium risk) -- Consider React SPA for production quality -- Features: Search, filter by source, map exploration, export -- Add collection logos/NounProject icons per Eric's suggestion - -### 4. Schema Enforcement (Low risk) -- Normalize `sample_identifier_col` → `sample_identifier` -- Add column order tests - -### 5. Deprecate iSamples Central API References (Medium priority, Low risk) - -**Goal**: Pivot fully to parquet workflows while preserving API code for potential future revival. - -**Strategy - "Soft Deprecation"**: -- Don't delete API client code - move to `_legacy/` or mark with deprecation warnings -- Update all tutorials/examples to use parquet-first patterns -- Add clear banners/callouts: "iSamples Central API is offline - using parquet archive" -- Keep API code importable but not in default examples - -**Repositories affected**: -| Repo | Action | -|------|--------| -| `isamples-python` | Mark `IsbClient`, `IsbClient2`, `ISamplesBulkHandler` as deprecated; keep in codebase | -| `isamplesorg.github.io` | Remove/archive API-dependent tutorials; focus on parquet demos | -| `pqg` | Already parquet-native - no changes needed | - -**Code preservation pattern**: -```python -# In isamples-python/src/isamples_client/isbclient.py -import warnings - -class IsbClient: - """ - DEPRECATED: iSamples Central API is offline as of 2025. - Use parquet workflows instead - see examples/basic/geoparquet.ipynb - - This class is preserved for potential future API revival. - """ - def __init__(self, ...): - warnings.warn( - "IsbClient is deprecated - iSamples Central API offline. " - "Use parquet workflows: examples/basic/geoparquet.ipynb", - DeprecationWarning, - stacklevel=2 - ) - ... -``` +On a committed search, `buildSearchFilter()` materializes a **non-temp** DuckDB table `search_pids` (one `ILIKE` scan over `facets_url`), then every surface constrains with a cheap semi-join `AND pid IN (SELECT pid FROM search_pids)`. State on `window.__searchFilter {active,term,token,total}`; predicate via `window.searchFilterSQL(col)`. -**Documentation updates**: -- README.md: Lead with parquet, mention API as "archived" -- CLAUDE.md: Already notes API offline - strengthen language -- Tutorials: Archive API-dependent ones, create new parquet-only versions - -**Parquet format focus** (per Eric's direction): -- PQG narrow format: Full fidelity, archival -- PQG wide format: Query-optimized, entity-centric -- Frontend-optimized: H3 aggregated, pre-computed facets (new) - -### 6. Repository Cleanup & Organization (Low priority, Low risk) - -**Inventory completed Dec 9, 2025** - Assessment of all iSamples repos: - -#### Active Repositories (keep as-is) -| Repo | Last Commit | 6-Mo Commits | Size | Notes | -|------|-------------|--------------|------|-------| -| `isamplesorg.github.io` | Dec 6 | 71 | 1.4G | Primary website, Cesium demos | -| `isamples-python` | Dec 4 | 30 | 997M | Python client, Jupyter examples | -| `pqg` | Dec 6 | 21 | 18G | Property graph framework | - -#### Maintained (keep, minimal changes expected) -| Repo | Last Commit | Notes | -|------|-------------|-------| -| `export_client` | Dec 5 | CLI for batch downloads | -| `isamplesorg-metadata` | Nov 14 | LinkML schemas, vocabularies | - -#### Legacy/Archive (candidates for `archive/` subdirectory) -| Repo | Last Commit | Size | Notes | -|------|-------------|------|-------| -| `isamples_inabox` | Feb 2023 | 19M | Original server (PostgreSQL/Solr/FastAPI) | -| `isamples_docker` | Mar 2022 | 340M | Docker deployment - obsolete | -| `isamples_docker_upstream` | Mar 2023 | 357M | Docker mirror - obsolete | -| `isamples-ansible` | Mar 2023 | 381M | Ansible deployment - obsolete | -| `noid-generation` | Oct 2023 | 168M | NOID identifier tool | -| `noid-1` | Oct 2021 | 372K | Original NOID Python port | -| `noidy` | Apr 2023 | 284K | NOID variant | -| `pynoid` | Apr 2023 | 192K | NOID alternative | -| `ezid` | May 2023 | 93M | EZID identifier service | -| `ezid-client-tools` | Jun 2023 | 1.6M | EZID client tools | -| `opencontext_rdhyee` | Mar 2023 | 373M | Exploratory OC work | - -#### Root-Level Files to Clean Up -**Keep (essential docs):** -- `CLAUDE.md`, `SESSION_SUMMARY.md` - Active guidance -- `EDGE_TYPE_FLOW.md`, `PQG_LEARNING_GUIDE.md` - Valuable reference - -**Archive/Delete (Oct 2025 scratch files):** -- `test_*.py`, `test_*.js` - Exploratory test scripts -- `*_output.txt` - Test outputs (regenerable) -- `find_pkap_geos.py`, `investigate_path1.py` - One-off scripts -- `package.json`, `node_modules/` - Minimal npm setup (not needed) -- `GEMINI.md` - Empty placeholder -- `IMPLEMENTATION_SUMMARY.md`, `BILLING_UPDATE.md`, `QUERY_COMPARISON.md`, `AGENTS.md` - Possibly stale - -**Suggested cleanup action:** -```bash -cd /Users/raymondyee/C/src/iSamples -mkdir -p archive -mv isamples_inabox isamples_docker isamples_docker_upstream isamples-ansible archive/ -mv noid-generation noid-1 noidy pynoid ezid ezid-client-tools archive/ -mv opencontext_rdhyee archive/ -# Consider: rm -rf node_modules package.json package-lock.json -``` +**Probe findings (de-risked the design):** pid is unique (no dup), facets ⊆ lite so **no coordinate-less matches** (table count == mappable matches — simple "N of M in view" copy), broadest realistic term ~82k pids (no million-row blowup). Full scoping + Codex resolutions in **`A1_SCOPING.md`**. -**Space recovery potential:** ~1.7GB from archiving legacy repos +Surfaces wired: `loadCount`/`loadPage` (table) ✅ verified; `loadViewportSamples` (points); `updateCrossFilteredCounts` (facet legend, + gate cube fast-path & global-baseline when search active); `summaryText` copy. --- -## Active File Analysis (Per-Repo Cleanup Plans) - -### 1. `isamplesorg.github.io` (1.4G total, 20M git) - -**Most Active Files (commits since Jun 2025):** -| File | Commits | Status | -|------|---------|--------| -| `tutorials/parquet_cesium.qmd` | 27 | ACTIVE - main Cesium demo | -| `_quarto.yml` | 9 | Config | -| `tutorials/zenodo_isamples_analysis.qmd` | 7 | ACTIVE | -| `index.qmd` | 6 | Homepage | -| `tutorials/parquet_cesium_wide.qmd` | 2 | ACTIVE - wide format demo | -| `tutorials/parquet_cesium_isamples_wide.qmd` | 1 | ACTIVE - full iSamples demo | - -**Space Hogs:** -- `assets/oc_isamples_pqg.parquet` - **691MB** (duplicated in docs/assets!) -- `docs/assets/` - 695MB (duplicate of assets/) - -**Cleanup Opportunities:** -```bash -# Remove duplicate parquet (use R2 URL instead) -rm assets/oc_isamples_pqg.parquet -# Or add to .gitignore and reference R2 URL in tutorials -``` +## THE LOGJAM (start here tomorrow) -**Files to consider archiving:** -- `PERFORMANCE_OPTIMIZATION_PLAN.md`, `OPTIMIZATION_SUMMARY.md`, `LAZY_LOADING_IMPLEMENTATION.md` - One-off planning docs +**Symptom:** search `bucchero` → table = "2,693 of 2,693 matches" (✅), but globe phaseMsg/stat stay **cluster** and `exitPointMode` runs. Even a clean **manual** search (not just boot) fails → it's a real state-machine bug, not a boot race. ---- +**Codex's diagnosis (correct, partially fixed in `4e79830`):** +1. ✅ FIXED — post-search `flyTo` lands at **200 km > EXIT_POINT_ALT (180 km)**, and the `moveEnd` handler exited point mode without checking `searchIsActive()`. Latched now. +2. ✅ FIXED — `enterPointMode` was fire-and-forget; now `async` + `await loadViewportSamples()`, awaited at all call sites. +3. ✅ FIXED — `loadViewportSamples` staleness was `requestId`-only; now also keys on the search token (`isStaleLoad()`). +4. ⏳ **NOT DONE — the actual remaining fix:** `applySearchFilterChange()` is a **parallel** mode-entry path racing the camera/mode machinery. Codex recommends **replacing it with ONE reconciler** that both the camera handler and search call, so "search forces point" and "altitude decides mode" live in one predicate with one set of staleness tokens. -### 2. `isamples-python` (997M total) - -**Most Active Files:** -| File | Commits | Status | -|------|---------|--------| -| `examples/basic/oc_parquet_analysis_enhanced.ipynb` | 13 | ACTIVE | -| `examples/basic/geoparquet.ipynb` | 5 | ACTIVE - main parquet demo | -| `examples/basic/isample-archive.ipynb` | 4 | ACTIVE | -| `README.md`, `CLAUDE.md`, `pyproject.toml` | 4 each | Config/docs | -| `src/isamples_client/isbclient.py` | 1 | API client (TO DEPRECATE) | - -**Space Hogs:** -- `examples/basic/oc_isamples_pqg.parquet` - **691MB** -- `examples/basic/oc_isamples_pqg_wide.parquet` - **275MB** - -**Cleanup Opportunities:** -```bash -# Add parquet files to .gitignore, document R2 URLs instead -echo "*.parquet" >> .gitignore -# Or keep one canonical copy and symlink +**Codex's reconciler spec (implement this):** +```js +async function reconcileGlobeForCurrentFilters(pushHistory = false) { + syncFacetNote(); + refreshHeatmap(); + if (searchIsActive()) { + if (getMode() !== 'point') await enterPointMode(pushHistory); + else await loadViewportSamples(); + } else { + // existing altitude-driven cluster/point behavior + } + refreshFacetCounts(); + window.refreshSamplesTable?.(); +} ``` +Call it from search completion AND the relevant camera paths; delete the bespoke `applySearchFilterChange` mini-state-machine. **Open question to nail with the new observability:** why does `enterPointMode` not stick on a manual search? (`[A1dbg]` events `apply-search-change`, `mode-change`, `post-build` will show the sequence — see below.) -**Files to consider archiving:** -- `PQG_INTEGRATION_PLAN.md`, `ISAMPLES_MODEL_ACTION_PLAN.md` - Planning docs (may be stale) -- `examples/spatial/` - Check if still relevant -- Multiple `*_output.txt` files +**Other bugs Codex flagged (not yet addressed):** +- Heatmap `renderHeatmap()` omits `searchFilterSQL` and `heatmapFilterHash()` omits the search token → heatmap (labeled "filtered density") stays unfiltered under search. (PR#2 or fix now.) +- Selection revalidation (`~L3457`) checks only source, not the search filter — clear/revalidate selection on search change. --- -### 3. `pqg` (18G total - **NEEDS ATTENTION**) - -**Most Active Files:** -| File | Commits | Status | -|------|---------|--------| -| `pqg/sql_converter.py` | 8 | ACTIVE - core converter | -| `pqg/pqg_singletable.py` | 4 | ACTIVE - main implementation | -| `README.md` | 4 | Docs | -| `pqg/typed_edges.py` | 2 | ACTIVE - typed edge support | -| `pqg/schemas/*.py` | 2 each | ACTIVE - schema validation | +## PERFORMANCE MODEL — why the UI hides the 40s, and what A1 does to it -**Space Hogs (CRITICAL):** -- `.git/` - **17GB** (likely large parquet commits in history) -- `.venv/` - 690MB (normal for DuckDB/PyArrow) +(RY's framing, 2026-05-29 — worth keeping front-of-mind for the substrate-vs-progress-UI call.) -**Cleanup Opportunities:** -```bash -# Check git history for large files -git rev-list --objects --all | git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' | sort -k3 -n -r | head -20 +The explorer never *feels* like a 40–90s app because the whole design is **"never fetch big data over a wide area."** Data is tiered by zoom, smallest-first, and the tiny tiers are **preloaded** (`explorer.qmd` L14–17: `` for h3 res4 + facet_summaries + vocab_labels): -# Consider: git filter-repo to remove large parquet files from history -# Or: fresh clone without history -``` - -**Root cause investigation needed:** Why is .git 17GB? Likely committed large parquet files that were later removed. +| User action | Fetched | Size | Felt | +|---|---|---|---| +| Land on globe (zoomed out) | H3 res4 | **580 KB** (preloaded) | instant (`Load Time 0.4s`) | +| Zoom in / more | H3 res6 / res8 | 1.6 / 2.5 MB | fast | +| Zoom **deep** → point mode | `samples_map_lite` | 60 MB file… | **still fast** ↓ | ---- +The trick on that last row: by the time `samples_map_lite` (60 MB) is touched, the camera is deep (alt < `ENTER_POINT_ALT` 120 km), so the bbox is tiny. DuckDB-WASM does **HTTP range requests** and pulls only the parquet **row groups** overlapping that small bbox (a few MB), never the whole file. So the big files are only ever read in slivers. UX masking on top: instant res4 globe, phase messages, stale-while-loading (dimmed old rows). -### 4. `export_client` (58M total - clean) +**The two operations with NO spatial narrowing** (= the only ones that can hit the full 40s; both were what I kept triggering in dev): +1. **Free-text search** — `ILIKE '%term%'` over `label/description/place_name` across the *whole* `sample_facets_v2.parquet` (63 MB text). ILIKE can't skip row groups; it's a full column scan. Irreducible without an index. +2. **Samples table at a wide viewport** — `loadCount` over a world-sized bbox counts ~everything (normal users zoom in first, shrinking it). -**Most Active Files:** -| File | Commits | Status | -|------|---------|--------| -| `isamples_export_client/pqg_converter.py` | 4 | ACTIVE | -| `README.md` | 2 | Docs | +**The A1 implication (the load-bearing point):** A1 takes operation #1 — the single slowest thing in the app — and moves it to the **front of the common flow.** Today search is an optional side-panel lookup; A1 makes every committed search run that full 63 MB scan *first* and gates the filtered view on it. So A1 risks importing the one 40s wait into exactly the place the rest of the UI worked to avoid it. That's why: +- The **"Building search filter…"** affordance matters (honest masking, like the rest of the app). +- **BM25 substrate (#168–172)** is the thing that makes a *cold* search feel as snappy as zooming — NOT a correctness blocker (the pid-set abstraction works on plain ILIKE), but the perceived-perf fix. +- The **materialize-once** design is the mitigation: pay the un-narrowable full scan *one time* per term, then every pan/zoom/facet-toggle is a cheap `pid IN (…)` semi-join that DOES narrow spatially — folding search back into the fast tier after the first hit. -**Status:** Clean, well-organized. No cleanup needed. +(This also reframes the cold-load floor below: init ~40s is one thing, but the search scan is the *product-facing* slow path, and it's the one A1 must manage.) --- -### 5. `isamplesorg-metadata` (83M total - stable) +## FAST VERIFY LOOP (built today — use it tomorrow) -**Most Active Files:** -| File | Commits | Status | -|------|---------|--------| -| `src/docs/*.md` | 1 each | Documentation updates | +**Why today was slow:** every iteration was a cold reload. Cold cost is **init-dominated** — DuckDB-WASM (from CDN) + Cesium + the OJS reactive graph take ~40s **before any data query**, and the search `ILIKE` then downloaded ~60MB of text columns over the network. Console capture from the automation harness was also flaky. -**Status:** Foundational schema repo. Stable. No cleanup needed. - ---- +**The fix (set up, committed in `62d5500`):** +1. **Local parquet mirror** — `docs/data/*.parquet` (128MB, gitignored via `docs` + `*.parquet`). Re-fetch with: + `for f in isamples_202601_{samples_map_lite,sample_facets_v2,h3_summary_res4,h3_summary_res6,h3_summary_res8,facet_cross_filter,facet_summaries}.parquet vocab_labels.parquet; do curl -s -o docs/data/$f https://data.isamples.org/$f; done` + (⚠️ `current/wide.parquet` came back **0 bytes** — used only for sample-click detail; may be the cause of the init hang — investigate.) +2. **`R2_BASE` override** — load with `?data_base=/data` (or `localStorage.ISAMPLES_DATA_BASE`). Defaults to prod, so shipped builds are unchanged. +3. **Range-capable server** — `python3 dev_server.py --dir docs --port 8099`. **Stock `python3 -m http.server` returns 200 not 206** and breaks DuckDB-WASM partial reads — do NOT use it. Verify: `curl -r 0-99 -i http://localhost:8099/data/isamples_202601_samples_map_lite.parquet` → must be **206** (confirmed working). +4. **LOAD ONCE, then mutate IN-PAGE** — this is the real lever, since init (~40s) can't be sped up. Pay init once; then drive searches via the search box (or `page.fill`) without reloading. Each in-page search hits the local mirror (fast data). +5. **Deterministic observability** (replaces flaky console): `window.__a1log` (ordered events), `window.__a1state[event]` (latest), `window.__a1globe()` → `{mode, samplePointsLen, samplePointsShown, h3PointsShown}`. On-page panel via `?debug=a1`. Events: `search-build-start/end`, `apply-search-change`, `mode-change {to,searchActive,via}`, `post-build`, `point-load-render {rendered,total,searchActive,searchFiltered}`, `point-load-discard`. +6. **Playwright harness** — `tests/playwright/a1-verify.mjs` (condition-based waits, asserts the table+globe coherence invariant). `node tests/playwright/a1-verify.mjs` (needs `npm i -D playwright` / `npx playwright install chromium`). -## Priority Cleanup Actions +**Loop URL example:** +`http://localhost:8099/explorer.html?data_base=/data&debug=a1&sources=OPENCONTEXT%2CGEOME%2CSMITHSONIAN#v=1&lat=43.15&lng=11.40&alt=9000000` -### Immediate (High impact, low risk) -1. **pqg .git cleanup** - 17GB is excessive. Investigate and consider `git filter-repo` or fresh clone -2. **Remove duplicate parquets** - `assets/oc_isamples_pqg.parquet` duplicated in website repo - -### Short-term (Medium impact) -3. **Add `.gitignore` for parquet** - Reference R2 URLs instead of committing 691MB files -4. **Archive planning docs** - Move stale `*_PLAN.md` files to `archive/` in each repo - -### When convenient (Low priority) -5. **Clean root-level scratch files** - Test scripts, output files in `/Users/raymondyee/C/src/iSamples/` +**Shakedown TODO (tomorrow, first thing):** a full mirror load hung in init (~50s, zero `/data` fetches). Check whether the 0-byte `current/wide.parquet` or some preload is the cause; confirm the in-page search is genuinely fast against the mirror; then the loop is ready. --- -## Technical Notes - -### Profiler Usage -```bash -# Safe mode (skips classification query) -~/.pyenv/versions/myenv/bin/python scripts/profile_queries.py --remote-only +## Collaboration plan for tomorrow (agreed) -# Full mode (WARNING: high memory/CPU) -~/.pyenv/versions/myenv/bin/python scripts/profile_queries.py --full - -# Local only (if file downloaded) -curl -o /tmp/isamples_202512_wide.parquet https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202512_wide.parquet -~/.pyenv/versions/myenv/bin/python scripts/profile_queries.py --local-only -``` - -### Credentials & Tools -- **R2 Credentials**: Stored in 1Password, use `op run --env-file=...` pattern -- **Gemini CLI**: `/opt/homebrew/bin/gemini` -- **Codex CLI**: `/opt/homebrew/bin/codex exec "prompt" -o /tmp/output.txt` +Flip the loop for the reconciler refactor: **Codex authors** (it out-diagnosed Claude's debugging and designed the fix), **Claude reviews line-by-line + owns the runtime verify loop + git/PR/deploy**. Iterate: Codex edits → Claude renders + runs `a1-verify.mjs` / in-page → feeds `__a1log` back to Codex → repeat. Higher effort both sides. --- -## Blockers / Decisions Needed +## Cleanup before the A1 PR is opened (don't ship these) -1. **Artifact storage**: Upload optimized parquet files to R2? Or generate on-demand? -2. **Pre-compute strategy**: Run classification once during ETL vs compute lazily? -3. **Location type**: Should `location_type` be pre-computed (blue/purple/orange classification)? +- Remove the **`a1PersistenceProbe`** dev cell (right after the `db` cell) — persistence already proven. +- Decide on `a1dbg`/`__a1log`/`__a1state`/`__a1globe` + `?debug=a1` panel: gate behind a dev flag or strip. The `R2_BASE ?data_base=` override and `dev_server.py` are worth KEEPING (useful, safe defaults). +- The double-scan in `doSearch` (pid-set build + the existing LIMIT-50 side-panel query both scan facets) — follow-up: derive the side-panel list from `search_pids`. +- Heatmap + selection-revalidation search-awareness (above). --- -## Resume Checklist - -1. Read this SESSION_SUMMARY.md -2. Review profiling results: `/tmp/query_profile_results.txt` -3. Next action: Create `locations_summary.parquet` generation script -4. Public URLs above are live and working - ---- +## Key references -**Last Updated**: 2025-12-09 by Claude Code (Opus 4.5) -**Repository**: isamplesorg.github.io (fork at rdhyee/isamplesorg.github.io) -**Focus**: Query performance optimization, intermediary artifact design -**Next Action**: Generate optimized parquet artifacts -**Session Status**: IN PROGRESS +- `explorer.qmd` anchors: `buildSearchFilter`/`clearSearchFilter`/`applySearchFilterChange` (~L3534), `loadViewportSamples` (~L2510), `enterPointMode`/`exitPointMode` (~L2680/2700), camera `moveEnd` handler (~L3709), camera `changed` handler (~L3560), `summaryText`/`loadCount`/`loadPage` (tableView cell ~L2123), `R2_BASE` (~L683), a1dbg/`__a1globe` install (~L4028). +- `A1_SCOPING.md` — full scope + probe + Codex resolutions. +- `dev_server.py`, `tests/playwright/a1-verify.mjs` — the loop. +- Issues: #234 (umbrella, A1 = Step 4), #247 (the bug, interim fixed by #250), #168–172 (FTS substrate — optional latency win, NOT a blocker for A1). From c387f6bb9d47a155c7c85319e690e4993d2296f0 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sat, 30 May 2026 10:13:09 -0700 Subject: [PATCH 08/23] fix(explorer): resolve relative data_base override to absolute for DuckDB-WASM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ?data_base=/data dev override produced root-relative parquet URLs (/data/foo.parquet). DuckDB-WASM's httpfs reads those as a virtual-FS glob ("No files found that match the pattern") instead of fetching over HTTP, so the local-mirror verify loop hung in init with zero /data fetches — the "shakedown" symptom. Resolve a root-relative data_base against location.origin so the ergonomic ?data_base=/data form works; the prod default and absolute (http://...) overrides pass through unchanged. Verify-loop infra: - dev_server.py: pin HTTP/1.1 (DuckDB's range reader expects keep-alive; curl-verified 206 + multi-request keep-alive). Local full-GET-vs-206 is DuckDB-WASM heuristic and moot over localhost; validate ranges on deploy. - tests/playwright/shakedown-206.mjs: headless boot+search probe (no popup). Confirms cold boot ~2.3s to live, bucchero search builds 2,693 pids ~9s. Co-Authored-By: Claude Opus 4.8 (1M context) --- dev_server.py | 7 ++++++ explorer.qmd | 15 ++++++++++--- tests/playwright/shakedown-206.mjs | 35 ++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 tests/playwright/shakedown-206.mjs diff --git a/dev_server.py b/dev_server.py index bd36a633..053575e3 100644 --- a/dev_server.py +++ b/dev_server.py @@ -24,6 +24,13 @@ class RangeHandler(http.server.SimpleHTTPRequestHandler): + # DuckDB-WASM's httpfs range reader expects HTTP/1.1 (keep-alive + + # persistent connections for its many small footer/row-group range GETs). + # Python's http.server defaults to HTTP/1.0, under which DuckDB falls back + # to whole-file GET 200s — so the local mirror never exercises the 206 + # range path that production (Cloudflare R2, HTTP/2) uses. Pin 1.1. + protocol_version = "HTTP/1.1" + def end_headers(self): # CORS + always-Accept-Ranges so a cross-origin data_base also works. self.send_header("Access-Control-Allow-Origin", "*") diff --git a/explorer.qmd b/explorer.qmd index 60a95f0e..751410fb 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -688,9 +688,18 @@ Cesium.Ion.defaultAccessToken = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOi // DuckDB-WASM partial reads; use dev_server.py) // localStorage ISAMPLES_DATA_BASE (sticky per-browser override) // Defaults to the production R2 origin, so shipped builds are unchanged. -R2_BASE = new URLSearchParams(location.search).get('data_base') - || (typeof localStorage !== 'undefined' && localStorage.getItem('ISAMPLES_DATA_BASE')) - || "https://data.isamples.org" +R2_BASE = (() => { + const raw = new URLSearchParams(location.search).get('data_base') + || (typeof localStorage !== 'undefined' && localStorage.getItem('ISAMPLES_DATA_BASE')) + || "https://data.isamples.org"; + // DuckDB-WASM's httpfs only range-fetches ABSOLUTE http(s) URLs. A root- + // relative override (?data_base=/data) is read as a virtual-filesystem + // glob and fails with "No files found that match the pattern" — silently, + // before any network fetch. Resolve a relative override against the page + // origin so the ergonomic ?data_base=/data form actually works. The prod + // default and absolute overrides (http://localhost:8099/data) pass through. + return raw.startsWith('/') ? new URL(raw, location.origin).href : raw; +})() h3_res4_url = `${R2_BASE}/isamples_202601_h3_summary_res4.parquet` h3_res6_url = `${R2_BASE}/isamples_202601_h3_summary_res6.parquet` h3_res8_url = `${R2_BASE}/isamples_202601_h3_summary_res8.parquet` diff --git a/tests/playwright/shakedown-206.mjs b/tests/playwright/shakedown-206.mjs new file mode 100644 index 00000000..87465e09 --- /dev/null +++ b/tests/playwright/shakedown-206.mjs @@ -0,0 +1,35 @@ +// Headless shakedown probe: confirms the local mirror boots on HTTP/1.1 and +// that a committed search exercises the big-file range path. Does NOT assert +// A1 globe coherence (that's the known logjam) — only boot + search timing. +import { chromium } from 'playwright'; + +const BASE = 'http://localhost:8099/explorer.html?data_base=/data&debug=a1&sources=OPENCONTEXT%2CGEOME%2CSMITHSONIAN'; +const URL = `${BASE}#v=1&lat=43.15&lng=11.40&alt=9000000`; +const TERM = 'bucchero'; +const t0 = performance.now(); +const sec = () => ((performance.now() - t0) / 1000).toFixed(1) + 's'; + +const browser = await chromium.launch({ headless: true }); +const page = await browser.newPage(); +page.on('pageerror', e => console.log(' [pageerror]', String(e).slice(0, 120))); + +console.log(`[${sec()}] goto`, URL); +await page.goto(URL, { waitUntil: 'domcontentloaded' }); + +await page.waitForFunction( + () => typeof window.a1dbg === 'function' && !!window.__a1globe && !!document.querySelector('#sampleSearch'), + null, { timeout: 90_000 }); +console.log(`[${sec()}] APP LIVE — boot globe:`, await page.evaluate(() => window.__a1globe?.())); + +await page.fill('#sampleSearch', TERM); +await page.press('#sampleSearch', 'Enter'); +console.log(`[${sec()}] search submitted: "${TERM}" (scans sample_facets_v2.parquet)`); + +await page.waitForFunction( + (t) => window.__searchFilter?.active === true && window.__searchFilter?.term === t && window.__searchFilter?.total > 0, + TERM, { timeout: 120_000 }); +const st = await page.evaluate(() => ({ search: window.__searchFilter, globe: window.__a1globe?.() })); +console.log(`[${sec()}] SEARCH FILTER BUILT — total pids:`, st.search?.total, ' globe:', JSON.stringify(st.globe)); + +await browser.close(); +console.log(`[${sec()}] done`); From 865d8d01278b276c4d41d30028b97d7affdc9718 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sat, 30 May 2026 10:25:09 -0700 Subject: [PATCH 09/23] test: headless globe point-load probe (altitude A/B for A1 search) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diagnoses whether a committed search renders sample points and whether the result depends on camera altitude. Boots at a given alt/lat/lng, fires the bucchero search, waits for the async point load to settle, and dumps the __a1log event sequence + final __a1globe() state. Finding: with a proper wait, the globe is A1-coherent at BOTH whole-globe (9000 km → renders all 2693 pids; computeViewRectangle saturates, not null) and zoomed-in (80 km → 2670 in-view) altitudes. The earlier "0 sample points" was a measure-too-early artifact, not a bug. Suggests the C3 fixes (4e79830) work in a foreground/headless context and the summary's "globe won't enter point mode" was likely a backgrounded-tab rAF-freeze artifact. Pending headed a1-verify.mjs verdict to rule out an animation-only race. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/playwright/globe-points-probe.mjs | 46 +++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tests/playwright/globe-points-probe.mjs diff --git a/tests/playwright/globe-points-probe.mjs b/tests/playwright/globe-points-probe.mjs new file mode 100644 index 00000000..e35a3847 --- /dev/null +++ b/tests/playwright/globe-points-probe.mjs @@ -0,0 +1,46 @@ +// Globe point-load diagnosis: does a committed search render sample points, +// and does the answer depend on camera ALTITUDE? Hypothesis: at whole-globe +// altitude, getViewportBounds() (computeViewRectangle) returns null and +// loadViewportSamples() bails → 0 points, even though search forced point mode. +// +// Usage: node globe-points-probe.mjs +import { chromium } from 'playwright'; + +const ALT = process.argv[2] || '9000000'; +const LAT = process.argv[3] || '43.15'; +const LNG = process.argv[4] || '11.40'; +const BASE = 'http://localhost:8099/explorer.html?data_base=/data&debug=a1&sources=OPENCONTEXT%2CGEOME%2CSMITHSONIAN'; +const URL = `${BASE}#v=1&lat=${LAT}&lng=${LNG}&alt=${ALT}`; +const TERM = 'bucchero'; + +const browser = await chromium.launch({ headless: true }); +const page = await browser.newPage(); +await page.goto(URL, { waitUntil: 'domcontentloaded' }); +await page.waitForFunction( + () => typeof window.a1dbg === 'function' && !!window.__a1globe && !!document.querySelector('#sampleSearch'), + null, { timeout: 90_000 }); + +// reset the event log so we only capture the search→point-load sequence +await page.evaluate(() => { window.__a1log = []; }); +await page.fill('#sampleSearch', TERM); +await page.press('#sampleSearch', 'Enter'); +await page.waitForFunction( + (t) => window.__searchFilter?.active === true && window.__searchFilter?.term === t && window.__searchFilter?.total > 0, + TERM, { timeout: 120_000 }); +// give the point load a generous window to run/settle +await page.waitForTimeout(6000); + +const out = await page.evaluate(() => ({ + globe: window.__a1globe?.(), + searchTotal: window.__searchFilter?.total, + // event types in order, plus any point-load events with their payloads + events: (window.__a1log || []).map(e => e.event), + pointLoadEvents: (window.__a1log || []).filter(e => /point-load/.test(e.event)), +})); + +console.log(`\n=== alt=${ALT} lat=${LAT} lng=${LNG} ===`); +console.log('searchTotal :', out.searchTotal); +console.log('globe :', JSON.stringify(out.globe)); +console.log('events :', out.events.join(' → ')); +console.log('point-load :', JSON.stringify(out.pointLoadEvents, null, 2)); +await browser.close(); From b4ed7e3643ebac1b4e6a3f4c78bb333be643518b Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sat, 30 May 2026 11:44:42 -0700 Subject: [PATCH 10/23] test(a1-verify): add HEADLESS=1 env flag for reliable automated runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default stays headed (real flyTo — what A1 is verified against). A headed window that opens UNFOCUSED becomes a background tab → Chrome freezes its rAF render loop → the page hangs mid-init (the same backgrounded-tab freeze that corrupted the original logjam observations). HEADLESS=1 sidesteps that: headless pages are always "active". Use it for CI / repeated runs; keep headed for a real-animation spot check. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/playwright/a1-verify.mjs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/playwright/a1-verify.mjs b/tests/playwright/a1-verify.mjs index 0b6af241..669852c2 100644 --- a/tests/playwright/a1-verify.mjs +++ b/tests/playwright/a1-verify.mjs @@ -25,7 +25,11 @@ const TERM = process.env.A1_TERM || 'bucchero'; // point mode from cluster via an in-page search (the failing case). const URL = `${BASE}#v=1&lat=43.15&lng=11.40&alt=9000000`; -const browser = await chromium.launch({ headless: false }); +// Default headed (real flyTo, what the A1 work is verified against). Set +// HEADLESS=1 for reliable automated/CI runs — headless pages are always +// "active", so they're immune to the backgrounded-window rAF freeze that +// hangs an unfocused headed window mid-init. +const browser = await chromium.launch({ headless: process.env.HEADLESS === '1' }); const page = await browser.newPage(); page.on('console', (m) => { if (/A1|point mode|Discarding/.test(m.text())) console.log(' page>', m.text()); }); From 6494cadb0c4c323656971f48f1c7603669e4b99f Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sat, 30 May 2026 12:31:20 -0700 Subject: [PATCH 11/23] chore(explorer): gate A1 debug instrumentation behind ?debug=a1; drop dev probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-deploy cleanup (the summary's "don't ship" list): - Remove the a1PersistenceProbe OJS cell — a one-time dev check that console-logged on every load and threw a Catalog Error (the design point it verified, non-temp tables persisting across DuckDBClient connections, is proven and load-bearing in production now). - Gate the whole A1 observability block (a1dbg / __a1log / __a1state / __a1globe + on-page panel) behind ?debug=a1. Production users now get a clean global namespace and zero overhead; the Playwright harness opts in via ?debug=a1. All a1dbg?.() call sites already use optional chaining, so they are no-ops when the block doesn't run. Verified: ?debug=a1 → a1-verify.mjs still ✅ COHERENT (2693 pts); no flag → __a1globe/a1dbg/__a1log undefined, no panel, no probe console output. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 109 ++++++++++++++++++++------------------------------- 1 file changed, 42 insertions(+), 67 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 751410fb..5e3fcdf7 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -1463,37 +1463,6 @@ db = { } ``` -```{ojs} -//| echo: false -//| output: false - -// === A1 dev probe (#234 Step 4) — TEMPORARY, remove before PR === -// Confirms the linchpin of the search-as-global-filter design: a NON-TEMP -// table created in one db.query() call is visible to a LATER db.query() -// call. Observable's DuckDBClient opens a fresh connection per query(), so a -// connection-local TEMP table would NOT survive — but a regular table in the -// shared in-memory database does. Also checks TEMP for contrast. -a1PersistenceProbe = { - try { - await db.query(`CREATE OR REPLACE TABLE __a1_probe AS SELECT 42 AS x`); - const r = Array.from(await db.query(`SELECT x FROM __a1_probe`)); - const nonTempOK = r.length === 1 && Number(r[0].x) === 42; - console.log(`[A1probe] NON-TEMP table persists across db.query():`, nonTempOK); - let tempOK = null; - try { - await db.query(`CREATE OR REPLACE TEMP TABLE __a1_probe_t AS SELECT 7 AS x`); - const rt = Array.from(await db.query(`SELECT x FROM __a1_probe_t`)); - tempOK = rt.length === 1 && Number(rt[0].x) === 7; - } catch (e) { tempOK = `threw: ${e.message}`; } - console.log(`[A1probe] TEMP table persists across db.query():`, tempOK); - await db.query(`DROP TABLE IF EXISTS __a1_probe`); - return { nonTempOK, tempOK }; - } catch (e) { - console.log(`[A1probe] FAILED:`, e.message); - return { error: e.message }; - } -} -``` ```{ojs} //| echo: false @@ -4043,45 +4012,51 @@ zoomWatcher = { ? ` AND ${pidCol} IN (SELECT pid FROM search_pids)` : ''; - // DEV: deterministic A1 observability. console capture races the - // automation harness; instead a1dbg() appends to window.__a1log and - // stamps window.__a1state[event] so a test can POLL state (and read - // an optional on-page panel via ?debug=a1) rather than chase console. - if (!window.a1dbg) { - window.__a1log = []; - window.__a1state = {}; - let _a1panel = null; - if (new URLSearchParams(location.search).get('debug') === 'a1' && document.body) { - _a1panel = document.createElement('div'); - _a1panel.id = 'a1DebugPanel'; - _a1panel.style.cssText = 'position:fixed;bottom:0;right:0;width:520px;max-height:42vh;overflow:auto;background:#111;color:#3f3;font:11px/1.35 monospace;z-index:99999;padding:4px;opacity:.92'; - document.body.appendChild(_a1panel); - } - window.a1dbg = (event, data = {}) => { - const row = { t: Math.round(performance.now()), event, ...data }; - window.__a1log.push(row); - window.__a1state[event] = row; - if (_a1panel) { - const line = document.createElement('div'); - line.textContent = `[A1] +${row.t}ms ${event} ${JSON.stringify(data)}`; - _a1panel.appendChild(line); - _a1panel.scrollTop = _a1panel.scrollHeight; + // DEV: deterministic A1 observability, gated behind ?debug=a1 so + // production users get a clean global namespace and zero overhead. + // The Playwright harness (a1-verify.mjs) loads with ?debug=a1 to opt + // in. Every a1dbg?.() call site uses optional chaining, so they are + // no-ops when this block doesn't run. a1dbg() appends to + // window.__a1log and stamps window.__a1state[event] so a test can + // POLL state (and read an optional on-page panel) rather than chase + // console; __a1globe() exposes just what a coherence assertion needs. + if (new URLSearchParams(location.search).get('debug') === 'a1') { + if (!window.a1dbg) { + window.__a1log = []; + window.__a1state = {}; + let _a1panel = null; + if (document.body) { + _a1panel = document.createElement('div'); + _a1panel.id = 'a1DebugPanel'; + _a1panel.style.cssText = 'position:fixed;bottom:0;right:0;width:520px;max-height:42vh;overflow:auto;background:#111;color:#3f3;font:11px/1.35 monospace;z-index:99999;padding:4px;opacity:.92'; + document.body.appendChild(_a1panel); } + window.a1dbg = (event, data = {}) => { + const row = { t: Math.round(performance.now()), event, ...data }; + window.__a1log.push(row); + window.__a1state[event] = row; + if (_a1panel) { + const line = document.createElement('div'); + line.textContent = `[A1] +${row.t}ms ${event} ${JSON.stringify(data)}`; + _a1panel.appendChild(line); + _a1panel.scrollTop = _a1panel.scrollHeight; + } + }; + } + // Globe-state accessor for the Playwright harness — exposes just + // what a coherence assertion needs, not the whole Cesium viewer. + // Lazily reads `viewer` (in this cell's scope). + window.__a1globe = () => { + try { + return { + mode: viewer._globeState.mode, + samplePointsLen: viewer.samplePoints.length, + samplePointsShown: viewer.samplePoints.show, + h3PointsShown: viewer.h3Points.show, + }; + } catch (e) { return { error: String(e) }; } }; } - // Globe-state accessor for the Playwright harness — exposes just what - // a coherence assertion needs, not the whole Cesium viewer. Lazily - // reads `viewer` (in this cell's scope) so it's safe to define here. - window.__a1globe = () => { - try { - return { - mode: viewer._globeState.mode, - samplePointsLen: viewer.samplePoints.length, - samplePointsShown: viewer.samplePoints.show, - h3PointsShown: viewer.h3Points.show, - }; - } catch (e) { return { error: String(e) }; } - }; } async function buildSearchFilter(terms, term) { From 832b7796b7e87cd4a4f884525a4d2da7afbf8a3d Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sat, 30 May 2026 13:42:11 -0700 Subject: [PATCH 12/23] =?UTF-8?q?perf(explorer):=20collapse=20A1=20search?= =?UTF-8?q?=20double-scan=20=E2=80=94=20side=20panel=20reads=20search=5Fpi?= =?UTF-8?q?ds?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit doSearch scanned the 63 MB facets parquet TWICE per committed search: once in buildSearchFilter (pid-set) and again for the side-panel results SELECT (+ a third for the real-count COUNT when the 50-cap hit). On CI's smoke gate, the broad "pottery" search blew the 90s budget (first A1 deploy failed there). Fix: buildSearchFilter now materializes the side-panel columns (label, source, place_name) and the relevance score IN THE SAME scan that builds the pid-set, so the results SELECT and the COUNT read the small in-memory search_pids table (aliased `s`) instead of re-scanning facets. One facets scan per search now, matching pre-A1. sourceFilterSQL('s.source') + the bare-pid facetFilterSQL compose unchanged; search_pids stays pid-keyed (dropped the weaker 5-col DISTINCT — pid is unique, so the build is naturally one row per pid). Verified locally (fast mirror): pottery 15.8s → 12.7s (build 6.9s + surface updates); a1-verify still ✅ COHERENT; production-clean without ?debug=a1. Note: the remaining time-to-results is buildSearchFilter + applySearchFilter (globe/facet updates); if CI's smoke still exceeds budget, render the side panel before applySearchFilterChange next. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 82 ++++++++++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 5e3fcdf7..b060b280 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -4067,16 +4067,34 @@ zoomWatcher = { 'description', 'CAST(place_name AS VARCHAR)', ]); - // DISTINCT + NOT NULL: pid is unique in facets_url today, but DISTINCT - // is cheap insurance against a future facet-shaped projection. + // Compute the side-panel relevance score in the SAME scan that builds + // the pid-set, and materialize the columns doSearch's results list + // needs (label / source / place_name + score). This lets doSearch read + // the small search_pids table instead of RE-scanning the 63 MB facets + // parquet — the old double-scan that pushed broad terms like "pottery" + // past CI's 90s smoke budget. search_pids stays pid-keyed (one row per + // unique pid), so every other surface's `pid IN (SELECT pid FROM + // search_pids)` semi-join and the COUNT(*) total are unchanged. + const score = textSearchScore(terms, [ + { col: 'label', weight: 3 }, + { col: 'description', weight: 1 }, + { col: 'CAST(place_name AS VARCHAR)', weight: 2 }, + ]); + // pid is unique in facets_url (verified in A1 scoping), so the build + // is naturally one row per pid — no DISTINCT needed. (The previous + // `SELECT DISTINCT pid` deduped a single column; a 5-column DISTINCT + // would be both slower and weaker — it dedupes identical full rows, + // not pids — so we rely on the established uniqueness instead. This + // keeps COUNT(*) == match count and the side-panel free of dup rows.) await db.query(` CREATE OR REPLACE TABLE search_pids_next AS - SELECT DISTINCT pid + SELECT pid, label, source, place_name, (${score}) AS relevance_score FROM read_parquet('${facets_url}') WHERE pid IS NOT NULL AND ${searchWhere} `); if (token !== _searchFilterToken) return false; // superseded mid-build - await db.query(`CREATE OR REPLACE TABLE search_pids AS SELECT pid FROM search_pids_next`); + await db.query(`CREATE OR REPLACE TABLE search_pids AS + SELECT pid, label, source, place_name, relevance_score FROM search_pids_next`); if (token !== _searchFilterToken) return false; const cnt = Array.from(await db.query(`SELECT COUNT(*) AS n FROM search_pids`)); const total = cnt.length ? Number(cnt[0].n) : 0; @@ -4201,24 +4219,14 @@ zoomWatcher = { // hash-partitioned BM25 indexes that fixes both recall AND // latency. This code path goes away when #171 lands. // - // CTE-then-keyed-join shape (NOT a naive LEFT JOIN). Native - // DuckDB benchmark: naive 4.2 s vs CTE 0.5 s for `pottery`. - // The browser DuckDB-WASM penalty makes the difference even - // more pronounced; the naive form times out on `pottery` cold. - // Use `f.`-qualified columns so the same searchWhere/score - // strings work for both the world-mode CTE (single table aliased - // f) and the area-mode INNER JOIN (f + l, both via USING (pid)). - const searchWhere = textSearchWhere(terms, [ - 'f.label', - 'f.description', - 'CAST(f.place_name AS VARCHAR)', - ]); - const score = textSearchScore(terms, [ - { col: 'f.label', weight: 3 }, - { col: 'f.description', weight: 1 }, - { col: 'CAST(f.place_name AS VARCHAR)', weight: 2 }, - ]); - + // The search-term match + relevance score are already materialized + // in `search_pids` by buildSearchFilter (aliased `s` below), so the + // results SELECT and the follow-up COUNT both read that small table + // instead of re-scanning the 63 MB facets parquet — the old + // double-scan (one in buildSearchFilter, one here) that pushed + // broad terms like `pottery` past CI's 90s smoke budget. The single + // remaining facets scan lives in buildSearchFilter. + // // Snapshot the source / facet predicates ONCE per search so the // follow-up COUNT(*) at the end of this try block uses the same // filter state as the SELECT. Without this snapshot, a user who @@ -4228,7 +4236,7 @@ zoomWatcher = { // of PR #236 round 1). The string is captured here, before any // `db.query` await, and re-used by both the SELECT and the // COUNT below. - const sourceSQL = sourceFilterSQL('f.source'); + const sourceSQL = sourceFilterSQL('s.source'); const facetSQL = facetFilterSQL(); // Telemetry-equivalent of the SQL snapshots above (`hadSourceFilter` // / `hadFacetFilter`) is declared OUTSIDE this try block so it @@ -4273,15 +4281,15 @@ zoomWatcher = { results = await runWorldQuery(); } else { results = await db.query(` - SELECT f.pid, f.label, f.source, l.latitude, l.longitude, - f.place_name, (${score}) AS relevance_score - FROM read_parquet('${facets_url}') f + SELECT s.pid, s.label, s.source, l.latitude, l.longitude, + s.place_name, s.relevance_score + FROM search_pids s INNER JOIN read_parquet('${lite_url}') l USING (pid) - WHERE ${searchWhere} + WHERE 1=1 ${bboxSQL} ${sourceSQL} ${facetSQL} - ORDER BY relevance_score DESC, f.label + ORDER BY s.relevance_score DESC, s.label LIMIT 50 `); effectiveQueryShape = 'area'; @@ -4294,13 +4302,13 @@ zoomWatcher = { async function runWorldQuery() { return db.query(` WITH matches AS ( - SELECT f.pid, f.label, f.source, f.place_name, - (${score}) AS relevance_score - FROM read_parquet('${facets_url}') f - WHERE ${searchWhere} + SELECT s.pid, s.label, s.source, s.place_name, + s.relevance_score + FROM search_pids s + WHERE 1=1 ${sourceSQL} ${facetSQL} - ORDER BY relevance_score DESC + ORDER BY s.relevance_score DESC LIMIT 50 ) SELECT m.pid, m.label, m.source, l.latitude, l.longitude, @@ -4474,17 +4482,17 @@ zoomWatcher = { const countSQL = effectiveQueryShape === 'area' ? ` SELECT COUNT(*) AS n - FROM read_parquet('${facets_url}') f + FROM search_pids s INNER JOIN read_parquet('${lite_url}') l USING (pid) - WHERE ${searchWhere} + WHERE 1=1 ${effectiveBboxSQL} ${sourceSQL} ${facetSQL} ` : ` SELECT COUNT(*) AS n - FROM read_parquet('${facets_url}') f - WHERE ${searchWhere} + FROM search_pids s + WHERE 1=1 ${sourceSQL} ${facetSQL} `; From 817bc5faa5176ea0df8c8fbd74efc5518f7c3a77 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sat, 30 May 2026 14:13:02 -0700 Subject: [PATCH 13/23] fix(explorer): facet legend counts use padded viewport (== table "N match") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updateCrossFilteredCounts computed facet-legend counts over the EXACT viewport (pad 0), while the samples-table COUNT, the point-mode loader, the "samples in view" stat, and the heatmap all pad by VIEWPORT_PAD_FACTOR (0.3). Matching samples in the 30% margin were counted by the table but not the legend, so the legend read low: off-by-one at a Cyprus deep-zoom (13 vs 14), and ~166 vs ~481 for material=rock at a wide Red-Sea view (RY, live rdhyee deploy). Aligns the last "in view" surface to the padded contract (#234 coherence). Applies the parked facet_count_padding.patch (one line + the coherence regression test) on the A1 branch, since the mismatch is live on the A1 deploy and #234 is exactly "make filter semantics coherent across surfaces." Verified at the reported view: facet Rock 167 → 496, now == table 496; a1-verify still ✅ COHERENT. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 18 +++++++++++++- tests/playwright/facet-viewport.spec.js | 33 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/explorer.qmd b/explorer.qmd index b060b280..6a274b94 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -2852,8 +2852,24 @@ zoomWatcher = { // since `facets_url` carries no coordinates today). Cube fast-path // is unconditionally gated off (it is pre-aggregated globally and // can't answer viewport-scoped questions). + // + // VIEWPORT_PAD_FACTOR (not 0): the facet counts must use the SAME + // padded viewport as every other "in view" surface — the samples + // table COUNT (loadCount), the point-mode sample loader, the + // "samples in view" stat, and the heatmap all pad by + // VIEWPORT_PAD_FACTOR (0.3). B1 originally shipped this at pad 0 + // (exact viewport), which left the facet count one (or more) low + // versus the table whenever a matching sample sat in the 30% pad + // margin — e.g. material=mineral at a Cyprus deep-zoom read 13 on + // the legend but 14 in "samples match the current filters" (RY, + // 2026-05-28); at a wide view (alt≈900 km, material=rock over the + // Red Sea) the same mismatch reads ~166 vs ~481 because the margin + // is huge. The heatmap hit the identical mismatch earlier and was + // moved to the padded contract; this aligns the last surface. The + // deeper fix (one shared "in view" bbox source so these can't drift + // again) is tracked on #234. const isGlobal = isGlobalView(); - const bboxSQL = isGlobal ? null : viewerBboxSQL('l.latitude', 'l.longitude', 0); + const bboxSQL = isGlobal ? null : viewerBboxSQL('l.latitude', 'l.longitude', VIEWPORT_PAD_FACTOR); // Baseline early-return only applies when there is no filter AND no // spatial constraint. In a non-global view with no facet filter, B1 diff --git a/tests/playwright/facet-viewport.spec.js b/tests/playwright/facet-viewport.spec.js index c74ed8f2..5d330d97 100644 --- a/tests/playwright/facet-viewport.spec.js +++ b/tests/playwright/facet-viewport.spec.js @@ -241,6 +241,39 @@ test.describe('B1 viewport-aware facet counts (#234 step 3)', () => { expect(cyprusTotal).toBeLessThan(filteredTotal); }); + test('coherence: active-material legend count == table "N match" count (#234 padding)', async ({ page }) => { + // Regression for the off-by-one RY found 2026-05-28. Facet counts + // used exact-viewport (pad 0) while the samples-table COUNT uses + // VIEWPORT_PAD_FACTOR (0.3); a matching sample in the 30% margin + // made the legend read one low — at this exact Cyprus deep-zoom, + // material=mineral showed 13 on the legend but 14 in "samples match + // the current filters." With facet counts on the same padded bbox + // as the table/heatmap/point-loader/stat, the two must agree. + const MINERAL = 'https://w3id.org/isample/vocabulary/material/1.0/mineral'; + const suffix = `?material=${encodeURIComponent(MINERAL)}#v=1&lat=35.0900&lng=32.8900&alt=50000&mode=point`; + await page.goto(explorerUrl(suffix)); + await page.waitForSelector('#cesiumContainer', { timeout: 30000 }); + await waitForFacetUI(page); + await waitForFacetCountsStable(page); + + // Poll until both the legend count for the active material and the + // table meta "N match" line have resolved, then assert equality via + // a backreference (`14/14` matches, `13/14` does not). Both numbers + // are viewport-scoped, so they must be identical for the same view. + await expect.poll(async () => { + return await page.evaluate((mineral) => { + const f = document.querySelector(`.facet-count[data-facet="material"][data-value="${mineral}"]`); + const fm = f && (f.textContent || '').match(/\(([\d,]+)\)/); + const meta = document.getElementById('tableMeta')?.textContent || ''; + const tm = meta.match(/([\d,]+)\s+samples?\s+match/); + if (!fm || !tm) return 'pending'; + const fv = parseInt(fm[1].replace(/,/g, ''), 10); + const tv = parseInt(tm[1].replace(/,/g, ''), 10); + return `${fv}/${tv}`; + }, MINERAL); + }, { timeout: 60000, intervals: [500, 1000, 2000] }).toMatch(/^(\d+)\/\1$/); + }); + test('moveStart marks .recomputing before the debounce can run', async ({ page }) => { await page.goto(explorerUrl(GLOBAL_HASH)); await waitForFacetUI(page); From a576deaf9277917a52ea950a649ad183823573a5 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sun, 31 May 2026 07:07:56 -0700 Subject: [PATCH 14/23] fix(explorer): A1 race + heatmap search-blindness (Codex review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two issues from Codex's PR #251 review: 1. search_pids staging race — buildSearchFilter used a fixed `search_pids_next` name. Two overlapping searches could interleave so a later search swapped an earlier search's rows into `search_pids` under its own term (the token checks guard the publish, not the shared staging object). Use a token-scoped staging table `search_pids_next_${token}`, dropped in finally. Also stop DROPping the live `search_pids` on clear (an in-flight reader would throw) — flip active=false and leave it unreferenced until the next search replaces it. Verified: bucchero→soil back-to-back now publishes soil/2969 (its own count), not bucchero's under soil's term. 2. heatmap search-blind — renderHeatmap omitted searchFilterSQL and heatmapFilterHash omitted the search token, so the "filtered density" overlay stayed unfiltered under a committed search. Append window.searchFilterSQL('pid') to the heatmap aggregation and add the search token to the hash so it recomputes/re-keys on search commit/clear (#234 cross-surface coherence). a1-verify still ✅ COHERENT. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 57 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 6a274b94..c35635ac 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -3060,6 +3060,12 @@ zoomWatcher = { material: getCheckedValues('materialFilterBody').slice().sort(), context: getCheckedValues('contextFilterBody').slice().sort(), object_type: getCheckedValues('objectTypeFilterBody').slice().sort(), + // A1: include the committed search so the heatmap recomputes (and + // re-keys) when a search is committed or cleared — otherwise the + // "filtered density" overlay stays unfiltered under search while + // every other surface filters (Codex review 2026-05-30, #234). + search: (typeof window !== 'undefined' && window.__searchFilter && window.__searchFilter.active) + ? window.__searchFilter.token : null, }); } @@ -3202,6 +3208,7 @@ zoomWatcher = { WHERE ${heatmapBboxPredicate(bounds, 'latitude', 'longitude')} ${sourceFilterSQL('source')} ${facetFilterSQL()} + ${(typeof window !== 'undefined' && window.searchFilterSQL) ? window.searchFilterSQL('pid') : ''} GROUP BY x, y `); if (myReq !== heatmapReqId || !heatmapEnabled()) return; @@ -4102,28 +4109,44 @@ zoomWatcher = { // would be both slower and weaker — it dedupes identical full rows, // not pids — so we rely on the established uniqueness instead. This // keeps COUNT(*) == match count and the side-panel free of dup rows.) - await db.query(` - CREATE OR REPLACE TABLE search_pids_next AS - SELECT pid, label, source, place_name, (${score}) AS relevance_score - FROM read_parquet('${facets_url}') - WHERE pid IS NOT NULL AND ${searchWhere} - `); - if (token !== _searchFilterToken) return false; // superseded mid-build - await db.query(`CREATE OR REPLACE TABLE search_pids AS - SELECT pid, label, source, place_name, relevance_score FROM search_pids_next`); - if (token !== _searchFilterToken) return false; - const cnt = Array.from(await db.query(`SELECT COUNT(*) AS n FROM search_pids`)); - const total = cnt.length ? Number(cnt[0].n) : 0; - if (token !== _searchFilterToken) return false; - window.__searchFilter = { active: true, term, token, total }; - window.a1dbg?.('search-build-end', { term, token, total }); - return true; + // Token-scoped staging table: a fixed `search_pids_next` name lets two + // overlapping searches clobber each other's staging rows, so a later + // search could swap the EARLIER search's pids into `search_pids` under + // its own term — the token checks guard the publish, not the shared + // staging object. Naming the staging table per-token isolates + // concurrent builds; it's dropped in `finally`. (Codex review 2026-05-30.) + const staging = `search_pids_next_${token}`; + try { + await db.query(` + CREATE OR REPLACE TABLE ${staging} AS + SELECT pid, label, source, place_name, (${score}) AS relevance_score + FROM read_parquet('${facets_url}') + WHERE pid IS NOT NULL AND ${searchWhere} + `); + if (token !== _searchFilterToken) return false; // superseded mid-build + await db.query(`CREATE OR REPLACE TABLE search_pids AS + SELECT pid, label, source, place_name, relevance_score FROM ${staging}`); + if (token !== _searchFilterToken) return false; + const cnt = Array.from(await db.query(`SELECT COUNT(*) AS n FROM search_pids`)); + const total = cnt.length ? Number(cnt[0].n) : 0; + if (token !== _searchFilterToken) return false; + window.__searchFilter = { active: true, term, token, total }; + window.a1dbg?.('search-build-end', { term, token, total }); + return true; + } finally { + try { await db.query(`DROP TABLE IF EXISTS ${staging}`); } catch (e) { /* best effort */ } + } } async function clearSearchFilter() { _searchFilterToken++; window.__searchFilter = { active: false, term: null, token: _searchFilterToken, total: 0 }; - try { await db.query(`DROP TABLE IF EXISTS search_pids`); } catch (e) { /* best effort */ } + // Don't DROP search_pids here: a surface query built while the search + // was active may still be in flight and reference it, and dropping + // under that reader throws. Flipping active=false makes every NEW query + // omit the semi-join (searchFilterSQL returns ''), so the stale table is + // simply unreferenced until the next search's CREATE OR REPLACE + // replaces it. (Codex review 2026-05-30.) } async function doSearch(scope) { From 8a9a1d3b47ae59a0c7cb6ae0a5e5032b90c426f8 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sun, 31 May 2026 11:09:36 -0700 Subject: [PATCH 15/23] fix(explorer): clearSearchFilter empties search_pids (don't leave stale rows) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-2 of Codex's PR #251 review: the previous no-drop clear left the prior search's rows in search_pids, and doSearch's side-panel SELECT reads `FROM search_pids` directly (does NOT gate on __searchFilter.active) — so a build failure could render the previous term's rows under the new term. Chose Codex's empty-table alternative over the early-return built-guard: an early return before the side-panel try would skip the #167 telemetry `finally`, whereas CREATE OR REPLACE TABLE search_pids (...empty...) keeps both the in-flight semi-join readers and the direct side-panel reader safely seeing zero rows, and a build failure flows through the existing results.length===0 → return-in-try → finally path with telemetry intact. Only clearSearchFilter changes; no doSearch control-flow restructure. Verified: a1-verify ✅ COHERENT; bucchero→clear→soil publishes soil/2969 (own count, no stale rows); clearSearchFilter is only called on empty-submit + build-failure, not per search. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index c35635ac..be388354 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -4141,12 +4141,20 @@ zoomWatcher = { async function clearSearchFilter() { _searchFilterToken++; window.__searchFilter = { active: false, term: null, token: _searchFilterToken, total: 0 }; - // Don't DROP search_pids here: a surface query built while the search - // was active may still be in flight and reference it, and dropping - // under that reader throws. Flipping active=false makes every NEW query - // omit the semi-join (searchFilterSQL returns ''), so the stale table is - // simply unreferenced until the next search's CREATE OR REPLACE - // replaces it. (Codex review 2026-05-30.) + // Replace search_pids with an EMPTY same-shape table rather than + // DROPping it. Two readers touch this table without gating on + // __searchFilter.active: an in-flight semi-join surface query, and + // doSearch's side-panel SELECT that reads `FROM search_pids` directly. + // DROP would make those throw on a missing table; simply leaving the old + // table would let the side panel render the PREVIOUS search's rows under + // a new/cleared term (Codex review 2026-05-30). An empty table makes + // both readers safely see zero rows; the next search's + // CREATE OR REPLACE ... AS SELECT swaps in the real rows. + try { + await db.query(`CREATE OR REPLACE TABLE search_pids ( + pid VARCHAR, label VARCHAR, source VARCHAR, place_name VARCHAR, relevance_score DOUBLE + )`); + } catch (e) { /* best effort */ } } async function doSearch(scope) { From 0a91361bc0c62e6285b8477d9f9e9932345577e5 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sun, 31 May 2026 11:26:43 -0700 Subject: [PATCH 16/23] polish(explorer): distinguish search build failure from empty results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses Codex's remaining nit (PR #251, non-blocking): since clearSearchFilter() now leaves search_pids EMPTY, a genuine build failure and a true empty result set both reach the side-panel's results.length===0 branch. A `searchFilterBuildFailed` flag (set in the build catch) makes the panel say "Search error: couldn't build the filter…" on a real failure while still flowing through the #167 telemetry finally — instead of the misleading "No results for {term}". a1-verify still ✅ COHERENT. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index be388354..af55ac9a 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -4222,10 +4222,16 @@ zoomWatcher = { ? 'Building search filter for selected areas…' : 'Building search filter…'; searchResults.textContent = buildingMsg; + // Distinguishes a genuine build failure (DuckDB error) from a real + // empty result set: clearSearchFilter() leaves search_pids EMPTY, so the + // side-panel SELECT below returns 0 rows either way; this flag lets the + // empty-results branch say "Search error" rather than "No results". + let searchFilterBuildFailed = false; try { await buildSearchFilter(terms, term); } catch (e) { console.warn('A1 search-filter build failed; surfaces stay unfiltered:', e); + searchFilterBuildFailed = true; await clearSearchFilter(); } // Superseded by a newer search while building? Bail before mutating UI. @@ -4382,14 +4388,18 @@ zoomWatcher = { } resultsCount = results.length; if (results.length === 0) { - searchResults.textContent = `No results for "${term}"`; + // A build failure also empties search_pids → 0 rows here, so + // distinguish a genuine error from a true empty result set. + searchResults.textContent = searchFilterBuildFailed + ? `Search error: couldn't build the filter for "${term}". Please try again.` + : `No results for "${term}"`; // Honesty fix (#247): the table meta points "→ panel", so the // panel must reflect THIS (empty) search rather than whatever // it showed before. Without this, a zero-result search left // stale prior content under a pointer claiming otherwise. const sampEl0 = document.getElementById('samplesSection'); - if (sampEl0) sampEl0.innerHTML = searchHeadingHTML(' (0)') - + '
No samples matched this search.
'; + if (sampEl0) sampEl0.innerHTML = searchHeadingHTML(searchFilterBuildFailed ? '' : ' (0)') + + `
${searchFilterBuildFailed ? 'Search failed to build — please try again.' : 'No samples matched this search.'}
`; return; } From f2eac3519d135666cb583fa93061cbff112e0dbf Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sun, 31 May 2026 22:28:41 -0700 Subject: [PATCH 17/23] =?UTF-8?q?feat(explorer):=20#248=20Flavor=20A=20fou?= =?UTF-8?q?ndation=20=E2=80=94=20conceptLabelForUri=20+=20buildConceptFilt?= =?UTF-8?q?er?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First, additive pieces for `described-by=` (#248), riding the A1 search_pids machinery (Codex plan-reviewed: "mostly sound + guardrails"): - window.conceptLabelForUri(uri): expose the facetFilters cell's URI→prefLabel resolver so the concept producer can label a URI without re-querying vocab_labels (guardrail #1). - buildConceptFilter(uri): a SECOND search_pids producer — exact-URI match across the object_type/material/context columns of sample_facets_v2, with object_type>material>context relevance ranking. Same token-scoped staging, finally-drop, shared _searchFilterToken, and empty-clear invariant as buildSearchFilter (guardrails #2/#6). Tags __searchFilter.kind ('concept' vs 'text') for mutual exclusivity (#5). Not yet wired: doDescribedBy flow (shared runPidSetResults render), the described-by= URL param + writeQueryState kind-preservation, and mutual exclusivity at producer entry. buildConceptFilter isn't called yet, so this is behavior-neutral. Verified: conceptLabelForUri('…organismpart')→"Organism part"; free-text a1-verify still ✅ COHERENT. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/explorer.qmd b/explorer.qmd index af55ac9a..210b9687 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -1763,6 +1763,10 @@ facetFilters = { const parts = s.replace(/[#?].*$/, "").split("/").filter(Boolean); return parts.length ? parts[parts.length - 1] : s; }; + // #248: expose the URI→prefLabel resolver so the concept-filter producer + // (buildConceptFilter, in the zoomWatcher cell) can label a `described-by=` + // URI without re-querying vocab_labels. Best-effort: falls back to URI tail. + if (typeof window !== 'undefined') window.conceptLabelForUri = prettyLabel; try { const summaries = await db.query(` @@ -4130,7 +4134,7 @@ zoomWatcher = { const cnt = Array.from(await db.query(`SELECT COUNT(*) AS n FROM search_pids`)); const total = cnt.length ? Number(cnt[0].n) : 0; if (token !== _searchFilterToken) return false; - window.__searchFilter = { active: true, term, token, total }; + window.__searchFilter = { active: true, term, token, total, kind: 'text' }; window.a1dbg?.('search-build-end', { term, token, total }); return true; } finally { @@ -4138,6 +4142,47 @@ zoomWatcher = { } } + // #248 Flavor A: a SECOND producer of `search_pids` — selects samples whose + // iSamples-vocabulary concept (the URI-valued object_type / material / + // context columns in sample_facets_v2) exactly matches `uri`. Mirrors + // buildSearchFilter's token-scoped staging + finally-drop + token guards, + // sharing the same _searchFilterToken (the pid-set is a singleton). Every + // surface then filters via the same `pid IN (SELECT pid FROM search_pids)` + // semi-join, for free. Flavor B (arbitrary external/Getty URIs needing + // URI→label resolution + free-text) is a follow-up. + async function buildConceptFilter(uri) { + const token = ++_searchFilterToken; + const label = (typeof window !== 'undefined' && window.conceptLabelForUri) + ? window.conceptLabelForUri(uri) : uri; + window.a1dbg?.('search-build-start', { term: label, token, kind: 'concept' }); + const u = escSql(uri); + // Rank object_type matches above material above context, then by label + // (Codex plan-review suggestion — cheap, no extra scan). + const rank = `CASE WHEN object_type = '${u}' THEN 3 WHEN material = '${u}' THEN 2 WHEN context = '${u}' THEN 1 ELSE 0 END`; + const staging = `search_pids_next_${token}`; + try { + await db.query(` + CREATE OR REPLACE TABLE ${staging} AS + SELECT pid, label, source, place_name, (${rank}) AS relevance_score + FROM read_parquet('${facets_url}') + WHERE pid IS NOT NULL + AND (object_type = '${u}' OR material = '${u}' OR context = '${u}') + `); + if (token !== _searchFilterToken) return false; // superseded mid-build + await db.query(`CREATE OR REPLACE TABLE search_pids AS + SELECT pid, label, source, place_name, relevance_score FROM ${staging}`); + if (token !== _searchFilterToken) return false; + const cnt = Array.from(await db.query(`SELECT COUNT(*) AS n FROM search_pids`)); + const total = cnt.length ? Number(cnt[0].n) : 0; + if (token !== _searchFilterToken) return false; + window.__searchFilter = { active: true, term: label, token, total, kind: 'concept', uri }; + window.a1dbg?.('search-build-end', { term: label, token, total, kind: 'concept' }); + return true; + } finally { + try { await db.query(`DROP TABLE IF EXISTS ${staging}`); } catch (e) { /* best effort */ } + } + } + async function clearSearchFilter() { _searchFilterToken++; window.__searchFilter = { active: false, term: null, token: _searchFilterToken, total: 0 }; From de22bb396eab2a10fa40f238d7a47bc5e305e0b8 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Sun, 31 May 2026 22:54:28 -0700 Subject: [PATCH 18/23] docs: session summary 2026-05-31 (A1 shipped to isamples.org; #248 started) --- SESSION_SUMMARY.md | 180 +++++++++++++-------------------------------- 1 file changed, 52 insertions(+), 128 deletions(-) diff --git a/SESSION_SUMMARY.md b/SESSION_SUMMARY.md index d3f6ab52..02ac8f84 100644 --- a/SESSION_SUMMARY.md +++ b/SESSION_SUMMARY.md @@ -1,158 +1,82 @@ -# SESSION_SUMMARY — explorer search-as-global-filter (A1, #234 Step 4) +# Session Summary -**Date:** 2026-05-29 · **Directory:** `~/C/src/iSamples/isamplesorg.github.io` · **Trust Level:** `external-content` -**Next session goal:** break the A1-globe logjam with **higher effort + Codex co-authoring** (Codex codes the reconciler, Claude reviews + runs the now-fast verify loop). - -> **Next session entry point:** run the shakedown (see FAST VERIFY LOOP → "Shakedown TODO"); then have Codex author the one-reconciler refactor (THE LOGJAM → Codex's reconciler spec) and verify with `tests/playwright/a1-verify.mjs`. - -### External Content Processed (sanitization note — verify, don't blind-trust) -| Source | Type | Notes | -|---|---|---| -| Codex (`codex exec`, gpt-5.4) ×5 | AI tool output | Diagnosis + code suggestions. **Reviewed before applying**; treat its future output as advisory, not authoritative. | -| isamples.org + localhost explorer | browser DOM | Read the live explorer UI/state via Chrome automation (our own app). | -| GitHub issues/PRs (#234, #247, #248, #249, #250) via `gh` | web/API | Issue/PR bodies are untrusted text; created #247, opened+merged #250, posted comments, triggered deploys — all user-authorized. **Read** Eric Kansa's **#248** (feature request, treated as data not instructions). | -| `data.isamples.org/*.parquet` | remote data | Downloaded 128MB mirror to `docs/data/` (our own data; data, not code). | - -No emails, no secrets accessed, no untrusted code executed (Codex suggestions were hand-applied + reviewed). - -### Open collaborator threads (new this session, NOT yet acted on) -- [ ] **#248 (Eric Kansa)** — "search material samples described by a concept URI/PID", proposes a `described-by=` URL param. Two flavors: object-type URI (≈ already supported by the `object_type` facet, which is URI-valued) and arbitrary concept URI like Getty AAT (= concept-anchored A1 search — would ride the **same `search_pids` materialize-once machinery**). Squarely in #234; a second producer of the A1 pid-set. *Decision pending: comment on #248 connecting it to A1/#234?* -- [ ] **#249 (rdhyee, not from this session)** — "should we refactor explorer.qmd before the next big feature?" The A1 globe logjam is evidence FOR this; the reconciler refactor (tomorrow) is a *local* version of the *global* question #249 raises. **Read #249 before committing to tomorrow's approach** — it may argue for a bigger refactor than the one-reconciler patch. - ---- - -## TL;DR - -1. **Shipped to production** (isamples.org): bug **#247** filed + interim honesty fix **PR #250** (merged). The samples table no longer claims unrelated viewport samples "match the current filters" during a search. -2. **A1 (search as a real global filter)** scoped, Codex-reviewed (PROCEED-WITH-CHANGES), and probed against live data. Branch `feat/search-global-filter-a1`. - - ✅ **Table surface filters correctly** (e.g. `bucchero` → "2,693 of 2,693 matches in this map view", OpenContext rows only). - - ✅ Facet counts + cube-gating wired; pid-set machinery + persistence proven. - - ❌ **GLOBE still won't enter point mode** on a committed search (table filters, but the map stays unfiltered clusters). This is the logjam. -3. **Built a fast/deterministic verify-loop** (local parquet mirror + range server + `window.__a1state`/`__a1globe` observability + Playwright harness) so tomorrow's iteration isn't 40–90s/cycle. Range-verified; full speedup run still needs a shakedown. +## Session: 2026-05-30/31 (evening) +**Directory**: `~/C/src/iSamples/isamplesorg.github.io` +**Trust Level**: external-content --- -## Branch & commits +## What Happened -`feat/search-global-filter-a1` (off `upstream/main` which already has #250): -- `204d2df` table surface (pid-set + semi-join + summaryText) -- `936f1f3` points/facets/cube-gate/C3 wired — globe buggy -- `4e79830` Codex's C3 fixes (moveEnd latch, awaitable enterPointMode, search-token staleness) — globe STILL not entering point mode -- `62d5500` dev verify-loop infra (mirror support + dev_server.py + a1dbg/__a1state/__a1globe) +A long, productive session. Started as "tackle the fast-verify shakedown"; ended with **A1 shipped to production (isamples.org)** and **#248 underway**. -Production (already merged, do NOT redo): upstream `a4da97b` (#250). +1. **Shakedown root-caused & fixed.** The dev `?data_base=/data` override produced root-relative parquet URLs that DuckDB-WASM's httpfs can't fetch (read as a virtual-FS glob → zero fetches). Resolved to absolute against `location.origin`. This unblocked the fast verify loop (~2.3s to live). +2. **The "globe logjam" was never real** — it was a **backgrounded-Chrome-MCP-tab artifact** (Chrome freezes rAF in hidden tabs → Cesium camera never settles → "globe won't enter point mode"). In any foreground/headless context the C3 fixes work. The reconciler refactor was unnecessary. **Lesson: drive the verify loop with `HEADLESS=1` Playwright, never the MCP tab.** +3. **Fixed an A1 search perf regression** the CI smoke gate caught (double facets scan → materialize side-panel columns+score into `search_pids`, one scan). +4. **Fixed the live facet-padding mismatch** RY hit (legend pad-0 vs table 0.3 → facet read low; e.g. material=rock ~166 vs ~481). Now facet == table. +5. **Shipped A1**: opened **PR #251**, ran a 3-round **Codex review/revise loop to dual approval** (Codex caught a real `search_pids` staging-table race, heatmap search-blindness, and a stale-reader follow-on — all fixed), then **squash-merged to upstream → deployed to isamples.org** (smoke gate green). +6. **Started #248 (Eric Kansa's concept-URI search)**: posted a connecting comment, Codex plan-reviewed ("mostly sound + guardrails"), and committed the **foundation** on `feat/described-by-concept`. +7. **Investigated a transient camera freeze** (RY's `h3=`+`heading=` deep-link, also on isamples.org). Ruled out locked controller / tracked-entity / refresh-loop via a new `?debug=a1` `__a1camera` hook; **resolved on its own → likely transient WebGL context-loss / network**. Surfaced a real **testing gap**: no gate asserts post-hydration *interactivity*. --- -## The A1 design (Strategy B — agreed + Codex-approved) +## Safe to Carry Forward -On a committed search, `buildSearchFilter()` materializes a **non-temp** DuckDB table `search_pids` (one `ILIKE` scan over `facets_url`), then every surface constrains with a cheap semi-join `AND pid IN (SELECT pid FROM search_pids)`. State on `window.__searchFilter {active,term,token,total}`; predicate via `window.searchFilterSQL(col)`. +### Key Decisions +- A1 ships on plain ILIKE; **BM25 (#168–172) is a perceived-perf follow-up, not a correctness blocker.** +- `search_pids` is a **singleton**; any new producer (#248) shares one `_searchFilterToken`/`_searchSeq` and the `kind: 'text'|'concept'` tag. +- Codex-reviewed A1 invariants to preserve: **token-scoped staging table**, **empty-table clear** (never DROP the live table), **build-failure distinguished from empty results**. +- `?debug=a1`-gated hooks: `__a1globe`, `__a1log`/`__a1state`, and (new, uncommitted/diagnostic) `__a1camera`. -**Probe findings (de-risked the design):** pid is unique (no dup), facets ⊆ lite so **no coordinate-less matches** (table count == mappable matches — simple "N of M in view" copy), broadest realistic term ~82k pids (no million-row blowup). Full scoping + Codex resolutions in **`A1_SCOPING.md`**. +### Branch / ship state +- **A1**: merged to upstream `main` as **`e6f9def`** (PR #251), live on isamples.org + rdhyee. Local `feat/search-global-filter-a1` is now redundant (squash-merged). +- **#248**: branch **`feat/described-by-concept`** off merged main; foundation commit **`f2eac35`** (`conceptLabelForUri` + `buildConceptFilter`, behavior-neutral, verified). -Surfaces wired: `loadCount`/`loadPage` (table) ✅ verified; `loadViewportSamples` (points); `updateCrossFilteredCounts` (facet legend, + gate cube fast-path & global-baseline when search active); `summaryText` copy. +### Files Changed (this session, across A1 + #248) +- `explorer.qmd` — A1 data_base fix, double-scan collapse, facet-padding, Codex fixes (staging race / heatmap / empty-clear / build-failure msg), `?debug=a1` gating; #248 `conceptLabelForUri` + `buildConceptFilter`. +- `dev_server.py` — HTTP/1.1; `tests/playwright/a1-verify.mjs` — `HEADLESS=1` flag; new probes `globe-points-probe.mjs`, `shakedown-206.mjs`; `tests/playwright/facet-viewport.spec.js` — coherence test. ---- - -## THE LOGJAM (start here tomorrow) - -**Symptom:** search `bucchero` → table = "2,693 of 2,693 matches" (✅), but globe phaseMsg/stat stay **cluster** and `exitPointMode` runs. Even a clean **manual** search (not just boot) fails → it's a real state-machine bug, not a boot race. - -**Codex's diagnosis (correct, partially fixed in `4e79830`):** -1. ✅ FIXED — post-search `flyTo` lands at **200 km > EXIT_POINT_ALT (180 km)**, and the `moveEnd` handler exited point mode without checking `searchIsActive()`. Latched now. -2. ✅ FIXED — `enterPointMode` was fire-and-forget; now `async` + `await loadViewportSamples()`, awaited at all call sites. -3. ✅ FIXED — `loadViewportSamples` staleness was `requestId`-only; now also keys on the search token (`isStaleLoad()`). -4. ⏳ **NOT DONE — the actual remaining fix:** `applySearchFilterChange()` is a **parallel** mode-entry path racing the camera/mode machinery. Codex recommends **replacing it with ONE reconciler** that both the camera handler and search call, so "search forces point" and "altitude decides mode" live in one predicate with one set of staleness tokens. - -**Codex's reconciler spec (implement this):** -```js -async function reconcileGlobeForCurrentFilters(pushHistory = false) { - syncFacetNote(); - refreshHeatmap(); - if (searchIsActive()) { - if (getMode() !== 'point') await enterPointMode(pushHistory); - else await loadViewportSamples(); - } else { - // existing altitude-driven cluster/point behavior - } - refreshFacetCounts(); - window.refreshSamplesTable?.(); -} -``` -Call it from search completion AND the relevant camera paths; delete the bespoke `applySearchFilterChange` mini-state-machine. **Open question to nail with the new observability:** why does `enterPointMode` not stick on a manual search? (`[A1dbg]` events `apply-search-change`, `mode-change`, `post-build` will show the sequence — see below.) - -**Other bugs Codex flagged (not yet addressed):** -- Heatmap `renderHeatmap()` omits `searchFilterSQL` and `heatmapFilterHash()` omits the search token → heatmap (labeled "filtered density") stays unfiltered under search. (PR#2 or fix now.) -- Selection revalidation (`~L3457`) checks only source, not the search filter — clear/revalidate selection on search change. +### Patterns/Learnings +- **Backgrounded tabs freeze rAF** → corrupts every globe/camera observation. Headless Playwright is the reliable instrument. +- **Don't pile up runs**: accumulated hung browsers hold HTTP/1.1 keep-alive + peg CPU and starve `dev_server.py`. Restart between batches. +- **Local mirror full-downloads** (GET 200, not 206) — fine on localhost; validate range/perf on the deploy, not the mirror. +- Codex's `codex exec ... -o FILE` often fails to capture the final message when the diff is large; read the verdict from the streamed `.log` instead (resume the session for continuity). --- -## PERFORMANCE MODEL — why the UI hides the 40s, and what A1 does to it - -(RY's framing, 2026-05-29 — worth keeping front-of-mind for the substrate-vs-progress-UI call.) - -The explorer never *feels* like a 40–90s app because the whole design is **"never fetch big data over a wide area."** Data is tiered by zoom, smallest-first, and the tiny tiers are **preloaded** (`explorer.qmd` L14–17: `` for h3 res4 + facet_summaries + vocab_labels): +## External Content Processed -| User action | Fetched | Size | Felt | -|---|---|---|---| -| Land on globe (zoomed out) | H3 res4 | **580 KB** (preloaded) | instant (`Load Time 0.4s`) | -| Zoom in / more | H3 res6 / res8 | 1.6 / 2.5 MB | fast | -| Zoom **deep** → point mode | `samples_map_lite` | 60 MB file… | **still fast** ↓ | - -The trick on that last row: by the time `samples_map_lite` (60 MB) is touched, the camera is deep (alt < `ENTER_POINT_ALT` 120 km), so the bbox is tiny. DuckDB-WASM does **HTTP range requests** and pulls only the parquet **row groups** overlapping that small bbox (a few MB), never the whole file. So the big files are only ever read in slivers. UX masking on top: instant res4 globe, phase messages, stale-while-loading (dimmed old rows). - -**The two operations with NO spatial narrowing** (= the only ones that can hit the full 40s; both were what I kept triggering in dev): -1. **Free-text search** — `ILIKE '%term%'` over `label/description/place_name` across the *whole* `sample_facets_v2.parquet` (63 MB text). ILIKE can't skip row groups; it's a full column scan. Irreducible without an index. -2. **Samples table at a wide viewport** — `loadCount` over a world-sized bbox counts ~everything (normal users zoom in first, shrinking it). - -**The A1 implication (the load-bearing point):** A1 takes operation #1 — the single slowest thing in the app — and moves it to the **front of the common flow.** Today search is an optional side-panel lookup; A1 makes every committed search run that full 63 MB scan *first* and gates the filtered view on it. So A1 risks importing the one 40s wait into exactly the place the rest of the UI worked to avoid it. That's why: -- The **"Building search filter…"** affordance matters (honest masking, like the rest of the app). -- **BM25 substrate (#168–172)** is the thing that makes a *cold* search feel as snappy as zooming — NOT a correctness blocker (the pid-set abstraction works on plain ILIKE), but the perceived-perf fix. -- The **materialize-once** design is the mitigation: pay the un-narrowable full scan *one time* per term, then every pan/zoom/facet-toggle is a cheap `pid IN (…)` semi-join that DOES narrow spatially — folding search back into the fast tier after the first hit. - -(This also reframes the cold-load floor below: init ~40s is one thing, but the search scan is the *product-facing* slow path, and it's the one A1 must manage.) - ---- - -## FAST VERIFY LOOP (built today — use it tomorrow) - -**Why today was slow:** every iteration was a cold reload. Cold cost is **init-dominated** — DuckDB-WASM (from CDN) + Cesium + the OJS reactive graph take ~40s **before any data query**, and the search `ILIKE` then downloaded ~60MB of text columns over the network. Console capture from the automation harness was also flaky. - -**The fix (set up, committed in `62d5500`):** -1. **Local parquet mirror** — `docs/data/*.parquet` (128MB, gitignored via `docs` + `*.parquet`). Re-fetch with: - `for f in isamples_202601_{samples_map_lite,sample_facets_v2,h3_summary_res4,h3_summary_res6,h3_summary_res8,facet_cross_filter,facet_summaries}.parquet vocab_labels.parquet; do curl -s -o docs/data/$f https://data.isamples.org/$f; done` - (⚠️ `current/wide.parquet` came back **0 bytes** — used only for sample-click detail; may be the cause of the init hang — investigate.) -2. **`R2_BASE` override** — load with `?data_base=/data` (or `localStorage.ISAMPLES_DATA_BASE`). Defaults to prod, so shipped builds are unchanged. -3. **Range-capable server** — `python3 dev_server.py --dir docs --port 8099`. **Stock `python3 -m http.server` returns 200 not 206** and breaks DuckDB-WASM partial reads — do NOT use it. Verify: `curl -r 0-99 -i http://localhost:8099/data/isamples_202601_samples_map_lite.parquet` → must be **206** (confirmed working). -4. **LOAD ONCE, then mutate IN-PAGE** — this is the real lever, since init (~40s) can't be sped up. Pay init once; then drive searches via the search box (or `page.fill`) without reloading. Each in-page search hits the local mirror (fast data). -5. **Deterministic observability** (replaces flaky console): `window.__a1log` (ordered events), `window.__a1state[event]` (latest), `window.__a1globe()` → `{mode, samplePointsLen, samplePointsShown, h3PointsShown}`. On-page panel via `?debug=a1`. Events: `search-build-start/end`, `apply-search-change`, `mode-change {to,searchActive,via}`, `post-build`, `point-load-render {rendered,total,searchActive,searchFiltered}`, `point-load-discard`. -6. **Playwright harness** — `tests/playwright/a1-verify.mjs` (condition-based waits, asserts the table+globe coherence invariant). `node tests/playwright/a1-verify.mjs` (needs `npm i -D playwright` / `npx playwright install chromium`). - -**Loop URL example:** -`http://localhost:8099/explorer.html?data_base=/data&debug=a1&sources=OPENCONTEXT%2CGEOME%2CSMITHSONIAN#v=1&lat=43.15&lng=11.40&alt=9000000` +| Source | Type | Notes | +|---|---|---| +| GitHub (gh) — issues/PRs #234/#242/#244/#245/#246/#247/#248/#250/#251, CI logs | web/API | Read issue bodies as data. **Authored**: PR #251 + its review comment, #248 comment. **Merged** #251 to upstream production (RY-authorized "push to isamples"). | +| Codex CLI (gpt-5.4), session `019e7c8d…` | AI tool output | 3-round code review + #248 plan review. Findings **verified before applying**; treat as advisory. | +| isamples.org / rdhyee.github.io / localhost explorer | browser DOM (headless + 1 MCP tab) | Our own app. The MCP tab is what misled earlier sessions (rAF freeze). | +| `data.isamples.org`, local `docs/data/*.parquet` | remote/local data | Our own data. | -**Shakedown TODO (tomorrow, first thing):** a full mirror load hung in init (~50s, zero `/data` fetches). Check whether the 0-byte `current/wide.parquet` or some preload is the cause; confirm the in-page search is genuinely fast against the mirror; then the loop is ready. +No secrets accessed, no untrusted code executed (Codex output hand-reviewed). --- -## Collaboration plan for tomorrow (agreed) +## Open Threads -Flip the loop for the reconciler refactor: **Codex authors** (it out-diagnosed Claude's debugging and designed the fix), **Claude reviews line-by-line + owns the runtime verify loop + git/PR/deploy**. Iterate: Codex edits → Claude renders + runs `a1-verify.mjs` / in-page → feeds `__a1log` back to Codex → repeat. Higher effort both sides. +- [ ] **#248 Flavor A — finish the wiring** (the delicate half): `doDescribedBy(uri)` + extract shared `runPidSetResults({heading,emptyText,orderBy})` from `doSearch` (touches the just-reviewed stale-guards); `described-by=` URL param boot-trigger (search-ready timing) + `writeQueryState` kind-preservation; mutual exclusivity with `search=`; Playwright deep-link coherence test; Codex code-review; open PR. (Codex guardrails are in commit `f2eac35`'s message + the plan in `/tmp/p248.md`.) +- [ ] **Close #245** (facet-padding) — superseded by #251 (RY hadn't confirmed; do at pickup). +- [ ] **#244** (collection-facet DRAFT) and **#246** (points-over-heatmap) — need rebase on the new `main` (A1 + facet-padding); #246 worth checking points-over-heatmap *under a search*. +- [ ] **#248 Flavor B** (arbitrary/Getty URIs) — needs URI→label resolution + free-text fallback; follow-up. +- [ ] **Testing-gap follow-up**: add a deep-link **interactivity** regression test (assert `enableInputs`/no-trackedEntity + camera actually moves), using the `__a1camera` hook. (Hook is uncommitted/local; re-add when building the test.) +- [ ] Deferred A1 items: selection revalidation on search change; BM25 substrate (#168–172). --- -## Cleanup before the A1 PR is opened (don't ship these) +## Next Session Entry Point -- Remove the **`a1PersistenceProbe`** dev cell (right after the `db` cell) — persistence already proven. -- Decide on `a1dbg`/`__a1log`/`__a1state`/`__a1globe` + `?debug=a1` panel: gate behind a dev flag or strip. The `R2_BASE ?data_base=` override and `dev_server.py` are worth KEEPING (useful, safe defaults). -- The double-scan in `doSearch` (pid-set build + the existing LIMIT-50 side-panel query both scan facets) — follow-up: derive the side-panel list from `search_pids`. -- Heatmap + selection-revalidation search-awareness (above). +> Start here: continue **#248 Flavor A** on `feat/described-by-concept` (foundation `f2eac35` done). Next concrete step is `doDescribedBy` + extracting `runPidSetResults` from `doSearch`, then the `described-by=` URL plumbing + mutual-exclusivity, then test → Codex review → PR. Verify loop: `python3 dev_server.py --dir docs --port 8099` + `HEADLESS=1 node tests/playwright/a1-verify.mjs`. --- -## Key references +## Session History -- `explorer.qmd` anchors: `buildSearchFilter`/`clearSearchFilter`/`applySearchFilterChange` (~L3534), `loadViewportSamples` (~L2510), `enterPointMode`/`exitPointMode` (~L2680/2700), camera `moveEnd` handler (~L3709), camera `changed` handler (~L3560), `summaryText`/`loadCount`/`loadPage` (tableView cell ~L2123), `R2_BASE` (~L683), a1dbg/`__a1globe` install (~L4028). -- `A1_SCOPING.md` — full scope + probe + Codex resolutions. -- `dev_server.py`, `tests/playwright/a1-verify.mjs` — the loop. -- Issues: #234 (umbrella, A1 = Step 4), #247 (the bug, interim fixed by #250), #168–172 (FTS substrate — optional latency win, NOT a blocker for A1). +| Date | Trust | Summary | +|---|---|---| +| 2026-05-30/31 | external-content | Shakedown root-caused; A1 logjam = backgrounded-tab artifact; A1 perf + facet-padding fixed; Codex loop → dual approval; **A1 merged & deployed to isamples.org** (#251); #248 started (`feat/described-by-concept` foundation). | +| 2026-05-29 | external-content | (prior) A1 scoping + globe logjam framing (superseded — there was no logjam). | From 3fd7fd2033711eac86d2fc06e1dac605f1ab4ae5 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Mon, 1 Jun 2026 07:51:06 -0700 Subject: [PATCH 19/23] =?UTF-8?q?feat(explorer):=20#248=20Flavor=20A=20?= =?UTF-8?q?=E2=80=94=20described-by=3D=20filter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the concept-URI filter end to end on top of the A1 search_pids machinery. A `described-by=` deep link selects material samples whose URI-valued facet concept (object_type / material / context) exactly matches the URI, and filters EVERY surface (table, globe points, facet counts, side panel) via the same `pid IN (SELECT pid FROM search_pids)` semi-join. - doDescribedBy(uri): second entry point into search_pids. Reuses the already -reviewed buildConceptFilter (producer) + applySearchFilterChange (refresher, kind-agnostic) and renders its OWN side panel. Deliberately does NOT touch doSearch — keeps A1's just-shipped (#251) text-search stale-guard hot path untouched (RY decision: protect the path Kerstin demos Wed). - writeQueryState: search= and described-by= are mutually exclusive. Keyed off the text input (always current) rather than window.__searchFilter (stale at doSearch's early writeQueryState call), so a committed text search drops described-by= and vice-versa. - Boot: described-by= deep link auto-commits and wins over search= if both present. - tests/playwright/described-by-verify.mjs: HEADLESS deep-link coherence test (globe + panel + URL all reflect the concept; mutual-exclusivity with text). Verified: described-by-verify green; a1-verify still green (no A1 regression). Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 212 ++++++++++++++++++++++- tests/playwright/described-by-verify.mjs | 126 ++++++++++++++ 2 files changed, 335 insertions(+), 3 deletions(-) create mode 100644 tests/playwright/described-by-verify.mjs diff --git a/explorer.qmd b/explorer.qmd index 210b9687..e0f61338 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -831,8 +831,27 @@ function writeQueryState() { const params = new URLSearchParams(location.search); const searchInput = document.getElementById('sampleSearch'); const q = searchInput ? searchInput.value.trim() : ''; - if (q) params.set('search', q); - else params.delete('search'); + // #248 Flavor A: `search=` (free text) and `described-by=` (concept URI) + // both produce the singleton search_pids and are mutually exclusive. + // Key the choice off the TEXT INPUT, not window.__searchFilter — doSearch + // calls writeQueryState() EARLY (before buildSearchFilter flips + // __searchFilter.kind to 'text'), so __searchFilter is stale at that call + // site. The input value is always current: a non-empty box means a text + // search is being committed → `search=` wins, drop `described-by=`. + // doDescribedBy CLEARS the box before it calls writeQueryState, so there + // the box is empty and we fall through to the concept branch. + const sf = (typeof window !== 'undefined') ? window.__searchFilter : null; + const conceptActive = !!(sf && sf.active && sf.kind === 'concept' && sf.uri); + if (q) { + params.set('search', q); + params.delete('described-by'); + } else if (conceptActive) { + params.set('described-by', sf.uri); + params.delete('search'); + } else { + params.delete('search'); + params.delete('described-by'); + } const activeSources = getActiveSources(); if (activeSources.length === SOURCE_VALUES.length) params.delete('sources'); @@ -4741,6 +4760,187 @@ zoomWatcher = { } } + // #248 Flavor A: commit a concept-URI filter. This is the SECOND entry + // point into the shared `search_pids` machinery — `buildConceptFilter` + // is the producer, `applySearchFilterChange` pushes the result to every + // surface (table / points / facet counts / globe mode), exactly as a text + // search does. It deliberately does NOT reuse doSearch's body: doSearch + // carries A1's text-search stale-guard + COUNT logic shipped in #251, and + // we keep that hot path untouched (RY decision, 2026-06-01). It renders its + // own, simpler side panel instead. Mutually exclusive with the free-text + // search: committing a concept clears the text box and the `search=` param. + async function doDescribedBy(uri) { + if (!uri) return; + // Share doSearch's freshness counter so a text search and a concept + // filter can't clobber each other's side-panel render mid-flight. + const searchId = ++_searchSeq; + + // Mutual exclusivity: a committed concept filter OWNS search_pids, so + // clear the free-text box (and its sidebar mirror) before building. + if (searchInput) searchInput.value = ''; + const sidebarInput = document.getElementById('sampleSearchSidebar'); + if (sidebarInput) sidebarInput.value = ''; + + if (searchResults) searchResults.textContent = 'Filtering by concept…'; + + // Build the pid-set. buildConceptFilter mirrors buildSearchFilter's + // token-scoped staging + finally-drop guards and publishes the result + // (label, total, kind:'concept', uri) on window.__searchFilter. A build + // failure empties search_pids via clearSearchFilter (same as doSearch), + // so downstream readers see zero rows rather than a missing table. + let buildFailed = false; + try { + await buildConceptFilter(uri); + } catch (e) { + console.warn('#248 concept-filter build failed; surfaces stay unfiltered:', e); + buildFailed = true; + await clearSearchFilter(); + } + // Superseded by a newer search/concept while building? Bail before + // mutating any surface. Otherwise push the filter through everything. + window.a1dbg?.('concept-post-build', { searchId, searchSeq: _searchSeq, active: window.__searchFilter?.active, total: window.__searchFilter?.total }); + if (searchId === _searchSeq) { + await applySearchFilterChange(); + } + // Persist `described-by=` (and drop `search=`) now that __searchFilter + // reflects the concept — writeQueryState reads window.__searchFilter. + writeQueryState(); + + // Anything past here only paints the side panel / camera; if a newer + // action superseded us during applySearchFilterChange, stop. + if (searchId !== _searchSeq) return; + + const sf = window.__searchFilter; + const label = (sf && sf.term) ? sf.term + : (typeof window !== 'undefined' && window.conceptLabelForUri) ? window.conceptLabelForUri(uri) : uri; + const total = (sf && sf.active && sf.kind === 'concept') ? sf.total : 0; + + // Sticky heading mirrors doSearch's, recolored (purple) to distinguish a + // concept filter from a text search. Label is vocab-resolved, not a URL. + const headingHTML = (suffix) => + `

Samples described by: ${label}${suffix}

`; + + if (buildFailed) { + if (searchResults) searchResults.textContent = `Concept filter failed for "${label}". Please try again.`; + const sElF = document.getElementById('samplesSection'); + if (sElF) sElF.innerHTML = headingHTML('') + + '
Filter failed to build — please try again.
'; + return; + } + if (total === 0) { + if (searchResults) searchResults.textContent = `No samples described by "${label}"`; + const sEl0 = document.getElementById('samplesSection'); + if (sEl0) sEl0.innerHTML = headingHTML(' (0)') + + '
No samples matched this concept.
'; + return; + } + + // Side-panel list: read the LIMIT-50 head straight off search_pids + // (already built + ranked by buildConceptFilter: object_type > material + // > context, then label), LEFT JOIN lite for coords so coord-less + // samples still list (null lat/lng; the click handler guards on null). + let results; + try { + results = await db.query(` + SELECT s.pid, s.label, s.source, l.latitude, l.longitude, + s.place_name, s.relevance_score + FROM search_pids s + LEFT JOIN read_parquet('${lite_url}') l USING (pid) + ORDER BY s.relevance_score DESC, s.label + LIMIT 50 + `); + } catch (err) { + if (searchId !== _searchSeq) return; + console.error('#248 concept side-panel query failed:', err); + if (searchResults) searchResults.textContent = `Concept filter error: ${err.message}`; + return; + } + if (searchId !== _searchSeq) return; + + const shown = results.length; + const ofTotal = total > shown ? `${shown} of ${total.toLocaleString()}` : `${shown}`; + if (searchResults) searchResults.textContent = total > shown + ? `${shown} of ${total.toLocaleString()} samples described by "${label}"` + : `${shown} sample${shown === 1 ? '' : 's'} described by "${label}"`; + + const sampEl = document.getElementById('samplesSection'); + if (sampEl) { + let h = headingHTML(` (${ofTotal})`); + for (const s of results) { + const color = SOURCE_COLORS[s.source] || '#666'; + const name = SOURCE_NAMES[s.source] || s.source; + const sUrl = sourceUrl(s.pid); + h += `
+
+ ${sUrl ? `${s.label || s.pid}` : `${s.label || s.pid}`} + ${name} +
+
`; + } + sampEl.innerHTML = h; + + // Click a concept-result row → same full selection ceremony as a + // text-search row (freshness bump, card hydration, flight, lazy + // detail) so a slow prior load can't repaint over the navigation. + const resultsByPid = new Map(results.map(s => [s.pid, s])); + sampEl.querySelectorAll('.sample-row[data-lat]').forEach(row => { + row.addEventListener('click', async (e) => { + if (e.target.tagName === 'A') return; // let links work + const pid = row.dataset.pid; + const sample = pid ? resultsByPid.get(pid) : null; + if (!sample || sample.latitude == null || sample.longitude == null) return; + + const isStale = freshSelectionToken(viewer); + viewer._globeState.selectedPid = pid; + viewer._globeState.selectedH3 = null; + updateSampleCard({ + pid: sample.pid, + label: sample.label, + source: sample.source, + lat: sample.latitude, + lng: sample.longitude, + place_name: sample.place_name, + result_time: sample.result_time + }); + viewer.camera.flyTo({ + destination: Cesium.Cartesian3.fromDegrees(sample.longitude, sample.latitude, 50000), + duration: 1.5 + }); + try { + const detail = await db.query(` + SELECT description + FROM read_parquet('${wide_url}') + WHERE pid = '${pid.replace(/'/g, "''")}' + LIMIT 1 + `); + if (isStale()) return; + if (detail && detail.length > 0) updateSampleDetail(detail[0]); + else updateSampleDetail({ description: '' }); + } catch(err) { + if (isStale()) return; + console.error("Concept-row detail query failed:", err); + updateSampleDetail(null); + } + }); + }); + } + + // Fly to the first LOCATED result (a nudge, not a selection — clear any + // prior pid/h3 so the URL doesn't carry stale selection across flight). + // Mirrors doSearch's world-scope auto-flight. + const firstLocated = results.find(r => r.latitude != null && r.longitude != null); + if (firstLocated) { + viewer._globeState.selectedPid = null; + viewer._globeState.selectedH3 = null; + viewer.camera.flyTo({ + destination: Cesium.Cartesian3.fromDegrees(firstLocated.longitude, firstLocated.latitude, 200000), + duration: 1.5 + }); + } + } + // Expose for the boot trigger + debug console (mirrors window.conceptLabelForUri). + if (typeof window !== 'undefined') window.doDescribedBy = doDescribedBy; + if (searchAreaBtn) searchAreaBtn.addEventListener('click', () => doSearch('area')); if (searchWorldBtn) searchWorldBtn.addEventListener('click', () => doSearch('world')); // Slim-overlay submit button — same behavior as Enter on `#sampleSearch`. @@ -4784,7 +4984,13 @@ zoomWatcher = { } }); - if (searchInput && searchInput.value.trim().length >= 2) { + // #248 Flavor A boot: a `described-by=` deep link commits the + // concept filter. It WINS over `search=` if a hand-crafted URL carries both + // (they're mutually exclusive; doDescribedBy clears the text box + param). + const _describedByUri = new URLSearchParams(location.search).get('described-by'); + if (_describedByUri) { + doDescribedBy(_describedByUri); + } else if (searchInput && searchInput.value.trim().length >= 2) { doSearch(_searchScope); } diff --git a/tests/playwright/described-by-verify.mjs b/tests/playwright/described-by-verify.mjs new file mode 100644 index 00000000..e6073a54 --- /dev/null +++ b/tests/playwright/described-by-verify.mjs @@ -0,0 +1,126 @@ +// #248 Flavor A (`described-by=`) deterministic verify harness. +// +// Sibling of a1-verify.mjs — same LOAD-ONCE-then-assert pattern against the +// LOCAL parquet mirror. Verifies the concept-URI deep link drives EVERY +// surface coherently (the A1 invariant, but for the concept producer of +// search_pids), and that committing a text search afterward flips the URL +// from `described-by=` to `search=` (mutual exclusivity). +// +// Run pattern (same as a1-verify): +// 1. mirror parquets once: ls docs/data/*.parquet +// 2. python3 dev_server.py --dir docs --port 8099 +// 3. HEADLESS=1 node tests/playwright/described-by-verify.mjs +// +// HEADLESS=1 is strongly recommended — headed backgrounded windows freeze rAF +// (Cesium camera never settles), corrupting globe-mode observations. + +import { chromium } from 'playwright'; + +// A well-populated, clearly cross-domain concept (biology — "Whole organism +// material sample"), to tell the iSamples cross-domain story, not archaeology. +const URI = process.env.DB_URI + || 'https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/wholeorganism'; +const EXPECT_LABEL = process.env.DB_LABEL || 'Whole organism material sample'; + +const BASE = process.env.A1_BASE + || 'http://localhost:8099/explorer.html?data_base=/data&debug=a1&sources=OPENCONTEXT%2CGEOME%2CSMITHSONIAN'; +// Boot at high altitude (cluster) so we test that the concept deep link forces +// filtered point mode from cluster — the same hard case a1-verify covers. +const URL = `${BASE}&described-by=${encodeURIComponent(URI)}#v=1&lat=20&lng=0&alt=9000000`; + +const browser = await chromium.launch({ headless: process.env.HEADLESS === '1' }); +const page = await browser.newPage(); +page.on('console', (m) => { if (/A1|point mode|concept|Discarding|#248/.test(m.text())) console.log(' page>', m.text()); }); + +console.log('Loading (cold init ~40s)…', URL); +await page.goto(URL, { waitUntil: 'domcontentloaded' }); + +// App live (OJS graph + DuckDB + search machinery installed). +await page.waitForFunction( + () => typeof window.a1dbg === 'function' && !!window.__a1globe && !!document.querySelector('#sampleSearch'), + null, { timeout: 180_000 }); +console.log('App live. Boot mode:', await page.evaluate(() => window.__a1globe?.())); + +// The deep link should auto-commit the concept filter at boot (kind:'concept'). +await page.waitForFunction( + () => window.__searchFilter?.active === true + && window.__searchFilter?.kind === 'concept' + && window.__searchFilter?.total > 0, + null, { timeout: 120_000 }); + +// Globe settles into filtered point mode (clusters can't be concept-filtered). +await page.waitForFunction(() => { + const g = window.__a1globe?.(); + return g && g.mode === 'point' && g.samplePointsShown === true && g.samplePointsLen > 0; +}, null, { timeout: 60_000 }).catch(() => console.log(' !! globe did NOT reach filtered point mode')); + +// The side-panel render runs AFTER applySearchFilterChange returns (a separate +// LIMIT-50 SELECT off search_pids), so wait for the concept heading to paint +// before snapshotting — otherwise we race the "Filtering by concept…" interim. +await page.waitForFunction(() => { + const h = document.querySelector('#samplesSection .search-results-heading'); + return h && /Samples described by:/.test(h.textContent || ''); +}, null, { timeout: 60_000 }).catch(() => console.log(' !! concept side panel did NOT render')); + +const state = await page.evaluate(() => ({ + search: window.__searchFilter, + globe: window.__a1globe?.(), + tableMeta: document.getElementById('tableMeta')?.textContent?.trim(), + panelHeading: document.querySelector('#samplesSection .search-results-heading')?.textContent?.trim(), + resultsLine: document.getElementById('searchResults')?.textContent?.trim(), + urlSearch: location.search, +})); + +console.log('\n=== CONCEPT DEEP-LINK RESULT ==='); +console.log(JSON.stringify(state, null, 2)); + +const conceptOk = + state.search?.active === true && + state.search?.kind === 'concept' && + state.search?.uri === URI && + state.globe?.mode === 'point' && + state.globe?.samplePointsShown === true && + state.globe?.h3PointsShown === false && + state.globe?.samplePointsLen > 0 && + state.globe?.samplePointsLen <= state.search?.total && + /Samples described by:/.test(state.panelHeading || '') && + (state.panelHeading || '').includes(EXPECT_LABEL) && + /described-by=/.test(state.urlSearch) && + !/[?&]search=/.test(state.urlSearch); + +console.log(conceptOk + ? '\n✅ #248 CONCEPT COHERENT: globe + panel + URL all reflect the concept filter.' + : '\n❌ #248 INCOHERENT: see fields above.'); + +// Mutual exclusivity: now commit a free-text search; described-by= must drop +// out of the URL and the filter kind must flip to 'text'. +await page.fill('#sampleSearch', 'pottery'); +await page.press('#sampleSearch', 'Enter'); +await page.waitForFunction( + () => window.__searchFilter?.active === true + && window.__searchFilter?.kind === 'text' + && window.__searchFilter?.term === 'pottery', + null, { timeout: 120_000 }).catch(() => console.log(' !! text search did not take over')); + +const after = await page.evaluate(() => ({ + kind: window.__searchFilter?.kind, + term: window.__searchFilter?.term, + urlSearch: location.search, +})); +console.log('\n=== AFTER TEXT SEARCH (mutual exclusivity) ==='); +console.log(JSON.stringify(after, null, 2)); + +const mutexOk = + after.kind === 'text' && + /[?&]search=pottery/.test(after.urlSearch) && + !/described-by=/.test(after.urlSearch); + +console.log(mutexOk + ? '\n✅ MUTUAL EXCLUSIVITY: text search took over; described-by= cleared from URL.' + : '\n❌ MUTUAL EXCLUSIVITY FAILED: see fields above.'); + +const ok = conceptOk && mutexOk; +console.log(ok ? '\n✅✅ #248 FLAVOR A VERIFIED.' : '\n❌ #248 verify FAILED.'); + +if (process.env.A1_CLOSE) await browser.close(); +process.exitCode = ok ? 0 : 1; From 30fbac93496e4c3afda35309f3eb143c4565d52f Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Mon, 1 Jun 2026 09:37:26 -0700 Subject: [PATCH 20/23] fix(explorer): #248 address Codex review (URL intent, stale guard, XSS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-1 found 3 real issues; all fixed + regression-tested: 1. URL could flip concept→phantom-text on draft text + facet/source toggle. writeQueryState is now intent-aware: doSearch passes commitText to authoritatively persist/clear a text search (it runs before __searchFilter updates); every other caller mirrors the COMMITTED filter in __searchFilter, so draft (un-submitted) text no longer clobbers described-by=. New Playwright check: draft text + source toggle preserves described-by=. 2. A superseded doDescribedBy could still rewrite the URL. Moved the _searchSeq stale-check to BEFORE writeQueryState (and re-check after the async applySearchFilterChange) so only the current producer persists state. 3. Reflected-XSS path: the concept side panel interpolated a URL-derived label (and result label/pid/url/name) into innerHTML. Now escapeHtml'd (the same helper the table renderer uses), covering text + attribute contexts. Verified: described-by-verify green (concept + draft-text + mutual-exclusivity); a1-verify still green. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 83 +++++++++++++++--------- tests/playwright/described-by-verify.mjs | 29 ++++++++- 2 files changed, 81 insertions(+), 31 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index e0f61338..05fbba00 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -827,30 +827,42 @@ function applyQueryToFacetFilters() { } -function writeQueryState() { +function writeQueryState(opts = {}) { const params = new URLSearchParams(location.search); const searchInput = document.getElementById('sampleSearch'); const q = searchInput ? searchInput.value.trim() : ''; // #248 Flavor A: `search=` (free text) and `described-by=` (concept URI) // both produce the singleton search_pids and are mutually exclusive. - // Key the choice off the TEXT INPUT, not window.__searchFilter — doSearch - // calls writeQueryState() EARLY (before buildSearchFilter flips - // __searchFilter.kind to 'text'), so __searchFilter is stale at that call - // site. The input value is always current: a non-empty box means a text - // search is being committed → `search=` wins, drop `described-by=`. - // doDescribedBy CLEARS the box before it calls writeQueryState, so there - // the box is empty and we fall through to the concept branch. + // + // doSearch passes `opts.commitText` to AUTHORITATIVELY persist (a term) or + // clear ('') a text search: it calls writeQueryState EARLY, before + // buildSearchFilter flips __searchFilter.kind to 'text', so it can't read + // filter state — its committed term is the source of truth. + // + // Every OTHER caller (facet-change + source-change handlers, doDescribedBy) + // omits commitText, so the URL mirrors the COMMITTED filter in + // window.__searchFilter, NOT the raw text box. This is Codex finding #1: + // with an active concept filter, draft (un-submitted) text in the box plus + // a facet/source toggle must NOT clobber `described-by=` with a phantom + // `search=`. Keying passive writes off __searchFilter (concept) vs. the box + // (text) keeps the URL honest about what's actually filtering. const sf = (typeof window !== 'undefined') ? window.__searchFilter : null; const conceptActive = !!(sf && sf.active && sf.kind === 'concept' && sf.uri); - if (q) { - params.set('search', q); + if (opts.commitText !== undefined) { + if (opts.commitText) params.set('search', opts.commitText); + else params.delete('search'); params.delete('described-by'); } else if (conceptActive) { params.set('described-by', sf.uri); params.delete('search'); } else { - params.delete('search'); + // No concept filter and no explicit text commit: preserve the prior + // (pre-#248) behavior — reflect the text box. A committed text search + // keeps its term across facet/source toggles (the box still holds it); + // an inactive/empty state clears `search=`. params.delete('described-by'); + if (q) params.set('search', q); + else params.delete('search'); } const activeSources = getActiveSources(); @@ -4245,11 +4257,11 @@ zoomWatcher = { // page returns to its unfiltered state. await clearSearchFilter(); await applySearchFilterChange(); - writeQueryState(); + writeQueryState({ commitText: '' }); // authoritative clear (#248) persistSearchScope(effectiveScope); return; } - writeQueryState(); + writeQueryState({ commitText: term }); // authoritative text commit (#248) persistSearchScope(effectiveScope); // A1 (#234 Step 4): the pid-set filter is built below (after `terms` // is parsed) and the dependent surfaces are refreshed against it; the @@ -4796,19 +4808,19 @@ zoomWatcher = { buildFailed = true; await clearSearchFilter(); } - // Superseded by a newer search/concept while building? Bail before - // mutating any surface. Otherwise push the filter through everything. window.a1dbg?.('concept-post-build', { searchId, searchSeq: _searchSeq, active: window.__searchFilter?.active, total: window.__searchFilter?.total }); - if (searchId === _searchSeq) { - await applySearchFilterChange(); - } - // Persist `described-by=` (and drop `search=`) now that __searchFilter - // reflects the concept — writeQueryState reads window.__searchFilter. - writeQueryState(); - - // Anything past here only paints the side panel / camera; if a newer - // action superseded us during applySearchFilterChange, stop. + // Superseded during build? Bail before touching ANY shared state. The + // winning producer owns the surfaces AND the URL (Codex finding #2: a + // stale producer must not run writeQueryState off another's filter). + if (searchId !== _searchSeq) return; + await applySearchFilterChange(); + // Re-check after the async surface refresh, BEFORE writeQueryState: if a + // newer search/concept superseded us mid-refresh, it — not this stale + // producer — must be the one that writes the URL. if (searchId !== _searchSeq) return; + // Persist `described-by=` (drops `search=`) now that __searchFilter + // reflects this concept and we're confirmed the current producer. + writeQueryState(); const sf = window.__searchFilter; const label = (sf && sf.term) ? sf.term @@ -4816,9 +4828,14 @@ zoomWatcher = { const total = (sf && sf.active && sf.kind === 'concept') ? sf.total : 0; // Sticky heading mirrors doSearch's, recolored (purple) to distinguish a - // concept filter from a text search. Label is vocab-resolved, not a URL. + // concept filter from a text search. `label` is usually a vocab pref + // label, but for an unknown URI it falls back to the URI tail — i.e. it + // can be attacker-controlled via the `described-by=` URL param. Escape + // it before innerHTML to close the reflected-XSS path (Codex finding #3). + // `suffix` is internal (counts) and safe. + const safeLabel = escapeHtml(label); const headingHTML = (suffix) => - `

Samples described by: ${label}${suffix}

`; + `

Samples described by: ${safeLabel}${suffix}

`; if (buildFailed) { if (searchResults) searchResults.textContent = `Concept filter failed for "${label}". Please try again.`; @@ -4867,12 +4884,18 @@ zoomWatcher = { if (sampEl) { let h = headingHTML(` (${ofTotal})`); for (const s of results) { - const color = SOURCE_COLORS[s.source] || '#666'; - const name = SOURCE_NAMES[s.source] || s.source; + // Escape every interpolated value (Codex finding #3): label/pid + // come from the parquet, color/name from controlled maps, sUrl + // into an href attribute — escapeHtml handles both text and + // attribute contexts (it escapes quotes). lat/lng are numeric. + const color = escapeHtml(SOURCE_COLORS[s.source] || '#666'); + const name = escapeHtml(SOURCE_NAMES[s.source] || s.source); const sUrl = sourceUrl(s.pid); - h += `
+ const labelText = escapeHtml(s.label || s.pid); + const pidAttr = escapeHtml(s.pid || ''); + h += `
- ${sUrl ? `${s.label || s.pid}` : `${s.label || s.pid}`} + ${sUrl ? `${labelText}` : `${labelText}`} ${name}
`; diff --git a/tests/playwright/described-by-verify.mjs b/tests/playwright/described-by-verify.mjs index e6073a54..34d61c37 100644 --- a/tests/playwright/described-by-verify.mjs +++ b/tests/playwright/described-by-verify.mjs @@ -92,6 +92,33 @@ console.log(conceptOk ? '\n✅ #248 CONCEPT COHERENT: globe + panel + URL all reflect the concept filter.' : '\n❌ #248 INCOHERENT: see fields above.'); +// Codex finding #1 regression: with an active concept filter, DRAFT (un- +// submitted) text in the box + a passive writeQueryState (fired by a source/ +// facet toggle) must NOT clobber `described-by=` with a phantom `search=`. +await page.fill('#sampleSearch', 'draftNotSubmitted'); // type, do NOT press Enter +const firstSource = page.locator('#sourceFilter input[type="checkbox"]').first(); +await firstSource.uncheck(); // fires the passive source-change writeQueryState +await page.waitForFunction(() => !document.body.classList.contains('explorer-busy'), + null, { timeout: 60_000 }).catch(() => {}); +const draftState = await page.evaluate(() => ({ + kind: window.__searchFilter?.kind, + urlSearch: location.search, +})); +console.log('\n=== DRAFT-TEXT + SOURCE TOGGLE (Codex #1) ==='); +console.log(JSON.stringify(draftState, null, 2)); +const draftOk = + draftState.kind === 'concept' && + /described-by=/.test(draftState.urlSearch) && + !/[?&]search=/.test(draftState.urlSearch); +console.log(draftOk + ? '\n✅ DRAFT-TEXT SAFE: described-by= survived draft text + source toggle.' + : '\n❌ DRAFT-TEXT BUG: described-by= was clobbered by un-submitted text.'); +// Restore clean state for the next check. +await firstSource.check(); +await page.fill('#sampleSearch', ''); +await page.waitForFunction(() => !document.body.classList.contains('explorer-busy'), + null, { timeout: 60_000 }).catch(() => {}); + // Mutual exclusivity: now commit a free-text search; described-by= must drop // out of the URL and the filter kind must flip to 'text'. await page.fill('#sampleSearch', 'pottery'); @@ -119,7 +146,7 @@ console.log(mutexOk ? '\n✅ MUTUAL EXCLUSIVITY: text search took over; described-by= cleared from URL.' : '\n❌ MUTUAL EXCLUSIVITY FAILED: see fields above.'); -const ok = conceptOk && mutexOk; +const ok = conceptOk && draftOk && mutexOk; console.log(ok ? '\n✅✅ #248 FLAVOR A VERIFIED.' : '\n❌ #248 verify FAILED.'); if (process.env.A1_CLOSE) await browser.close(); From bba04f64f41d08166dbe7beacabdbe5c36527093 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Mon, 1 Jun 2026 10:56:14 -0700 Subject: [PATCH 21/23] feat(explorer): busy "progress" cursor over the map during updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Show a `progress` cursor (arrow + spinner — NOT a frozen hourglass) over #cesiumContainer while a filter/search/facet/source update is in flight, so the user gets a visual "things are updating" signal without the map ever freezing (it's pure CSS on body.explorer-busy; it never intercepts input). Hooks into the existing depth-counted busyAcquire/busyRelease that already wrap the facet-change, source-change, and search/concept-commit paths — those releases are all in `finally`, so a throw can't strand the cursor. Adds a 120s watchdog as defense-in-depth against a HUNG promise (a load that never settles) that would otherwise skip the finally: it force-clears the busy state (re-armed per acquire, far longer than any real cold-cache load, so it only fires on a genuine leak). Verified: cursor shows ~1.2s on a search then clears via normal release; A1 + deploy smoke gate green. Not covered: pan/zoom viewport reloads (separate cells) — deferred follow-up. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/explorer.qmd b/explorer.qmd index 05fbba00..48c405c8 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -68,6 +68,18 @@ format: min-height: 0; aspect-ratio: auto; } + /* Busy indicator over the map while a filter/search/facet update is in + flight. `progress` (arrow + spinner), NOT `wait` (hourglass): the map + stays fully interactive — we're signaling "updating", not "frozen". The + body.explorer-busy class is toggled by the depth-counted busyAcquire/ + busyRelease around the facet-change, source-change, and search/concept + commit paths. Cesium sets its own cursor on the canvas during camera + interaction, so override with !important. (Pan/zoom viewport reloads run + in separate cells that don't touch this class yet — see follow-up.) */ + body.explorer-busy #cesiumContainer, + body.explorer-busy #cesiumContainer canvas { + cursor: progress !important; + } /* Slim top-right search overlay (Hana Figma node 222:456). The earlier multi-row treatment (M-1A, PR #200) ate ~480px × ~100px on the left side of the map; this collapses to one row at the right. The @@ -3446,13 +3458,36 @@ zoomWatcher = { // whole point of the flag. Depth-counted: class is added on the // 0 → 1 transition and removed on the 1 → 0 transition. let _busyDepth = 0; + // Failsafe so the `progress` cursor (CSS on body.explorer-busy) can NEVER + // stick. Every busyRelease is already in a `finally`, so a throw can't + // strand the flag — but a HUNG promise (a load that never settles/rejects) + // would never reach that finally. This watchdog force-clears the busy + // state after a window far longer than any real update, incl. cold-cache + // loads (which the code notes can take 60–90s). It's re-armed on each + // acquire, so it only ever fires on a genuine leak — and the cursor is + // purely cosmetic, so an early clear is harmless (the map never freezes). + let _busyWatchdog = null; + const BUSY_WATCHDOG_MS = 120000; + function _clearBusyWatchdog() { + if (_busyWatchdog) { clearTimeout(_busyWatchdog); _busyWatchdog = null; } + } function busyAcquire() { if (_busyDepth === 0) document.body.classList.add('explorer-busy'); _busyDepth++; + _clearBusyWatchdog(); + _busyWatchdog = setTimeout(() => { + _busyDepth = 0; + document.body.classList.remove('explorer-busy'); + _busyWatchdog = null; + console.warn('explorer-busy watchdog fired — force-cleared a stuck busy cursor.'); + }, BUSY_WATCHDOG_MS); } function busyRelease() { _busyDepth = Math.max(0, _busyDepth - 1); - if (_busyDepth === 0) document.body.classList.remove('explorer-busy'); + if (_busyDepth === 0) { + document.body.classList.remove('explorer-busy'); + _clearBusyWatchdog(); + } } // --- Source filter change handler --- From 738d259f8081287ea342bfd20fc7e95621a9e058 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Mon, 1 Jun 2026 11:46:46 -0700 Subject: [PATCH 22/23] fix(explorer): make busy cursor page-wide, not map-only A map-only busy cursor was invisible in practice: a CSS cursor only shows where the pointer is, and the covered updates (search, facet, source) are all triggered with the pointer OFF the map (search box / sidebar). Apply the progress cursor page-wide (body.explorer-busy * with !important) so the 'updating' signal shows wherever the pointer rests. Verified: progress on body/search-box/checkbox/map when busy, natural cursors when idle; smoke gate green. Co-Authored-By: Claude Opus 4.8 (1M context) --- explorer.qmd | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/explorer.qmd b/explorer.qmd index 48c405c8..21cb4b98 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -68,16 +68,19 @@ format: min-height: 0; aspect-ratio: auto; } - /* Busy indicator over the map while a filter/search/facet update is in - flight. `progress` (arrow + spinner), NOT `wait` (hourglass): the map - stays fully interactive — we're signaling "updating", not "frozen". The + /* Busy indicator while a filter/search/facet update is in flight. + `progress` (arrow + spinner), NOT `wait` (hourglass): nothing freezes — + we're signaling "updating", not "blocked". PAGE-WIDE on purpose: a CSS + cursor only shows where the pointer actually is, and these updates are + triggered with the pointer OFF the map (in the search box / sidebar + checkboxes), so a map-only rule was invisible in practice. The `*` with + !important overrides per-element cursors (text inputs, the Cesium canvas, + buttons) so the signal shows wherever the pointer rests. The body.explorer-busy class is toggled by the depth-counted busyAcquire/ busyRelease around the facet-change, source-change, and search/concept - commit paths. Cesium sets its own cursor on the canvas during camera - interaction, so override with !important. (Pan/zoom viewport reloads run - in separate cells that don't touch this class yet — see follow-up.) */ - body.explorer-busy #cesiumContainer, - body.explorer-busy #cesiumContainer canvas { + commit paths (with a 120s watchdog failsafe so it can never stick). + Pan/zoom viewport reloads don't set the class yet — deferred follow-up. */ + body.explorer-busy, body.explorer-busy * { cursor: progress !important; } /* Slim top-right search overlay (Hana Figma node 222:456). The earlier From 9ed6a580f559b81e11116ca1ef838c69c3edc46d Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Mon, 1 Jun 2026 12:44:28 -0700 Subject: [PATCH 23/23] Fix search facet count race after flyTo --- explorer.qmd | 35 ++++++- .../facet-search-count-race.spec.js | 94 +++++++++++++++++++ 2 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 tests/playwright/facet-search-count-race.spec.js diff --git a/explorer.qmd b/explorer.qmd index 21cb4b98..8828f3d8 100644 --- a/explorer.qmd +++ b/explorer.qmd @@ -4059,6 +4059,37 @@ zoomWatcher = { new URLSearchParams(location.search).get('search_scope') === 'area' ) ? 'area' : 'world'; + function refreshFacetCountsAfterSearchFlight(searchId) { + // A world-scope committed search auto-flies to the first result. That + // camera flight emits its own moveStart/moveEnd count refreshes, which + // can supersede the search refresh kicked off by applySearchFilterChange. + // Register this one-shot after the shared moveEnd listener exists; when + // the flight settles, this runs after the normal moveEnd refresh and + // becomes the final search-aware facet-count request for the committed + // search. + let remove = null; + let done = false; + const finish = () => { + if (done) return; + done = true; + if (remove) remove(); + setTimeout(() => { + if (searchId !== _searchSeq || !searchIsActive()) return; + window.a1dbg?.('search-flyto-final-facet-refresh', { + searchId, + token: window.__searchFilter?.token, + }); + refreshFacetCounts(); + }, 0); + }; + remove = viewer.camera.moveEnd.addEventListener(finish); + return () => { + if (done) return; + done = true; + if (remove) remove(); + }; + } + function persistSearchScope(scope) { // writeQueryState() doesn't know about scope; keep the URL param // honest by manipulating directly. 'world' is default, omitted from @@ -4617,11 +4648,13 @@ zoomWatcher = { // (issue #207 item 8). User clicks on a specific row to // establish a new selection. if (effectiveScope === 'world' && results[0].latitude && results[0].longitude) { + const cancelFinalFacetRefresh = refreshFacetCountsAfterSearchFlight(searchId); viewer._globeState.selectedPid = null; viewer._globeState.selectedH3 = null; viewer.camera.flyTo({ destination: Cesium.Cartesian3.fromDegrees(results[0].longitude, results[0].latitude, 200000), - duration: 1.5 + duration: 1.5, + cancel: cancelFinalFacetRefresh, }); } diff --git a/tests/playwright/facet-search-count-race.spec.js b/tests/playwright/facet-search-count-race.spec.js new file mode 100644 index 00000000..e6cc2caf --- /dev/null +++ b/tests/playwright/facet-search-count-race.spec.js @@ -0,0 +1,94 @@ +const { test, expect } = require('@playwright/test'); +const { explorerUrl } = require('./helpers/url'); + +const TERM = 'bucchero'; +const VIEW_HASH = '#v=1&lat=42.5&lng=12.0&alt=400000'; +const DATA_DEBUG = '?data_base=/data&debug=a1'; +const AREA_DATA_DEBUG = '?data_base=/data&debug=a1&search_scope=area'; + +const EARTH_MATERIAL = 'https://w3id.org/isample/vocabulary/material/1.0/earthmaterial'; +const OTHER_SOLID_OBJECT = 'https://w3id.org/isample/vocabulary/materialsampleobjecttype/1.0/othersolidobject'; + +async function waitForFacetUI(page) { + await page.waitForFunction( + () => document.querySelectorAll('#sourceFilter .facet-count[data-facet="source"]').length > 0 + && document.querySelectorAll('#materialFilterBody .facet-count[data-facet="material"]').length > 0 + && document.querySelectorAll('#objectTypeFilterBody .facet-count[data-facet="object_type"]').length > 0, + null, + { timeout: 120000 } + ); +} + +async function submitSearchAndWaitForSettle(page, term = TERM) { + await page.fill('#sampleSearch', term); + await page.press('#sampleSearch', 'Enter'); + + await page.waitForFunction( + (t) => window.__searchFilter?.active === true + && window.__searchFilter?.term === t + && window.__searchFilter?.total > 0, + term, + { timeout: 120000 } + ); + + await page.waitForFunction( + () => !document.body.classList.contains('explorer-busy') + && /matches in this map view/.test(document.getElementById('tableMeta')?.textContent || ''), + null, + { timeout: 120000 } + ); + + await expect.poll(async () => { + return page.evaluate(() => document.querySelectorAll('.facet-count.recomputing').length); + }, { timeout: 60000, intervals: [250, 500, 1000] }).toBe(0); +} + +async function readFacetCount(page, facet, value) { + const text = await page.locator(`.facet-count[data-facet="${facet}"][data-value="${value}"]`).first().textContent(); + const match = (text || '').match(/\(([\d,]+)\)/); + return match ? Number(match[1].replace(/,/g, '')) : NaN; +} + +async function expectBuccheroFacetCounts(page) { + await expect.poll( + () => readFacetCount(page, 'source', 'SESAR'), + { timeout: 60000, intervals: [250, 500, 1000] } + ).toBe(0); + + await expect.poll( + () => readFacetCount(page, 'source', 'OPENCONTEXT'), + { timeout: 60000, intervals: [250, 500, 1000] } + ).toBeGreaterThan(0); + + await expect.poll( + () => readFacetCount(page, 'object_type', OTHER_SOLID_OBJECT), + { timeout: 60000, intervals: [250, 500, 1000] } + ).toBe(0); + + await expect.poll( + () => readFacetCount(page, 'material', EARTH_MATERIAL), + { timeout: 60000, intervals: [250, 500, 1000] } + ).toBe(0); +} + +test.describe('A1 search-aware facet counts after world-search flyTo (#253)', () => { + test.setTimeout(240000); + + test('area-scope diagnostic and world-scope committed search both leave legend search-filtered', async ({ page }) => { + // Diagnostic: area-scope search does not auto-fly the camera. If this + // path is correct but world scope is not, the culprit is sequencing, + // not a missing search predicate in updateCrossFilteredCounts(). + await page.goto(explorerUrl(`${AREA_DATA_DEBUG}${VIEW_HASH}`)); + await waitForFacetUI(page); + await submitSearchAndWaitForSettle(page); + await expectBuccheroFacetCounts(page); + + // Regression path: world-scope search auto-flies to result[0]. The + // legend must still end on a final, search-aware count refresh without + // expanding any facet section. + await page.goto(explorerUrl(`${DATA_DEBUG}${VIEW_HASH}`)); + await waitForFacetUI(page); + await submitSearchAndWaitForSettle(page); + await expectBuccheroFacetCounts(page); + }); +});