diff --git a/docs/CICs/GaulLookupEnricher.md b/docs/CICs/GaulLookupEnricher.md index a864612..a9d7ef0 100644 --- a/docs/CICs/GaulLookupEnricher.md +++ b/docs/CICs/GaulLookupEnricher.md @@ -61,9 +61,13 @@ views-datafactory area-majority join), so this class does only a table join. - Output: the input frame (or, with `only_metadata=True`, just `pg_id_col` + `time_id_col`) left-merged with the 9 metadata columns. -- Side effects: logs the lookup size at construction (INFO); logs a WARNING with - the count and sample of unmatched cell ids when any occur; logs ignored - mapper-only kwargs at DEBUG. No file writes, no network. +- Public attribute: `lookup_version` — a short, stampable id read from the + lookup's embedded provenance at construction (`@`, or + `"unknown"` if the lookup carries none). The manager stamps it on each + delivery so a delivery is traceable to the exact lookup build. +- Side effects: logs the lookup size + version at construction (INFO); logs a + WARNING with the count and sample of unmatched cell ids when any occur; logs + ignored mapper-only kwargs at DEBUG. No file writes, no network. --- diff --git a/reports/predeploy_safe_checks.md b/reports/predeploy_safe_checks.md new file mode 100644 index 0000000..713f5e0 --- /dev/null +++ b/reports/predeploy_safe_checks.md @@ -0,0 +1,29 @@ +# Pre-deploy safe checks (read-only) + +Date: 2026-06-18. These only read files. No code changed. + +| # | Check | Result | +|---|-------|--------| +| 1 | Lookup data clean — 64,742 cells, no nulls, no bad codes, coordinates in range | **PASS** | +| 2 | Disputed places labeled (see below) | **PASS** (for review) | +| 3 | New code runs without the heavy map library (geopandas) | **PASS** | +| 4 | A delivery is traceable to a lookup version | **PARTIAL** | + +## Check 2 — how GAUL labels sensitive places (for your eyes before FAO's) + +| Place | ISO code in data | GAUL name | Cells | +|-------|------------------|-----------|-------| +| Jammu & Kashmir | `xJK` | Jammu And Kashmir | 75 | +| Arunachal Pradesh | `xAP` | Arunachal Pradesh | 23 | +| Aksai Chin | `xAC` | Aksai Chin | 11 | +| Taiwan | `TWN` | "Taiwan Province of China" | 23 | +| Palestine | `PSE` | Palestine | 3 | +| Western Sahara | `ESH` | Western Sahara | 101 | +| Somaliland area | `SOM` | Somalia (the old "-99" is gone) | 234 | +| Crimea | — | not present in the lookup region | 0 | + +Non-standard `x*` ISO codes present: `xAB, xAC, xAP, xHT, xIT, xJK, xMS` (GAUL's codes for 7 disputed zones with no official ISO code). + +## Check 4 — the partial + +The lookup file itself is stamped with its version (region + source digests). But when a delivery is uploaded to FAO, the description only says a generic "ADR-011 lookup" — not the exact version. So today you can trace a delivery to "the lookup", but not to "this exact lookup build". Small gap; fixable later. diff --git a/views_postprocessing/unfao/enrichment.py b/views_postprocessing/unfao/enrichment.py index 6087be7..c96f26e 100644 --- a/views_postprocessing/unfao/enrichment.py +++ b/views_postprocessing/unfao/enrichment.py @@ -19,10 +19,12 @@ from __future__ import annotations +import json import logging from pathlib import Path import pandas as pd +import pyarrow.parquet as pq from views_postprocessing.unfao.gaul_schema import METADATA_COLS @@ -48,11 +50,32 @@ def __init__(self, lookup_path: str | Path | None = None) -> None: raise ValueError( f"Lookup table is missing contract columns: {missing}" ) + self.lookup_version = self._read_version(self._lookup_path) logger.info( - "Loaded GAUL lookup: %d cells from %s", - len(self._lookup), self._lookup_path, + "Loaded GAUL lookup: %d cells from %s (version=%s)", + len(self._lookup), self._lookup_path, self.lookup_version, ) + @staticmethod + def _read_version(path: Path) -> str: + """A short, stampable version id from the lookup's embedded provenance. + + Format: ``@`` (e.g. ``land_gaul@f74d3b2b``) + so a delivery can be traced to the exact lookup build. Falls back to + ``"unknown"`` if the parquet carries no provenance metadata. + """ + meta = pq.read_metadata(path).metadata or {} + meta = {k.decode(): v.decode() for k, v in meta.items()} + region = meta.get("region", "?") + digest = "?" + try: + prov = json.loads(meta.get("source_provenance", "{}")) + digest = (prov.get("land_gaul_region", {}) + .get("content_digest", "?"))[:8] + except (ValueError, AttributeError): + pass + return "unknown" if region == "?" and digest == "?" else f"{region}@{digest}" + def enrich_dataframe_with_pg_info( self, df: pd.DataFrame, diff --git a/views_postprocessing/unfao/managers/unfao.py b/views_postprocessing/unfao/managers/unfao.py index f39b5d9..0e9e80b 100644 --- a/views_postprocessing/unfao/managers/unfao.py +++ b/views_postprocessing/unfao/managers/unfao.py @@ -199,7 +199,7 @@ def _save(self) -> list: dsm = DatastoreModule(appwrite_file_manager_config=unfao_appwrite_config) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - enrichment_description = f"Enriched with geographic metadata on {timestamp} using precomputed GAUL lookup (ADR-011)." + enrichment_description = f"Enriched with geographic metadata on {timestamp} using precomputed GAUL lookup (ADR-011, version={self._enricher.lookup_version})." historical_file_path = self._model_path.data_generated / f"historical_dataset_{timestamp}.parquet" forecast_file_path = self._model_path.data_generated / f"forecast_dataset_{timestamp}.parquet"