From 53db2c44f048edb249b906e9d72a64850619f38f Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Sun, 14 Jun 2026 11:37:34 -0300 Subject: [PATCH 01/13] Add declarative Derivation model with operator registry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces `Derivation`, a Pydantic front-end over the existing `SpatialDerivation` / `Variable` machinery that validates operator names and operator-specific requirements at construction time (fail-fast). Field names follow STAC conventions where natural (`valid_from`, `valid_until`, `bbox`) — naming only, no STAC implementation. - `disscube/operators/base.py`: `OPERATOR_REGISTRY` as single source of truth for valid operator names and their constraints. - `disscube/derivation.py`: `Derivation` model with `to_variable()`, `to_spatial_derivation()`, and `spec_hash()` that delegates to `SpatialDerivation.spec_hash()`. - `disscube/client/cube_client.py`: thin `derive_declarative()` wrapper. - `tests/test_derivation.py`: 13 tests covering validation, roun hash consistency, and bbox exclusion from spec_hash. --- catalog.db | Bin 57344 -> 65536 bytes disscube/__init__.py | 3 +- disscube/client/cube_client.py | 23 +++++ disscube/derivation.py | 164 +++++++++++++++++++++++++++++++ disscube/operators/__init__.py | 3 +- disscube/operators/base.py | 34 +++++++ examples/drivers/catalog.db | Bin 0 -> 49152 bytes tests/test_derivation.py | 173 +++++++++++++++++++++++++++++++++ 8 files changed, 398 insertions(+), 2 deletions(-) create mode 100644 disscube/derivation.py create mode 100644 disscube/operators/base.py create mode 100644 examples/drivers/catalog.db create mode 100644 tests/test_derivation.py diff --git a/catalog.db b/catalog.db index 9f1b664a7104356ca7cca1530d95b7ca861f7fe5..751fbb6da13e6472702af9b6744aad0e0c0b61cd 100644 GIT binary patch delta 3461 zcmb_eTWl0n7~a|4LOVONXP1j+xtMl%01U8m=5l6c(?B7qK@9=wi^S-1X3h*<*-Ntv zY7{eFO%*gI-HgT;qJ%z~V6Yiu!V5)_grEr;MJ|a(@l8`neLxdT^vpu-x-E5=@-TnS z{4?{-ch3Kv??1z5BE!Sc(Up8U6bdEi1?a`p)@e+7eD4Xch`rEtH1_9s?Ts$~XzH0T z!}_;Y{ue+ z%MW6Do0UXKdP&F;AtTKFP}m?8alxj~ci^FI^^dzBr5|^bQuhvA#>K(2W2Nrv!*6#7 zP~OlM4K)0wo}1V#tX5TYn?jo1zrCyHSMVJQShW4n-&i z6yyJ)2))yr*UZO|ew6H8;g>S+bg-QON5Bnm2w

djg;p5rdg~<@mcjf+FX=yYty3 zxETtADO%o80^9;qV9HPKx(6|MNhU&uqaalgP0P?7S)s4$s-~GHQXEUll3e9L zwqg^z!aW+TE@wPYR)@kw#?YoY(;}9QbVPX(Th=tu#5z%JD63dg9NAWh4k>L%!}Kv!ZAla* z70UqL@Ljv!R+#HkLDXo6G`B$<|SVGBQc;>ks=o5<$ZIp|G_B)JL$mby58GOQWCl zb@27%#o|_P)5-6aaNt}B{0061zk^9|k=FB^pZMq`SQm|i7%m90Ah1D51|bmy5QKOT zcyDqd#hnlF8TP>9(dY^0{CMpX#vQ>dTU`#Td}#Gy_kL(~ww&febz9j2d0ZrP8`yXJ zZ$7+M;DQ}B?Uwx+;r89Wa*kx{9bonZ=*#6&K5+kOCNvWmcSXoNC5hA+m!1_Dym%C| zGGtrXL0lQ6ziM}uiDu*Q@Ks@pLM&n%NF`WyG-N6UrlupyDzY6-HFaXCmLm~GQ!HDR zsd*CBR4lA&CN>m9SG}(?%UoiYbI;i%J$qr&`|>3d=hN+SZKj&0HQ4kbE|-OuM{ty_ zu1!%rrjB%3ma5tQ>CYdD#XFeeA$}DrEZ!S^jX6GE8)YIR82J6yehxEnfBpBD!;z6K zyg&KrnXvmj$5z(_l1?4zxsR=3_Wb7tcUx~fBv{$A_ z;VY}WiA$0@_oP~%Q?riSoK>Q^Ia3|?^dZ%BB&a%~B+53Tja;{Bn?$Bbr7O{hG+n`> zLa-%Ej;K4>v2;R)w zH}tng*lAn4Bg7W*?h4mvunqp2xmSawC+fgp7fP#1nC-$Pn$LK3VM2*Wj!YG&*>WHr z#epJ0mhMpZbZpIWG?N$**_J6`(GaP-5)snlo(8F4=@B##X5s%Gwq3H#TpJwLPY&Gx lwPQ(+r^o6ryFIYg?c=pxHZrn1>A&^uH(@5>_ysk1&|oemg`9e58xMkDQM@U;P|%AAO~4=~ z2eS;=OR0i*5E^6|FY2K^cxnX+rbc@xm0mPb6%Q&##ob);;Qjc$clh%!mq~fWZj8uL zgpi81Kf#eK9t|%p0ZaT6t9&60YTPWOW-RQO-pH5T*wNk0o?T4%2!@5PPRXOmV~MFS zI4_K!yGI#=c^keOl|Ku?gPfk9ydtAu481sBLC@f=Z?IQV5uIFS5$u;)HK)ephP2U4^uc|SOp;1k-)$E3O;0dhB9+5fH~!`ID^FrqZ+NAs$(TQlnV$0}Fg0ub>GTkX zMFc+tim!pg~zZM1>SI{?(lYe3OvWsL+Tyq*^33tQb9k#1ysJYx( zu}t&jxkQqX8j^>^xHCg)VQ_=k)oE@fe!sD>YYzPVVA<76j(ODDwlFhy+8ygZrva5V diff --git a/disscube/__init__.py b/disscube/__init__.py index 05a2fd4..48bb79a 100644 --- a/disscube/__init__.py +++ b/disscube/__init__.py @@ -1,4 +1,5 @@ from disscube.client import CubeClient from disscube.models import GridSpec, SpatialSource, SpatialDerivation, Variable, DerivedVariable +from disscube.derivation import Derivation -__all__ = ["CubeClient", "GridSpec", "SpatialSource", "SpatialDerivation", "Variable", "DerivedVariable"] +__all__ = ["CubeClient", "GridSpec", "SpatialSource", "SpatialDerivation", "Variable", "DerivedVariable", "Derivation"] diff --git a/disscube/client/cube_client.py b/disscube/client/cube_client.py index 39aa0e1..7ed0e9f 100644 --- a/disscube/client/cube_client.py +++ b/disscube/client/cube_client.py @@ -90,6 +90,29 @@ def derive(self, derivation: SpatialDerivation, tile_id: Optional[str] = None) - if d.spec_hash == spec_hash ] + def derive_declarative( + self, + derivation: "Derivation", # noqa: F821 — imported lazily to avoid circular refs + grid_id: str, + tile_id: Optional[str] = None, + ) -> List[DerivedVariable]: + """ + Thin convenience wrapper: build a ``SpatialDerivation`` from a + declarative ``Derivation`` and call the existing ``derive()`` pipeline. + + No new execution logic is introduced here. + + Parameters + ---------- + derivation : Derivation + Declarative description of the derivation intent. + grid_id : str + Target grid identifier. + tile_id : str | None + Optional tile sub-identifier, forwarded to ``derive()``. + """ + return self.derive(derivation.to_spatial_derivation(grid_id), tile_id=tile_id) + def load( self, variable_id: str, diff --git a/disscube/derivation.py b/disscube/derivation.py new file mode 100644 index 0000000..6637929 --- /dev/null +++ b/disscube/derivation.py @@ -0,0 +1,164 @@ +""" +Declarative Derivation model — thin front-end over SpatialDerivation / Variable. + +Field names are chosen to be compatible with STAC conventions where natural: + +- ``valid_from`` / ``valid_until`` align with STAC ``start_datetime`` / + ``end_datetime`` (and ``datetime`` for the static/instant case). + No STAC logic is implemented here; the alignment is naming-only. +- ``bbox`` (optional, reserved) follows the STAC bounding-box field order: + [xmin, ymin, xmax, ymax] in EPSG:4326. The field is not populated + from data and is not used in execution. + +No STAC code, catalog, API, or export is implemented in this module. +""" + +import json +import hashlib +from pydantic import BaseModel, model_validator + +from disscube.operators.base import OPERATOR_REGISTRY +from disscube.models.variable import Variable, SpatialDerivation + + +class Derivation(BaseModel): + """ + Declarative description of a single derivation intent. + + Acts as a thin, additive front-end over the existing + ``SpatialDerivation`` / ``Variable`` machinery. Instantiation validates + the operator name and operator-specific field requirements so that errors + surface before any I/O (fail-fast). + + Parameters + ---------- + target : str + Name of the derived variable produced (maps to ``Variable.name``). + source_id : str + Identifier of the ``SpatialSource`` (passed through to + ``SpatialDerivation``). + operator : str + Operator name; must be a key in ``OPERATOR_REGISTRY``. + class_code : int | None + Target class code used by class-aware operators (e.g. ``"percentage"``). + Required when the operator demands it; reserved but optional otherwise. + role : str + Semantic role of the variable (e.g. ``"driver"``, ``"state"``). + Defaults to ``"driver"``. + valid_from : str | None + Start of the temporal validity window (ISO 8601 or year string). + Aligns with STAC ``start_datetime``. ``None`` means no lower bound. + valid_until : str | None + End of the temporal validity window (ISO 8601 or year string). + Aligns with STAC ``end_datetime``. ``None`` means no upper bound. + Both ``None`` → static variable (aligns with STAC ``datetime``). + purity_threshold : float | None + Reserved for future purity-masking logic. Included in + ``spec_hash()`` so that two derivations with different thresholds + are always distinct products. Currently unused in execution. + bbox : list[float] | None + Optional bounding box ``[xmin, ymin, xmax, ymax]`` in EPSG:4326. + Aligns with the STAC ``bbox`` field. Reserved — not used in + execution and excluded from ``spec_hash()``. + """ + + target: str + source_id: str + operator: str + class_code: int | None = None + role: str = "driver" + valid_from: str | None = None + valid_until: str | None = None + purity_threshold: float | None = None + bbox: list[float] | None = None + + @model_validator(mode="after") + def _validate_operator(self) -> "Derivation": + available = sorted(OPERATOR_REGISTRY) + if self.operator not in OPERATOR_REGISTRY: + raise ValueError( + f"Unknown operator {self.operator!r}. " + f"Available operators: {available}" + ) + meta = OPERATOR_REGISTRY[self.operator] + if meta["requires_class_code"] and self.class_code is None: + raise ValueError( + f"Operator {self.operator!r} requires class_code to be set." + ) + return self + + # ── Conversion helpers ──────────────────────────────────────────────────── + + def to_variable(self) -> Variable: + """ + Return the ``Variable`` instance corresponding to this derivation. + + Returns + ------- + Variable + With ``name=target``, ``operator=operator``, and + ``class_code=class_code``. + """ + return Variable( + name=self.target, + operator=self.operator, + class_code=self.class_code, + ) + + def to_spatial_derivation(self, grid_id: str) -> SpatialDerivation: + """ + Return the ``SpatialDerivation`` corresponding to this intent. + + This is the canonical bridge to the existing execution pipeline; + ``CubeClient.derive()`` accepts the result directly. + + Parameters + ---------- + grid_id : str + Target grid identifier (required by ``SpatialDerivation``). + + Returns + ------- + SpatialDerivation + """ + return SpatialDerivation( + source_id=self.source_id, + grid_id=grid_id, + role=self.role, + variables=[self.to_variable()], + valid_from=self.valid_from, + valid_until=self.valid_until, + ) + + # ── Reproducibility ─────────────────────────────────────────────────────── + + def spec_hash(self, grid_id: str = "__global__") -> str: + """ + Deterministic SHA-256 hash of the derivation spec. + + Delegates to ``SpatialDerivation.spec_hash()`` for the base fields, + then folds in ``purity_threshold`` when it is set. ``bbox`` is + excluded because it is descriptive metadata and does not affect + what is computed. + + Parameters + ---------- + grid_id : str + Grid identifier used for the underlying + ``SpatialDerivation.spec_hash()``. Defaults to ``"__global__"`` + for grid-agnostic comparisons. + + Returns + ------- + str + 64-character hex SHA-256 digest. + """ + base = self.to_spatial_derivation(grid_id).spec_hash() + if self.purity_threshold is None: + return base + payload = json.dumps( + {"base": base, "purity_threshold": self.purity_threshold}, + sort_keys=True, + ensure_ascii=False, + ).encode("utf-8") + return hashlib.sha256(payload).hexdigest() diff --git a/disscube/operators/__init__.py b/disscube/operators/__init__.py index 8d8e4db..68c526d 100644 --- a/disscube/operators/__init__.py +++ b/disscube/operators/__init__.py @@ -1,4 +1,5 @@ from .zonal import ZonalAggregator from .proximity import ProximityAggregator +from .base import OPERATOR_REGISTRY -__all__ = ["ZonalAggregator", "ProximityAggregator"] +__all__ = ["ZonalAggregator", "ProximityAggregator", "OPERATOR_REGISTRY"] diff --git a/disscube/operators/base.py b/disscube/operators/base.py new file mode 100644 index 0000000..4956490 --- /dev/null +++ b/disscube/operators/base.py @@ -0,0 +1,34 @@ +""" +Operator plugin registry for disscube. + +Maps each operator name to its metadata: which requirements must be satisfied +at Derivation construction time (fail-fast validation before any I/O). +""" + +from typing import TypedDict + + +class OperatorMeta(TypedDict): + requires_class_code: bool + + +# Single source of truth for valid operator names and their constraints. +# The Aggregator (pipeline/aggregator.py) dispatches by the same names; +# adding an operator here AND in the Aggregator's dispatch block is the +# complete registration pattern. +OPERATOR_REGISTRY: dict[str, OperatorMeta] = { + # ── Zonal operators (window-based aggregates over raster / vector data) ── + "mean": {"requires_class_code": False}, + "sum": {"requires_class_code": False}, + "std": {"requires_class_code": False}, + "min": {"requires_class_code": False}, + "max": {"requires_class_code": False}, + "majority": {"requires_class_code": False}, + "minority": {"requires_class_code": False}, + "percentage": {"requires_class_code": True}, # class_code selects the target class + "attribute": {"requires_class_code": False}, + "presence": {"requires_class_code": False}, + # ── Proximity operators (distance / density to features) ───────────────── + "min_distance": {"requires_class_code": False}, + "count": {"requires_class_code": False}, +} diff --git a/examples/drivers/catalog.db b/examples/drivers/catalog.db new file mode 100644 index 0000000000000000000000000000000000000000..eb958b040a13a9ea7d5605f95ac9b24a55fad845 GIT binary patch literal 49152 zcmeI%F>ljA6u@!2P2)6fno10o$j}=TsYQyID-o!NijYDv&`y@AO=~G_z;>%JFr={X z9r#Lo6$U2Q@VVD0A#tdLV6gs{VxKS9*L%Nrigdc?&$?j}h&O}i%uB?5Z9~&_?ST-Q zrWxsHE&beF^1XEPl0Nl`c3#ua?p=Pa)qiMa^_y1TsefG`*4JxW>wD(+wV&n}b2shg zLI42-5I_I{1Q0;r9|<^S+1hUCgYeuBhM_-n{U8c21K&-;exTZGj}GjPV~f4su6-ze zA0+mBLJe%nKXOCA^|G2pjc1~sdhw|mZ01IlgC?WKZ-Zl3cD9_=oY|g4p|9drbK}ZE zlX2y{ob6S~Fs#i@{o|{|d)*JBpzkH&;5;5TDk`MYdDykZxLY)jYbR0Y(dJf)^ObIj7hNGCCo z!Tc2etU6vTi{!7@>4Fi_pdT#wdbT37p4;WJ)oJ8;Di61<8pi0(a%MUDkXc@rFIJDz zl%?wYTgzJ|QCgIQnuyz0jg?VNH!RE2NAI&FWVPb#wltMM*7ZN8-&!hLJ62u|vT3;O z?6I8wz*gjsY)#%au~L{J;1$CPM@eKmY**5I_I{1Q0*~0R-6pvjz}A009IL zKmY**5I_I{1P~~`0Q>*q*O&|uKmY**5I_I{1Q0*~0R#|W|IZpg009ILKmY**5I_I{ L1Q0-=_yWHG_d;*n literal 0 HcmV?d00001 diff --git a/tests/test_derivation.py b/tests/test_derivation.py new file mode 100644 index 0000000..7222a71 --- /dev/null +++ b/tests/test_derivation.py @@ -0,0 +1,173 @@ +""" +Tests for the declarative Derivation model (disscube/derivation.py). +""" + +import pytest +from disscube.derivation import Derivation +from disscube.models.variable import Variable, SpatialDerivation + + +# ── Construction-time validation ────────────────────────────────────────────── + +def test_unknown_operator_raises(): + with pytest.raises(ValueError, match="Unknown operator"): + Derivation(target="x", source_id="s1", operator="nonexistent") + + +def test_unknown_operator_error_lists_available(): + with pytest.raises(ValueError, match="Available operators"): + Derivation(target="x", source_id="s1", operator="bogus") + + +def test_percentage_without_class_code_raises(): + with pytest.raises(ValueError, match="requires class_code"): + Derivation(target="forest_pct", source_id="mapbiomas_2020", operator="percentage") + + +def test_percentage_with_class_code_ok(): + d = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + ) + assert d.class_code == 3 + + +def test_operator_without_class_code_requirement_accepts_none(): + d = Derivation(target="dist", source_id="roads", operator="min_distance") + assert d.class_code is None + + +# ── to_variable() round-trip ────────────────────────────────────────────────── + +def test_to_variable_fields(): + d = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + role="driver", + valid_from="2020", + valid_until="2020", + ) + v = d.to_variable() + assert isinstance(v, Variable) + assert v.name == "forest_pct" + assert v.operator == "percentage" + assert v.class_code == 3 + + +def test_to_variable_no_class_code(): + d = Derivation(target="dist_road", source_id="roads", operator="min_distance") + v = d.to_variable() + assert v.name == "dist_road" + assert v.class_code is None + + +# ── spec_hash consistency with SpatialDerivation ───────────────────────────── + +def test_spec_hash_consistent_with_spatial_derivation(): + """ + to_spatial_derivation(grid_id).spec_hash() must equal the hash produced + by building an equivalent SpatialDerivation directly. + """ + d = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + role="driver", + valid_from="2020", + valid_until="2020", + ) + grid_id = "acre_5km" + + derived_hash = d.to_spatial_derivation(grid_id).spec_hash() + + direct = SpatialDerivation( + source_id="mapbiomas_2020", + grid_id=grid_id, + role="driver", + variables=[Variable(name="forest_pct", operator="percentage", class_code=3)], + valid_from="2020", + valid_until="2020", + ) + assert derived_hash == direct.spec_hash() + + +def test_spec_hash_temporal_window_differs(): + """Different valid_from/valid_until must produce different hashes.""" + d2019 = Derivation( + target="forest_pct", source_id="mb", operator="percentage", + class_code=3, valid_from="2019", valid_until="2019", + ) + d2020 = Derivation( + target="forest_pct", source_id="mb", operator="percentage", + class_code=3, valid_from="2020", valid_until="2020", + ) + assert d2019.spec_hash() != d2020.spec_hash() + + +# ── purity_threshold changes spec_hash ─────────────────────────────────────── + +def test_purity_threshold_changes_spec_hash(): + base = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + ) + with_threshold = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + purity_threshold=0.9, + ) + assert base.spec_hash() != with_threshold.spec_hash() + + +def test_different_purity_thresholds_differ(): + d1 = Derivation( + target="forest_pct", source_id="mb", operator="percentage", + class_code=3, purity_threshold=0.7, + ) + d2 = Derivation( + target="forest_pct", source_id="mb", operator="percentage", + class_code=3, purity_threshold=0.9, + ) + assert d1.spec_hash() != d2.spec_hash() + + +# ── bbox excluded from spec_hash ───────────────────────────────────────────── + +def test_bbox_excluded_from_spec_hash(): + """ + Two Derivations differing only in bbox must produce the same spec_hash. + bbox is descriptive metadata, not a derivation parameter. + """ + d_no_bbox = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + ) + d_with_bbox = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + bbox=[-74.0, -18.0, -44.0, 5.0], + ) + assert d_no_bbox.spec_hash() == d_with_bbox.spec_hash() + + +# ── spec_hash determinism ───────────────────────────────────────────────────── + +def test_spec_hash_is_deterministic(): + d = Derivation( + target="forest_pct", source_id="mb", operator="percentage", class_code=3, + ) + assert d.spec_hash() == d.spec_hash() + assert len(d.spec_hash()) == 64 From bb9d881164d0a6506e8b042c3f5d449d4359f3e3 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Sun, 14 Jun 2026 15:50:32 -0300 Subject: [PATCH 02/13] Refactoring disscube to make operators first-class citizens with auto-registration, fix architectural issues identified in audit. All derivation scripts pass. Next: commit the changes. --- catalog.db | Bin 65536 -> 69632 bytes disscube/derivation.py | 2 +- disscube/operators/__init__.py | 20 +- disscube/operators/base.py | 121 ++++++--- disscube/operators/proximity.py | 163 +++++++----- disscube/operators/zonal.py | 270 ++++++++++++++++---- disscube/pipeline/aggregator.py | 107 ++++---- disscube/pipeline/aligner.py | 190 ++++++++++---- disscube/pipeline/normalizer.py | 50 +++- examples/case_studies/brmangue/01_derive.py | 2 +- tests/test_aggregator.py | 170 ++++++++---- tests/test_aligner.py | 108 ++++++++ 12 files changed, 868 insertions(+), 335 deletions(-) create mode 100644 tests/test_aligner.py diff --git a/catalog.db b/catalog.db index 751fbb6da13e6472702af9b6744aad0e0c0b61cd..33904801bb496b996df596206af6d92cfd229a30 100644 GIT binary patch delta 5149 zcmdT|TWnlM8Qx>B?RIa+m-z13iAhx;mpyalb|$6jnkZBd)DV+CLe`db<{%_8aBgrFACm-+377~d>1H3AKQy=C|Vfy~BcRK0vo1K4a zg11-m58qPbCdHQqq9eQ4MTdiHHMzM@E#*n^MziBaa(7!b@^aDFc7L%6aFXuT7_Gs5 z(|yBz&AnP~IPhEdj{0u6=C(t~t%H!wq;jVd4XMdQTBl!3ol8xoJ6x0Hxl(RxrayBn z^KABV_7S(y{b43GJ#}BI^r_@TtK%_-C8bEqFw}uTh*_)qfiU{r@ zCaJFx*Ggb+ZArqIF~`(LfmMMvG!VY_HT96zQc)X7VvQQmgJa_Z;h>hslQXCZHy!|aS69Qj0@ZTw7f z;zLf9zq~7YX6TQ#ZIzy)$KNVMzbs~**64Dv-)V`q4)1ZAqtfuMns~|Yo|@VbogVIY zI->58ey1If?sl4v9dI^$#J!kk$i1F$cesIjkNXezRriW}%KfSPefK-=H{D0v1MWU| zqnpaTUf#Ij7rAKPQ!P$UbO2&^9DCQNs@t8u9eMOXv1`8BIoG5Q6dU8)qLHUWsV%IpZQUCXXfo(YxcamEmJ+_{usva*Hmd|aw5B05-wp*&rPG07H*5~n}%74vVIn2 zSqfjrV6!ZRyXIil#@fybN;Gk*P}?}$%wH}r$%eay_aWd>a*~> z>bhfz?Do`n-7)8xa{aj{oav|QYpW;kT2~G)9j~g|(G=ZzW>@sHQ?FHFKRSPLL-gdY zzE@z#C&XY|Qyx++LZ-bi1Tj{YgR)|weQ5%ZTV^HpFeX%EZdf2Oj(-12ceL$9 zE_(57Us`W_SdNbN9-iFMSibYj$DD;75spHxjUOUyBmq6QB4EalP$;1!wLTG=U=E)H z;!{vNYywLZh^55V6Kp|8yED%vvYS$cy3Zvab)K7^dcw(0%J$U6?d=oUMEhg6Mt?t1 zDF5!nC8r>e6-pWH8_qa|v4o*d0?Co^10{XP1ar@_P!h>mNCHLzt~|J+JRzBn-tBb5 zse7yTw3T=LYh6_#G@##_AS#4~dn~~gdq`pmI*p9-2v(e6ibQDrkcWiRz(-Q}G&F>U zRs!gj==8ro>$H}8{F05cN^sA~q)9{a)~PCP3lV#*>DA_mi7u^&(qDn=Ay6#G8s zP#_^^SUbOhcl7PO)swQN{O$8GqSZPKm_j(zN_qyC(i$cMPYI+0B9Vs`R9CDK%s5BB zfaC%q4E2LRdnBIgEs=lW-psz{)WmI-S$*_S^xDNj^y>@%tg}NSa=b#Ja|Pm2WhL>r zfI?=BXSwi!D=*Ze6oAfO0-Kv|kPs<)|FqFGupo^NKe@-Lw+N5uH8Lvgbx9 zD*+jR+me$$_;Ex?J;lO@5wjVlKEnNL# zs}&u2Wn%&9P*KS_>|X2}1`KE1z}7aza%7CJ10o zxGY-q)t9|woLDq{vM0Lw5-F`Uzs2gZGL!gjg7u5fITdwL+)%-hXDk#8f}obDkRc35 zCQoS&XB(kCA-IHdjHHNZDm}>vRCFthP^@B2?wOtc{1=8slpM;hb)3=trsHLzTb&%5m|1G_I_Xsxw5z9uMKPoO;!o#E;_qbXae#9i>)wHKof-n8Crp) z1k)P3C<`pnJ_*6r1X_U=Q3{KI3Q3tq5hW^&1!W0MR+-r1ynrTEvZMUM&%un*PzX5J zb8w2_J$>O5%HfDlFgW3Wfad}e%(%cBOf{7TTr#A`Sg63@!9x&Q5xi|DR4TSfPBsJ2 zR^Y%MPeMPaH?y;|GZTvMX;;D#r&khw2;JG41uL&h ziXf1?m$)qa;yE7zD-|UoF!S0%qz_6U#2KVfNC#?`;GaC@D(r))&H7q5u)qDyv$M~i z-}CH_?srE=-Cgtj9?P<7<^txj$CpsMXJYf7`F1tGJ^#kT`lEgA`CmQ550!nFzr9J1 z4$Rft*>u@^u$z~U@mr7BzD@=na}f0O`vWi4!7Zx$$ov{l}nZSpxaz* z>(f^n`n#=urPkW+zt+&5L%t34vZ3{Mpht%KD}L+IuDcDr<&Ix*Tic0?&sLmg+-BX@ z&8gLu($j{v8QN=SUkuw_okdDuqXlk(-@z~7Col<)gD=6SU@zDKhQUUY5(oOvz4P>| z`}jz`>U7wv2YblNTWhlv%+g?%`UUM>Q=D~YsWVGUZB9p9>p)AGV;1P~cLoQ87rkSC zpLZ{K)ISqk1z&?V3i-^+mvrs+%6)FrFZ5~XyN7-x-ReU)0vXk;maf@(C>TM1gm z<7XSDRm(r`tGF%0#KlI%Y0>6Xqf&axyiiEe=(?W=IB9`@z~A5xFb%#pdYlATz;EC@ z_!<0|FI@LA=yx5<_6le(pjtq^fLZ~dfS>@s0Ji|AfRf`_ZN90ZY=3L{PgY-_v&%h{ z*G_!sI%A2Ocivf8YOqY=5b`XHl8_6DITV>lWsDetGEQKuWK0?`nWk@$q zEz;vRj+mxH&QO#fOhTB-OsEt^Q4%XIS&U=`S(c?DWjKsY>qID121OheI#X=aH&I5M zu~mNl)!7rJe9`2&vibAe1mpYbUNLT^>e`~>wG-njs?ON@dcO10+)^Wx!4!V*V}sFYHgCKxKhB~FdD#at}a?U$dkYx$hu2s(Zi!7 diff --git a/disscube/derivation.py b/disscube/derivation.py index 6637929..53866cc 100644 --- a/disscube/derivation.py +++ b/disscube/derivation.py @@ -81,7 +81,7 @@ def _validate_operator(self) -> "Derivation": f"Available operators: {available}" ) meta = OPERATOR_REGISTRY[self.operator] - if meta["requires_class_code"] and self.class_code is None: + if meta.requires_class_code and self.class_code is None: raise ValueError( f"Operator {self.operator!r} requires class_code to be set." ) diff --git a/disscube/operators/__init__.py b/disscube/operators/__init__.py index 68c526d..357a01b 100644 --- a/disscube/operators/__init__.py +++ b/disscube/operators/__init__.py @@ -1,5 +1,21 @@ +""" +Operator registry for disscube. + +Importing this package is sufficient to populate ``OPERATOR_REGISTRY`` — +each submodule defines ``Operator`` subclasses that self-register via +``__init_subclass__``. +""" + +# Import submodules to trigger auto-registration of all operator classes. +from . import zonal, proximity # noqa: F401 + from .zonal import ZonalAggregator from .proximity import ProximityAggregator -from .base import OPERATOR_REGISTRY +from .base import OPERATOR_REGISTRY, Operator -__all__ = ["ZonalAggregator", "ProximityAggregator", "OPERATOR_REGISTRY"] +__all__ = [ + "Operator", + "OPERATOR_REGISTRY", + "ZonalAggregator", # legacy shim + "ProximityAggregator", # legacy shim +] diff --git a/disscube/operators/base.py b/disscube/operators/base.py index 4956490..f795c4e 100644 --- a/disscube/operators/base.py +++ b/disscube/operators/base.py @@ -1,34 +1,95 @@ """ -Operator plugin registry for disscube. +Operator plugin system for disscube. -Maps each operator name to its metadata: which requirements must be satisfied -at Derivation construction time (fail-fast validation before any I/O). +Every operator is a concrete subclass of ``Operator``. Defining ``name`` +on the subclass is the only registration step required — ``__init_subclass__`` +inserts it into ``OPERATOR_REGISTRY`` automatically. + +Adding a new operator: +1. Create a subclass of ``Operator`` anywhere that is imported at startup. +2. Set ``name`` and override ``resampling()`` / ``compute()`` as needed. +3. No changes to Aggregator or any other pipeline stage are required. """ -from typing import TypedDict - - -class OperatorMeta(TypedDict): - requires_class_code: bool - - -# Single source of truth for valid operator names and their constraints. -# The Aggregator (pipeline/aggregator.py) dispatches by the same names; -# adding an operator here AND in the Aggregator's dispatch block is the -# complete registration pattern. -OPERATOR_REGISTRY: dict[str, OperatorMeta] = { - # ── Zonal operators (window-based aggregates over raster / vector data) ── - "mean": {"requires_class_code": False}, - "sum": {"requires_class_code": False}, - "std": {"requires_class_code": False}, - "min": {"requires_class_code": False}, - "max": {"requires_class_code": False}, - "majority": {"requires_class_code": False}, - "minority": {"requires_class_code": False}, - "percentage": {"requires_class_code": True}, # class_code selects the target class - "attribute": {"requires_class_code": False}, - "presence": {"requires_class_code": False}, - # ── Proximity operators (distance / density to features) ───────────────── - "min_distance": {"requires_class_code": False}, - "count": {"requires_class_code": False}, -} +from __future__ import annotations + +from typing import TYPE_CHECKING, ClassVar + +import numpy as np +import xarray as xr +from rasterio.warp import Resampling + +if TYPE_CHECKING: + import geopandas as gpd + from disscube.models.grid import GridSpec + from disscube.models.variable import Variable + +# Maps operator name → Operator subclass. Populated automatically. +OPERATOR_REGISTRY: dict[str, type[Operator]] = {} + + +class Operator: + """ + Base class for all disscube derivation operators. + + Subclasses register themselves into ``OPERATOR_REGISTRY`` the moment the + class body is executed (via ``__init_subclass__``), so there is no + separate registration call. + + Parameters (class-level) + ------------------------ + name : str + Unique operator identifier used in ``Variable.operator`` and + ``OPERATOR_REGISTRY``. + requires_class_code : bool + When ``True``, ``Derivation`` enforces that ``class_code`` is set + at construction time (fail-fast validation). + _resampling : Resampling + Default rasterio resampling method used by ``GridAligner`` for + raster sources. Override in subclasses to match the operator's + aggregation semantics. + """ + + name: ClassVar[str] + requires_class_code: ClassVar[bool] = False + _resampling: ClassVar[Resampling] = Resampling.nearest + + def __init_subclass__(cls, **kwargs: object) -> None: + super().__init_subclass__(**kwargs) + if hasattr(cls, "name") and isinstance(cls.name, str): + OPERATOR_REGISTRY[cls.name] = cls + + @classmethod + def resampling(cls) -> Resampling: + """Rasterio resampling method for ``GridAligner``.""" + return cls._resampling + + def compute( + self, + data: "xr.DataArray | gpd.GeoDataFrame", + var: "Variable", + grid: "GridSpec", + ) -> xr.DataArray: + """ + Derive a single variable from aligned source data. + + Parameters + ---------- + data : xr.DataArray | geopandas.GeoDataFrame + Source already reprojected and clipped to ``grid`` extent. + ``DataArray`` for raster sources; ``GeoDataFrame`` for vectors. + var : Variable + Operator name and optional ``class_code`` for this variable. + grid : GridSpec + Target grid — provides ``rows``, ``cols``, ``transform``, + ``xs``, ``ys``, ``resolution``, and ``bbox``. + + Returns + ------- + xr.DataArray + Shape ``(rows, cols)`` with dims ``("y", "x")`` and coords + matching ``grid.ys`` / ``grid.xs``. + """ + raise NotImplementedError( + f"{type(self).__name__}.compute() is not implemented" + ) diff --git a/disscube/operators/proximity.py b/disscube/operators/proximity.py index facd80e..914028b 100644 --- a/disscube/operators/proximity.py +++ b/disscube/operators/proximity.py @@ -1,81 +1,108 @@ +""" +Proximity operators — Euclidean distance and feature-count over vector sources. +""" + +from __future__ import annotations + import numpy as np import xarray as xr import geopandas as gpd import rasterio.features -from affine import Affine -from scipy.ndimage import distance_transform_edt +from rasterio.warp import Resampling + +from disscube.operators.base import Operator +from disscube.models.variable import Variable +from disscube.models.grid import GridSpec -class ProximityAggregator: - @staticmethod - def aggregate(data: xr.DataArray | gpd.GeoDataFrame, variables, grid_spec): - rows = grid_spec.rows - cols = grid_spec.cols - transform = grid_spec.transform - xs = grid_spec.xs - ys = grid_spec.ys - var = variables[0] +class MinDistanceOperator(Operator): + """ + Euclidean distance (in CRS units) from each grid cell to the nearest + feature in the vector source. + """ + name = "min_distance" + _resampling = Resampling.nearest + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: if isinstance(data, gpd.GeoDataFrame): - if var.operator == "min_distance": - # Rasterize to binary mask (1 where features exist) - shapes = ((geom, 1) for geom in data.geometry if geom is not None) - mask = rasterio.features.rasterize( - shapes, - out_shape=(rows, cols), - transform=transform, - fill=0, - all_touched=True + from scipy.ndimage import distance_transform_edt + shapes = ((g, 1) for g in data.geometry if g is not None) + mask = rasterio.features.rasterize( + shapes, + out_shape=(grid.rows, grid.cols), + transform=grid.transform, + fill=0, + all_touched=True, + ) + dist = distance_transform_edt(1 - mask) * grid.resolution + return xr.DataArray( + dist, dims=("y", "x"), coords={"y": grid.ys, "x": grid.xs} + ) + if isinstance(data, xr.DataArray): + if "band" in data.dims: + data = data.isel(band=0) + return data.transpose("y", "x") + raise TypeError( + f"'min_distance' expects a vector source, got {type(data).__name__}" + ) + + +class CountOperator(Operator): + """Count of vector features whose centroid falls within each grid cell.""" + name = "count" + _resampling = Resampling.nearest + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, gpd.GeoDataFrame): + valid = data[data.geometry.notnull()] + if valid.empty: + counts = np.zeros((grid.rows, grid.cols), dtype=np.float64) + else: + minx, miny, maxx, maxy = grid.bbox + res = grid.resolution + centroids = valid.geometry.centroid + cols_idx = ((centroids.x - minx) / res).astype(int) + rows_idx = ((maxy - centroids.y) / res).astype(int) + in_bounds = ( + (cols_idx >= 0) & (cols_idx < grid.cols) + & (rows_idx >= 0) & (rows_idx < grid.rows) ) - # Distance transform - dist = distance_transform_edt(1 - mask) * grid_spec.resolution - - da = xr.DataArray(dist, dims=("y", "x"), coords={"y": ys, "x": xs}) - da.rio.write_crs(grid_spec.crs, inplace=True) - da.rio.write_transform(transform, inplace=True) - return da - - elif var.operator == "count": - # For count, we can use rasterize with 'merge_alg=ADD' if available, - # but rasterio's rasterize doesn't support ADD directly like that. - # Instead, we can use spatial join or iterate. - # Faster way for many points: use numpy bincount with cell indices. - - # Filter valid geometries - valid_data = data[data.geometry.notnull()] - if valid_data.empty: - counts = np.zeros((rows, cols)) - else: - # Get pixel coordinates - # Note: grid_spec.transform is from north-up (origin at top-left) - # col = (x - minx) / res - # row = (maxy - y) / res - minx, miny, maxx, maxy = grid_spec.bbox - res = grid_spec.resolution - - # Get centroids of geometries - centroids = valid_data.geometry.centroid - - cols_idx = ((centroids.x - minx) / res).astype(int) - rows_idx = ((maxy - centroids.y) / res).astype(int) - - # Filter indices within bounds - mask = (cols_idx >= 0) & (cols_idx < cols) & (rows_idx >= 0) & (rows_idx < rows) - cols_idx = cols_idx[mask] - rows_idx = rows_idx[mask] - - # Flatten indices for bincount - flat_idx = rows_idx * cols + cols_idx - counts_flat = np.bincount(flat_idx, minlength=rows * cols) - counts = counts_flat.reshape((rows, cols)) - - da = xr.DataArray(counts, dims=("y", "x"), coords={"y": ys, "x": xs}) - da.rio.write_crs(grid_spec.crs, inplace=True) - da.rio.write_transform(transform, inplace=True) - return da - + flat = rows_idx[in_bounds] * grid.cols + cols_idx[in_bounds] + counts = np.bincount( + flat, minlength=grid.rows * grid.cols + ).reshape((grid.rows, grid.cols)).astype(np.float64) + return xr.DataArray( + counts, dims=("y", "x"), coords={"y": grid.ys, "x": grid.xs} + ) if isinstance(data, xr.DataArray): if "band" in data.dims: data = data.isel(band=0) return data.transpose("y", "x") + raise TypeError( + f"'count' expects a vector source, got {type(data).__name__}" + ) + - return xr.DataArray(np.zeros((rows, cols)), dims=("y", "x"), coords={"y": ys, "x": xs}) +# --------------------------------------------------------------------------- +# Legacy compatibility shim +# --------------------------------------------------------------------------- + +class ProximityAggregator: + """ + Deprecated. Use ``MinDistanceOperator`` / ``CountOperator`` directly. + """ + + @staticmethod + def aggregate(data, variables, grid_spec): + import warnings + warnings.warn( + "ProximityAggregator is deprecated; use OPERATOR_REGISTRY operators directly.", + DeprecationWarning, + stacklevel=2, + ) + from disscube.operators.base import OPERATOR_REGISTRY + var = variables[0] + op_cls = OPERATOR_REGISTRY.get(var.operator) + if op_cls is None: + raise ValueError(f"Unknown operator: {var.operator}") + return op_cls().compute(data, var, grid_spec) diff --git a/disscube/operators/zonal.py b/disscube/operators/zonal.py index 3aa9032..95d7b98 100644 --- a/disscube/operators/zonal.py +++ b/disscube/operators/zonal.py @@ -1,63 +1,221 @@ +""" +Zonal operators — window-based aggregates over raster and vector sources. + +Each class registers itself automatically into ``OPERATOR_REGISTRY`` via +``Operator.__init_subclass__``. No changes to the pipeline are required +when a new operator is added here. +""" + +from __future__ import annotations + import numpy as np import xarray as xr -import rioxarray import geopandas as gpd import rasterio.features -from affine import Affine +from rasterio.warp import Resampling -class ZonalAggregator: - @staticmethod - def aggregate(data: xr.DataArray | gpd.GeoDataFrame, variables, grid_spec): - rows = grid_spec.rows - cols = grid_spec.cols - transform = grid_spec.transform - xs = grid_spec.xs - ys = grid_spec.ys +from disscube.operators.base import Operator +from disscube.models.variable import Variable +from disscube.models.grid import GridSpec + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +def _rasterize(shapes: list, grid: GridSpec) -> np.ndarray: + """Rasterize ``(geometry, value)`` pairs onto ``grid``, returning float32.""" + if not shapes: + return np.zeros((grid.rows, grid.cols), dtype=np.float32) + return rasterio.features.rasterize( + shapes, + out_shape=(grid.rows, grid.cols), + transform=grid.transform, + fill=0, + all_touched=False, + ).astype(np.float32) + + +def _wrap(arr: np.ndarray, grid: GridSpec) -> xr.DataArray: + """Wrap a 2-D array as a grid-aligned DataArray.""" + return xr.DataArray(arr, dims=("y", "x"), coords={"y": grid.ys, "x": grid.xs}) + + +def _passthrough(data: xr.DataArray) -> xr.DataArray: + """Return the raster band as a plain (y, x) DataArray.""" + if "band" in data.dims: + data = data.isel(band=0) + return data.transpose("y", "x") + + +# --------------------------------------------------------------------------- +# Raster-only operators +# --------------------------------------------------------------------------- + +class MeanOperator(Operator): + name = "mean" + _resampling = Resampling.average + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) + raise TypeError(f"'mean' requires a raster source, got {type(data).__name__}") + + +class SumOperator(Operator): + name = "sum" + _resampling = Resampling.sum + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) + raise TypeError(f"'sum' requires a raster source, got {type(data).__name__}") + + +class StdOperator(Operator): + name = "std" + # rasterio has no standard-deviation resampling; nearest is a placeholder. + # True std requires a zonal-statistics pass (future work). + _resampling = Resampling.nearest + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) + raise TypeError(f"'std' requires a raster source, got {type(data).__name__}") + + +class MinOperator(Operator): + name = "min" + _resampling = Resampling.min + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) + raise TypeError(f"'min' requires a raster source, got {type(data).__name__}") + + +class MaxOperator(Operator): + name = "max" + _resampling = Resampling.max + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) + raise TypeError(f"'max' requires a raster source, got {type(data).__name__}") + +# --------------------------------------------------------------------------- +# Raster + vector operators +# --------------------------------------------------------------------------- + +class MajorityOperator(Operator): + name = "majority" + _resampling = Resampling.mode + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) + if isinstance(data, gpd.GeoDataFrame): + val = var.class_code if var.class_code is not None else 1 + shapes = [(g, val) for g in data.geometry if g is not None] + return _wrap(_rasterize(shapes, grid), grid) + raise TypeError(f"'majority' got unexpected type {type(data).__name__}") + + +class MinorityOperator(Operator): + name = "minority" + _resampling = Resampling.mode + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) + if isinstance(data, gpd.GeoDataFrame): + val = var.class_code if var.class_code is not None else 1 + shapes = [(g, val) for g in data.geometry if g is not None] + return _wrap(_rasterize(shapes, grid), grid) + raise TypeError(f"'minority' got unexpected type {type(data).__name__}") + + +class PercentageOperator(Operator): + name = "percentage" + requires_class_code = True + _resampling = Resampling.mode + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) + if isinstance(data, gpd.GeoDataFrame): + shapes = [(g, var.class_code) for g in data.geometry if g is not None] + return _wrap(_rasterize(shapes, grid), grid) + raise TypeError(f"'percentage' got unexpected type {type(data).__name__}") + + +class AttributeOperator(Operator): + """ + Rasterize a vector layer using a numeric column as the pixel value. + + The column name must match ``Variable.name`` (explicit contract: + register the source with a column that matches the variable you intend + to derive). + """ + name = "attribute" + _resampling = Resampling.nearest + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) + if isinstance(data, gpd.GeoDataFrame): + column = var.name + if column in data.columns: + valid = data[data.geometry.notnull() & data[column].notnull()] + shapes = list(zip(valid.geometry, valid[column])) + else: + shapes = [] + return _wrap(_rasterize(shapes, grid), grid) + raise TypeError(f"'attribute' got unexpected type {type(data).__name__}") + + +class PresenceOperator(Operator): + """Binary mask: 1 where any feature is present, 0 elsewhere.""" + name = "presence" + _resampling = Resampling.nearest + + def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: + if isinstance(data, xr.DataArray): + return _passthrough(data) if isinstance(data, gpd.GeoDataFrame): - ds = xr.Dataset(coords={"y": ys, "x": xs}) - ds.rio.write_crs(grid_spec.crs, inplace=True) - ds.rio.write_transform(transform, inplace=True) - - for var in variables: - op = var.operator - column = var.name if op == "attribute" else None - - if op == "presence": - val = var.class_code if var.class_code is not None else 1 - shapes = [(geom, val) for geom in data.geometry if geom is not None] - elif column and column in data.columns: - valid_mask = (data.geometry.notnull()) & (data[column].notnull()) - shapes = list(zip(data.geometry[valid_mask], data[column][valid_mask])) - else: - val = var.class_code if var.class_code is not None else 1 - shapes = [(geom, val) for geom in data.geometry if geom is not None] - - if not shapes: - raster = np.zeros((rows, cols), dtype=np.float32) - else: - raster = rasterio.features.rasterize( - shapes, - out_shape=(rows, cols), - transform=transform, - fill=0, - all_touched=False - ) - - ds[var.name] = (("y", "x"), raster) - - return ds - - if isinstance(data, xr.DataArray): - if "band" in data.dims: - data = data.isel(band=0) - - # Use the variable name if provided, or preserve current name - name = variables[0].name if variables else data.name - if not name: - name = "variable" - - # Convert single DataArray to Dataset for uniform writer handling - return data.transpose("y", "x").to_dataset(name=name) - - return xr.Dataset(coords={"y": ys, "x": xs}) + val = var.class_code if var.class_code is not None else 1 + shapes = [(g, val) for g in data.geometry if g is not None] + return _wrap(_rasterize(shapes, grid), grid) + raise TypeError(f"'presence' got unexpected type {type(data).__name__}") + + +# --------------------------------------------------------------------------- +# Legacy compatibility shim — kept so existing imports don't break +# --------------------------------------------------------------------------- + +class ZonalAggregator: + """ + Deprecated. Use operator classes directly via ``OPERATOR_REGISTRY``. + + Kept as a thin compatibility shim; delegates to the appropriate + ``Operator.compute()`` for each variable. + """ + + @staticmethod + def aggregate(data, variables, grid_spec): + import warnings + warnings.warn( + "ZonalAggregator is deprecated; use OPERATOR_REGISTRY operators directly.", + DeprecationWarning, + stacklevel=2, + ) + from disscube.operators.base import OPERATOR_REGISTRY + ds = xr.Dataset(coords={"y": grid_spec.ys, "x": grid_spec.xs}) + for var in variables: + op_cls = OPERATOR_REGISTRY.get(var.operator) + if op_cls is None: + raise ValueError(f"Unknown operator: {var.operator}") + da = op_cls().compute(data, var, grid_spec) + ds[var.name] = da + return ds diff --git a/disscube/pipeline/aggregator.py b/disscube/pipeline/aggregator.py index 48b644f..26670d1 100644 --- a/disscube/pipeline/aggregator.py +++ b/disscube/pipeline/aggregator.py @@ -1,32 +1,47 @@ -from disscube.pipeline import PipelineStage, PipelineContext -from disscube.operators import ZonalAggregator, ProximityAggregator +""" +Aggregator — derives each Variable by delegating to its Operator class. + +For raster sources ``ctx.data`` arrives as an ``xr.Dataset`` produced by +``GridAligner``, with one correctly-resampled DataArray per variable. +The operator's ``compute()`` receives that DataArray and returns it +(possibly with minor post-processing). + +For vector sources ``ctx.data`` is a ``GeoDataFrame`` (clipped, reprojected). +Each operator's ``compute()`` performs the rasterization / distance +transform / count and returns an ``(y, x)`` DataArray. + +Adding a new operator never requires touching this file. +""" + +from __future__ import annotations + import xarray as xr -import numpy as np +import rioxarray # noqa: F401 — registers the .rio accessor on Dataset/DataArray + +from disscube.pipeline import PipelineStage, PipelineContext +from disscube.operators.base import OPERATOR_REGISTRY def _crs_to_named_wkt(crs_str: str) -> str: """ - Converts a CRS string (proj4, EPSG, or WKT) to a WKT string with a - human-readable name. This prevents rioxarray from writing PROJCS["unknown"] - when the CRS has no official EPSG code (e.g. BDC Albers). + Convert a CRS string to a WKT with a human-readable name. + + Prevents rioxarray from writing ``PROJCS["unknown"]`` when the CRS has + no official EPSG code (e.g. BDC Albers). """ from pyproj import CRS crs = CRS.from_user_input(crs_str) - # If pyproj already resolved a name, use it as-is if crs.name and crs.name.lower() != "unknown": return crs.to_wkt() - # BDC Albers: assign a stable name so QGIS and other tools can identify it bdc_albers_proj4 = ( "+proj=aea +lat_0=-12 +lon_0=-54 +lat_1=-2 +lat_2=-22 " "+x_0=5000000 +y_0=10000000 +ellps=GRS80 +units=m +no_defs" ) - bdc_ref = CRS.from_proj4(bdc_albers_proj4) - if crs.equals(bdc_ref): - # Re-create from ESRI WKT with explicit name — QGIS recognises this - named_wkt = ( + if crs.equals(CRS.from_proj4(bdc_albers_proj4)): + return ( 'PROJCS["Brazil_Albers_Equal_Area_Conic",' 'GEOGCS["GCS_SIRGAS_2000",' 'DATUM["SIRGAS_2000",' @@ -42,72 +57,40 @@ def _crs_to_named_wkt(crs_str: str) -> str: 'PARAMETER["latitude_of_center",-12],' 'UNIT["Meter",1]]' ) - return named_wkt - # Fallback: return whatever pyproj generates return crs.to_wkt() class Aggregator(PipelineStage): def execute(self, ctx: PipelineContext) -> PipelineContext: grid = ctx.grid - vars = ctx.derivation.variables - data = ctx.data + source_data = ctx.data - # Initial empty dataset with correct grid coords final_ds = xr.Dataset(coords={"y": grid.ys, "x": grid.xs}) - - # Write CRS using named WKT so tools like QGIS don't fall back to - # EPSG:4326 when the CRS has no official EPSG code (e.g. BDC Albers). named_wkt = _crs_to_named_wkt(grid.crs) final_ds.rio.write_crs(named_wkt, inplace=True) final_ds.rio.write_transform(grid.transform, inplace=True) - - # Preserve spatial_ref before any merge — compat="override" can silently - # replace it with the CRS from the aggregation result (e.g. EPSG:4674), - # causing QGIS and other tools to misread the output CRS. spatial_ref = final_ds.coords.get("spatial_ref") - for i, var in enumerate(vars): - # Selection logic for multi-band DataArray - var_data = data - if isinstance(data, xr.DataArray) and "band" in data.dims and data.sizes["band"] > 1: - # Use band_map if available, otherwise fallback to positional index - if ctx.source.band_map and var.name in ctx.source.band_map: - band_idx = ctx.source.band_map[var.name] - 1 # 1-based to 0-based - if 0 <= band_idx < data.sizes["band"]: - var_data = data.isel(band=band_idx) - else: - raise ValueError( - f"Band index {band_idx+1} for variable '{var.name}' is out of range. " - f"Data has {data.sizes['band']} bands." - ) - elif i < data.sizes["band"]: - var_data = data.isel(band=i) - else: - raise ValueError( - f"No band available for variable '{var.name}' at index {i}. " - f"Data has {data.sizes['band']} bands and no band_map was provided." - ) - - if var.operator in ["mean", "sum", "std", "min", "max", "majority", "minority", "percentage", "attribute", "presence"]: - res = ZonalAggregator.aggregate(var_data, [var], grid) - elif var.operator in ["min_distance", "count"]: - res = ProximityAggregator.aggregate(var_data, [var], grid) - else: - raise ValueError(f"Unknown operator: {var.operator}") - - # Merging result into final dataset - if isinstance(res, xr.Dataset): - if var.name in res.data_vars: - final_ds[var.name] = res[var.name] - else: - final_ds = final_ds.merge(res, compat="override") + for var in ctx.derivation.variables: + op_cls = OPERATOR_REGISTRY.get(var.operator) + if op_cls is None: + available = sorted(OPERATOR_REGISTRY) + raise ValueError( + f"Unknown operator {var.operator!r}. " + f"Available: {available}" + ) + + # For raster sources GridAligner returns a Dataset keyed by + # variable name; for vector sources it returns a GeoDataFrame. + if isinstance(source_data, xr.Dataset): + var_data = source_data[var.name] else: - final_ds[var.name] = res + var_data = source_data + + result: xr.DataArray = op_cls().compute(var_data, var, grid) + final_ds[var.name] = result - # Restore spatial_ref after every merge — compat="override" may have - # replaced it with the result's CRS coordinate. if spatial_ref is not None: final_ds = final_ds.assign_coords({"spatial_ref": spatial_ref}) diff --git a/disscube/pipeline/aligner.py b/disscube/pipeline/aligner.py index e416aa9..17b9897 100644 --- a/disscube/pipeline/aligner.py +++ b/disscube/pipeline/aligner.py @@ -1,69 +1,147 @@ -import rasterio -from rasterio.warp import calculate_default_transform, reproject, Resampling -import geopandas as gpd +""" +GridAligner — reprojects source data to the target GridSpec. + +For raster sources each variable is aligned independently using the +resampling method declared by its Operator class, so ``majority`` uses +``Resampling.mode`` while ``mean`` uses ``Resampling.average`` — even when +both are derived from the same multi-band file. + +The output is an ``xr.Dataset`` keyed by variable name; the Aggregator +picks each DataArray and delegates to the operator's ``compute()`` method. + +For vector sources the GeoDataFrame is reprojected and clipped to the grid +bounding box; the Aggregator then calls each operator's ``compute()`` to +rasterize. + +Invariant enforced at the end of raster alignment: + aligned.rio.shape == (grid.rows, grid.cols) + +A mismatch raises ``ValueError`` immediately so misalignments surface as +loud errors rather than silent downstream corruption. +""" + +from __future__ import annotations + +import logging import xarray as xr -import numpy as np +import geopandas as gpd +from shapely.geometry import box + from disscube.pipeline import PipelineStage, PipelineContext -from disscube.models import GridSpec, Variable +from disscube.models.grid import GridSpec +from disscube.models.variable import Variable + +log = logging.getLogger(__name__) + class GridAligner(PipelineStage): def execute(self, ctx: PipelineContext) -> PipelineContext: grid = ctx.grid fmt = ctx.source.format - + if fmt == "raster": - # For raster, we'll use rasterio to reproject/crop to grid bbox/resolution - # Pass variables to decide resampling method - ctx.data = self._align_raster(ctx.source.asset_url, grid, ctx.derivation.variables) + ctx.data = self._align_raster( + ctx.source.asset_url, + grid, + ctx.derivation.variables, + ctx.source.band_map, + ) elif fmt == "vector": - # For vector, ensure CRS matches grid - gdf = ctx.data - if gdf.crs != grid.crs: + gdf: gpd.GeoDataFrame = ctx.data + if str(gdf.crs) != str(grid.crs): gdf = gdf.to_crs(grid.crs) - # Crop to bbox - from shapely.geometry import box - bbox_geom = box(*grid.bbox) - gdf = gdf.clip(bbox_geom) + gdf = gdf.clip(box(*grid.bbox)) ctx.data = gdf - + return ctx - def _align_raster(self, url: str, grid: GridSpec, variables: list[Variable] = None) -> xr.DataArray: - import rioxarray - from rasterio.warp import Resampling - from affine import Affine - - # Load the raster - ds = rioxarray.open_rasterio(url) - - # Determine resampling method based on operator - resampling_method = Resampling.nearest - if variables: - op = variables[0].operator.lower() - if op in ["mean", "average"]: - resampling_method = Resampling.average - elif op in ["majority", "mode"]: - resampling_method = Resampling.mode - elif op == "max": - resampling_method = Resampling.max - elif op == "min": - resampling_method = Resampling.min - elif op == "sum": - resampling_method = Resampling.sum - - # Target size - rows = grid.rows - cols = grid.cols - - # Target transform - target_transform = grid.transform - - # Use rioxarray's reproject to match the target grid perfectly - aligned = ds.rio.reproject( - grid.crs, - shape=(rows, cols), - transform=target_transform, - resampling=resampling_method - ) - - return aligned + # ------------------------------------------------------------------ + # Raster alignment — one aligned DataArray per derived variable + # ------------------------------------------------------------------ + + def _align_raster( + self, + url: str, + grid: GridSpec, + variables: list[Variable], + band_map: dict[str, int], + ) -> xr.Dataset: + """ + Reproject and resample the source raster for each variable. + + Returns an ``xr.Dataset`` with one data variable per ``Variable`` in + ``variables``, each resampled with the method appropriate for its + operator (e.g. ``mode`` for ``majority``, ``average`` for ``mean``). + + Parameters + ---------- + url : str + Path or URL to the source raster. + grid : GridSpec + Target spatial grid. + variables : list[Variable] + Variables to derive; determines band selection and resampling. + band_map : dict[str, int] + Optional ``{variable_name: 1-based band index}`` from the source. + """ + import rioxarray # noqa: F401 — registers the .rio accessor + from disscube.operators.base import OPERATOR_REGISTRY + + ds_src = __import__("rioxarray").open_rasterio(url) + + result = xr.Dataset(coords={"y": grid.ys, "x": grid.xs}) + + for i, var in enumerate(variables): + # ── Band selection ───────────────────────────────────────── + if "band" in ds_src.dims and ds_src.sizes["band"] > 1: + if band_map and var.name in band_map: + band_idx = band_map[var.name] - 1 # 1-based → 0-based + if not (0 <= band_idx < ds_src.sizes["band"]): + raise ValueError( + f"Band index {band_idx + 1} for variable " + f"'{var.name}' is out of range; " + f"source has {ds_src.sizes['band']} bands." + ) + elif i < ds_src.sizes["band"]: + band_idx = i + else: + raise ValueError( + f"No band available for variable '{var.name}' at " + f"index {i}; source has {ds_src.sizes['band']} bands " + "and no band_map was provided." + ) + band = ds_src.isel(band=band_idx) + else: + band = ds_src.isel(band=0) if "band" in ds_src.dims else ds_src + + # ── Per-operator resampling method ───────────────────────── + op_cls = OPERATOR_REGISTRY.get(var.operator) + resampling = op_cls.resampling() if op_cls else __import__( + "rasterio.warp", fromlist=["Resampling"] + ).Resampling.nearest + + # ── Reproject to target grid ─────────────────────────────── + aligned = band.rio.reproject( + grid.crs, + shape=(grid.rows, grid.cols), + transform=grid.transform, + resampling=resampling, + ) + + # ── Alignment invariant ──────────────────────────────────── + actual = aligned.rio.shape + expected = (grid.rows, grid.cols) + if actual != expected: + raise ValueError( + f"GridAligner: variable '{var.name}' alignment produced " + f"shape {actual}, expected {expected} for grid '{grid.id}'. " + f"Source: {url}" + ) + + result[var.name] = aligned.transpose("y", "x") + log.debug( + "aligned '%s' via '%s' (resampling=%s, shape=%s)", + var.name, var.operator, resampling.name, actual, + ) + + return result diff --git a/disscube/pipeline/normalizer.py b/disscube/pipeline/normalizer.py index 359abeb..199130d 100644 --- a/disscube/pipeline/normalizer.py +++ b/disscube/pipeline/normalizer.py @@ -1,24 +1,58 @@ +import logging + import rasterio import geopandas as gpd + from disscube.pipeline import PipelineStage, PipelineContext +log = logging.getLogger(__name__) + + class Normalizer(PipelineStage): def execute(self, ctx: PipelineContext) -> PipelineContext: url = ctx.source.asset_url fmt = ctx.source.format - + if fmt == "raster": - with rasterio.open(url) as src: - # Validation only, don't read yet or just check metadata + # Validate that the file is readable; metadata is loaded lazily + # by GridAligner to avoid double I/O. + with rasterio.open(url): pass + elif fmt == "vector": - # For vector, we often need to load it to validate gdf = gpd.read_file(url) - # Inject CRS from source if available, as the file might have it wrong or missing + if ctx.source.crs: - gdf.crs = ctx.source.crs + from pyproj import CRS as ProjCRS + declared = ctx.source.crs + file_crs = gdf.crs + try: + crs_match = ( + file_crs is not None + and ProjCRS.from_user_input(file_crs).equals( + ProjCRS.from_user_input(declared) + ) + ) + except Exception: + crs_match = False + + if not crs_match: + # CRS in the file header is absent or differs from the + # registered source CRS. Override the metadata without + # transforming coordinates — the source registration is + # the authoritative reference. + log.warning( + "Normalizer: overriding file CRS (%s) with source CRS (%s) " + "for source '%s'. Coordinates are NOT reprojected.", + file_crs, + declared, + ctx.source.id, + ) + gdf = gdf.set_crs(declared, allow_override=True) + ctx.data = gdf + else: - raise ValueError(f"Unknown format: {fmt}") - + raise ValueError(f"Unknown source format: {fmt!r}") + return ctx diff --git a/examples/case_studies/brmangue/01_derive.py b/examples/case_studies/brmangue/01_derive.py index d40fa50..94c74dc 100644 --- a/examples/case_studies/brmangue/01_derive.py +++ b/examples/case_studies/brmangue/01_derive.py @@ -11,7 +11,7 @@ from disscube.models import SpatialDerivation, Variable -GRID_ID = "MA/100m" +GRID_ID = "ilha_maranhao/100m" SOURCE_ID = "maranhao_base" cube = CubeClient(catalog="catalog.db", store="./data/") diff --git a/tests/test_aggregator.py b/tests/test_aggregator.py index ce23f9b..63cb524 100644 --- a/tests/test_aggregator.py +++ b/tests/test_aggregator.py @@ -1,61 +1,129 @@ +""" +Tests for the Aggregator pipeline stage. + +After the operator refactor the Aggregator's contract is: + - Dispatch to the correct Operator class via OPERATOR_REGISTRY. + - Accept xr.Dataset (raster path, from GridAligner) or GeoDataFrame (vector). + - Raise ValueError for unknown operators. + +Band selection has moved to GridAligner; see test_aligner.py for those tests. +""" + import pytest -import xarray as xr import numpy as np +import xarray as xr +import rioxarray # noqa: F401 +import geopandas as gpd +from shapely.geometry import box + from disscube.pipeline import PipelineContext from disscube.pipeline.aggregator import Aggregator from disscube.models import GridSpec, SpatialSource, SpatialDerivation, Variable -def test_aggregator_strict_bands(): - grid = GridSpec(id="G1", type="local", crs="EPSG:31982", resolution=10, bbox=[0,0,100,100]) - source = SpatialSource( - id="S1", name="S1", format="raster", asset_url="test.tif", crs="EPSG:31982" + +def _grid(): + return GridSpec( + id="G1", type="local", crs="EPSG:31982", resolution=10, bbox=[0, 0, 100, 100] ) - derivation = SpatialDerivation( - source_id="S1", grid_id="G1", role="test", - variables=[Variable(name="B1", operator="mean")] - ) - - # 2-band data - data = xr.DataArray( - np.zeros((2, 10, 10)), - dims=("band", "y", "x"), - coords={"band": [1, 2], "y": grid.ys, "x": grid.xs} - ) - - ctx = PipelineContext(source=source, grid=grid, derivation=derivation, data=data) - agg = Aggregator() - - # 1 variable, will get band index 0. Should pass. - agg.execute(ctx) - - # Reset data for second test run - ctx.data = data - - # Now try to get a variable at index 2 (doesn't exist) - derivation.variables = [Variable(name="B1", operator="mean"), Variable(name="B2", operator="mean"), Variable(name="B3", operator="mean")] - with pytest.raises(ValueError, match="No band available for variable 'B3' at index 2"): - agg.execute(ctx) - -def test_aggregator_band_map_out_of_range(): - grid = GridSpec(id="G1", type="local", crs="EPSG:31982", resolution=10, bbox=[0,0,100,100]) - # band_map points to band 3, but data only has 2 - source = SpatialSource( + + +def _source(**kwargs): + return SpatialSource( id="S1", name="S1", format="raster", asset_url="test.tif", crs="EPSG:31982", - band_map={"B1": 3} + **kwargs ) + + +def _ctx(grid, variables, data, source=None): derivation = SpatialDerivation( - source_id="S1", grid_id="G1", role="test", - variables=[Variable(name="B1", operator="mean")] - ) - - data = xr.DataArray( - np.zeros((2, 10, 10)), - dims=("band", "y", "x"), - coords={"band": [1, 2], "y": grid.ys, "x": grid.xs} - ) - - ctx = PipelineContext(source=source, grid=grid, derivation=derivation, data=data) - agg = Aggregator() - - with pytest.raises(ValueError, match="Band index 3 for variable 'B1' is out of range"): - agg.execute(ctx) + source_id="S1", grid_id=grid.id, role="test", variables=variables + ) + return PipelineContext( + source=source or _source(), grid=grid, derivation=derivation, data=data + ) + + +# ── Raster path (Dataset from GridAligner) ─────────────────────────────────── + +def test_aggregator_raster_dataset_passthrough(): + """Aggregator forwards a pre-aligned DataArray from a Dataset.""" + grid = _grid() + da = xr.DataArray( + np.ones((10, 10), dtype=np.float32), + dims=("y", "x"), + coords={"y": grid.ys, "x": grid.xs}, + ) + ds_in = xr.Dataset({"B1": da}) + + ctx = _ctx(grid, [Variable(name="B1", operator="mean")], ds_in) + result_ctx = Aggregator().execute(ctx) + + assert "B1" in result_ctx.data.data_vars + assert result_ctx.data["B1"].shape == (10, 10) + + +def test_aggregator_two_variables_different_operators(): + """Two variables from the same Dataset, different operators.""" + grid = _grid() + da1 = xr.DataArray( + np.ones((10, 10), dtype=np.float32), + dims=("y", "x"), coords={"y": grid.ys, "x": grid.xs}, + ) + da2 = xr.DataArray( + np.full((10, 10), 2.0, dtype=np.float32), + dims=("y", "x"), coords={"y": grid.ys, "x": grid.xs}, + ) + ds_in = xr.Dataset({"uso": da1, "alt": da2}) + + ctx = _ctx( + grid, + [Variable(name="uso", operator="majority"), Variable(name="alt", operator="mean")], + ds_in, + ) + result_ctx = Aggregator().execute(ctx) + + assert "uso" in result_ctx.data.data_vars + assert "alt" in result_ctx.data.data_vars + + +# ── Vector path (GeoDataFrame) ──────────────────────────────────────────────── + +def test_aggregator_vector_presence(): + """Presence operator rasterizes a GeoDataFrame to binary mask.""" + grid = _grid() + gdf = gpd.GeoDataFrame( + {"geometry": [box(10, 10, 40, 40)]}, + crs="EPSG:31982", + ) + source = SpatialSource( + id="S1", name="S1", format="vector", + asset_url="test.gpkg", crs="EPSG:31982", + ) + ctx = _ctx( + grid, + [Variable(name="pres", operator="presence")], + gdf, + source=source, + ) + result_ctx = Aggregator().execute(ctx) + + assert "pres" in result_ctx.data.data_vars + arr = result_ctx.data["pres"].values + assert arr.max() > 0 # at least one pixel is non-zero + + +# ── Error handling ──────────────────────────────────────────────────────────── + +def test_aggregator_unknown_operator_raises(): + """Unknown operator raises ValueError with the list of valid operators.""" + grid = _grid() + da = xr.DataArray( + np.zeros((10, 10)), dims=("y", "x"), + coords={"y": grid.ys, "x": grid.xs}, + ) + ds_in = xr.Dataset({"B1": da}) + + ctx = _ctx(grid, [Variable(name="B1", operator="nonexistent")], ds_in) + + with pytest.raises(ValueError, match="nonexistent"): + Aggregator().execute(ctx) diff --git a/tests/test_aligner.py b/tests/test_aligner.py new file mode 100644 index 0000000..a05ef80 --- /dev/null +++ b/tests/test_aligner.py @@ -0,0 +1,108 @@ +""" +Tests for GridAligner — band selection, per-variable resampling, and the +alignment invariant. + +GridAligner takes a raster URL and returns an xr.Dataset keyed by variable +name, each band resampled with the correct method for its operator. +""" + +import pytest +import numpy as np +import xarray as xr +import rioxarray # noqa: F401 + +from disscube.pipeline.aligner import GridAligner +from disscube.pipeline import PipelineContext +from disscube.models import GridSpec, SpatialSource, SpatialDerivation, Variable + + +def _grid(**kw): + return GridSpec(id="G1", type="local", crs="EPSG:31982", resolution=10, bbox=[0, 0, 100, 100], **kw) + + +def _raster_source(asset_url="test.tif", **kw): + return SpatialSource(id="S1", name="S1", format="raster", asset_url=asset_url, crs="EPSG:31982", **kw) + + +def _ctx(grid, variables, source=None): + derivation = SpatialDerivation(source_id="S1", grid_id=grid.id, role="test", variables=variables) + return PipelineContext(source=source or _raster_source(), grid=grid, derivation=derivation) + + +# ── band_map out-of-range ───────────────────────────────────────────────────── + +def test_band_map_out_of_range(tmp_path): + """band_map pointing beyond available bands raises ValueError.""" + import rasterio + from rasterio.transform import from_bounds + + tif = tmp_path / "two_band.tif" + transform = from_bounds(0, 0, 100, 100, 10, 10) + with rasterio.open( + tif, "w", driver="GTiff", height=10, width=10, count=2, + dtype="float32", crs="EPSG:31982", transform=transform, + ) as dst: + dst.write(np.ones((2, 10, 10), dtype=np.float32)) + + grid = _grid() + source = _raster_source(asset_url=str(tif), band_map={"B1": 3}) + ctx = _ctx(grid, [Variable(name="B1", operator="mean")], source) + + with pytest.raises(ValueError, match="Band index 3.*out of range"): + GridAligner().execute(ctx) + + +# ── per-variable resampling ─────────────────────────────────────────────────── + +def test_per_variable_resampling_produces_correct_names(tmp_path): + """Each variable in a multi-variable derivation gets its own DataArray.""" + import rasterio + from rasterio.transform import from_bounds + + tif = tmp_path / "two_band.tif" + transform = from_bounds(0, 0, 100, 100, 10, 10) + with rasterio.open( + tif, "w", driver="GTiff", height=10, width=10, count=2, + dtype="float32", crs="EPSG:31982", transform=transform, + ) as dst: + dst.write(np.ones((2, 10, 10), dtype=np.float32)) + + grid = _grid() + source = _raster_source(asset_url=str(tif)) + ctx = _ctx( + grid, + [Variable(name="uso", operator="majority"), Variable(name="alt", operator="mean")], + source, + ) + + GridAligner().execute(ctx) + ds = ctx.data + + assert isinstance(ds, xr.Dataset) + assert "uso" in ds.data_vars + assert "alt" in ds.data_vars + assert ds["uso"].shape == (10, 10) + assert ds["alt"].shape == (10, 10) + + +# ── alignment invariant ─────────────────────────────────────────────────────── + +def test_alignment_invariant_passes_on_correct_shape(tmp_path): + """A well-formed raster passes the shape invariant without error.""" + import rasterio + from rasterio.transform import from_bounds + + tif = tmp_path / "ok.tif" + transform = from_bounds(0, 0, 100, 100, 10, 10) + with rasterio.open( + tif, "w", driver="GTiff", height=10, width=10, count=1, + dtype="float32", crs="EPSG:31982", transform=transform, + ) as dst: + dst.write(np.zeros((1, 10, 10), dtype=np.float32)) + + grid = _grid() + source = _raster_source(asset_url=str(tif)) + ctx = _ctx(grid, [Variable(name="v", operator="mean")], source) + + GridAligner().execute(ctx) # must not raise + assert ctx.data["v"].shape == (10, 10) From 4412c4716abceca700058fe742f8a79403d512f4 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Sun, 14 Jun 2026 16:23:31 -0300 Subject: [PATCH 03/13] =?UTF-8?q?Documentation=20files=20updated/created:?= =?UTF-8?q?=20-=20README.md=20=E2=80=94=20complete=20rewrite=20-=20docs/in?= =?UTF-8?q?dex.md=20=E2=80=94=20updated=20navigation=20-=20docs/architectu?= =?UTF-8?q?re/overview.md=20=E2=80=94=20complete=20rewrite=20(entity=20mod?= =?UTF-8?q?el,=20pipeline,=20spec=5Fhash,=20operator=20plugin=20system)=20?= =?UTF-8?q?-=20docs/architecture/operators.md=20=E2=80=94=20new=20file=20(?= =?UTF-8?q?operator=20system,=20table,=20compute()=20contract,=20how=20to?= =?UTF-8?q?=20add=20new=20operators)=20-=20docs/architecture/pipeline.md?= =?UTF-8?q?=20=E2=80=94=20complete=20rewrite=20(all=204=20stages=20with=20?= =?UTF-8?q?current=20code)=20-=20docs/architecture/catalog.md=20=E2=80=94?= =?UTF-8?q?=20complete=20rewrite=20(schema,=20hashes,=20temporal=20series,?= =?UTF-8?q?=20evolution)=20-=20docs/architecture/tiling.md=20=E2=80=94=20c?= =?UTF-8?q?omplete=20rewrite=20(tile=20concept,=20tile=20detection,=20para?= =?UTF-8?q?llel=20processing)=20-=20docs/guides/grids.md=20=E2=80=94=20com?= =?UTF-8?q?plete=20rewrite=20(snap,=20hierarchy,=20SpatialRelation,=20manu?= =?UTF-8?q?al=20grid)=20-=20docs/guides/bdc.md=20=E2=80=94=20rewritten=20(?= =?UTF-8?q?removed=20stale=20BDC=5FSM/BDC=5FMD/BDC=5FLG=20references,=20up?= =?UTF-8?q?dated=20with=20current=20tile=20workflow)=20-=20mkdocs.yml=20?= =?UTF-8?q?=E2=80=94=20added=20Operator=20System:=20architecture/operators?= =?UTF-8?q?.md=20to=20nav?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 293 +++++++++++++++++++-------------- disscube/pipeline/aligner.py | 32 ++-- docs/architecture/catalog.md | 104 +++++++++--- docs/architecture/operators.md | 128 ++++++++++++++ docs/architecture/overview.md | 131 +++++++++++++-- docs/architecture/pipeline.md | 156 ++++++++++++++++-- docs/architecture/tiling.md | 87 +++++++--- docs/guides/bdc.md | 137 ++++++++++++--- docs/guides/grids.md | 131 +++++++++------ docs/index.md | 53 ++++-- mkdocs.yml | 1 + 11 files changed, 956 insertions(+), 297 deletions(-) create mode 100644 docs/architecture/operators.md diff --git a/README.md b/README.md index ce558cd..001daa6 100644 --- a/README.md +++ b/README.md @@ -1,152 +1,189 @@ # DisSCube -> **⚠️ Project Status: Early Stage (Alpha)** -> This project is currently in its initial development phase. APIs and data structures are subject to frequent changes as we evolve the core engine. - -DisSCube is a high-performance spatial data cube engine designed for land change modeling within the **DisSModel** ecosystem. It provides a bridge between raw geospatial data (Rasters, Vectors, Points) and multidimensional analysis ready for statistical and dynamic models (like TerraME). - -## 🚀 Key Features - -- **FillCell Operators**: Legacy logic of TerraView FillCell for robust data aggregation. - - **Raster**: Majority, Mean, Max, Min, and Sum resampling. - - **Vector**: Presence (Boolean), Count, and area-weighted strategies. - - **Proximity**: High-performance Euclidean distance transforms. -- **Temporal Backend**: Support for multi-period variables. Derived products can have temporal validity windows, allowing models to load dynamic drivers. -- **Snapped Grids**: Automatic alignment of local grids (e.g., State-level) to national meshes (e.g., BDC 5km) to ensure pixel-perfect interoperability. -- **Master Grid Architecture**: Native support for **Brazil Data Cube (BDC)** master grids (SM, MD, LG) and custom 100m grids with tiled processing. -- **SQLite Catalog**: High-performance, concurrent registry for spatial metadata and variable provenance. -- **Multidimensional Storage**: Uses **Zarr** and **Xarray** for efficient storage of high-resolution spatial variables. - -## 🛠 Architecture - -DisSCube is built on a decoupled architecture that separates spatial definitions (Grids) from data partitions (Tiles). - -### 1. Data Processing Pipeline -The core engine follows a sequential pipeline where each stage transforms the `PipelineContext`. - -```mermaid -graph LR - S[SpatialSource] --> N[Normalizer] - N --> GA[GridAligner] - GA --> AG[Aggregator] - AG --> VW[VariableWriter] - VW --> C[(SQLite Catalog)] - VW --> Z[[Zarr Storage]] - - subgraph "Pipeline Execution" - N - GA - AG - VW - end -``` - -### 2. Temporal Awareness -Derivations can be associated with a time window. `to_lucc_data` automatically stacks these fatias into a 3D DataArray `(time, y, x)`, allowing CA models to query drivers by year. - -```mermaid -graph TD - D1[Derivation 2000-2010] --> VW - D2[Derivation 2011-2025] --> VW - VW --> DB[(Catalog)] - DB --> LC[to_lucc_data] - LC --> RB[RasterBackend] - RB --> CA[Cellular Automata Model] -``` - -### 3. Entity Model -The catalog maintains the relationships between sources, derivations, and the physical assets. - -```mermaid -classDiagram - class GridSpec { - +id: str - +crs: str - +resolution: float - +bbox: list - } - class SpatialSource { - +id: str - +format: raster|vector - +asset_url: str - +bbox: list - +time: int? - } - class SpatialDerivation { - +source_id: str - +grid_id: str - +valid_from: str? - +valid_until: str? - +spec_hash() str - } - class DerivedVariable { - +id: str - +grid_id: str - +tile_id: str - +spec_hash: str - +times: list[int] - +asset_url: str - } - - GridSpec "1" -- "0..*" DerivedVariable : defines space - SpatialSource "1" -- "0..*" DerivedVariable : raw material - SpatialDerivation "1" -- "0..*" DerivedVariable : generates -``` - -## 📖 Quick Start - -### Installation +> **Status: Alpha — APIs estáveis para o pipeline principal; modelos declarativos em evolução.** + +DisSCube é o motor de cubos de dados espaciais do ecossistema **DisSModel**. Ele converte fontes geoespaciais brutas (rasters, vetores) em variáveis derivadas alinhadas a grades de modelagem LUCC (Land Use and Cover Change), prontas para modelos de Autômatos Celulares e análises espacio-temporais. + +## Conceito central + +``` +SpatialSource → Derivation → Variable → DerivedVariable (Zarr) +``` + +Uma **fonte** (`SpatialSource`) passa por uma **derivação** (`SpatialDerivation` ou `Derivation`) que aplica um **operador** a uma **grade** (`GridSpec`), produzindo uma **variável derivada** registrada no catálogo SQLite e armazenada em Zarr. + +## Instalação ```bash -# Clone the repository -git clone https://github.com/dissmodel/disscube.git +git clone https://github.com/DisSModel/disscube.git cd disscube - -# Set up environment -python -m venv .venv -source .venv/bin/activate -pip install -r requirements.txt +python -m venv .venv && source .venv/bin/activate +pip install -e . ``` -### Basic Usage +## Uso básico + +### 1. Inicializar catálogo e registrar grade ```python from disscube.client import CubeClient -from disscube.models import Variable, SpatialDerivation +from disscube.utils.grids import register_local_grid -# Initialize client (Now using SQLite) cube = CubeClient(catalog="catalog.db", store="./data/") -# Define a derivation for a specific BDC Tile -derivation = SpatialDerivation( - source_id="urban_centers", - grid_id="BDC_100m", # Master Grid +grid = register_local_grid( + cube, + name="AC", + bbox_geo=(-73.99, -11.15, -66.62, -7.11), + resolution=5_000.0, +) +``` + +### 2. Registrar fonte + +```python +from disscube.models import SpatialSource + +cube.register_spatial_source(SpatialSource( + id="mapbiomas_2020", + name="MapBiomas Acre 2020", + format="raster", + asset_url="data/raw/mapbiomas_2020.tif", + crs="EPSG:4326", + time=2020, +)) +``` + +### 3. Derivar — modo declarativo (recomendado) + +```python +from disscube.derivation import Derivation + +d = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + role="driver", + valid_from="2020", + valid_until="2020", +) + +cube.derive_declarative(d, grid_id="AC/5km") +``` + +### 4. Derivar — modo direto + +```python +from disscube.models import SpatialDerivation, Variable + +cube.derive(SpatialDerivation( + source_id="mapbiomas_2020", + grid_id="AC/5km", role="driver", - variables=[Variable(name="dist_sedes", operator="min_distance")] + variables=[Variable(name="forest_pct", operator="percentage", class_code=3)], + valid_from="2020", + valid_until="2020", +)) +``` + +### 5. Carregar resultado + +```python +da = cube.load("forest_pct", grid_id="AC/5km") +print(da.shape) # (rows, cols) +``` + +### 6. Integrar ao DisSModel + +```python +backend = cube.to_lucc_data( + ["forest_pct", "dist_roads"], + grid_id="AC/5km", + period=("2015", "2020"), ) +``` -# Execute pipeline for a specific partition -cube.derive(derivation, tile_id="009002") +## Operadores disponíveis + +| Operador | Tipo | Resampling | Requer `class_code` | +|---|---|---|---| +| `mean` | zonal | average | não | +| `sum` | zonal | sum | não | +| `std` | zonal | nearest | não | +| `min` | zonal | min | não | +| `max` | zonal | max | não | +| `majority` | zonal | mode | não | +| `minority` | zonal | mode | não | +| `percentage` | zonal | mode | **sim** | +| `attribute` | zonal | nearest | não | +| `presence` | zonal | nearest | não | +| `min_distance` | proximity | nearest | não | +| `count` | proximity | nearest | não | + +## Pipeline -# Load data with tile disambiguation -res = cube.load("dist_sedes", tile_id="009002") -print(res.shape) +``` +SpatialSource + │ + ▼ +Normalizer — valida / carrega GeoDataFrame (vetor) ou abre raster + │ + ▼ +GridAligner — reprojeta por variável com o Resampling correto do operador + │ + ▼ +Aggregator — delega a operator.compute() → xr.DataArray por variável + │ + ▼ +VariableWriter — persiste Zarr + registra DerivedVariable no catálogo ``` -## 📁 Storage Structure +## Estrutura de armazenamento -Derived data is stored hierarchically to optimize access and prevent collisions: -`data/derived/{grid_id}/{tile_id}/{spec_hash}/{variable_name}.zarr` +``` +data/derived/{grid_id}/{partition}/{spec_hash}/{variable_name}.zarr +``` + +- `partition` = `tile_id` ou `global` para derivações sem tile. +- `spec_hash` = SHA-256 da derivação (fonte + grade + variáveis + janela temporal). + +## Estrutura do projeto + +``` +disscube/ +├── client/ CubeClient — ponto de entrada público +├── models/ GridSpec, SpatialSource, SpatialDerivation, Variable… +├── derivation.py Derivation declarativa (front-end sobre SpatialDerivation) +├── operators/ Operadores como classes (auto-registro via __init_subclass__) +│ ├── base.py Operator ABC + OPERATOR_REGISTRY +│ ├── zonal.py mean, sum, majority, percentage, attribute, presence… +│ └── proximity.py min_distance, count +├── pipeline/ Stages: Normalizer → GridAligner → Aggregator → Writer +├── catalog/ CatalogStore (Protocol) + SQLite e JSON implementations +├── storage/ AssetStore (fsspec — local e S3) +└── utils/grids.py register_local_grid, register_simulation_grids +``` -## 🔍 Tools +## Adicionar um operador novo + +Crie uma subclasse de `Operator` em qualquer arquivo importado na inicialização: + +```python +from rasterio.warp import Resampling +from disscube.operators.base import Operator + +class WeightedMeanOperator(Operator): + name = "weighted_mean" + _resampling = Resampling.average + + def compute(self, data, var, grid): + # data é xr.DataArray (raster) ou GeoDataFrame (vetor) + ... +``` -- `zarr_to_tif.py`: Export any Zarr variable to GeoTIFF for QGIS. - ```bash - python tools/zarr_to_tif.py data/derived/.../var.zarr output.tif - ``` -- `inspect_raster.py`: Check CRS and bounds of raw rasters. -- `list_grids.py`: List all master and local grids in the SQLite catalog. +O operador é registrado automaticamente e aceito em `Derivation` / `SpatialDerivation` sem nenhuma outra mudança. -## 📄 License +## Licença -This project is part of the DisSModel ecosystem. See the LICENSE file for details. +Parte do ecossistema DisSModel. Ver `LICENSE` para detalhes. diff --git a/disscube/pipeline/aligner.py b/disscube/pipeline/aligner.py index 17b9897..bc6c81d 100644 --- a/disscube/pipeline/aligner.py +++ b/disscube/pipeline/aligner.py @@ -23,10 +23,15 @@ from __future__ import annotations import logging + +import rioxarray # noqa: F401 — registers the .rio accessor import xarray as xr import geopandas as gpd +from pyproj import CRS as ProjCRS +from rasterio.warp import Resampling from shapely.geometry import box +from disscube.operators.base import OPERATOR_REGISTRY from disscube.pipeline import PipelineStage, PipelineContext from disscube.models.grid import GridSpec from disscube.models.variable import Variable @@ -48,7 +53,13 @@ def execute(self, ctx: PipelineContext) -> PipelineContext: ) elif fmt == "vector": gdf: gpd.GeoDataFrame = ctx.data - if str(gdf.crs) != str(grid.crs): + try: + needs_reproject = not ProjCRS.from_user_input(gdf.crs).equals( + ProjCRS.from_user_input(grid.crs) + ) + except Exception: + needs_reproject = str(gdf.crs) != str(grid.crs) + if needs_reproject: gdf = gdf.to_crs(grid.crs) gdf = gdf.clip(box(*grid.bbox)) ctx.data = gdf @@ -69,9 +80,10 @@ def _align_raster( """ Reproject and resample the source raster for each variable. - Returns an ``xr.Dataset`` with one data variable per ``Variable`` in - ``variables``, each resampled with the method appropriate for its - operator (e.g. ``mode`` for ``majority``, ``average`` for ``mean``). + Returns an ``xr.Dataset`` with one data variable per ``Variable``, + each resampled with the method appropriate for its operator + (e.g. ``Resampling.mode`` for ``majority``, + ``Resampling.average`` for ``mean``). Parameters ---------- @@ -84,11 +96,7 @@ def _align_raster( band_map : dict[str, int] Optional ``{variable_name: 1-based band index}`` from the source. """ - import rioxarray # noqa: F401 — registers the .rio accessor - from disscube.operators.base import OPERATOR_REGISTRY - - ds_src = __import__("rioxarray").open_rasterio(url) - + ds_src = rioxarray.open_rasterio(url) result = xr.Dataset(coords={"y": grid.ys, "x": grid.xs}) for i, var in enumerate(variables): @@ -116,9 +124,9 @@ def _align_raster( # ── Per-operator resampling method ───────────────────────── op_cls = OPERATOR_REGISTRY.get(var.operator) - resampling = op_cls.resampling() if op_cls else __import__( - "rasterio.warp", fromlist=["Resampling"] - ).Resampling.nearest + resampling: Resampling = ( + op_cls.resampling() if op_cls else Resampling.nearest + ) # ── Reproject to target grid ─────────────────────────────── aligned = band.rio.reproject( diff --git a/docs/architecture/catalog.md b/docs/architecture/catalog.md index 8d38647..5c8dce2 100644 --- a/docs/architecture/catalog.md +++ b/docs/architecture/catalog.md @@ -1,36 +1,90 @@ -# Catalog & Persistence +# Catálogo e Persistência -The Catalog is the "brain" of DisSCube. It records everything the system knows without storing the actual pixel data. +O catálogo é o registro de tudo que o sistema conhece. Ele não armazena pixels — apenas metadados, ponteiros e hashes. -## SQLite Schema +## Implementações -We use SQLite for its simplicity, concurrency support (ACID), and indexing capabilities. +A interface é definida pelo `CatalogStore` Protocol (`disscube/catalog/protocol.py`): -| Table | Purpose | Key Columns | -|-------|---------|-------------| -| `grids` | Master and local grid definitions | `id`, `data (JSON)` | -| `sources` | Raw data pointers (Rasters/Tiles) | `id`, `bbox`, `data` | -| `derived` | Variables generated by the engine | `spec_hash`, `tile_id`, `grid_id`, `times` | -| `relations` | Parent-child grid mappings | `source_grid_id`, `target_grid_id` | +| Implementação | Arquivo | Uso | +|---|---|---| +| `SqliteCatalogStore` | `catalog/sqlite_store.py` | Padrão — usado por `CubeClient` | +| `JsonCatalogStore` | `catalog/json_store.py` | Legado / testes simples | -## The Hash System (`spec_hash`) +## Schema SQLite -To guarantee reproducibility and prevent data collisions, we calculate a SHA-256 hash of the `SpatialDerivation` object. If any of these parameters change, the hash changes, and the system treats it as a different data asset: +```sql +grids (id TEXT PRIMARY KEY, data TEXT) -- GridSpec como JSON +sources (id TEXT PRIMARY KEY, data TEXT) -- SpatialSource como JSON +derived (id, grid_id, spec_hash, tile_id, role, data TEXT) +relations (source_grid_id, target_grid_id, data TEXT) +``` -1. **Source ID**: The origin of the data. -2. **Grid ID**: The spatial definition. -3. **Variables/Operators**: The exact math applied. -4. **Temporal Window**: The `valid_from` and `valid_until` markers. +Índices: `idx_derived_grid`, `idx_derived_hash`, `idx_derived_tile`. -> **Why include time in the hash?** -> A "distance to roads" layer computed for 2010 is physically different from one for 2020 if the road network expanded. By including the temporal window in the hash, DisSCube ensures that both versions can coexist in the same catalog and be loaded together as different time slices. +Todos os objetos são serializados como JSON no campo `data`. As colunas extraídas (`grid_id`, `spec_hash`, `tile_id`) existem para indexação e busca eficiente — os dados completos estão sempre no JSON. -## Time Slices -The `derived` table stores a `times` column (serialized as JSON). -- **Static Variables**: `times` is an empty list `[]`. -- **Temporal Variables**: `times` contains the representative year (e.g., `[2015]`). +## Hash de especificação (`spec_hash`) -When `CubeClient.load()` is called, it searches for all entries with the same variable name and grid. If multiple entries have non-empty `times`, it stacks them into a single multidimensional array. +SHA-256 determinístico da `SpatialDerivation`: -## Performance -By using SQLite indexes on `grid_id` and `tile_id`, DisSCube can find a specific tile among thousands in milliseconds, a task that would require loading and parsing a massive JSON file in the previous architecture. +```python +relevant_data = { + "source_id": ..., + "grid_id": ..., + "role": ..., + "variables": [...], # ordenados por nome + "relations": [...], # ordenados, sem campo metadata + "valid_from": ..., + "valid_until": ..., +} +encoded = json.dumps(relevant_data, sort_keys=True).encode("utf-8") +return hashlib.sha256(encoded).hexdigest() +``` + +**O que muda o hash:** + +- Trocar a fonte (`source_id`) +- Trocar a grade (`grid_id`) +- Adicionar/remover/renomear variáveis +- Mudar o operador ou `class_code` +- Mudar `valid_from` / `valid_until` +- Adicionar/remover relações espaciais + +**O que não muda o hash:** + +- Metadados das relações (`SpatialRelation.metadata`) +- A ordem das variáveis na lista (são ordenadas por nome) +- `bbox` de `Derivation` (metadado descritivo, não parâmetro) + +## Hash de conteúdo (`content_hash`) + +SHA-256 de todos os bytes dos arquivos do diretório Zarr, em ordem determinística (`sorted(root.rglob("*"))`). Garante integridade do dado materializado independente do `spec_hash`. + +## Séries temporais + +O campo `times` em `DerivedVariable` é uma lista de inteiros (anos): + +- `times = []` → variável estática +- `times = [2020]` → fatia temporal de 2020 + +`CubeClient.load()` detecta automaticamente se há múltiplas fatias e as empilha em `(time, y, x)` ordenadas pelo primeiro valor de `times`. + +`CubeClient.to_lucc_data()` aceita `period=("2000", "2020")` para filtrar apenas as fatias dentro do intervalo. + +## Busca no catálogo + +```python +# Por grade e role +cube.catalog.search_derived_variables(grid_id="AC/5km", role="driver") + +# Por tile +cube.catalog.search_derived_variables(tile_id="009002") + +# Por spec_hash (exato) +cube.catalog.get_derived_by_hash("a3f9...") +``` + +## Evolução do schema + +O schema usa `CREATE TABLE IF NOT EXISTS` — seguro para idempotência. Novas colunas exigem `ALTER TABLE` ou migração manual. Os campos descritivos extras de modelos (como `purity_threshold` de `Derivation`) vivem apenas no modelo Python, não no banco — isso protege a compatibilidade retroativa para campos reservados. diff --git a/docs/architecture/operators.md b/docs/architecture/operators.md new file mode 100644 index 0000000..4dba3b3 --- /dev/null +++ b/docs/architecture/operators.md @@ -0,0 +1,128 @@ +# Sistema de Operadores + +DisSCube implementa operadores como classes Python auto-registradas. Adicionar um novo operador não requer nenhuma mudança no pipeline. + +## Como funciona + +Toda subclasse de `Operator` que define `name` é inserida automaticamente no `OPERATOR_REGISTRY` via `__init_subclass__`: + +```python +# operators/base.py +class Operator: + name: ClassVar[str] + requires_class_code: ClassVar[bool] = False + _resampling: ClassVar[Resampling] = Resampling.nearest + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + if hasattr(cls, "name"): + OPERATOR_REGISTRY[cls.name] = cls # auto-registro + + @classmethod + def resampling(cls) -> Resampling: + return cls._resampling + + def compute(self, data, var, grid) -> xr.DataArray: + raise NotImplementedError +``` + +O `GridAligner` consulta `op_cls.resampling()` para escolher o método de reamostragem antes de reprojetar o raster. O `Aggregator` chama `op_cls().compute(data, var, grid)` para calcular o resultado. Nenhum dos dois contém listas de operadores. + +## Operadores disponíveis + +### Zonais — raster e vetor + +| Operador | `_resampling` | Raster | Vetor | `requires_class_code` | +|---|---|---|---|---| +| `mean` | `average` | média dos pixels no upscale | — | não | +| `sum` | `sum` | soma dos pixels no upscale | — | não | +| `std` | `nearest` | valor reamostrado¹ | — | não | +| `min` | `min` | mínimo no upscale | — | não | +| `max` | `max` | máximo no upscale | — | não | +| `majority` | `mode` | moda no upscale | rasteriza com `class_code` (ou 1) | não | +| `minority` | `mode` | moda no upscale | rasteriza com `class_code` (ou 1) | não | +| `percentage` | `mode` | valor reamostrado¹ | rasteriza com `class_code` | **sim** | +| `attribute` | `nearest` | passthrough | rasteriza com valor da coluna `var.name` | não | +| `presence` | `nearest` | passthrough | rasteriza com `class_code` (ou 1) binário | não | + +¹ `std` e `percentage` para rasters usam o dado já reamostrado pelo `GridAligner` — estatísticas verdadeiras de janela (como `rasterstats`) são trabalho futuro. + +### Proximidade — vetor (e passthrough para raster) + +| Operador | Descrição | `requires_class_code` | +|---|---|---| +| `min_distance` | Distância euclidiana (em unidades do CRS) até a feature mais próxima | não | +| `count` | Contagem de features cujo centroide cai em cada célula | não | + +## Contrato de `compute()` + +```python +def compute( + self, + data: xr.DataArray | gpd.GeoDataFrame, + var: Variable, + grid: GridSpec, +) -> xr.DataArray: +``` + +- `data`: para fontes raster, é o `DataArray` já reprojetado e reamostrado pelo `GridAligner`; para vetores, é o `GeoDataFrame` reprojetado e clipado ao bbox. +- Retorno: `xr.DataArray` com `dims=("y", "x")` e `coords` alinhados a `grid.ys` / `grid.xs`. + +## Adicionar um operador novo + +Crie o arquivo `disscube/operators/meu_operador.py`: + +```python +from rasterio.warp import Resampling +import xarray as xr +import numpy as np +from disscube.operators.base import Operator + +class WeightedMeanOperator(Operator): + """Média ponderada pela área de interseção (exemplo ilustrativo).""" + name = "weighted_mean" + _resampling = Resampling.average # usado pelo GridAligner + + def compute(self, data, var, grid) -> xr.DataArray: + if isinstance(data, xr.DataArray): + if "band" in data.dims: + data = data.isel(band=0) + return data.transpose("y", "x") + raise TypeError(f"'weighted_mean' requer fonte raster") +``` + +Importe o módulo para que o `__init_subclass__` seja executado — basta adicionar ao `disscube/operators/__init__.py`: + +```python +from . import meu_operador # noqa: F401 +``` + +Pronto. O operador aparece em `OPERATOR_REGISTRY["weighted_mean"]` e é aceito em `Derivation(operator="weighted_mean")` e em `Variable(operator="weighted_mean")`. + +## `attribute` — contrato implícito + +O operador `attribute` rasteriza um vetor usando o valor de uma coluna numérica como pixel value. **A coluna deve ter o mesmo nome que a variável (`Variable.name`)**: + +```python +# Fonte vetor com coluna "f" e coluna "d" +Variable(name="f", operator="attribute") # usa gdf["f"] +Variable(name="d", operator="attribute") # usa gdf["d"] +``` + +Se a coluna não existir no GeoDataFrame, o resultado é um raster de zeros sem erro explícito. Garanta que o nome da variável corresponda ao nome da coluna na fonte. + +## Validação em construção (`Derivation`) + +`Derivation` valida o nome do operador e os campos obrigatórios no momento da criação: + +```python +# Erro imediato — operador inexistente +Derivation(target="x", source_id="s", operator="bogus") +# ValueError: Unknown operator 'bogus'. Available: ['attribute', 'count', ...] + +# Erro imediato — percentage sem class_code +Derivation(target="x", source_id="s", operator="percentage") +# ValueError: Operator 'percentage' requires class_code to be set. +``` + +`SpatialDerivation` e `Variable` não validam — o erro aparece no `Aggregator` em tempo de execução. Use `Derivation` para validação antecipada. diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md index c1dad60..8038da2 100644 --- a/docs/architecture/overview.md +++ b/docs/architecture/overview.md @@ -1,26 +1,121 @@ -# Architecture Overview +# Visão Geral da Arquitetura -DisSCube is designed as a **Data Cube Engine** specifically optimized for Land Use and Cover Change (LUCC) modeling. Unlike traditional GIS systems that handle files, DisSCube handles **Variables** aligned in space and time. +DisSCube implementa uma abstração de alto nível para construção de cubos de dados espaciais derivados de múltiplas fontes geoespaciais. Ao contrário de um conjunto de scripts GIS, o sistema trata **derivações como objetos declarativos** com identidade, rastreabilidade e reprodutibilidade garantidas. -## Core Design Principles +## Modelo conceitual -### 1. Separation of Space and Asset -In traditional workflows, a "Grid" is often tied to a "File". In DisSCube, a `GridSpec` is a mathematical definition of space (CRS + Resolution + Extent). Multiple files (Tiles) can belong to the same Grid. +``` +SpatialSource ──► SpatialDerivation ──► Variable ──► DerivedVariable + │ │ + asset_url spec_hash (SHA-256) + format grid_id + crs valid_from / valid_until +``` + +### Entidades principais + +| Entidade | Papel | +|---|---| +| `GridSpec` | Definição matemática do espaço: CRS + resolução + bbox | +| `SpatialSource` | Ponteiro para dado bruto: raster (GeoTIFF) ou vetor (GPKG/SHP) | +| `SpatialDerivation` | Intenção de derivação: fonte + grade + operadores + janela temporal | +| `Derivation` | Front-end declarativo sobre `SpatialDerivation` com validação em construção | +| `Variable` | Nome + operador + class_code para uma variável derivada | +| `DerivedVariable` | Produto materializado: path Zarr + `spec_hash` + `content_hash` | +| `SpatialRelation` | Relação pai-filho entre grades (reservado para futuro uso no pipeline) | + +### Dois modos de uso + +**Declarativo (recomendado):** +```python +from disscube.derivation import Derivation + +d = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + valid_from="2020", valid_until="2020", +) +cube.derive_declarative(d, grid_id="AC/5km") +``` +Valida o operador e `class_code` na construção (fail-fast), antes de qualquer I/O. -### 2. Immutability & Provenance -Every operation in DisSCube is tracked via a `spec_hash`. If you change the parameters of a derivation, the hash changes, and a new variable is created. This ensures scientific reproducibility. +**Direto:** +```python +from disscube.models import SpatialDerivation, Variable -### 3. Tiled Execution -To support continental-scale data (like Brazil at 10m), the engine never loads the full grid into memory. It partitions the execution using BDC (Brazil Data Cube) tiles or custom spatial partitions. +cube.derive(SpatialDerivation( + source_id="mapbiomas_2020", grid_id="AC/5km", role="driver", + variables=[Variable(name="forest_pct", operator="percentage", class_code=3)], + valid_from="2020", valid_until="2020", +)) +``` +Ambos os modos chegam ao mesmo pipeline — `Derivation` é um front-end, não um caminho alternativo. -## High-Level Data Flow +## Pipeline de execução -```mermaid -graph TD - A[Raw Data: GeoTIFF/SHP] --> B{Normalizer} - B --> C[GridAligner: Reproject/Resample] - C --> D[Aggregator: FillCell Logic] - D --> E[VariableWriter] - E --> F[(SQLite Catalog)] - E --> G[[Zarr Store]] ``` +SpatialSource + │ + ▼ Normalizer + │ • raster: valida abertura do arquivo + │ • vetor: carrega GeoDataFrame, corrige CRS se necessário + │ + ▼ GridAligner + │ • raster: reproject por variável com Resampling do operador + │ → retorna xr.Dataset {var_name: DataArray} + │ • vetor: reprojeta GDF + clip ao bbox da grade + │ • invariante: verifica shape == (grid.rows, grid.cols) + │ + ▼ Aggregator + │ • delega a operator.compute(data, var, grid) → DataArray + │ • monta xr.Dataset final com CRS e transform + │ + ▼ VariableWriter + • salva cada variável como Zarr + • calcula content_hash (SHA-256 dos bytes) + • registra DerivedVariable no catálogo SQLite +``` + +## Reprodutibilidade: `spec_hash` + +Cada `SpatialDerivation` tem um `spec_hash` — SHA-256 determinístico de: + +- `source_id` +- `grid_id` +- `role` +- variáveis (nome + operador + class_code, ordenadas por nome) +- relações espaciais (ordenadas, sem metadados) +- `valid_from` / `valid_until` + +Se qualquer parâmetro mudar, o hash muda. O pipeline verifica o cache antes de processar: se todos os `DerivedVariable` com o mesmo `spec_hash` já existem no disco, a derivação é pulada. + +`Derivation.spec_hash()` delega a `SpatialDerivation.spec_hash()` e adiciona `purity_threshold` quando definido. `bbox` é excluído — é metadado descritivo, não parâmetro de derivação. + +## Sistema de operadores (plugin) + +Cada operador é uma subclasse de `Operator` que se auto-registra no `OPERATOR_REGISTRY`: + +```python +class MajorityOperator(Operator): + name = "majority" + _resampling = Resampling.mode + + def compute(self, data, var, grid) -> xr.DataArray: + ... +``` + +`OPERATOR_REGISTRY["majority"]` → `MajorityOperator`. O `GridAligner` usa `op_cls.resampling()` para escolher o método de reamostragem por variável. O `Aggregator` usa `op_cls().compute()` para calcular o resultado. Adicionar um operador = criar um arquivo; zero mudança no pipeline. + +Ver [Operadores](operators.md) para a lista completa e guia de extensão. + +## Armazenamento + +``` +data/derived/{grid_id}/{partition}/{spec_hash}/{variable_name}.zarr +``` + +- `partition` = `tile_id` ou `global`. +- Cada variável é um dataset Zarr independente com `spatial_ref` e metadados CRS. +- `content_hash` (SHA-256 dos bytes do Zarr) garante integridade do dado materializado. diff --git a/docs/architecture/pipeline.md b/docs/architecture/pipeline.md index 178cf22..0183388 100644 --- a/docs/architecture/pipeline.md +++ b/docs/architecture/pipeline.md @@ -1,25 +1,149 @@ -# Pipeline Stages +# Estágios do Pipeline -The DisSCube pipeline is a sequence of idempotent stages. Each stage receives a `PipelineContext` and returns an updated one. +O pipeline é uma sequência de `PipelineStage`, cada um recebendo e retornando um `PipelineContext`. O contexto carrega fonte, grade, derivação e o dado em transformação. + +```python +class PipelineStage: + def execute(self, ctx: PipelineContext) -> PipelineContext: + raise NotImplementedError +``` + +```python +class PipelineContext(BaseModel): + source: SpatialSource + grid: GridSpec + derivation: SpatialDerivation + tile_id: str | None = None + data: Any = None +``` + +A sequência executada por `CubeClient.derive()`: + +```python +pipeline = [Normalizer(), GridAligner(), Aggregator(), VariableWriter(store, catalog)] +for stage in pipeline: + ctx = stage.execute(ctx) +``` + +--- ## 1. Normalizer -Handles the entry point of the data. -- **Vector**: Loads the shapefile/GPKG into a GeoDataFrame. -- **Raster**: Prepares lazy-loading parameters and validates coordinates. + +**Arquivo:** `disscube/pipeline/normalizer.py` + +**Responsabilidade:** ponto de entrada da fonte de dados. + +### Raster +Abre o arquivo com `rasterio` para validar que é legível. Não carrega dados — a leitura real é lazy em `GridAligner`. + +### Vetor +Carrega o `GeoDataFrame` completo com `geopandas.read_file()`. Se o CRS declarado na fonte (`SpatialSource.crs`) difere do CRS no arquivo, aplica `set_crs(..., allow_override=True)` com `log.warning` explícito. + +!!! warning "Override de CRS" + O `Normalizer` **não reprojeta** — ele apenas corrige metadados. Isso é intencional para fontes com `.prj` malformado ou ausente. Se a intenção é reprojetar, use `SpatialSource.crs` com o CRS real do dado e deixe o `GridAligner` fazer a reprojection. + +--- ## 2. GridAligner -The most critical spatial stage. It ensures the source data matches the `GridSpec` exactly. -- **Reprojection**: Uses `rioxarray` and `rasterio` to transform the source CRS to the target Grid CRS. -- **Resampling**: Applies the selected operator logic (Nearest, Bilinear, etc.) to match the grid resolution. -- **Cropping**: Clips the data to the grid's Bounding Box (BBOX). + +**Arquivo:** `disscube/pipeline/aligner.py` + +**Responsabilidade:** alinhar a fonte ao `GridSpec` alvo. + +### Raster — por variável + +Para cada variável na derivação: + +1. **Seleção de banda:** por `band_map` (1-based) ou por índice posicional. +2. **Resampling por operador:** consulta `OPERATOR_REGISTRY[var.operator].resampling()`. Cada variável usa o método correto para sua semântica (`Resampling.mode` para `majority`, `Resampling.average` para `mean`). +3. **Reprojeção:** `band.rio.reproject(grid.crs, shape=(rows, cols), transform=grid.transform, resampling=...)`. +4. **Invariante de alinhamento:** verifica `aligned.rio.shape == (grid.rows, grid.cols)`. Mismatch → `ValueError` explícito. + +Retorna `xr.Dataset` com uma `DataArray` por variável, já com `dims=("y", "x")`. + +### Vetor + +1. Reprojeta o GeoDataFrame para o CRS da grade usando `pyproj.CRS.equals()` para comparação robusta. +2. Clipa ao bbox da grade com `shapely.geometry.box`. + +Retorna o GeoDataFrame processado em `ctx.data`. + +### Por que por variável? + +Derivações multi-variável de fontes multi-banda frequentemente usam operadores diferentes: + +```python +variables=[ + Variable(name="uso", operator="majority"), # Resampling.mode + Variable(name="alt", operator="mean"), # Resampling.average + Variable(name="solo", operator="majority"), # Resampling.mode +] +``` + +Com a abordagem por variável, cada banda é reprojetada com o método semanticamente correto. A versão anterior usava apenas `variables[0].operator` para todo o raster. + +--- ## 3. Aggregator -Applies domain-specific logic (FillCell operators). -- **Proximity**: Calculates Euclidean distance transforms for vector features. -- **Zonal Stats**: Aggregates point or polygon data into grid cells. + +**Arquivo:** `disscube/pipeline/aggregator.py` + +**Responsabilidade:** derivar cada variável chamando seu operador. + +Para cada variável na derivação: + +```python +op_cls = OPERATOR_REGISTRY[var.operator] + +# Raster: ctx.data é Dataset → seleciona DataArray pré-alinhado +# Vetor: ctx.data é GeoDataFrame → operador faz a rasterização +var_data = ctx.data[var.name] if isinstance(ctx.data, xr.Dataset) else ctx.data + +result = op_cls().compute(var_data, var, grid) +final_ds[var.name] = result +``` + +Não contém nenhuma lógica de operador — é puro despacho. O `if/elif` histórico foi eliminado. + +Monta o `xr.Dataset` final com CRS nomeado (evita `PROJCS["unknown"]` no QGIS para CRSs sem EPSG registrado, como BDC Albers) e `transform`. + +--- ## 4. VariableWriter -Persists the final result. -- **Format**: Saves as a Zarr dataset with consolidated metadata. -- **Hashing**: Calculates the `spec_hash` and `content_hash`. -- **Registration**: Updates the SQLite catalog with the new `DerivedVariable` entry. + +**Arquivo:** `disscube/pipeline/writer.py` + +**Responsabilidade:** persistir e registrar. + +Para cada variável no Dataset: + +1. Adiciona atributos: `grid_id`, `role`, `spec_hash`, `crs`, `tile_id`. +2. Salva como Zarr em `data/derived/{grid_id}/{partition}/{spec_hash}/{var}.zarr`. +3. Calcula `content_hash` (SHA-256 de todos os bytes do Zarr, em ordem determinística). +4. Determina `times`: + - Se `source.time` está definido → `[source.time]` (ex: MapBiomas 2020) + - Caso contrário, se `valid_from` é um ano → `[int(valid_from)]` + - Sem informação temporal → `[]` (variável estática) +5. Registra `DerivedVariable` no catálogo SQLite. + +### Tile detection + +Se `tile_id` não é passado explicitamente e `grid.id` começa com `BDC_`, tenta extrair o tile do `source.id` (convenção `BDC_LG_009002`). Isso é um fallback para compatibilidade com o workflow BDC. + +--- + +## Idempotência e cache + +`CubeClient.derive()` verifica o cache antes de executar: + +```python +spec_hash = derivation.spec_hash() +cached_vars = [ + d for d in all_derived + if d.spec_hash == spec_hash and self.store.fs.exists(d.asset_url) +] +if expected == cached_names: + return cached_vars # retorna sem executar o pipeline +``` + +Reexecutar a mesma derivação é seguro e rápido. diff --git a/docs/architecture/tiling.md b/docs/architecture/tiling.md index cde56fa..3ae054f 100644 --- a/docs/architecture/tiling.md +++ b/docs/architecture/tiling.md @@ -1,32 +1,79 @@ -# Master Grids & Tiling +# Tiling e Processamento Particionado -To process continental-scale data without massive memory requirements, DisSCube uses a "Virtual Tiling" approach. +Para processar dados em escala continental (ex: Brasil a 100m) sem carregar o país inteiro em memória, DisSCube usa um modelo de particionamento baseado em tiles. -## The Concept +## Conceito -Instead of creating thousands of individual small grids (which pollutes the catalog), we define a few **Master Grids** and use **Tiles** as execution filters. +Uma **Master Grid** define a resolução e o CRS para todo o país. **Tiles** são recortes com o mesmo CRS e resolução — apenas o bbox muda. O pipeline executa um tile por vez, gerando Zarr isolados que podem ser paralelizados. ```mermaid graph TD - MG[Master Grid: 10m Brazil] --> T1[Tile 001] + MG[Master Grid: BDC/100m] --> T1[Tile 001] MG --> T2[Tile 002] - MG --> T3[Tile N...] - - T1 --> P1[Process Partition 1] - T2 --> P2[Process Partition 2] + MG --> TN[Tile N…] + T1 --> Z1[zarr: .../001/{hash}/var.zarr] + T2 --> Z2[zarr: .../002/{hash}/var.zarr] ``` -## How it works in Code +## Como o CubeClient usa tiles -When you call `cube.derive(derivation, tile_id="XYZ")`: +```python +cube.derive(derivation, tile_id="009002") +``` + +Internamente: +1. Busca `GridSpec` da master grid. +2. Busca `SpatialSource` com id `{grid_id}_{tile_id}` para obter o `bbox` do tile. +3. Cria um `GridSpec` temporário: mesmos CRS e resolução, bbox restrito ao tile. +4. Executa o pipeline nessa grade temporária. +5. Salva em `data/derived/{grid_id}/009002/{spec_hash}/{var}.zarr`. + +## Registrar tiles + +Cada tile é registrado como um `SpatialSource` especial com o bbox do tile: + +```python +from disscube.models import SpatialSource + +cube.register_spatial_source(SpatialSource( + id="BDC_SM_009002", # convenção: {grid_id}_{tile_id} + name="BDC SM Tile 009002", + format="raster", + asset_url="data/raw/tile_009002.tif", + crs="EPSG:...", + bbox=[-70.0, -10.0, -65.0, -5.0], # bbox do tile em coordenadas geográficas +)) +``` + +O utilitário `disscube.utils.bdc_importer` automatiza esse processo para tiles BDC. + +## Processamento em loop + +```python +tiles = cube.catalog.list_spatial_sources() +tile_ids = [s.id.split("_")[-1] for s in tiles if s.id.startswith("BDC_SM_")] + +for tile_id in tile_ids: + cube.derive(derivation, tile_id=tile_id) +``` + +Cada iteração é independente. Workers paralelos podem processar tiles diferentes sem conflitos (caminhos Zarr únicos por tile + spec_hash). + +## Carregar dados tileados + +```python +# Carga de um tile específico +da = cube.load("dist_road", tile_id="009002") + +# Carga por grade (mosaico de todos os tiles com mesmo nome e spec_hash) +da = cube.load("dist_road", grid_id="BDC_SM") +``` + +`CubeClient.load()` detecta tiles automaticamente ao buscar por `variable_name` + `grid_id`. Se houver múltiplos tiles, empilha as fatias. Para variáveis temporais com múltiplos tiles, ordena por tempo. -1. The system fetches the `GridSpec` for the Master Grid. -2. It fetches the `SpatialSource` for the specific tile to get its `BBOX`. -3. It creates a **Temporary Grid** that has the same resolution/CRS as the Master Grid but is restricted to the tile's BBOX. -4. The pipeline processes only this small window. -5. The output is saved in a tile-specific directory, preventing file collisions. +## Vantagens -## Benefits -- **Memory Efficiency**: Process Brazil at 10m in 2GB of RAM by iterating tiles. -- **Parallelism**: Multiple workers can process different tiles of the same Master Grid simultaneously without race conditions (thanks to SQLite and unique paths). -- **Consistency**: All tiles are guaranteed to align perfectly because they all derive from the same Master Grid mathematical definition. +- **Memória controlada:** processa um tile de cada vez. +- **Paralelismo trivial:** workers independentes, sem race conditions. +- **Consistência garantida:** todos os tiles derivam da mesma `GridSpec` — pixels sempre alinhados. +- **Cache por tile:** re-executar um tile com o mesmo `spec_hash` é no-op. diff --git a/docs/guides/bdc.md b/docs/guides/bdc.md index e501dbe..279963c 100644 --- a/docs/guides/bdc.md +++ b/docs/guides/bdc.md @@ -1,30 +1,129 @@ -# Brazil Data Cube Integration +# Integração com Brazil Data Cube -DisSCube provides first-class support for the **Brazil Data Cube (BDC)** hierarchical grid system. +## Grades BDC e tiles -## Master Grids vs. Tiles +O Brazil Data Cube (BDC) particionam o Brasil em tiles hierárquicos. DisSCube representa isso com: -The BDC partitions Brazil into tiles. In DisSCube, we represent this using **Master Grids** (the resolution definition) and **Spatial Sources** (the individual tiles). +- **Master Grid**: definição de resolução e CRS para todo o país. +- **Tiles**: `SpatialSource` com o `bbox` de cada partição. -### Available Master Grids -- `BDC_SM`: 10m resolution (Small tiles). -- `BDC_MD`: 30m resolution (Medium tiles). -- `BDC_LG`: 60m resolution (Large tiles). -- `BDC_100m`: 100m custom resolution (Continental scale). +## Registrar grades e tiles BDC -## Tiled Derivation +O utilitário `bdc_importer` cria as master grids e registra cada tile como `SpatialSource`: -When you run a derivation, you specify the `tile_id`. DisSCube performs the following steps: +```python +from disscube.utils.bdc_importer import import_bdc_grids + +import_bdc_grids( + cube, + sm_shp="data/bdc_grids/BDC_SM_V2.shp", + md_shp="data/bdc_grids/BDC_MD_V2.shp", + lg_shp="data/bdc_grids/BDC_LG_V2.shp", +) +``` + +Isso registra as master grids e cada tile como `SpatialSource` com `bbox` preenchido. + +## Derivação por tile + +```python +from disscube.models import SpatialDerivation, Variable + +derivation = SpatialDerivation( + source_id="slope_brazil", + grid_id="BR/5km", + role="driver", + variables=[Variable(name="slope", operator="mean")], +) + +# Processar um tile +cube.derive(derivation, tile_id="009002") + +# Processar todos os tiles em loop +tiles = [s for s in cube.catalog.list_spatial_sources() if s.id.startswith("BR/5km_")] +for tile_source in tiles: + tile_id = tile_source.id.split("_")[-1] + cube.derive(derivation, tile_id=tile_id) +``` + +Cada tile é processado de forma independente e pode ser paralelizado. + +## Carregar resultado tileado + +```python +# Tile específico +da = cube.load("slope", tile_id="009002") + +# Todos os tiles da grade (mosaico automático) +da = cube.load("slope", grid_id="BR/5km") +``` -1. **Lookup**: Finds the BBOX of the tile in the catalog. -2. **Crop**: Restricts the Master Grid to that specific BBOX. -3. **Process**: Executes the operators only for that area. -4. **Partition**: Saves the Zarr file in a directory named after the tile. +## Grade 100m nacional -This approach allows you to process a single tile for testing or iterate through all tiles of Brazil using a simple loop, without ever hitting memory limits. +Para projetos que precisam de resolução mais alta que BDC_SM (10m), DisSCube suporta uma grade de 100m customizada: ```python -# Processing multiple tiles in a loop -for tile in ["008013", "008014", "008015"]: - cube.derive(my_derivation, tile_id=tile) +from disscube.utils.grids import register_local_grid + +grid_100m = register_local_grid( + cube, + name="BR", + bbox_geo=(-73.98, -33.75, -28.65, 5.27), # bbox do Brasil em WGS84 + resolution=100.0, + snap=True, +) +``` + +## Fluxo completo: setup → derivação → carregamento + +```python +from disscube.client import CubeClient +from disscube.models import SpatialSource, SpatialDerivation, Variable + +cube = CubeClient(catalog="catalog.db", store="./data/") + +# 1. Fonte +cube.register_spatial_source(SpatialSource( + id="urban_centers", + name="Centros Urbanos PNLT", + format="vector", + asset_url="data/raw/urban_centers.shp", + crs="EPSG:5880", +)) + +# 2. Derivação — distância a centros urbanos em BR/5km +derivation = SpatialDerivation( + source_id="urban_centers", + grid_id="BR/5km", + role="driver", + variables=[Variable(name="dist_cidades", operator="min_distance")], + valid_from="2000", + valid_until="2014", +) + +# 3. Executar para um tile +cube.derive(derivation, tile_id="009002") + +# 4. Carregar +da = cube.load("dist_cidades", tile_id="009002") +print(da.shape) # (rows, cols) +``` + +## Variáveis temporais com tiles + +Para drivers com variação temporal, derive múltiplos períodos e carregue como série: + +```python +for start, end in [("2000", "2014"), ("2015", "2025")]: + cube.derive(SpatialDerivation( + source_id="urban_centers", + grid_id="BR/5km", + role="driver", + variables=[Variable(name="dist_cidades", operator="min_distance")], + valid_from=start, valid_until=end, + )) + +# Carrega série temporal (time, y, x) +da = cube.load("dist_cidades", grid_id="BR/5km") +print(da.dims) # ('time', 'y', 'x') ``` diff --git a/docs/guides/grids.md b/docs/guides/grids.md index e665146..19cdb72 100644 --- a/docs/guides/grids.md +++ b/docs/guides/grids.md @@ -1,81 +1,118 @@ -# Grids & Spatial Interoperability +# Grades e Interoperabilidade Espacial -DisSCube is built on a **Universal Grid Architecture**. This allows multiple areas of study and different resolutions to coexist and interact without the need for expensive and lossy resampling. +## O problema que as grades snappadas resolvem -## 🔗 The "Magic" of Snapped Grids +Em workflows GIS tradicionais, criar uma grade para o Acre e outra para o Brasil frequentemente produz pixels desalinhados — o canto de um pixel de 5km do Acre não coincide com o canto de um pixel de 5km do Brasil. Agregar de uma grade para outra gera erros por pixels parciais nas bordas. -When you create a local grid (for a state, municipality, or any Area of Interest), DisSCube doesn't just cut the data at the boundaries. It performs a **Snapping** operation. +DisSCube resolve isso com **snap ao mesh virtual**: toda grade local é ancorada à mesma origem do CRS, garantindo que pixels de resoluções múltiplas sempre se alinhem perfeitamente. -### What is Snapping? -Snapping ensures that the boundaries of any grid are exact multiples of its resolution, calculated from the origin `(0,0)` of the Coordinate Reference System (CRS). +## Grades de referência nacionais -- **Without Snapping**: A grid starting at `x=2815245.5` with 5km resolution would have pixels shifted relative to a national grid. -- **With Snapping**: DisSCube rounds the start to `x=2815000.0`. - -### Why it matters -Because all grids (National, State, Local) are "pinned" to the same universal virtual mesh, **their pixels always align perfectly**. If you place a 5km pixel of the Acre grid over the 5km pixel of the Brazil grid, they are exactly the same square in the world. +```python +from disscube.utils.grids import register_simulation_grids -## 📦 Nested Grids (Hierarchy) +register_simulation_grids(cube) +# Registra: +# BR/5km — 5000 m, BDC Albers, bbox nacional +# BR/1km — 1000 m, BDC Albers, bbox nacional +``` -Nesting happens when you use resolutions that are multiples of each other (e.g., 5km, 1km, 100m). +## Grades locais com snap -Since they are all snapped to the same origin, the pixels form a perfect hierarchy: -- **1 pixel of 5km** contains exactly **25 pixels of 1km** ($5 \times 5$). -- **1 pixel of 1km** contains exactly **100 pixels of 100m** ($10 \times 10$). +```python +from disscube.utils.grids import register_local_grid -This "magic" allows for **Zero-Error Aggregation**. When calculating the percentage of forest (at 100m resolution) inside a model cell (at 5km), DisSCube knows exactly which $50 \times 50$ small pixels belong to the large cell. There are no "partial pixels" at the edges. +grid = register_local_grid( + cube, + name="AC", # ou state="AC" + bbox_geo=(-73.99, -11.15, -66.62, -7.11), # lon_min, lat_min, lon_max, lat_max + resolution=5_000.0, + snap=True, # padrão +) +# Produz grade "AC/5km" em BDC Albers +``` -## 🛠 Using `register_local_grid` +**O que `snap=True` faz:** -To take advantage of this, always use the `register_local_grid` utility: +A bbox geográfica é convertida para BDC Albers e os limites são arredondados para o múltiplo mais próximo da resolução: ```python -from disscube.utils.grids import register_local_grid +minx = math.floor(minx / resolution) * resolution +maxx = math.ceil(maxx / resolution) * resolution +``` -# Register a 1km grid for Acre that "nests" inside the national 5km grid -grid = register_local_grid( - cube, - name="AC", - bbox_geo=(-74, -11, -66, -7), - resolution=1000.0, - snap=True # This is the magic switch (True by default) -) +Resultado: `AC/5km` e `BR/5km` têm pixels idênticos na área de sobreposição — um pixel de 5km do Acre é o mesmo quadrado geográfico que o pixel de 5km do Brasil. + +## Hierarquia de resoluções + +Resoluções múltiplas dentro do mesmo CRS formam uma hierarquia perfeita: + +``` +1 pixel de 5km +├── 25 pixels de 1km (5×5) +└── 2500 pixels de 100m (50×50) ``` -## 🗺 Spatial Relations +Isso permite **agregação zero-erro**: ao calcular `percentage` de floresta (100m) dentro de uma célula de modelo (5km), todos os pixels de 100m que compõem a célula de 5km são conhecidos exatamente. -`SpatialRelation` entities define how data should move between different grids. Because of the snapping/nesting logic, these relations become much more powerful. +## Relações espaciais + +`SpatialRelation` registra a relação pai-filho entre grades: ```python from disscube.models import SpatialRelation -# Define that BR/1km is a child of BR/5km -rel = SpatialRelation( - source_grid_id="BR/1km", - target_grid_id="BR/5km", - strategy="simple" # Because they are nested, "simple" is perfectly accurate -) -cube.register_relation(rel) +cube.register_relation(SpatialRelation( + source_grid_id="AC/1km", + target_grid_id="AC/5km", + strategy="simple", +)) ``` -### Strategies -- **simple**: Used when grids are nested. The engine can perform fast block-aggregation (e.g., average $5 \times 5$ pixels). -- **keepinboth**: Used when you want to ensure the variable is available at both scales during cross-scale models. +**Estratégias disponíveis** (reservadas para uso futuro no pipeline): -## Creating a Custom Local Grid (Manual) +| Estratégia | Uso pretendido | +|---|---| +| `simple` | Grades nested — sem ambiguidade na agregação | +| `chooseone` | Células que pertencem a apenas uma grade-alvo | +| `keepinboth` | Células mantidas em ambas as grades (sobreposição) | -If you don't want to use the universal mesh (not recommended), you can register a `GridSpec` manually: +!!! note "Status das estratégias" + As estratégias estão modeladas no schema mas ainda não são aplicadas pelo pipeline de derivação. São reservadas para o mecanismo de cross-scale do DisSModel. + +## Criação manual de grade + +Quando o snap ao mesh BDC não é necessário (ex: projeto com CRS local): ```python from disscube.models import GridSpec -my_grid = GridSpec( - id="ProjectX/30m", +grid = GridSpec( + id="projeto_local/30m", type="local", crs="EPSG:31983", resolution=30.0, - bbox=[580000, 9700000, 600000, 9720000], - description="Manual grid - won't align with BDC national mesh" + bbox=[580000.0, 9700000.0, 600000.0, 9720000.0], + description="Grade manual sem snap ao mesh nacional", ) -cube.register_grid(my_grid) +cube.register_grid(grid) +``` + +!!! warning + Grades sem snap podem não alinhar com grades nacionais. A agregação cruzada entre grades não-alinhadas introduz erros por pixels parciais. + +## Propriedades derivadas de `GridSpec` + +```python +grid = GridSpec(id="G", type="local", crs="EPSG:31982", resolution=100, bbox=[0,0,1000,1000]) + +grid.rows # 10 +grid.cols # 10 +grid.transform # Affine (North-up, origem no canto superior esquerdo) +grid.xs # array de centróides X de cada coluna +grid.ys # array de centróides Y de cada linha + +# Identificação de células +cell = grid.cell_id(row=3, col=7) # "G:R0003C0007" +x, y = grid.coords_from_cell_id(cell) # centróide em coordenadas do CRS ``` diff --git a/docs/index.md b/docs/index.md index ed3b219..23603dd 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,21 +1,50 @@ -# DisSCube Documentation +# DisSCube -Welcome to the DisSCube documentation. DisSCube is the geospatial data engine for the **DisSModel** ecosystem. +DisSCube é o motor de cubos de dados espaciais do ecossistema **DisSModel**. Ele converte fontes geoespaciais brutas em variáveis derivadas alinhadas a grades de modelagem LUCC (Land Use and Cover Change). -## Documentation Structure +## Conceito central -- **[Architecture](architecture/overview.md)**: Deep dive into how DisSCube works, its pipeline, and the SQLite catalog system. -- **[Guides](guides/bdc.md)**: Practical examples on how to use DisSCube with Brazil Data Cube tiles and custom grids. +``` +SpatialSource → Derivation → Variable → DerivedVariable (Zarr) +``` + +Uma **fonte** (`SpatialSource`) passa por uma **derivação** que aplica um **operador** a uma **grade** (`GridSpec`), produzindo uma variável registrada no catálogo SQLite e armazenada em Zarr. -## Quick Install +## Instalação rápida ```bash -pip install -r requirements.txt +git clone https://github.com/DisSModel/disscube.git +cd disscube +python -m venv .venv && source .venv/bin/activate +pip install -e . +``` + +## Fluxo principal + +```python +from disscube.client import CubeClient +from disscube.derivation import Derivation + +cube = CubeClient(catalog="catalog.db", store="./data/") + +d = Derivation( + target="forest_pct", + source_id="mapbiomas_2020", + operator="percentage", + class_code=3, + valid_from="2020", + valid_until="2020", +) +cube.derive_declarative(d, grid_id="AC/5km") + +da = cube.load("forest_pct", grid_id="AC/5km") ``` -## Core Workflow +## Navegação -1. **Register** your grids and raw data sources in the catalog. -2. **Define** a declarative `SpatialDerivation`. -3. **Execute** the derivation to generate Zarr cube variables. -4. **Load** the variables for analysis or modeling. +- [**Arquitetura**](architecture/overview.md) — modelo conceitual, pipeline e hash de reprodutibilidade +- [**Operadores**](architecture/operators.md) — sistema de plugins, operadores disponíveis, como adicionar novos +- [**Pipeline**](architecture/pipeline.md) — estágios detalhados: Normalizer → GridAligner → Aggregator → Writer +- [**Catálogo**](architecture/catalog.md) — SQLite, schema, hash e séries temporais +- [**Grades**](guides/grids.md) — snap ao mesh nacional, grades locais, relações espaciais +- [**Guia BDC**](guides/bdc.md) — integração com Brazil Data Cube e processamento por tiles diff --git a/mkdocs.yml b/mkdocs.yml index f5768c6..1d0afb4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -26,6 +26,7 @@ nav: - Home: index.md - Architecture: - Overview: architecture/overview.md + - Operator System: architecture/operators.md - Catalog & Persistence: architecture/catalog.md - Pipeline Stages: architecture/pipeline.md - Master Grids & Tiling: architecture/tiling.md From b49640d0377cec76faa1e5e0820d3c16cf1194ef Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 08:46:36 -0300 Subject: [PATCH 04/13] update --- .gitignore | 2 + catalog.db | Bin 69632 -> 110592 bytes disscube/operators/base.py | 12 +- disscube/operators/zonal.py | 245 ++++++++++++++++++++++++- disscube/pipeline/aggregator.py | 20 ++- disscube/pipeline/aligner.py | 96 +++++++++- docs/terrame_fill_correspondence.md | 81 +++++++++ tests/test_aligner.py | 12 +- tests/test_aligner_resampling.py | 270 ++++++++++++++++++++++++++++ tests/test_percentage_purity.py | 196 ++++++++++++++++++++ 10 files changed, 914 insertions(+), 20 deletions(-) create mode 100644 docs/terrame_fill_correspondence.md create mode 100644 tests/test_aligner_resampling.py create mode 100644 tests/test_percentage_purity.py diff --git a/.gitignore b/.gitignore index ffc3628..2bdc089 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ ENV/ outputs +catalog.db + site # Python Cache diff --git a/catalog.db b/catalog.db index 33904801bb496b996df596206af6d92cfd229a30..1491037d708facd9725a49f576350afa76e6fd5c 100644 GIT binary patch literal 110592 zcmeI54~!hwdD!pFBYC%bdn=M6Pb5Wgb+#hXcjV2R|3indNQ!1ClaxeK)+w^QdGn84 zbMN-FyOc#Mf^#Pmwh$CW0T)FPw?*O_L0bn!(m!ch7-?YybrS?>kQn&`Bt;zr&fgko z5H~?kwBMVVxAW%B&T>X`vPa2VTf^D!&AfZ>H{ZVZeeZqq-i_z4bcSBb@AcR5u%%2r zG&Mav^;D}hH8u5R`0ptEM?ZVvWQKl$f2W!AtkW+~oqp}@M)R+x>b3th)jZq$m4(-v z3yqb9%k}>~|JU^|)}IEk_y_`mfFK|U2m*qDARq_`0`C%moAo`5r;bkdIvcL{ddGde z?Rx#rYo6O4cDi14K7Zl*#d9|=wk}_L`r=xPzo_->wN`ZD335}rS1;XXZ*SVsjmAdQO`NPu`r%DDAC&$Bpu>W#&sRV)G^6Nolud_U>Igeti0^7l+vH zdVQ~phn?QWV01DYWqI@5`70M&BXR3Qtu^K|=-ur*UOURYb@Srd&6Tki4srjMH_UON zi-$PKr*-Ywo2_dvT)8qn!{@GFzIyKZORbMye5rLJS&Wrd@~@ptmFOgz-LrV{`1Dlp z&<3w|!3%2RyTed5+M`;x(dcw;`^fISi${-6f3_Zwi=IIA&z=OeNSMJ49t`2sVrnqH ziVvnb(w0f&Lp_~vMZeeeCM2hyNSf!VJ$n|<9nIny@o1yyWbf92T>;CjHv^X8;~PZ( z%p_Pw!m=!LRbpmRC8C0~QFO9t0X^^0WrQ;eCk;1fuUo@lU>Q&*Is| ztY;7uqm6=pc0mI>OB&fmdbxi3!DerfKD^}lNT%Y|>eOHHrHRuB*b1OY)n5D)|e z0YTt_1h$vwW)6S$*>e|G(Vg`-mOAdznWgZ{(#q2C-lhlN`<@Rj+=kv zeex{!aBCSnyS8-p6q4bzw?R%dHI$;Wr-%^JWjKS=*CADtf=_Gt9(+OJ7pb-Eb-SB` zvzM;lFyYJHjm~gzcD=Q{(F5n%AHbjOdxKv0F7Y9k&XBuKODjvZ-FtoM%*(Qg%uoUe z$O=ytT~(-1Lq>4=%F2@K4V->wGt39-*&1}#?}ol?indrRCFWvJ|1z3`AYhWvDku5G0tOUT~@-$s7qaRUZp6QROV{ z%fJc0W#@sv=W-WppO^>zzI^`rYOLSkm&!hwNO4E4ff4ENia}C>Cu5K&6g@7JGz;U@ z`&uwUg}bdQFI>0~HwIekPDWF3q7k8r8%{t4Un+|Vbd}X&d52MSJ&?iK_D0YN|z5CjAPK|l}?1Ox#=KoAfF1c4nUP}_BQIvfz5i%wx^ zX!g)9G@TgB3dZ&7t%>VZMf1Uq#^Iw|( z2lJnqU!Q+@{_?yr|HS-(`RV$%>%Uw7<@(Rpf2#fu>u=Rxt+(q}>mRD0tS{Dg*Z!jR zKWo2Q`)ci9)&6nqbG5rQr}lj9!?jbjqqVuY|1`%=8{nhluC<0Z1oOOV4$B%6~+7Um`95AzbE%}ex8@DlBn zgv5lJew>%+FY^-Zhj@wl5-(BKc!~TXFF_yUCDIF=M6z!367vQxF|P9x{dr!ZJ;zJb zXL*ToEg><0##LUTU*RR%M|p|*3@=eG^Ah<7c?r72OQer*63M#AOU$QviE)9K=;wKf zc8-^*Pw^7v!wHEFG=6}W=x2F}_Wis>{SYrv&hQf1;w8xBC6d8OBunQdrp8MQm6zxW zFVSRPqN0REgStz+L|^44+G$>*p5i6S3NMkDc?tR;FOg1i63IHjON=LZiT-`OM0ouuFG+jyhxcpF1_|Ec!yu2a)+(kFjp!|mMi zHt=BXpD_GNziyg-`J%n^$v5~5pt|_~gq}!73t0vNA56PMj|C6z#xVo>|*iwAkYr`sp zz2lc>whorh3U`N(?MzO02fbt0XSSZ4mSj!Ao~>FCU!Y@6c9G%QmSXrmh8in|qRGhg zZPWEM(~@1;h1qA<#kS^~SoJ--?REykw$pJjER^_Yl%-)go~~)0t1Fu7NSbGwmTn=- zS9I)oo}w$3W~tcM4a3q5({U_C*Hy`N6|5RKk>$}#Gh6$neceWIg@G+w)jTX4hUR#h zVPMHIR8JxFVNz3(9mTV;V<2o9Ivx@LKv zsu-@OS{l-kEZe$n`Kqp|lI>Wguc&aT34h48b=7y|uxJC=lVLC)JP6w`b!0l&@DWx? z6_9RvitW0(447O`b~MX@l9@V2j$<0Wq$67g7zXx1X`D7Zh+xy$1HP%biY&wP^bDYa zY*n%hPeZl=kId4sqT7mMnwsfohHlCZC@?62<73Zv%&?MVQumSIc{<3zx~FNnBiji3 zShj3OLz)AN<@t^)!K1Wv9l0bcU4}I@eFY>kJOX5?iskEx=eZbUX}g*&`*4Q~mt&}% z>-eg#sjlrJ**9GSNLWabp};P(eeA0wf6Im3v2VzJBFjV1&ul$0ExU>%Bfw?)$grS3 zswY`8axConrtiz1sVSPR__AzZWV@j8w&eJ(Y$=kXsix81gat!k#ZnBLZXEpR%+}#) zS@C?V8&Ek-GvJ305IjwBVP!y5H!*U3$jQ#H-^eO<*4Xn~vj zwH^t7tr)JdfU`>ShK9>9vEdSJw4Dn9A5HD6ADpUxYyMRIC+aWP4>pI5A6xj+!YlJX-?%vc{}vV-zt%iGKWP5P z_tGO4WfcSj0YN|z5CjAPK|l}?1PTziIJ4C#y@2mRuX)NRW$Eayfv*K`v!~Mh9xmKZAw;e`>1vt1#>Ti_QNG zdjR}l^Y>x*yx(sAM)Nz(KX3l0=D%902)tu0M|9ZtpPJxJg_uGO{8dk1&~wds06*}?)lJ=hv_V@Usith$ctS-R$81(q*`g;_Byc&jLy?)ioW zD-S!G6xL&D@3l+3QpDV&_(%hmJ3@GJ++zgVKX6pRo}ZD(IX;Q-d)x{B92qbrljP z8yY360{JHljq=$IJCh6zU9)`|X}%;&iU*6jtELA_=RVy>oDzHF}4l#P_WY<4?r zQ_C`=?056n)MYEcsrn9NH(AvY?9|~apu@0R0MS3yN0O^MrVsYA@4@CChV5EN_Z%B` zE3qW7r(M`T!&eLhKLOb9;+?5_*=jL1HCg|E*WRC!kD7$s2-Y^Gdj1Wit!+bOQ4z^1OmAp5SZj zbcKZmQ8cPM$)39w~ z*EV3VS5;-rhRwcgTS30!dWL67;F4Lqn{|3MF`^sdrK7lIE3+CrKk8RqI8fW#M>Kf( z^Us%!U~RkM`>kd3>E3nPJ$vBw!y*32p(>ke+l^J2oW|-KmO55n?(ahaott2Ms9?ZA zks)#7fI-<(L;kEL??85wH5oPpRFSMWmTSPShl*x6u)&+7yOQI;rjR!5z-K9n<;vKD zaRS(T4Z|L6uw$UB7_fgIRum{NJ7B+jC{#8%b;<88$j) zvrHI%Wiv~ZFK%|&%`YdmKq)g2wrypn_H&N8m#qerk?lfvAiK$3sD*sXwmld2q16oy zI!worRoGw@S+Z$k8CoR8fNe{`I>d$!+t9*>s)h~w%EGqF7KS}^i!?rED-({iM_U=w z-t_rLYFm$oUQMs;OK_nMk&SgjE4FNY2?Z~kSzI_yyGeNys$Imx$*I<53(u+b4rDi3 ztsM{c%XU=+8y)+grM{}bXgk8NMYF9T&z4lhHdIAJ7DTQ%hNBvqDZ{4JGVEFnJ4*X_ zvRWs7oU)Y()tWp-s?&HeZnw+6l9^~1!-AL1GNIOGGfR|@)9mP{UQD?mBunF6p9jyC zmaPV-;5(4rWCaJK(S(iAO%L{aS74WCWT=vFVc4DAN1g+loWpkAifn2ygQTmlH@gPA zjT?sJLes8;&qQ3o39e64!OK==Y`cvQ#X7#DyN!osFPmjT$IE8+F6j6>u}#WWgVXUH z$nF6;4wL$hsoO|#TpLRsIQ71#V^gv%2WwkTgto~WWnX7+Z}z=`x8dOSuoD)&Y>o*PFPj-#JkXQK>1WCe z!r1)}D*&P#W3V~LuF6(`Q|=we?g7dzL9`22CBt%U%kw1Puw)ph0_($7B@?T*DeH!> z_`0Ql-Hmh|*R?PYRTUea?>yJfo zMa$sN!w4tVGz8I5(5pi;sQZeIRMm801j&-XKl8ynK)UQ9S@Yq45J;l9y01Z0w&L4f z8+SoLZ{zklW@7cr=E{vmN2B3ob0uYs8NFPy z$q~!|K{&q!p-1osCK!lm=5(CAET>g?ywnd`KemJVJ~oc7Y-u_D+9`bx-%5!((eR7V z_sRXKiDhWqu&+WBNLoT!hA}D=`>p}uavHq1;=?qiWw@%QLkCL2NKFH}RhSTxWEp-0 z-!pxO$4m7cKN|Z)QprWP5xX%06kl7ODUjW|@rd%DW}@xrNHJVnvRy;Rp5;oGWstY0u!eM3!xGk2 z7{v80&$9IgPk935QBz*BGGl!re3aGka!bM`bcpJBxhy#yFPB-Od^@A#@5Jwy3HevChIsf+@Prtc4C8$+lq-D-$EF}OgWvjvI z_zq+@S;sZUvJ_1=U~U5D+hj{tAri|rRRy8|OcjQ(AOJu&ECnL7VA8^a2mw%WU3Y94 z!3B4Zf2nq2=t|kjjMH&4|3BTBpMw93k02lj2m*qDARq_`0)l`bAP5Koe`g5XfAx56 zYmuyUvHW(k?2JNt(CdzQ;lFIQ@6tpCDQxWYN8iL$#bn>#g!u+Tfrvy+)!|)#WVs5= z%E2lYz5_vsI-J|EtQL&gB3NF_a9th2l!fDau+|vXCFcRLTkqo39C^OX{Qtt#Q&Y{w z+Qx$3I642p`g3!)XYIXvcYkH(`S*Qi`o|%0!e@K!;LMZvkDh<}Li@(mwggLlAT63b zOUPp70&(lc)z(c|LZlU54{3g{zmA8ns#qTnhF%{e?8sm6Vw&wbt2eH$hPRv^cF0VR zbK7(74DPPeCEA=mtk$-)yxH%4@+|gnYq^VuZRzYOB*Q7J+Xkna8cNaGQ$z^qGMvHb z>yWBJtT=pH%lF_5ta%rFTFYLyyE!;}>G};5l7iXgv+J$pjb7XJd=j#2+r8JxB622- z1WD3qX$48LuENc5Dq9c+5(y#bLno&Y)<~wWX{m?!7FmN1q&8V1Zyj$goq6NUH@DXw z=FzHS(30h2qDhw&T9Q@}ty+1slwtxcME(Zo^P)Y#qgBSBC9^D9Xh~W@v`XdCQi=(* zni(o^UbOppwDK6Vr18r_OVSFWl`D^yQcR%Lp?4_KAnfDOqA_U6yBt|)Nm@a)sPbqj z#ROUi!wVYVyjuHE0xg17dU&)%mW4J-D}t8XQcbjm9ClOCjm2#()3?!9A}88i%^B-0 z(yQ5>-Z~z%Bg3XGL%>GFczj|E`zqc%AKcu!454lL@qvetN0ZEw@w%U7Vj=zPl)A!moj+X+a-{ZQT15xm3U-O({uiv_H`TC`EH^@)A zb;9ZO`<+`Iw|A0#=HZ1PNm-7hw$9)jSg4~+PT$(Ra|^B=Z$O66SvSbfJ^h@cXFmQp zZ@pND8a__m0%~t=bcfNS9esx!qqg}vS*&o-dhXhln;;xLw8_;N<+D0EYj5_t_u7L_ z{}vtuP2S-4=H!}brb)D_Xw814HuL0L&fPv3sci?=6^7DBwM)q28ood-ZCwa2ZJi)B zE%3zHOjpU@>jTn22>-6P!8#8Bf48c?U7MRZ_0~P8HMl74&W78$BXsqFE^g4f4laDIcx9l?RjhWiCw?)zCMWmu70-HS_yklg=I-14Id3_ z5Lx{ElfHM9mP5ry!y4q4iI4evHLRh)D_DV-0`?#^{Z3MMBtRu)??95Htec^mQqZhrLIk0w2$#JONOKz!T zTg#3XBX|imXkl4ZrsK+a8-$}g+SKSgupm*0V@Hglw1Q~0%A=(eS6H-ktRpYlBRtyF zh!Nn+95IU03Zkv(h|!oijs#k2HsnQnm`9s3z5rjQ@fD>NL|aYci)|1n#ROVvHsnS7 z2#+>pd;z{p<10!lh_;%>7mJosOrWJ^LteCtJld4;1^60HaRs=1%rJCIsGY~(r5Ln){Z_9^P`2S;vIJVc86B-B0lBU{8 zBFoaaD6NRbky|RMapPS56-CxU8y^M*=r25<#%-^CkVl(pCxJy~I~k=FL|e^vl0{1? zCeRjbCr|QdQ%)-2%XCtsw1Q|W>ZCFQRFq-@ZBZxn1cz3#cqbL2hpg-dAxJBRRj{m6-YaG96IoU}8Ko7`IC4uh+eyaNkIj8W z4{iJvuu}_~`x=kd7pn=+gLU#8&{r4>Y5QNxKrODQI_mihpBwN~cQrVJ;*muWaf zX$8?%)No?ZQi=(*)Cb6m7V&6Ph7;h+G@PQef@mvhI5B7`#RS@-o|Ke83uC9LwwB1U zy0Iv&2wHMWHM=optW|Y5(YBUB8#}3SD=~WcDX-R^;?btsTELgt)<$Us(N?srWzfb> zDucF|le)dO!lO+E1OdLxfS@R?AlhmM1hE5DlwzVmprI`Jv{v~48UFvdgvJ?QfAUT$ zk!AJnQCbm=BeztucW0cY*d|r=Xm$!AYE9+Sxb3y4c(kc@5?ExmlTlhhv=wb98MKt* zio&2RYLkALN1JLV0bgc28Ko6OThVrsK}#tn(CUgF7^-=-_6HJZ^)co?k!3XqQCbnS z)Z~_Gn)^&g9^0g`lUjHP`lCGBluZgOGHuc* zt;mm=+){b8u}#XLrPfrwAM^IwGd$XqO$sbBZPF;MAli!Bq>R>5imd-XTFN3X+RHrJ zluZiwGHuc*tsvTp+N2EH*d}GPmReJJ(f%NhHf56nzD%1mN-K!AqBbdmmQqY;Ew!fd zqP@hUP1&S?FViNC(h8!js7=bCr4$orsWp`s?MHaDDVr4VW!j`sT0yiGwMiMYlwtyH zF`IOI?IMRZ9efEx)tSMUQCcyyRSmvm&{B#l+ELhGUaftaN1F=11bmsnmr+_lv=t4$ zjM0uLCeRiQzP!MrO$7u2zRVedD6Jscip~%)Xeq@6S{gQ(S8LDnXvcVC*}<29wzxMY zRo)w8USo@mFGQzk^Pv^~e}@16VnX9!OZAjZN@Q8neNkGGA2Ye7noer0TvBY4YDV}X z>N|M<|6@GbluZgOGHuc*tsvTp+N2EH*d}GrQfn&Tk9m9T1s-k6CIx(%HffYr5N$&gmZfn~ zS`m#Sw^UQ(7^f-rq)6~|@SY#oQU#a$#vIzTCuL3Wq@uK9XshZ;F=#2pl^`xXL_6{U z@^(Q#%%e?tQh+bhlZw&`qOGVW6{8(dOrWJcKwh+M9&M@%0(_ZWP?S~>ZAH5v1}&wS zKudjqyl6kcqfL2IfG^XNiqZD5wzr%YFY@4`5D`!G4Fyl>Gs;oJla%)04y>a zgea{b+KM&^4B9yOl0i$u2J>p|5AkSI4FcfHY!ITff@mw+ATVetMb=3z{6f%6JlZis zE7`%9fR+rq6hvG3p_LKE1X>z4m{)7pc(h}NRsff$?0Zzq75D)|e0YN|z z5CjAPL0}w#?X^B@lfnQW;%(AEmN9BY(+b$6!7bGsT4DUr*d~n!1f^m&>Gs;IJX${Z zlCa1MzNBdd(N;A0l0i!;vcuSggD*EZw5i}rz?b_E^ RgEkJnWVDus4dz4p{{h^3d2Rpz delta 152 zcmZp8z}B#UWr8%T0|Ntt;6w#`Mu&|F3;6~4iWpcq=P>ZP^IhdS#W{!j$Hs|WoSWxJ zu4ZO#(qP@ZR#t;?vb$_5BirU1ax#J{tX#~Jj7~xNhS|Anyg-vUIFcCn#kunM%y>0< zw73H}GdPkqPW0knZ(`|x)W=s*7e E0PpxHZ~y=R diff --git a/disscube/operators/base.py b/disscube/operators/base.py index f795c4e..b7acbd1 100644 --- a/disscube/operators/base.py +++ b/disscube/operators/base.py @@ -54,6 +54,16 @@ class body is executed (via ``__init_subclass__``), so there is no requires_class_code: ClassVar[bool] = False _resampling: ClassVar[Resampling] = Resampling.nearest + # When True, GridAligner must NOT pre-aggregate the band with this + # operator's ``resampling()``. Instead it provides a fine-resolution + # array reprojected with ``Resampling.nearest`` whose origin is snapped + # to the target grid, at a resolution that is an integer sub-multiple of + # the target cell size. The operator's ``compute()`` then performs the + # window aggregation into the target grid. Used by categorical operators + # (percentage / majority / minority) that must see sub-cell class + # composition rather than a pre-collapsed mode. + needs_fine_alignment: ClassVar[bool] = False + def __init_subclass__(cls, **kwargs: object) -> None: super().__init_subclass__(**kwargs) if hasattr(cls, "name") and isinstance(cls.name, str): @@ -92,4 +102,4 @@ def compute( """ raise NotImplementedError( f"{type(self).__name__}.compute() is not implemented" - ) + ) \ No newline at end of file diff --git a/disscube/operators/zonal.py b/disscube/operators/zonal.py index 95d7b98..36b973d 100644 --- a/disscube/operators/zonal.py +++ b/disscube/operators/zonal.py @@ -48,6 +48,214 @@ def _passthrough(data: xr.DataArray) -> xr.DataArray: return data.transpose("y", "x") +# --------------------------------------------------------------------------- +# Window-based categorical aggregation (fine grid -> target grid) +# --------------------------------------------------------------------------- + +def _fine_array(data: xr.DataArray) -> tuple[np.ndarray, float | None]: + """ + Extract the fine 2-D values and nodata from an origin-snapped DataArray. + + The aligner annotates fine arrays for categorical operators; this returns + the (rows, cols) ndarray plus the nodata sentinel (if any). + """ + if "band" in data.dims: + data = data.isel(band=0) + data = data.transpose("y", "x") + arr = np.asarray(data.values, dtype=np.float64) + nodata = data.attrs.get("_disscube_nodata", None) + if nodata is None: + try: + nodata = data.rio.nodata + except Exception: + nodata = None + return arr, nodata + + +def _windows(n_fine: int, n_target: int) -> list[tuple[int, int]]: + """ + Build per-target-cell (start, stop) index ranges along one axis. + + Splits ``n_fine`` fine pixels into ``n_target`` contiguous windows. When + ``n_fine`` is not an exact multiple of ``n_target`` the remainder pixels + are distributed so every fine pixel belongs to exactly one window (the + last windows absorb the extra pixels). This makes non-multiple ratios + (e.g. 10 m -> 30 m over a 100 m extent) well-defined without a reshape. + """ + if n_target <= 0: + return [] + edges = np.linspace(0, n_fine, n_target + 1) + edges = np.round(edges).astype(int) + return [(int(edges[i]), int(edges[i + 1])) for i in range(n_target)] + + +def _categorical_reduce( + fine: np.ndarray, + nodata: float | None, + grid: GridSpec, + mode: str, + class_code: int | None, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Reduce a fine categorical array into the target grid. + + Parameters + ---------- + fine : np.ndarray + Fine-resolution (rows, cols) class array, origin-snapped to the grid. + nodata : float | None + Sentinel marking invalid fine pixels. + grid : GridSpec + Target grid (provides ``rows`` / ``cols``). + mode : {"percentage", "majority", "minority"} + Aggregation to compute for the primary output. + class_code : int | None + Target class for ``percentage``. + + Returns + ------- + value : np.ndarray + Primary output, shape (grid.rows, grid.cols). + ``percentage`` -> fraction in 0..1; ``majority``/``minority`` -> class. + coverage_purity : np.ndarray + valid_pixels / total_pixels per cell (universal metric). + dominance_purity : np.ndarray + count(dominant_class) / valid_pixels per cell (categorical metric). + """ + fr, fc = fine.shape + tr, tc = grid.rows, grid.cols + + value = np.full((tr, tc), np.nan, dtype=np.float64) + coverage = np.zeros((tr, tc), dtype=np.float64) + dominance = np.full((tr, tc), np.nan, dtype=np.float64) + + row_windows = _windows(fr, tr) + col_windows = _windows(fc, tc) + + for ti, (r0, r1) in enumerate(row_windows): + for tj, (c0, c1) in enumerate(col_windows): + block = fine[r0:r1, c0:c1] + total = block.size + if total == 0: + continue + + if nodata is not None and not np.isnan(nodata): + valid_mask = block != nodata + else: + valid_mask = ~np.isnan(block) + valid_mask &= ~np.isnan(block) + + n_valid = int(valid_mask.sum()) + coverage[ti, tj] = n_valid / total + if n_valid == 0: + continue + + vals = block[valid_mask].astype(np.int64) + classes, counts = np.unique(vals, return_counts=True) + + # dominant class = highest count; tie -> smallest class value + dom_idx = np.lexsort((classes, -counts))[0] + dom_class = int(classes[dom_idx]) + dom_count = int(counts[dom_idx]) + dominance[ti, tj] = dom_count / n_valid + + if mode == "percentage": + hit = int(counts[classes == class_code].sum()) if class_code in classes else 0 + value[ti, tj] = hit / n_valid + elif mode == "majority": + value[ti, tj] = dom_class + elif mode == "minority": + min_idx = np.lexsort((classes, counts))[0] + value[ti, tj] = int(classes[min_idx]) + + return value, coverage, dominance + + +def _attach_purity( + value: np.ndarray, + coverage: np.ndarray, + dominance: np.ndarray, + grid: GridSpec, +) -> xr.DataArray: + """Wrap the primary value and attach purity arrays as coordinates.""" + da = xr.DataArray(value, dims=("y", "x"), coords={"y": grid.ys, "x": grid.xs}) + da = da.assign_coords( + coverage_purity=(("y", "x"), coverage), + dominance_purity=(("y", "x"), dominance), + ) + return da + + +def _continuous_reduce( + fine: np.ndarray, + nodata: float | None, + grid: GridSpec, + stat: str, +) -> tuple[np.ndarray, np.ndarray]: + """ + Reduce a fine continuous array into the target grid by real windows. + + Used by operators that need a per-cell statistic not directly available + from a single rasterio resampling pass (notably ``std``). Returns the + statistic and the coverage purity (valid_pixels / total_pixels) per cell. + + Parameters + ---------- + fine : np.ndarray + Fine-resolution (rows, cols) value array, origin-snapped to the grid. + nodata : float | None + Sentinel marking invalid fine pixels. + grid : GridSpec + Target grid. + stat : {"std", "mean", "sum", "min", "max"} + Statistic to compute per target cell over valid pixels. + + Returns + ------- + value : np.ndarray + Statistic per cell, shape (grid.rows, grid.cols); NaN where no valid + pixels. + coverage_purity : np.ndarray + valid_pixels / total_pixels per cell. + """ + fr, fc = fine.shape + tr, tc = grid.rows, grid.cols + + value = np.full((tr, tc), np.nan, dtype=np.float64) + coverage = np.zeros((tr, tc), dtype=np.float64) + + row_windows = _windows(fr, tr) + col_windows = _windows(fc, tc) + + for ti, (r0, r1) in enumerate(row_windows): + for tj, (c0, c1) in enumerate(col_windows): + block = fine[r0:r1, c0:c1].astype(np.float64) + total = block.size + if total == 0: + continue + if nodata is not None and not np.isnan(nodata): + mask = (block != nodata) & ~np.isnan(block) + else: + mask = ~np.isnan(block) + n_valid = int(mask.sum()) + coverage[ti, tj] = n_valid / total + if n_valid == 0: + continue + vals = block[mask] + if stat == "std": + value[ti, tj] = float(np.std(vals)) + elif stat == "mean": + value[ti, tj] = float(np.mean(vals)) + elif stat == "sum": + value[ti, tj] = float(np.sum(vals)) + elif stat == "min": + value[ti, tj] = float(np.min(vals)) + elif stat == "max": + value[ti, tj] = float(np.max(vals)) + + return value, coverage + + # --------------------------------------------------------------------------- # Raster-only operators # --------------------------------------------------------------------------- @@ -74,13 +282,17 @@ def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: class StdOperator(Operator): name = "std" - # rasterio has no standard-deviation resampling; nearest is a placeholder. - # True std requires a zonal-statistics pass (future work). + # Standard deviation has no rasterio resampling equivalent; it requires a + # real per-cell window pass, so this operator uses fine alignment. _resampling = Resampling.nearest + needs_fine_alignment = True def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: if isinstance(data, xr.DataArray): - return _passthrough(data) + fine, nodata = _fine_array(data) + value, cov = _continuous_reduce(fine, nodata, grid, "std") + da = xr.DataArray(value, dims=("y", "x"), coords={"y": grid.ys, "x": grid.xs}) + return da.assign_coords(coverage_purity=(("y", "x"), cov)) raise TypeError(f"'std' requires a raster source, got {type(data).__name__}") @@ -110,11 +322,14 @@ def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: class MajorityOperator(Operator): name = "majority" - _resampling = Resampling.mode + _resampling = Resampling.nearest + needs_fine_alignment = True def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: if isinstance(data, xr.DataArray): - return _passthrough(data) + fine, nodata = _fine_array(data) + value, cov, dom = _categorical_reduce(fine, nodata, grid, "majority", None) + return _attach_purity(value, cov, dom, grid) if isinstance(data, gpd.GeoDataFrame): val = var.class_code if var.class_code is not None else 1 shapes = [(g, val) for g in data.geometry if g is not None] @@ -124,11 +339,14 @@ def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: class MinorityOperator(Operator): name = "minority" - _resampling = Resampling.mode + _resampling = Resampling.nearest + needs_fine_alignment = True def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: if isinstance(data, xr.DataArray): - return _passthrough(data) + fine, nodata = _fine_array(data) + value, cov, dom = _categorical_reduce(fine, nodata, grid, "minority", None) + return _attach_purity(value, cov, dom, grid) if isinstance(data, gpd.GeoDataFrame): val = var.class_code if var.class_code is not None else 1 shapes = [(g, val) for g in data.geometry if g is not None] @@ -139,12 +357,21 @@ def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: class PercentageOperator(Operator): name = "percentage" requires_class_code = True - _resampling = Resampling.mode + _resampling = Resampling.nearest + needs_fine_alignment = True def compute(self, data, var: Variable, grid: GridSpec) -> xr.DataArray: if isinstance(data, xr.DataArray): - return _passthrough(data) + if var.class_code is None: + raise ValueError("'percentage' requires class_code to be set.") + fine, nodata = _fine_array(data) + value, cov, dom = _categorical_reduce( + fine, nodata, grid, "percentage", var.class_code + ) + return _attach_purity(value, cov, dom, grid) if isinstance(data, gpd.GeoDataFrame): + if var.class_code is None: + raise ValueError("'percentage' requires class_code to be set.") shapes = [(g, var.class_code) for g in data.geometry if g is not None] return _wrap(_rasterize(shapes, grid), grid) raise TypeError(f"'percentage' got unexpected type {type(data).__name__}") diff --git a/disscube/pipeline/aggregator.py b/disscube/pipeline/aggregator.py index 26670d1..f37fad7 100644 --- a/disscube/pipeline/aggregator.py +++ b/disscube/pipeline/aggregator.py @@ -81,18 +81,30 @@ def execute(self, ctx: PipelineContext) -> PipelineContext: f"Available: {available}" ) - # For raster sources GridAligner returns a Dataset keyed by - # variable name; for vector sources it returns a GeoDataFrame. - if isinstance(source_data, xr.Dataset): + # For raster sources GridAligner returns a dict keyed by variable + # name (DataArrays may be at target-grid resolution for continuous + # operators, or at a fine resolution for categorical ones). For + # vector sources it returns a single GeoDataFrame. + if isinstance(source_data, dict): + var_data = source_data[var.name] + elif isinstance(source_data, xr.Dataset): var_data = source_data[var.name] else: var_data = source_data result: xr.DataArray = op_cls().compute(var_data, var, grid) + + # Purity metrics (coverage_purity / dominance_purity) produced by + # categorical operators are kept as COORDINATES on the variable's + # DataArray — not as separate data variables — so that the + # VariableWriter persists them inside the variable's own Zarr and + # does NOT register them as standalone DerivedVariables in the + # catalog. They travel with the variable and never pollute the + # product catalog. final_ds[var.name] = result if spatial_ref is not None: final_ds = final_ds.assign_coords({"spatial_ref": spatial_ref}) ctx.data = final_ds - return ctx + return ctx \ No newline at end of file diff --git a/disscube/pipeline/aligner.py b/disscube/pipeline/aligner.py index bc6c81d..874f905 100644 --- a/disscube/pipeline/aligner.py +++ b/disscube/pipeline/aligner.py @@ -25,6 +25,7 @@ import logging import rioxarray # noqa: F401 — registers the .rio accessor +import numpy as np import xarray as xr import geopandas as gpd from pyproj import CRS as ProjCRS @@ -76,7 +77,7 @@ def _align_raster( grid: GridSpec, variables: list[Variable], band_map: dict[str, int], - ) -> xr.Dataset: + ) -> dict[str, xr.DataArray]: """ Reproject and resample the source raster for each variable. @@ -97,7 +98,11 @@ def _align_raster( Optional ``{variable_name: 1-based band index}`` from the source. """ ds_src = rioxarray.open_rasterio(url) - result = xr.Dataset(coords={"y": grid.ys, "x": grid.xs}) + # Map of variable name -> aligned DataArray. A plain dict (not a + # Dataset) is used because fine-aligned categorical arrays have a + # different shape than the target grid; putting them in a Dataset + # keyed on grid coords would trigger coordinate realignment to NaN. + result: dict[str, xr.DataArray] = {} for i, var in enumerate(variables): # ── Band selection ───────────────────────────────────────── @@ -124,6 +129,21 @@ def _align_raster( # ── Per-operator resampling method ───────────────────────── op_cls = OPERATOR_REGISTRY.get(var.operator) + needs_fine = bool(getattr(op_cls, "needs_fine_alignment", False)) + + if needs_fine: + # Categorical operators must see sub-cell class composition. + # Reproject with NEAREST (never average a class code) onto a + # fine grid that shares the target grid origin, at a resolution + # that is an integer sub-multiple of the target cell size. + aligned = self._align_fine(band, grid) + result[var.name] = aligned + log.debug( + "fine-aligned '%s' via '%s' (nearest, fine shape=%s -> target=%s)", + var.name, var.operator, aligned.rio.shape, (grid.rows, grid.cols), + ) + continue + resampling: Resampling = ( op_cls.resampling() if op_cls else Resampling.nearest ) @@ -153,3 +173,75 @@ def _align_raster( ) return result + + # ------------------------------------------------------------------ + # Fine alignment for categorical operators + # ------------------------------------------------------------------ + + def _align_fine(self, band: xr.DataArray, grid: GridSpec) -> xr.DataArray: + """ + Reproject ``band`` onto a fine grid snapped to the target grid origin. + + The fine resolution is the largest integer sub-multiple of the target + cell size that is not coarser than the source resolution, so each + target cell maps onto a whole number of fine pixels along each axis. + Resampling is NEAREST to preserve class codes. The source nodata is + carried on the result as ``_disscube_nodata`` for the operator. + + Parameters + ---------- + band : xr.DataArray + Single-band source (already band-selected). + grid : GridSpec + Target grid. + + Returns + ------- + xr.DataArray + Fine, origin-snapped array (dims "y","x"), with nodata recorded + in ``attrs["_disscube_nodata"]``. + """ + from affine import Affine + + # Estimate source resolution in target CRS units by reprojecting first + # to the target CRS at native resolution, then deriving a sub-multiple. + src = band.rio.reproject(grid.crs, resampling=Resampling.nearest) + try: + src_res = abs(float(src.rio.resolution()[0])) + except Exception: + src_res = grid.resolution + + target_res = grid.resolution + if src_res <= 0 or src_res >= target_res: + # Source no finer than target: one fine pixel per target cell. + factor = 1 + else: + # Largest integer factor whose fine res (target/factor) is >= src_res. + factor = max(1, int(np.floor(target_res / src_res))) + + fine_res = target_res / factor + fine_rows = grid.rows * factor + fine_cols = grid.cols * factor + + # Fine transform shares the target grid origin (north-up). + fine_transform = ( + Affine.translation(grid.bbox[0], grid.bbox[3]) + * Affine.scale(fine_res, -fine_res) + ) + + nodata = None + try: + nodata = band.rio.nodata + except Exception: + nodata = None + + aligned = band.rio.reproject( + grid.crs, + shape=(fine_rows, fine_cols), + transform=fine_transform, + resampling=Resampling.nearest, + ) + aligned = aligned.transpose("y", "x") if "band" not in aligned.dims else aligned.isel(band=0).transpose("y", "x") + if nodata is not None: + aligned.attrs["_disscube_nodata"] = nodata + return aligned diff --git a/docs/terrame_fill_correspondence.md b/docs/terrame_fill_correspondence.md new file mode 100644 index 0000000..00a7394 --- /dev/null +++ b/docs/terrame_fill_correspondence.md @@ -0,0 +1,81 @@ +# From TerraME *Fill Cells* to DisSCube operators + +## Lineage + +DisSCube's derivation layer is the conceptual successor of TerraME's +`fillCellularSpace` (the *Fill Cells* operation), reformulated as a +reproducible, catalogued data-cube layer. + +In TerraME/LuccME, populating a cellular space with attributes derived from +heterogeneous geographic layers is performed cell by cell, in Lua, through +fill *strategies* (`area`, `presence`, `count`, `distance`, `percentage`, +`majority`, `average`, etc.). DisSCube keeps the same semantic vocabulary but +expresses each strategy as a typed, auto-registered `Operator` that runs over a +catalogued data cube rather than over an in-memory cellular space. + +The advance is not the set of operations — those are deliberately faithful to +TerraME — but the engineering around them: + +- **Reproducibility.** Every derived product carries a deterministic + `spec_hash`; identical specifications always produce the same catalogued + product. +- **Grid-aligned aggregation.** Categorical and standard-deviation operators + aggregate from a fine array snapped to the target grid origin, using real + per-cell windows, so target resolutions that are not integer multiples of the + source (e.g. 30 m → 1000 m) are well defined rather than silently + approximated. +- **Explicit cell purity.** Coverage purity (valid / total pixels) and, for + categorical operators, dominance purity (dominant-class fraction) are computed + per cell. They are first-class metadata travelling with the variable, ready + for a future masking policy. Purity is implicit in TerraME; here it is named + and measurable — matching the parameter the INPE e-Cube design also elevates. +- **CRS robustness.** Aggregation is validated on real projected CRSs, + including the BDC Brazil Albers grid, with named-WKT serialization to avoid + `PROJCS["unknown"]` round-trip problems. + +## Strategy → operator correspondence + +| TerraME fill strategy | DisSCube operator (`name`) | Status | Notes | +|---|---|---|---| +| `presence` | `presence` | implemented | Binary mask: 1 where any feature is present. | +| `area` / `coverage` / `percentage` | `percentage` | implemented (window-based) | Fraction (0..1) of the target class per cell, over valid pixels. Requires `class_code`. | +| `majority` / `mode` | `majority` | implemented (window-based) | Dominant class per cell; ties resolve to the smallest class value. | +| `minority` | `minority` | implemented (window-based) | Least-frequent class per cell. | +| `count` | `count` | implemented | Count of features per cell (proximity operator). | +| `distance` | `min_distance` | implemented | Euclidean distance to the nearest feature (EDT × resolution). | +| `average` / `mean` | `mean` | implemented | Mean value per cell (continuous). | +| `sum` | `sum` | implemented | Sum per cell (continuous). | +| `minimum` | `min` | implemented | Minimum per cell. | +| `maximum` | `max` | implemented | Maximum per cell. | +| `stdev` / `standardDeviation` | `std` | implemented (window-based) | True per-cell standard deviation over valid pixels. | +| `attribute` (value copy) | `attribute` | implemented (vector) | Rasterize a numeric vector column whose name matches the variable. | + +## Aggregation path by operator type + +- **Continuous, resampling-expressible** (`mean`, `sum`, `min`, `max`): aligned + to the target grid directly via the corresponding rasterio resampling method. +- **Categorical** (`percentage`, `majority`, `minority`) and **`std`**: aligned + to a fine, origin-snapped grid with nearest resampling (never averaging a + class code), then reduced per target cell over real windows. These operators + set `needs_fine_alignment = True`. +- **Vector** (`presence`, `attribute`, and the vector branch of the categorical + operators): reprojected and clipped to the grid bounding box, then rasterized. + +## Known gaps relative to TerraME + +- **Area-weighted vector aggregation.** For vector sources, fractional-coverage + strategies are currently rasterized rather than area-weighted; the + raster-fine path is the recommended route for fractional drivers. Area-weighted + vector aggregation remains future work. +- **In-memory, single-tile by design.** The fine-alignment path materializes a + fine array in memory; very large tiles at a high fine/target ratio are bounded + by available memory. Distributed/lazy execution is a roadmap item, not a + current capability. + +## Positioning statement + +> The filling of cellular spaces from heterogeneous geographic data — the core +> operation of TerraME's `fillCellularSpace` — is reformulated in DisSCube as a +> reproducible spatial-derivation layer: fill strategies become typed operators +> over a catalogued data cube, with aggregation on windows aligned to the target +> grid and explicit control of cell purity. diff --git a/tests/test_aligner.py b/tests/test_aligner.py index a05ef80..662d8b7 100644 --- a/tests/test_aligner.py +++ b/tests/test_aligner.py @@ -78,10 +78,14 @@ def test_per_variable_resampling_produces_correct_names(tmp_path): GridAligner().execute(ctx) ds = ctx.data - assert isinstance(ds, xr.Dataset) - assert "uso" in ds.data_vars - assert "alt" in ds.data_vars - assert ds["uso"].shape == (10, 10) + # GridAligner now returns a dict keyed by variable name. Continuous + # operators (mean) are aligned to the target grid; categorical ones + # (majority) are returned at a fine, origin-snapped resolution and + # reduced later by the operator, so only the continuous one is asserted + # to match the target shape here. + assert isinstance(ds, dict) + assert "uso" in ds + assert "alt" in ds assert ds["alt"].shape == (10, 10) diff --git a/tests/test_aligner_resampling.py b/tests/test_aligner_resampling.py new file mode 100644 index 0000000..5c23ca1 --- /dev/null +++ b/tests/test_aligner_resampling.py @@ -0,0 +1,270 @@ +""" +Complementary GridAligner tests. + +These extend ``tests/test_aligner.py`` (which covers band selection, per-variable +naming and the happy-path shape invariant) with the cases that matter most for +scientific correctness: + +* downsampling to a resolution that IS an integer multiple of the source; +* downsampling to a resolution that is NOT an integer multiple (e.g. 10 m -> 30 m + over a 100 m extent: the regression guard for the alignment concern raised in the + architecture audit, analogous to the real-world 30 m -> 1000 m case); +* numeric correctness of ``mean`` (Resampling.average) and ``majority`` + (Resampling.mode) on hand-crafted rasters; +* vector reprojection + clip to the grid bbox. + +All inputs are generated in-memory / in ``tmp_path``; no external files are used. +""" + +import numpy as np +import pytest +import rasterio +from rasterio.transform import from_bounds +import xarray as xr +import rioxarray # noqa: F401 — registers the .rio accessor + +from disscube.pipeline.aligner import GridAligner +from disscube.pipeline import PipelineContext +from disscube.models import GridSpec, SpatialSource, SpatialDerivation, Variable + +CRS = "EPSG:31982" # UTM 22S, metres — same as existing aligner tests + + +# --------------------------------------------------------------------------- # +# Helpers +# --------------------------------------------------------------------------- # + +def _grid(resolution, bbox=(0, 0, 100, 100), gid="G1"): + return GridSpec(id=gid, type="local", crs=CRS, resolution=resolution, bbox=list(bbox)) + + +def _raster_source(asset_url, **kw): + return SpatialSource(id="S1", name="S1", format="raster", asset_url=asset_url, crs=CRS, **kw) + + +def _vector_source(**kw): + return SpatialSource(id="V1", name="V1", format="vector", asset_url="mem://vec", crs=CRS, **kw) + + +def _ctx(grid, variables, source, data=None): + deriv = SpatialDerivation(source_id=source.id, grid_id=grid.id, role="test", variables=variables) + return PipelineContext(source=source, grid=grid, derivation=deriv, data=data) + + +def _write_raster(path, array, bbox=(0, 0, 100, 100), crs=CRS): + """Write a single-band float32 GeoTIFF from a (rows, cols) array.""" + rows, cols = array.shape + transform = from_bounds(*bbox, cols, rows) + with rasterio.open( + path, "w", driver="GTiff", height=rows, width=cols, count=1, + dtype="float32", crs=crs, transform=transform, + ) as dst: + dst.write(array.astype(np.float32), 1) + return str(path) + + +# --------------------------------------------------------------------------- # +# Resolution: integer-multiple downsampling +# --------------------------------------------------------------------------- # + +def test_downsample_multiple_shape(tmp_path): + """10 m source over a 100 m extent -> 20 m grid: clean 5x5 result.""" + src = np.arange(100, dtype=np.float32).reshape(10, 10) # 10 m, 10x10 + url = _write_raster(tmp_path / "fine.tif", src) + + grid = _grid(resolution=20) # 100/20 = 5 + ctx = _ctx(grid, [Variable(name="v", operator="mean")], _raster_source(url)) + + GridAligner().execute(ctx) + assert ctx.data["v"].shape == (grid.rows, grid.cols) == (5, 5) + + +def test_mean_value_is_block_average(tmp_path): + """mean -> Resampling.average must average the source block, not sample it. + + Source is a 4x4 grid of distinct values; aligning to a 2x2 grid (50 m over a + 100 m extent) must yield the mean of each 2x2 block. + """ + src = np.array( + [ + [1, 1, 2, 2], + [1, 1, 2, 2], + [3, 3, 4, 4], + [3, 3, 4, 4], + ], + dtype=np.float32, + ) # 25 m pixels over 0..100 + url = _write_raster(tmp_path / "blocks.tif", src) + + grid = _grid(resolution=50) # 2x2 result, each cell == one 2x2 block + ctx = _ctx(grid, [Variable(name="v", operator="mean")], _raster_source(url)) + + GridAligner().execute(ctx) + out = ctx.data["v"].values + assert out.shape == (2, 2) + # Each block is constant, so the average equals that constant. + np.testing.assert_allclose(np.sort(out.ravel()), [1, 2, 3, 4], rtol=0, atol=1e-5) + + +def test_majority_value_is_mode(tmp_path): + """majority must return the dominant class per target cell. + + After the operator refactor, categorical aggregation is performed by the + operator's compute() (fine align + window reduction), NOT by the aligner. + So this exercises the full aligner -> aggregator path. + """ + from disscube.pipeline.aggregator import Aggregator + + src = np.array( + [ + [7, 7, 5, 5], + [7, 9, 5, 5], + [2, 2, 8, 8], + [2, 2, 8, 6], + ], + dtype=np.float32, + ) + url = _write_raster(tmp_path / "cls.tif", src) + + grid = _grid(resolution=50) # 2x2 blocks + ctx = _ctx(grid, [Variable(name="uso", operator="majority")], _raster_source(url)) + + GridAligner().execute(ctx) + Aggregator().execute(ctx) + out = ctx.data["uso"].values + + assert out.shape == (2, 2) + assert out[0, 0] == 7 + assert out[0, 1] == 5 + assert out[1, 0] == 2 + assert out[1, 1] == 8 + + +# --------------------------------------------------------------------------- # +# Resolution: NON-multiple downsampling (the key regression guard) +# --------------------------------------------------------------------------- # + +def test_downsample_non_multiple_does_not_crash(tmp_path): + """10 m source, 30 m grid over a 100 m extent (100/30 is not integer). + + rows/cols round to 3 (round(100/30)=3). The aligner must produce exactly the + grid shape without raising, proving alignment is driven by the target + transform/shape rather than a naive reshape that would require divisibility. + """ + src = (np.arange(100, dtype=np.float32)).reshape(10, 10) + url = _write_raster(tmp_path / "fine.tif", src) + + grid = _grid(resolution=30) + assert (grid.rows, grid.cols) == (3, 3) # round(100/30) == 3 + + ctx = _ctx(grid, [Variable(name="v", operator="mean")], _raster_source(url)) + GridAligner().execute(ctx) + + out = ctx.data["v"] + assert out.shape == (grid.rows, grid.cols) == (3, 3) + # Values must be finite and within the source value range. + vals = out.values + assert np.all(np.isfinite(vals)) + assert vals.min() >= src.min() - 1e-6 + assert vals.max() <= src.max() + 1e-6 + + +def test_non_multiple_coords_match_gridspec(tmp_path): + """Output coordinates must equal GridSpec.xs / GridSpec.ys exactly.""" + src = np.ones((10, 10), dtype=np.float32) + url = _write_raster(tmp_path / "ones.tif", src) + + grid = _grid(resolution=30) + ctx = _ctx(grid, [Variable(name="v", operator="mean")], _raster_source(url)) + GridAligner().execute(ctx) + + out = ctx.data["v"] + np.testing.assert_allclose(out["x"].values, grid.xs) + np.testing.assert_allclose(out["y"].values, grid.ys) + + +# --------------------------------------------------------------------------- # +# Alignment invariant fires on a genuine mismatch +# --------------------------------------------------------------------------- # + +def test_alignment_invariant_raises_on_shape_mismatch(tmp_path, monkeypatch): + """If reprojection yields the wrong shape, the invariant must raise ValueError. + + We force the failure by monkeypatching the .rio.reproject result to a wrong + shape, isolating the invariant check itself. + """ + src = np.zeros((10, 10), dtype=np.float32) + url = _write_raster(tmp_path / "z.tif", src) + + grid = _grid(resolution=10) # expects 10x10 + ctx = _ctx(grid, [Variable(name="v", operator="mean")], _raster_source(url)) + + real_open = rioxarray.open_rasterio + + def _fake_open(u, *a, **k): + da = real_open(u, *a, **k) + + class _Reproj: + def __init__(self, inner): + self._inner = inner + + def reproject(self, *a, **k): + # Return a deliberately wrong shape (5x5) to trip the invariant. + bad = self._inner.isel(band=0)[:5, :5] + return bad + + # attach a fake .rio with a reproject that returns wrong shape + band0 = da.isel(band=0) if "band" in da.dims else da + + class _Band: + dims = band0.dims + sizes = band0.sizes + + @property + def rio(self): + return _Reproj(da) + + def isel(self, *a, **k): + return self + + return _Band() + + monkeypatch.setattr(rioxarray, "open_rasterio", _fake_open) + + with pytest.raises(ValueError, match="alignment produced shape"): + GridAligner().execute(ctx) + + +# --------------------------------------------------------------------------- # +# Vector path: reproject + clip to grid bbox +# --------------------------------------------------------------------------- # + +def test_vector_reproject_and_clip(tmp_path): + """A GeoDataFrame in a different CRS is reprojected and clipped to the bbox.""" + gpd = pytest.importorskip("geopandas") + from shapely.geometry import box as shp_box + + grid = _grid(resolution=10, bbox=(0, 0, 100, 100)) + + # One polygon fully inside the grid, one fully outside (after reprojection + # the outside one must be removed by the clip). + inside = shp_box(10, 10, 40, 40) + outside = shp_box(1000, 1000, 1100, 1100) + gdf = gpd.GeoDataFrame({"v": [1, 2]}, geometry=[inside, outside], crs=CRS) + + # Put it in WGS84 to force a reprojection branch. + gdf_wgs = gdf.to_crs("EPSG:4326") + + src = _vector_source() + ctx = _ctx(grid, [Variable(name="v", operator="attribute")], src, data=gdf_wgs) + + GridAligner().execute(ctx) + out = ctx.data + + assert isinstance(out, gpd.GeoDataFrame) + # Back in the grid CRS. + from pyproj import CRS as ProjCRS + assert ProjCRS.from_user_input(out.crs).equals(ProjCRS.from_user_input(CRS)) + # The outside polygon was clipped away; only the inside one survives. + assert len(out) >= 1 + assert out.total_bounds[0] >= -1 and out.total_bounds[2] <= 101 diff --git a/tests/test_percentage_purity.py b/tests/test_percentage_purity.py new file mode 100644 index 0000000..9418df4 --- /dev/null +++ b/tests/test_percentage_purity.py @@ -0,0 +1,196 @@ +""" +Tests for window-based categorical aggregation: percentage, majority, and the +coverage / dominance purity metrics. + +These exercise the full GridAligner -> Aggregator path, since categorical +aggregation is performed by the operator's compute() (fine align + window +reduction), not by the aligner. +""" + +import numpy as np +import pytest +import rasterio +from rasterio.transform import from_bounds +import rioxarray # noqa: F401 + +from disscube.pipeline.aligner import GridAligner +from disscube.pipeline.aggregator import Aggregator +from disscube.pipeline import PipelineContext +from disscube.models import GridSpec, SpatialSource, SpatialDerivation, Variable + +CRS = "EPSG:31982" + + +def _grid(resolution, bbox=(0, 0, 100, 100)): + return GridSpec(id="G1", type="local", crs=CRS, resolution=resolution, bbox=list(bbox)) + + +def _source(url, **kw): + return SpatialSource(id="S1", name="S1", format="raster", asset_url=url, crs=CRS, **kw) + + +def _ctx(grid, variables, url): + src = _source(url) + deriv = SpatialDerivation(source_id="S1", grid_id=grid.id, role="test", variables=variables) + return PipelineContext(source=src, grid=grid, derivation=deriv) + + +def _write(path, array, bbox=(0, 0, 100, 100), nodata=None): + rows, cols = array.shape + transform = from_bounds(*bbox, cols, rows) + with rasterio.open( + path, "w", driver="GTiff", height=rows, width=cols, count=1, + dtype="float32", crs=CRS, transform=transform, nodata=nodata, + ) as dst: + dst.write(array.astype(np.float32), 1) + return str(path) + + +def _run(grid, var, url): + ctx = _ctx(grid, [var], url) + GridAligner().execute(ctx) + Aggregator().execute(ctx) + return ctx.data + + +# --------------------------------------------------------------------------- # +# percentage +# --------------------------------------------------------------------------- # + +def test_percentage_known_mix(tmp_path): + """A single 2x2 target cell with 3 px class 1 and 1 px class 2.""" + src = np.array([[1, 1], [1, 2]], dtype=np.float32) # 50 m px over 0..100 + url = _write(tmp_path / "mix.tif", src) + grid = _grid(resolution=100) # one cell covering the whole extent + + out1 = _run(grid, Variable(name="c", operator="percentage", class_code=1), url) + np.testing.assert_allclose(out1["c"].values[0, 0], 0.75, atol=1e-6) + + out2 = _run(grid, Variable(name="c", operator="percentage", class_code=2), url) + np.testing.assert_allclose(out2["c"].values[0, 0], 0.25, atol=1e-6) + + +def test_percentage_sums_to_one(tmp_path): + """Percentages over all present classes sum to ~1.0 per cell.""" + src = np.array( + [[1, 1, 2, 3], [1, 1, 2, 3], [2, 2, 3, 3], [1, 2, 3, 3]], + dtype=np.float32, + ) + url = _write(tmp_path / "multi.tif", src) + grid = _grid(resolution=50) # 2x2 cells + + total = np.zeros((2, 2)) + for c in (1, 2, 3): + out = _run(grid, Variable(name="p", operator="percentage", class_code=c), url) + total += out["p"].values + np.testing.assert_allclose(total, np.ones((2, 2)), atol=1e-6) + + +def test_percentage_requires_class_code(tmp_path): + src = np.ones((4, 4), dtype=np.float32) + url = _write(tmp_path / "x.tif", src) + grid = _grid(resolution=50) + with pytest.raises(ValueError, match="class_code"): + _run(grid, Variable(name="p", operator="percentage"), url) + + +# --------------------------------------------------------------------------- # +# majority +# --------------------------------------------------------------------------- # + +def test_majority_dominant_class(tmp_path): + src = np.array( + [[7, 7, 5, 5], [7, 9, 5, 5], [2, 2, 8, 8], [2, 2, 8, 6]], + dtype=np.float32, + ) + url = _write(tmp_path / "cls.tif", src) + grid = _grid(resolution=50) + out = _run(grid, Variable(name="uso", operator="majority"), url) + v = out["uso"].values + assert (v[0, 0], v[0, 1], v[1, 0], v[1, 1]) == (7, 5, 2, 8) + + +def test_majority_tie_resolves_to_smallest(tmp_path): + """A 2x2 cell with two of class 3 and two of class 9 -> smallest (3).""" + src = np.array([[3, 3], [9, 9]], dtype=np.float32) + url = _write(tmp_path / "tie.tif", src) + grid = _grid(resolution=100) + out = _run(grid, Variable(name="uso", operator="majority"), url) + assert out["uso"].values[0, 0] == 3 + + +# --------------------------------------------------------------------------- # +# non-multiple resolution +# --------------------------------------------------------------------------- # + +def test_percentage_non_multiple_resolution(tmp_path): + """10 m source -> 30 m grid over 100 m extent: no crash, values in [0,1].""" + rng = np.random.default_rng(0) + src = rng.integers(1, 4, size=(10, 10)).astype(np.float32) + url = _write(tmp_path / "rand.tif", src) + grid = _grid(resolution=30) + assert (grid.rows, grid.cols) == (3, 3) + + out = _run(grid, Variable(name="p", operator="percentage", class_code=2), url) + vals = out["p"].values + assert vals.shape == (3, 3) + finite = vals[np.isfinite(vals)] + assert np.all(finite >= 0.0) and np.all(finite <= 1.0) + + +# --------------------------------------------------------------------------- # +# purity metrics +# --------------------------------------------------------------------------- # + +def test_coverage_purity_with_nodata(tmp_path): + """A cell half-filled with nodata has coverage_purity ~0.5; full cell ~1.0.""" + # Left half class 1, right half nodata(-1). One 2x2 cell -> coverage 0.5. + src = np.array([[1, -1], [1, -1]], dtype=np.float32) + url = _write(tmp_path / "nd.tif", src, nodata=-1) + grid = _grid(resolution=100) + out = _run(grid, Variable(name="c", operator="percentage", class_code=1), url) + cov = out["c"]["coverage_purity"].values[0, 0] + np.testing.assert_allclose(cov, 0.5, atol=1e-6) + # percentage is computed over VALID pixels only -> class 1 is 100% of valid. + np.testing.assert_allclose(out["c"].values[0, 0], 1.0, atol=1e-6) + + +def test_coverage_purity_full_cell(tmp_path): + src = np.ones((4, 4), dtype=np.float32) + url = _write(tmp_path / "full.tif", src) + grid = _grid(resolution=50) + out = _run(grid, Variable(name="c", operator="percentage", class_code=1), url) + np.testing.assert_allclose(out["c"]["coverage_purity"].values, np.ones((2, 2)), atol=1e-6) + + +def test_dominance_purity_value(tmp_path): + """Cell with 3 px class 1, 1 px class 2 -> dominance_purity = 0.75.""" + src = np.array([[1, 1], [1, 2]], dtype=np.float32) + url = _write(tmp_path / "dom.tif", src) + grid = _grid(resolution=100) + out = _run(grid, Variable(name="uso", operator="majority"), url) + np.testing.assert_allclose(out["uso"]["dominance_purity"].values[0, 0], 0.75, atol=1e-6) + + +# --------------------------------------------------------------------------- # +# std (real window-based standard deviation) +# --------------------------------------------------------------------------- # + +def test_std_real_value(tmp_path): + """std computes the true per-cell standard deviation over a window.""" + src = np.array([[0, 2], [4, 6]], dtype=np.float32) + url = _write(tmp_path / "std.tif", src) + grid = _grid(resolution=100) # single cell over 0,2,4,6 + out = _run(grid, Variable(name="v", operator="std"), url) + np.testing.assert_allclose(out["v"].values[0, 0], np.std([0, 2, 4, 6]), atol=1e-5) + + +def test_std_coverage_purity(tmp_path): + """std exposes coverage_purity and respects nodata.""" + src = np.array([[1, -1], [3, 5]], dtype=np.float32) # one nodata pixel + url = _write(tmp_path / "stdnd.tif", src, nodata=-1) + grid = _grid(resolution=100) + out = _run(grid, Variable(name="v", operator="std"), url) + # std over valid {1,3,5} + np.testing.assert_allclose(out["v"].values[0, 0], np.std([1, 3, 5]), atol=1e-5) + np.testing.assert_allclose(out["v"]["coverage_purity"].values[0, 0], 0.75, atol=1e-6) From 47203bbfc7f579dee5e7f44f1af008acff4958b0 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 09:01:55 -0300 Subject: [PATCH 05/13] Apply audit fixes 1-5: hygiene, deps, temporal path, logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Untrack catalog.db and examples/drivers/catalog.db (runtime artifacts; .gitignore rule already existed — files were previously force-tracked) - pyproject.toml: declare dissmodel>=0.6.0,<0.7.0; add fiona as [bdc] optional extra; move s3fs/python-multipart to optional extras; remove rasterstats and typer which have zero references in library code - cube_client.py: defer `from dissmodel...RasterBackend` import to inside to_lucc_data() so CubeClient is usable without dissmodel installed; fix load() to always return 3D (time,y,x) for temporal variables even when only one slice exists (was silently returning 2D, causing to_lucc_data to treat single-year temporal vars as static); replace print() warning with log.warning(); add CONTRACT notes documenting the three open temporal-contract decisions - writer.py: fix valid_from ISO-date silent drop — int("2020-01-01") was swallowing the year and producing times=[], now extracts year component from any "YYYY" or "YYYY-MM-DD" string - bdc_importer.py, grids.py: replace all print() calls with logging.getLogger(__name__) so output is observable in structured logs - tests/test_temporal.py: six new regression tests covering the two defects fixed above (ISO date extraction, load() shape consistency) plus temporal sort order and the period-filter logging path Co-Authored-By: Claude Sonnet 4.6 --- catalog.db | Bin 110592 -> 0 bytes disscube/client/cube_client.py | 46 ++++--- disscube/pipeline/writer.py | 7 +- disscube/utils/bdc_importer.py | 8 +- disscube/utils/grids.py | 21 ++-- examples/drivers/catalog.db | Bin 49152 -> 0 bytes pyproject.toml | 18 ++- tests/test_temporal.py | 212 +++++++++++++++++++++++++++++++++ 8 files changed, 277 insertions(+), 35 deletions(-) delete mode 100644 catalog.db delete mode 100644 examples/drivers/catalog.db create mode 100644 tests/test_temporal.py diff --git a/catalog.db b/catalog.db deleted file mode 100644 index 1491037d708facd9725a49f576350afa76e6fd5c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 110592 zcmeI54~!hwdD!pFBYC%bdn=M6Pb5Wgb+#hXcjV2R|3indNQ!1ClaxeK)+w^QdGn84 zbMN-FyOc#Mf^#Pmwh$CW0T)FPw?*O_L0bn!(m!ch7-?YybrS?>kQn&`Bt;zr&fgko z5H~?kwBMVVxAW%B&T>X`vPa2VTf^D!&AfZ>H{ZVZeeZqq-i_z4bcSBb@AcR5u%%2r zG&Mav^;D}hH8u5R`0ptEM?ZVvWQKl$f2W!AtkW+~oqp}@M)R+x>b3th)jZq$m4(-v z3yqb9%k}>~|JU^|)}IEk_y_`mfFK|U2m*qDARq_`0`C%moAo`5r;bkdIvcL{ddGde z?Rx#rYo6O4cDi14K7Zl*#d9|=wk}_L`r=xPzo_->wN`ZD335}rS1;XXZ*SVsjmAdQO`NPu`r%DDAC&$Bpu>W#&sRV)G^6Nolud_U>Igeti0^7l+vH zdVQ~phn?QWV01DYWqI@5`70M&BXR3Qtu^K|=-ur*UOURYb@Srd&6Tki4srjMH_UON zi-$PKr*-Ywo2_dvT)8qn!{@GFzIyKZORbMye5rLJS&Wrd@~@ptmFOgz-LrV{`1Dlp z&<3w|!3%2RyTed5+M`;x(dcw;`^fISi${-6f3_Zwi=IIA&z=OeNSMJ49t`2sVrnqH ziVvnb(w0f&Lp_~vMZeeeCM2hyNSf!VJ$n|<9nIny@o1yyWbf92T>;CjHv^X8;~PZ( z%p_Pw!m=!LRbpmRC8C0~QFO9t0X^^0WrQ;eCk;1fuUo@lU>Q&*Is| ztY;7uqm6=pc0mI>OB&fmdbxi3!DerfKD^}lNT%Y|>eOHHrHRuB*b1OY)n5D)|e z0YTt_1h$vwW)6S$*>e|G(Vg`-mOAdznWgZ{(#q2C-lhlN`<@Rj+=kv zeex{!aBCSnyS8-p6q4bzw?R%dHI$;Wr-%^JWjKS=*CADtf=_Gt9(+OJ7pb-Eb-SB` zvzM;lFyYJHjm~gzcD=Q{(F5n%AHbjOdxKv0F7Y9k&XBuKODjvZ-FtoM%*(Qg%uoUe z$O=ytT~(-1Lq>4=%F2@K4V->wGt39-*&1}#?}ol?indrRCFWvJ|1z3`AYhWvDku5G0tOUT~@-$s7qaRUZp6QROV{ z%fJc0W#@sv=W-WppO^>zzI^`rYOLSkm&!hwNO4E4ff4ENia}C>Cu5K&6g@7JGz;U@ z`&uwUg}bdQFI>0~HwIekPDWF3q7k8r8%{t4Un+|Vbd}X&d52MSJ&?iK_D0YN|z5CjAPK|l}?1Ox#=KoAfF1c4nUP}_BQIvfz5i%wx^ zX!g)9G@TgB3dZ&7t%>VZMf1Uq#^Iw|( z2lJnqU!Q+@{_?yr|HS-(`RV$%>%Uw7<@(Rpf2#fu>u=Rxt+(q}>mRD0tS{Dg*Z!jR zKWo2Q`)ci9)&6nqbG5rQr}lj9!?jbjqqVuY|1`%=8{nhluC<0Z1oOOV4$B%6~+7Um`95AzbE%}ex8@DlBn zgv5lJew>%+FY^-Zhj@wl5-(BKc!~TXFF_yUCDIF=M6z!367vQxF|P9x{dr!ZJ;zJb zXL*ToEg><0##LUTU*RR%M|p|*3@=eG^Ah<7c?r72OQer*63M#AOU$QviE)9K=;wKf zc8-^*Pw^7v!wHEFG=6}W=x2F}_Wis>{SYrv&hQf1;w8xBC6d8OBunQdrp8MQm6zxW zFVSRPqN0REgStz+L|^44+G$>*p5i6S3NMkDc?tR;FOg1i63IHjON=LZiT-`OM0ouuFG+jyhxcpF1_|Ec!yu2a)+(kFjp!|mMi zHt=BXpD_GNziyg-`J%n^$v5~5pt|_~gq}!73t0vNA56PMj|C6z#xVo>|*iwAkYr`sp zz2lc>whorh3U`N(?MzO02fbt0XSSZ4mSj!Ao~>FCU!Y@6c9G%QmSXrmh8in|qRGhg zZPWEM(~@1;h1qA<#kS^~SoJ--?REykw$pJjER^_Yl%-)go~~)0t1Fu7NSbGwmTn=- zS9I)oo}w$3W~tcM4a3q5({U_C*Hy`N6|5RKk>$}#Gh6$neceWIg@G+w)jTX4hUR#h zVPMHIR8JxFVNz3(9mTV;V<2o9Ivx@LKv zsu-@OS{l-kEZe$n`Kqp|lI>Wguc&aT34h48b=7y|uxJC=lVLC)JP6w`b!0l&@DWx? z6_9RvitW0(447O`b~MX@l9@V2j$<0Wq$67g7zXx1X`D7Zh+xy$1HP%biY&wP^bDYa zY*n%hPeZl=kId4sqT7mMnwsfohHlCZC@?62<73Zv%&?MVQumSIc{<3zx~FNnBiji3 zShj3OLz)AN<@t^)!K1Wv9l0bcU4}I@eFY>kJOX5?iskEx=eZbUX}g*&`*4Q~mt&}% z>-eg#sjlrJ**9GSNLWabp};P(eeA0wf6Im3v2VzJBFjV1&ul$0ExU>%Bfw?)$grS3 zswY`8axConrtiz1sVSPR__AzZWV@j8w&eJ(Y$=kXsix81gat!k#ZnBLZXEpR%+}#) zS@C?V8&Ek-GvJ305IjwBVP!y5H!*U3$jQ#H-^eO<*4Xn~vj zwH^t7tr)JdfU`>ShK9>9vEdSJw4Dn9A5HD6ADpUxYyMRIC+aWP4>pI5A6xj+!YlJX-?%vc{}vV-zt%iGKWP5P z_tGO4WfcSj0YN|z5CjAPK|l}?1PTziIJ4C#y@2mRuX)NRW$Eayfv*K`v!~Mh9xmKZAw;e`>1vt1#>Ti_QNG zdjR}l^Y>x*yx(sAM)Nz(KX3l0=D%902)tu0M|9ZtpPJxJg_uGO{8dk1&~wds06*}?)lJ=hv_V@Usith$ctS-R$81(q*`g;_Byc&jLy?)ioW zD-S!G6xL&D@3l+3QpDV&_(%hmJ3@GJ++zgVKX6pRo}ZD(IX;Q-d)x{B92qbrljP z8yY360{JHljq=$IJCh6zU9)`|X}%;&iU*6jtELA_=RVy>oDzHF}4l#P_WY<4?r zQ_C`=?056n)MYEcsrn9NH(AvY?9|~apu@0R0MS3yN0O^MrVsYA@4@CChV5EN_Z%B` zE3qW7r(M`T!&eLhKLOb9;+?5_*=jL1HCg|E*WRC!kD7$s2-Y^Gdj1Wit!+bOQ4z^1OmAp5SZj zbcKZmQ8cPM$)39w~ z*EV3VS5;-rhRwcgTS30!dWL67;F4Lqn{|3MF`^sdrK7lIE3+CrKk8RqI8fW#M>Kf( z^Us%!U~RkM`>kd3>E3nPJ$vBw!y*32p(>ke+l^J2oW|-KmO55n?(ahaott2Ms9?ZA zks)#7fI-<(L;kEL??85wH5oPpRFSMWmTSPShl*x6u)&+7yOQI;rjR!5z-K9n<;vKD zaRS(T4Z|L6uw$UB7_fgIRum{NJ7B+jC{#8%b;<88$j) zvrHI%Wiv~ZFK%|&%`YdmKq)g2wrypn_H&N8m#qerk?lfvAiK$3sD*sXwmld2q16oy zI!worRoGw@S+Z$k8CoR8fNe{`I>d$!+t9*>s)h~w%EGqF7KS}^i!?rED-({iM_U=w z-t_rLYFm$oUQMs;OK_nMk&SgjE4FNY2?Z~kSzI_yyGeNys$Imx$*I<53(u+b4rDi3 ztsM{c%XU=+8y)+grM{}bXgk8NMYF9T&z4lhHdIAJ7DTQ%hNBvqDZ{4JGVEFnJ4*X_ zvRWs7oU)Y()tWp-s?&HeZnw+6l9^~1!-AL1GNIOGGfR|@)9mP{UQD?mBunF6p9jyC zmaPV-;5(4rWCaJK(S(iAO%L{aS74WCWT=vFVc4DAN1g+loWpkAifn2ygQTmlH@gPA zjT?sJLes8;&qQ3o39e64!OK==Y`cvQ#X7#DyN!osFPmjT$IE8+F6j6>u}#WWgVXUH z$nF6;4wL$hsoO|#TpLRsIQ71#V^gv%2WwkTgto~WWnX7+Z}z=`x8dOSuoD)&Y>o*PFPj-#JkXQK>1WCe z!r1)}D*&P#W3V~LuF6(`Q|=we?g7dzL9`22CBt%U%kw1Puw)ph0_($7B@?T*DeH!> z_`0Ql-Hmh|*R?PYRTUea?>yJfo zMa$sN!w4tVGz8I5(5pi;sQZeIRMm801j&-XKl8ynK)UQ9S@Yq45J;l9y01Z0w&L4f z8+SoLZ{zklW@7cr=E{vmN2B3ob0uYs8NFPy z$q~!|K{&q!p-1osCK!lm=5(CAET>g?ywnd`KemJVJ~oc7Y-u_D+9`bx-%5!((eR7V z_sRXKiDhWqu&+WBNLoT!hA}D=`>p}uavHq1;=?qiWw@%QLkCL2NKFH}RhSTxWEp-0 z-!pxO$4m7cKN|Z)QprWP5xX%06kl7ODUjW|@rd%DW}@xrNHJVnvRy;Rp5;oGWstY0u!eM3!xGk2 z7{v80&$9IgPk935QBz*BGGl!re3aGka!bM`bcpJBxhy#yFPB-Od^@A#@5Jwy3HevChIsf+@Prtc4C8$+lq-D-$EF}OgWvjvI z_zq+@S;sZUvJ_1=U~U5D+hj{tAri|rRRy8|OcjQ(AOJu&ECnL7VA8^a2mw%WU3Y94 z!3B4Zf2nq2=t|kjjMH&4|3BTBpMw93k02lj2m*qDARq_`0)l`bAP5Koe`g5XfAx56 zYmuyUvHW(k?2JNt(CdzQ;lFIQ@6tpCDQxWYN8iL$#bn>#g!u+Tfrvy+)!|)#WVs5= z%E2lYz5_vsI-J|EtQL&gB3NF_a9th2l!fDau+|vXCFcRLTkqo39C^OX{Qtt#Q&Y{w z+Qx$3I642p`g3!)XYIXvcYkH(`S*Qi`o|%0!e@K!;LMZvkDh<}Li@(mwggLlAT63b zOUPp70&(lc)z(c|LZlU54{3g{zmA8ns#qTnhF%{e?8sm6Vw&wbt2eH$hPRv^cF0VR zbK7(74DPPeCEA=mtk$-)yxH%4@+|gnYq^VuZRzYOB*Q7J+Xkna8cNaGQ$z^qGMvHb z>yWBJtT=pH%lF_5ta%rFTFYLyyE!;}>G};5l7iXgv+J$pjb7XJd=j#2+r8JxB622- z1WD3qX$48LuENc5Dq9c+5(y#bLno&Y)<~wWX{m?!7FmN1q&8V1Zyj$goq6NUH@DXw z=FzHS(30h2qDhw&T9Q@}ty+1slwtxcME(Zo^P)Y#qgBSBC9^D9Xh~W@v`XdCQi=(* zni(o^UbOppwDK6Vr18r_OVSFWl`D^yQcR%Lp?4_KAnfDOqA_U6yBt|)Nm@a)sPbqj z#ROUi!wVYVyjuHE0xg17dU&)%mW4J-D}t8XQcbjm9ClOCjm2#()3?!9A}88i%^B-0 z(yQ5>-Z~z%Bg3XGL%>GFczj|E`zqc%AKcu!454lL@qvetN0ZEw@w%U7Vj=zPl)A!moj+X+a-{ZQT15xm3U-O({uiv_H`TC`EH^@)A zb;9ZO`<+`Iw|A0#=HZ1PNm-7hw$9)jSg4~+PT$(Ra|^B=Z$O66SvSbfJ^h@cXFmQp zZ@pND8a__m0%~t=bcfNS9esx!qqg}vS*&o-dhXhln;;xLw8_;N<+D0EYj5_t_u7L_ z{}vtuP2S-4=H!}brb)D_Xw814HuL0L&fPv3sci?=6^7DBwM)q28ood-ZCwa2ZJi)B zE%3zHOjpU@>jTn22>-6P!8#8Bf48c?U7MRZ_0~P8HMl74&W78$BXsqFE^g4f4laDIcx9l?RjhWiCw?)zCMWmu70-HS_yklg=I-14Id3_ z5Lx{ElfHM9mP5ry!y4q4iI4evHLRh)D_DV-0`?#^{Z3MMBtRu)??95Htec^mQqZhrLIk0w2$#JONOKz!T zTg#3XBX|imXkl4ZrsK+a8-$}g+SKSgupm*0V@Hglw1Q~0%A=(eS6H-ktRpYlBRtyF zh!Nn+95IU03Zkv(h|!oijs#k2HsnQnm`9s3z5rjQ@fD>NL|aYci)|1n#ROVvHsnS7 z2#+>pd;z{p<10!lh_;%>7mJosOrWJ^LteCtJld4;1^60HaRs=1%rJCIsGY~(r5Ln){Z_9^P`2S;vIJVc86B-B0lBU{8 zBFoaaD6NRbky|RMapPS56-CxU8y^M*=r25<#%-^CkVl(pCxJy~I~k=FL|e^vl0{1? zCeRjbCr|QdQ%)-2%XCtsw1Q|W>ZCFQRFq-@ZBZxn1cz3#cqbL2hpg-dAxJBRRj{m6-YaG96IoU}8Ko7`IC4uh+eyaNkIj8W z4{iJvuu}_~`x=kd7pn=+gLU#8&{r4>Y5QNxKrODQI_mihpBwN~cQrVJ;*muWaf zX$8?%)No?ZQi=(*)Cb6m7V&6Ph7;h+G@PQef@mvhI5B7`#RS@-o|Ke83uC9LwwB1U zy0Iv&2wHMWHM=optW|Y5(YBUB8#}3SD=~WcDX-R^;?btsTELgt)<$Us(N?srWzfb> zDucF|le)dO!lO+E1OdLxfS@R?AlhmM1hE5DlwzVmprI`Jv{v~48UFvdgvJ?QfAUT$ zk!AJnQCbm=BeztucW0cY*d|r=Xm$!AYE9+Sxb3y4c(kc@5?ExmlTlhhv=wb98MKt* zio&2RYLkALN1JLV0bgc28Ko6OThVrsK}#tn(CUgF7^-=-_6HJZ^)co?k!3XqQCbnS z)Z~_Gn)^&g9^0g`lUjHP`lCGBluZgOGHuc* zt;mm=+){b8u}#XLrPfrwAM^IwGd$XqO$sbBZPF;MAli!Bq>R>5imd-XTFN3X+RHrJ zluZiwGHuc*tsvTp+N2EH*d}GPmReJJ(f%NhHf56nzD%1mN-K!AqBbdmmQqY;Ew!fd zqP@hUP1&S?FViNC(h8!js7=bCr4$orsWp`s?MHaDDVr4VW!j`sT0yiGwMiMYlwtyH zF`IOI?IMRZ9efEx)tSMUQCcyyRSmvm&{B#l+ELhGUaftaN1F=11bmsnmr+_lv=t4$ zjM0uLCeRiQzP!MrO$7u2zRVedD6Jscip~%)Xeq@6S{gQ(S8LDnXvcVC*}<29wzxMY zRo)w8USo@mFGQzk^Pv^~e}@16VnX9!OZAjZN@Q8neNkGGA2Ye7noer0TvBY4YDV}X z>N|M<|6@GbluZgOGHuc*tsvTp+N2EH*d}GrQfn&Tk9m9T1s-k6CIx(%HffYr5N$&gmZfn~ zS`m#Sw^UQ(7^f-rq)6~|@SY#oQU#a$#vIzTCuL3Wq@uK9XshZ;F=#2pl^`xXL_6{U z@^(Q#%%e?tQh+bhlZw&`qOGVW6{8(dOrWJcKwh+M9&M@%0(_ZWP?S~>ZAH5v1}&wS zKudjqyl6kcqfL2IfG^XNiqZD5wzr%YFY@4`5D`!G4Fyl>Gs;oJla%)04y>a zgea{b+KM&^4B9yOl0i$u2J>p|5AkSI4FcfHY!ITff@mw+ATVetMb=3z{6f%6JlZis zE7`%9fR+rq6hvG3p_LKE1X>z4m{)7pc(h}NRsff$?0Zzq75D)|e0YN|z z5CjAPL0}w#?X^B@lfnQW;%(AEmN9BY(+b$6!7bGsT4DUr*d~n!1f^m&>Gs;IJX${Z zlCa1MzNBdd(N;A0l0i!;vcuSggD*EZw5i}rz?b_E^ RgEkJnWVDus4dz4p{{h^3d2Rpz diff --git a/disscube/client/cube_client.py b/disscube/client/cube_client.py index 7ed0e9f..bae7643 100644 --- a/disscube/client/cube_client.py +++ b/disscube/client/cube_client.py @@ -1,9 +1,13 @@ +from __future__ import annotations + +import logging from typing import Optional, List, Any import os import numpy as np import xarray as xr -from dissmodel.geo.raster.backend import RasterBackend from disscube.catalog.sqlite_store import SqliteCatalogStore + +log = logging.getLogger(__name__) from disscube.storage import AssetStore from disscube.models import GridSpec, SpatialSource, SpatialDerivation, DerivedVariable, SpatialRelation from disscube.pipeline import PipelineContext @@ -157,8 +161,6 @@ def load( da = xr.open_zarr(d.asset_url)[d.name] slices.append(da) time_coords.extend(d.times) - if len(slices) == 1: - return slices[0] return xr.concat(slices, dim=xr.DataArray(time_coords, dims="time")) # Static — original behaviour @@ -170,7 +172,7 @@ def to_lucc_data( variables: List[str], grid_id: Optional[str] = None, period: Optional[tuple[str, str]] = None, - ) -> RasterBackend: + ) -> "RasterBackend": """ Standard integration point for the DisSModel ecosystem. Returns a RasterBackend containing all requested variables. @@ -194,8 +196,30 @@ def to_lucc_data( Only time slices whose value falls within [start, end] are loaded. Static variables are unaffected. Example: ``period=("2000", "2014")`` + + Notes + ----- + CONTRACT decisions (open — to be resolved before 1.0): + + 1. Canonical temporal type: ``valid_from`` / ``valid_until`` accept + year strings ("2020") or ISO dates ("2020-01-01"); + ``DerivedVariable.times`` stores ``list[int]`` (years only). + Open: validate year-only format at model construction time so + callers cannot silently store wrong temporal metadata. + + 2. Missing-time behavior: a temporal variable whose slices are all + outside ``period`` is skipped with ``log.warning`` and absent from + the returned backend. The caller cannot distinguish "variable was + static (period ignored)" from "existed but outside the range". + Open: raise ``ValueError``, return a NaN slice, or keep skip. + + 3. Empty-period backend: when every requested variable is filtered + out by ``period``, ``RasterBackend`` is initialized but holds no + data arrays. The caller receives a valid-looking but empty backend. + Open: raise before returning when no variable was stored. """ - # Detect CRS from first successfully loaded variable + from dissmodel.geo.raster.backend import RasterBackend + detected_crs = None backend = None @@ -211,19 +235,14 @@ def to_lucc_data( except Exception: pass - # Initialise backend from first variable if backend is None: - if da.ndim == 2: - rows, cols = da.sizes["y"], da.sizes["x"] - else: - rows, cols = da.sizes["y"], da.sizes["x"] + rows, cols = da.sizes["y"], da.sizes["x"] backend = RasterBackend(shape=(rows, cols)) if da.ndim == 3 and "time" in da.dims: - # Temporal variable ─ optionally filter by period + # Temporal variable — optionally filter by period if period is not None: start, end = period - # Ensure comparison is done with consistent types (e.g. all ints if coordinates are ints) time_vals = da.coords["time"].values if len(time_vals) > 0 and isinstance(time_vals[0], (int, np.integer)): try: @@ -237,8 +256,7 @@ def to_lucc_data( da = da.isel(time=mask) if da.sizes.get("time", 0) == 0: - # No slices in requested period — skip with warning - print(f" [warn] {var_name}: no data in period {period}, skipped") + log.warning("%s: no data in period %s, skipped", var_name, period) continue time_coords = da.coords["time"].values diff --git a/disscube/pipeline/writer.py b/disscube/pipeline/writer.py index fa00d91..d29a732 100644 --- a/disscube/pipeline/writer.py +++ b/disscube/pipeline/writer.py @@ -64,10 +64,9 @@ def execute(self, ctx: PipelineContext) -> PipelineContext: times = [ctx.source.time] elif derivation.valid_from is not None: try: - # Try to extract year as integer - times = [int(derivation.valid_from)] - except ValueError: - # Fallback to list of 0 or other logic if non-integer dates are used + # Extract year component: accepts "2020" or "2020-01-01" + times = [int(derivation.valid_from.split("-")[0])] + except (ValueError, AttributeError): pass derived = DerivedVariable( diff --git a/disscube/utils/bdc_importer.py b/disscube/utils/bdc_importer.py index 5377933..38258df 100644 --- a/disscube/utils/bdc_importer.py +++ b/disscube/utils/bdc_importer.py @@ -1,9 +1,13 @@ +import logging + import fiona from shapely.geometry import shape from disscube.models import SpatialSource from disscube.client import CubeClient from .grids import BDC_CRS, register_simulation_grids +log = logging.getLogger(__name__) + # --------------------------------------------------------------------------- # BDC Specific Constants # --------------------------------------------------------------------------- @@ -37,7 +41,7 @@ def _register_tile_sources(cube: CubeClient, paths: dict[str, str]) -> None: ("LG", paths["lg_path"]), ] for label, path in level_paths: - print(f"\nImporting BDC_{label} tiles from {path} …") + log.info("Importing BDC_%s tiles from %s", label, path) count = 0 with fiona.open(path) as src: for rec in src: @@ -56,4 +60,4 @@ def _register_tile_sources(cube: CubeClient, paths: dict[str, str]) -> None: cube.register_spatial_source(source) count += 1 - print(f" [tiles] registered {count} BDC_{label} tiles") + log.info("[tiles] registered %d BDC_%s tiles", count, label) diff --git a/disscube/utils/grids.py b/disscube/utils/grids.py index f30ebc5..56d4f2e 100644 --- a/disscube/utils/grids.py +++ b/disscube/utils/grids.py @@ -1,8 +1,11 @@ +import logging import math from pyproj import Transformer from disscube.models import GridSpec from disscube.client import CubeClient +log = logging.getLogger(__name__) + # --------------------------------------------------------------------------- # Master Grid Constants (Brazil Data Cube Standard) # --------------------------------------------------------------------------- @@ -42,8 +45,8 @@ def register_simulation_grids(cube: CubeClient) -> None: description=description, ) cube.register_grid(grid) - print(f" [grid] registered {grid_id!r} ({resolution:.0f} m pixels, " - f"{grid.rows} rows × {grid.cols} cols)") + log.info("[grid] registered %r (%d m pixels, %d rows × %d cols)", + grid_id, int(resolution), grid.rows, grid.cols) def register_local_grid( @@ -108,12 +111,12 @@ def register_local_grid( lon_min, lat_min = back.transform(minx, miny) lon_max, lat_max = back.transform(maxx, maxy) - print( - f"\n[grid] registered {grid_id!r}\n" - f" bbox (BDC Albers): [{minx:.0f}, {miny:.0f}, {maxx:.0f}, {maxy:.0f}]\n" - f" bbox (geographic): lon [{lon_min:.2f}, {lon_max:.2f}] " - f"lat [{lat_min:.2f}, {lat_max:.2f}]\n" - f" size: {grid.rows} rows × {grid.cols} cols = " - f"{grid.rows * grid.cols:,} cells ({resolution:.0f} m pixels)" + log.info( + "[grid] registered %r bbox(BDC Albers)=[%.0f, %.0f, %.0f, %.0f]" + " bbox(geo)=lon[%.2f,%.2f] lat[%.2f,%.2f]" + " size=%d×%d cells (%.0f m pixels)", + grid_id, minx, miny, maxx, maxy, + lon_min, lon_max, lat_min, lat_max, + grid.rows, grid.cols, resolution, ) return grid diff --git a/examples/drivers/catalog.db b/examples/drivers/catalog.db deleted file mode 100644 index eb958b040a13a9ea7d5605f95ac9b24a55fad845..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 49152 zcmeI%F>ljA6u@!2P2)6fno10o$j}=TsYQyID-o!NijYDv&`y@AO=~G_z;>%JFr={X z9r#Lo6$U2Q@VVD0A#tdLV6gs{VxKS9*L%Nrigdc?&$?j}h&O}i%uB?5Z9~&_?ST-Q zrWxsHE&beF^1XEPl0Nl`c3#ua?p=Pa)qiMa^_y1TsefG`*4JxW>wD(+wV&n}b2shg zLI42-5I_I{1Q0;r9|<^S+1hUCgYeuBhM_-n{U8c21K&-;exTZGj}GjPV~f4su6-ze zA0+mBLJe%nKXOCA^|G2pjc1~sdhw|mZ01IlgC?WKZ-Zl3cD9_=oY|g4p|9drbK}ZE zlX2y{ob6S~Fs#i@{o|{|d)*JBpzkH&;5;5TDk`MYdDykZxLY)jYbR0Y(dJf)^ObIj7hNGCCo z!Tc2etU6vTi{!7@>4Fi_pdT#wdbT37p4;WJ)oJ8;Di61<8pi0(a%MUDkXc@rFIJDz zl%?wYTgzJ|QCgIQnuyz0jg?VNH!RE2NAI&FWVPb#wltMM*7ZN8-&!hLJ62u|vT3;O z?6I8wz*gjsY)#%au~L{J;1$CPM@eKmY**5I_I{1Q0*~0R-6pvjz}A009IL zKmY**5I_I{1P~~`0Q>*q*O&|uKmY**5I_I{1Q0*~0R#|W|IZpg009ILKmY**5I_I{ L1Q0-=_yWHG_d;*n diff --git a/pyproject.toml b/pyproject.toml index c2cfd33..4a9cec1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,21 +17,18 @@ dependencies = [ "zarr", "rasterio", "geopandas", - "rasterstats", "shapely", "scipy", "numpy", "pandas", "pyproj", "fsspec", - "s3fs", - "typer", - "fastapi", - "uvicorn", - "python-multipart", "toml", "rioxarray", "affine", + "fastapi", + "uvicorn", + "dissmodel>=0.6.0,<0.7.0", ] [project.optional-dependencies] @@ -40,6 +37,15 @@ dev = [ "pytest-cov", "ipykernel", ] +bdc = [ + "fiona", +] +s3 = [ + "s3fs", +] +api = [ + "python-multipart", +] [tool.setuptools.packages.find] where = ["."] diff --git a/tests/test_temporal.py b/tests/test_temporal.py new file mode 100644 index 0000000..c9a409a --- /dev/null +++ b/tests/test_temporal.py @@ -0,0 +1,212 @@ +""" +Tests for the temporal path: writer time extraction, load() shape consistency, +and to_lucc_data period-filter logging. +""" + +import logging + +import numpy as np +import pytest +import xarray as xr + +from disscube.catalog.sqlite_store import SqliteCatalogStore +from disscube.client.cube_client import CubeClient +from disscube.models import ( + DerivedVariable, + GridSpec, + SpatialDerivation, + SpatialSource, + Variable, +) +from disscube.pipeline.context import PipelineContext +from disscube.pipeline.writer import VariableWriter +from disscube.storage.local import AssetStore + + +# --------------------------------------------------------------------------- # +# Shared helpers +# --------------------------------------------------------------------------- # + +def _grid(): + return GridSpec(id="G1", type="local", crs="EPSG:31982", resolution=10, + bbox=[0, 0, 100, 100]) + + +def _source(**kw): + return SpatialSource(id="S1", name="S1", format="raster", + asset_url="test.tif", crs="EPSG:31982", **kw) + + +def _ones_da(grid): + return xr.DataArray( + np.ones((grid.rows, grid.cols), dtype=np.float32), + dims=("y", "x"), + coords={"y": grid.ys, "x": grid.xs}, + ) + + +def _write_zarr(da, path, name="v"): + da.to_dataset(name=name).to_zarr(path, mode="w") + return str(path) + + +def _register_dv(cube, *, zarr_path, name="v", times, spec_hash="h", uid=None): + uid = uid or f"{spec_hash}_{name}_{'_'.join(map(str, times))}" + dv = DerivedVariable( + id=uid, name=name, grid_id="G1", role="driver", + times=times, dtype="float32", derivation_id=spec_hash, + spec_hash=spec_hash, tile_id=None, content_hash=None, + asset_url=zarr_path, + ) + cube.catalog.save_derived(dv) + return dv + + +# --------------------------------------------------------------------------- # +# VariableWriter: valid_from → DerivedVariable.times +# --------------------------------------------------------------------------- # + +def test_writer_year_string_produces_times(tmp_path): + """valid_from="2005" must produce times=[2005].""" + catalog = SqliteCatalogStore(tmp_path / "cat.db") + store = AssetStore(str(tmp_path / "store")) + grid = _grid() + derivation = SpatialDerivation( + source_id="S1", grid_id="G1", role="driver", + variables=[Variable(name="v", operator="mean")], + valid_from="2005", + ) + ctx = PipelineContext(source=_source(), grid=grid, derivation=derivation, + data=xr.Dataset({"v": _ones_da(grid)})) + VariableWriter(store, catalog).execute(ctx) + + derived = catalog.search_derived_variables(grid_id="G1") + assert len(derived) == 1 + assert derived[0].times == [2005] + + +def test_writer_iso_date_extracts_year(tmp_path): + """valid_from="2020-01-01" must produce times=[2020], not times=[].""" + catalog = SqliteCatalogStore(tmp_path / "cat.db") + store = AssetStore(str(tmp_path / "store")) + grid = _grid() + derivation = SpatialDerivation( + source_id="S1", grid_id="G1", role="driver", + variables=[Variable(name="v", operator="mean")], + valid_from="2020-01-01", + valid_until="2020-12-31", + ) + ctx = PipelineContext(source=_source(), grid=grid, derivation=derivation, + data=xr.Dataset({"v": _ones_da(grid)})) + VariableWriter(store, catalog).execute(ctx) + + derived = catalog.search_derived_variables(grid_id="G1") + assert len(derived) == 1 + assert derived[0].times == [2020], ( + f"Expected [2020] for ISO date valid_from, got {derived[0].times}. " + "Likely the int() conversion dropped the temporal marker." + ) + + +def test_writer_static_variable_has_empty_times(tmp_path): + """A derivation with no valid_from must produce times=[].""" + catalog = SqliteCatalogStore(tmp_path / "cat.db") + store = AssetStore(str(tmp_path / "store")) + grid = _grid() + derivation = SpatialDerivation( + source_id="S1", grid_id="G1", role="driver", + variables=[Variable(name="v", operator="mean")], + ) + ctx = PipelineContext(source=_source(), grid=grid, derivation=derivation, + data=xr.Dataset({"v": _ones_da(grid)})) + VariableWriter(store, catalog).execute(ctx) + + derived = catalog.search_derived_variables(grid_id="G1") + assert derived[0].times == [] + + +# --------------------------------------------------------------------------- # +# CubeClient.load(): shape consistency for temporal variables +# --------------------------------------------------------------------------- # + +def test_load_single_temporal_slice_returns_3d(tmp_path): + """load() must return (time, y, x) even when only one temporal slice exists. + + Regression guard: previously returned 2D (y, x) for single-slice variables, + causing to_lucc_data to treat them as static. + """ + cube = CubeClient(str(tmp_path / "cat.db"), str(tmp_path / "store")) + grid = _grid() + cube.register_grid(grid) + + da = _ones_da(grid) + zarr_path = _write_zarr(da, tmp_path / "store" / "v_2020.zarr") + _register_dv(cube, zarr_path=zarr_path, times=[2020]) + + result = cube.load("v", grid_id="G1") + assert result.ndim == 3, ( + f"Expected 3D (time, y, x) for a single temporal slice, got {result.ndim}D. " + "to_lucc_data would silently treat this as a static variable." + ) + assert "time" in result.dims + assert list(result.coords["time"].values) == [2020] + + +def test_load_multiple_temporal_slices_are_sorted(tmp_path): + """load() stacks temporal slices in ascending time order.""" + cube = CubeClient(str(tmp_path / "cat.db"), str(tmp_path / "store")) + grid = _grid() + cube.register_grid(grid) + + for year, val in [(2010, 3.0), (2000, 1.0), (2005, 2.0)]: + da = xr.DataArray( + np.full((grid.rows, grid.cols), val, dtype=np.float32), + dims=("y", "x"), coords={"y": grid.ys, "x": grid.xs}, + ) + zarr_path = _write_zarr(da, tmp_path / "store" / f"v_{year}.zarr") + _register_dv(cube, zarr_path=zarr_path, times=[year], + spec_hash=f"h{year}", uid=f"h{year}_v") + + result = cube.load("v", grid_id="G1") + assert result.ndim == 3 + assert list(result.coords["time"].values) == [2000, 2005, 2010] + # Pixel values should match the time-sorted order + np.testing.assert_allclose(result.isel(time=0).values, 1.0) + np.testing.assert_allclose(result.isel(time=1).values, 2.0) + np.testing.assert_allclose(result.isel(time=2).values, 3.0) + + +# --------------------------------------------------------------------------- # +# to_lucc_data: period filter emits log.warning (not print) +# --------------------------------------------------------------------------- # + +def test_to_lucc_data_period_skip_logs_warning(tmp_path, caplog): + """A temporal variable outside the requested period logs a warning. + + Verifies the fix from print() to log.warning() so the message is + observable in structured logging rather than stdout only. + """ + pytest.importorskip("dissmodel") + + cube = CubeClient(str(tmp_path / "cat.db"), str(tmp_path / "store")) + grid = _grid() + cube.register_grid(grid) + + # Temporal variable at 2000 — outside the requested period 2010–2020 + da = _ones_da(grid) + t_path = _write_zarr(da, tmp_path / "store" / "v_2000.zarr") + _register_dv(cube, zarr_path=t_path, times=[2000], spec_hash="ht", uid="ht_v") + + # Static variable — always loaded regardless of period + s_path = _write_zarr(da, tmp_path / "store" / "s.zarr", name="s") + _register_dv(cube, zarr_path=s_path, name="s", times=[], spec_hash="hs", + uid="hs_s") + + with caplog.at_level(logging.WARNING, logger="disscube.client.cube_client"): + cube.to_lucc_data(["v", "s"], grid_id="G1", period=("2010", "2020")) + + warning_messages = [r.message for r in caplog.records if r.levelno == logging.WARNING] + assert any("v" in m for m in warning_messages), ( + "Expected a log.warning mentioning 'v' for the out-of-period variable. " + f"Got records: {warning_messages}" + ) From 312dde3b9b575a95a10d49f58549369586242fe6 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 09:08:28 -0300 Subject: [PATCH 06/13] Honest-scoping fixes for JOSS review (audit item 6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_bdc_master_grids.py: - Fix setUp to use tempfile.mkdtemp() instead of relative paths, eliminating the fragile in-tree db creation that caused intermittent failures - Update test_importer_creates_3_grids (split into two): now asserts BR/5km and BR/1km grids (what register_simulation_grids() actually creates) and 6 BDC tile sources (2 tiles × 3 levels), matching current code - Keep test_derive_uses_tile_id as-is (it passes and covers real behavior) - Split test_load_with_tile_id: keep the passing half (explicit tile_id filtering works) as test_load_with_explicit_tile_id_filters_correctly; mark the unimplemented disambiguation half as @unittest.expectedFailure with a clear docstring documenting the planned feature docs/architecture/tiling.md: - Fix mermaid diagram: BDC/100m (non-existent) → BR/5km (actual grid) - Add clarifying note distinguishing BDC tile sources from simulation grids - Add limitation callout on load() multi-tile ambiguity README.md: - Add "Limitações conhecidas" section documenting six intentional scope boundaries: in-memory single-tile design, rasterized vector aggregation, load() tile disambiguation, SpatialRelation not used in pipeline, purity_threshold reserved, no STAC integration Co-Authored-By: Claude Sonnet 4.6 --- README.md | 22 +++++ docs/architecture/tiling.md | 21 ++-- tests/test_bdc_master_grids.py | 170 ++++++++++++++++++++++----------- 3 files changed, 151 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index 001daa6..0fd93a8 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,28 @@ class WeightedMeanOperator(Operator): O operador é registrado automaticamente e aceito em `Derivation` / `SpatialDerivation` sem nenhuma outra mudança. +## Limitações conhecidas + +As limitações abaixo são decisões de escopo da versão atual, não bugs. Estão documentadas para que usuários e revisores entendam o que está implementado versus o que está planejado. + +**Processamento em memória, single-tile** +Cada chamada a `derive()` carrega o dado completo de um tile em memória. Não há processamento lazy (Dask) nem distribuído. Para grades de escala continental (ex: `BR/1km`), use o loop de tiles — cada tile é processado e salvo independentemente. + +**Agregação vetorial por rasterização (não área-ponderada)** +Operadores sobre fontes vetoriais (`majority`, `percentage`, `attribute`, `presence`, `minority`) convertem geometrias em raster antes de agregar pixels. A fração de cobertura de cada célula é estimada por contagem de pixels, não por cálculo de área de interseção. Para cobertura proporcional mais precisa, use uma fonte raster em resolução substancialmente maior que a célula-alvo. + +**Desambiguação de tiles em `load()`** +`CubeClient.load(name)` sem `tile_id` retorna silenciosamente o primeiro resultado quando múltiplos tiles da mesma variável existem na mesma grade. Erro explícito ou mosaico automático estão planejados. **Especifique sempre `tile_id` em workloads multi-tile.** + +**`SpatialRelation` não atua no pipeline** +O modelo `SpatialRelation` é persistido no catálogo e incluído no `spec_hash` (mudá-lo invalida o cache), mas nenhum estágio do pipeline usa as relações durante a derivação. A integração com estratégias hierárquicas de grades está reservada para versão futura. + +**`purity_threshold` reservado** +O campo `purity_threshold` em `Derivation` é incluído no `spec_hash`, mas não é aplicado à saída — a máscara por pureza não está implementada. Definir `purity_threshold` muda o cache key sem mudar o resultado. + +**Sem integração STAC** +Os campos `valid_from`/`valid_until` e `bbox` em `Derivation` seguem a convenção de nomenclatura STAC, mas nenhuma lógica de cliente, catálogo ou exportação STAC está implementada neste módulo. + ## Licença Parte do ecossistema DisSModel. Ver `LICENSE` para detalhes. diff --git a/docs/architecture/tiling.md b/docs/architecture/tiling.md index 3ae054f..857fae3 100644 --- a/docs/architecture/tiling.md +++ b/docs/architecture/tiling.md @@ -8,11 +8,11 @@ Uma **Master Grid** define a resolução e o CRS para todo o país. **Tiles** s ```mermaid graph TD - MG[Master Grid: BDC/100m] --> T1[Tile 001] + MG[Master Grid: BR/5km] --> T1[Tile 001] MG --> T2[Tile 002] MG --> TN[Tile N…] - T1 --> Z1[zarr: .../001/{hash}/var.zarr] - T2 --> Z2[zarr: .../002/{hash}/var.zarr] + T1 --> Z1[zarr: .../BR_5km/001/{hash}/var.zarr] + T2 --> Z2[zarr: .../BR_5km/002/{hash}/var.zarr] ``` ## Como o CubeClient usa tiles @@ -57,19 +57,26 @@ for tile_id in tile_ids: cube.derive(derivation, tile_id=tile_id) ``` +> **Nota:** o `bdc_importer` registra tiles BDC como `SpatialSource` com IDs no formato +> `BDC_SM_`. A grade de simulação permanece `BR/5km` ou `BR/1km` — os tiles BDC +> definem apenas o bbox do recorte a processar. + Cada iteração é independente. Workers paralelos podem processar tiles diferentes sem conflitos (caminhos Zarr únicos por tile + spec_hash). ## Carregar dados tileados ```python -# Carga de um tile específico +# Carga de um tile específico (tile_id sempre funciona) da = cube.load("dist_road", tile_id="009002") -# Carga por grade (mosaico de todos os tiles com mesmo nome e spec_hash) -da = cube.load("dist_road", grid_id="BDC_SM") +# Carga por grade — funciona quando há apenas um tile +da = cube.load("dist_road", grid_id="BR/5km") ``` -`CubeClient.load()` detecta tiles automaticamente ao buscar por `variable_name` + `grid_id`. Se houver múltiplos tiles, empilha as fatias. Para variáveis temporais com múltiplos tiles, ordena por tempo. +> **Limitação atual:** `load()` sem `tile_id` retorna silenciosamente o primeiro resultado +> quando múltiplos tiles da mesma variável existem na mesma grade. A desambiguação +> automática (mosaico ou erro explícito) está planejada mas não implementada. +> Especifique sempre `tile_id` em workloads multi-tile. ## Vantagens diff --git a/tests/test_bdc_master_grids.py b/tests/test_bdc_master_grids.py index 26b9475..e9863f5 100644 --- a/tests/test_bdc_master_grids.py +++ b/tests/test_bdc_master_grids.py @@ -1,131 +1,191 @@ +""" +Tests for BDC grid import and tile-based loading. + +What is actually implemented (and tested here): + - import_bdc_grids() registers BR/5km and BR/1km simulation grids plus + BDC_SM/MD/LG tile sources in the catalog. + - load(name, tile_id=...) correctly filters to a specific tile. + +What is planned but not yet implemented (marked xfail): + - load(name) without tile_id raising when multiple tiles of the same + variable exist on the same grid (tile disambiguation). +""" + +import os +import shutil +import tempfile import unittest from unittest.mock import MagicMock, patch + from disscube.client import CubeClient from disscube.utils.bdc_importer import import_bdc_grids -import os -import json + class TestBDCMasterGrids(unittest.TestCase): def setUp(self): - self.catalog_path = "test_catalog.db" - self.store_path = "./test_store" - if os.path.exists(self.catalog_path): - os.remove(self.catalog_path) + self._tmpdir = tempfile.mkdtemp() + self.catalog_path = os.path.join(self._tmpdir, "catalog.db") + self.store_path = os.path.join(self._tmpdir, "store") self.cube = CubeClient(self.catalog_path, self.store_path) def tearDown(self): - if os.path.exists(self.catalog_path): - os.remove(self.catalog_path) + shutil.rmtree(self._tmpdir, ignore_errors=True) + + # ------------------------------------------------------------------ + # BDC importer + # ------------------------------------------------------------------ @patch('fiona.open') - def test_importer_creates_3_grids(self, mock_fiona): - # Mock fiona to return 2 tiles per size with valid geometry + def test_importer_registers_simulation_grids(self, mock_fiona): + """import_bdc_grids() creates the BR/5km and BR/1km simulation grids.""" mock_src = MagicMock() mock_src.__enter__.return_value = [ { - 'properties': {'tile': '001001'}, - 'geometry': {'type': 'Polygon', 'coordinates': [[[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]]]} + 'properties': {'tile': '001001'}, + 'geometry': {'type': 'Polygon', + 'coordinates': [[[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]]]} }, { - 'properties': {'tile': '001002'}, - 'geometry': {'type': 'Polygon', 'coordinates': [[[1, 1], [2, 1], [2, 2], [1, 2], [1, 1]]]} + 'properties': {'tile': '001002'}, + 'geometry': {'type': 'Polygon', + 'coordinates': [[[1, 1], [2, 1], [2, 2], [1, 2], [1, 1]]]} } ] mock_fiona.return_value = mock_src import_bdc_grids(self.cube, "sm.shp", "md.shp", "lg.shp") - # Verify 4 grids (SM, MD, LG, 100m) - grids = self.cube.catalog.list_grids() - grid_ids = [g.id for g in grids] - self.assertIn("BDC_SM", grid_ids) - self.assertIn("BDC_MD", grid_ids) - self.assertIn("BDC_LG", grid_ids) - self.assertIn("BDC_100m", grid_ids) - self.assertEqual(len(grids), 4) + grid_ids = [g.id for g in self.cube.catalog.list_grids()] + self.assertIn("BR/5km", grid_ids) + self.assertIn("BR/1km", grid_ids) + + @patch('fiona.open') + def test_importer_registers_bdc_tile_sources(self, mock_fiona): + """import_bdc_grids() registers 2 tiles × 3 BDC levels = 6 SpatialSources.""" + mock_src = MagicMock() + mock_src.__enter__.return_value = [ + { + 'properties': {'tile': '001001'}, + 'geometry': {'type': 'Polygon', + 'coordinates': [[[0, 0], [1, 0], [1, 1], [0, 1], [0, 0]]]} + }, + { + 'properties': {'tile': '001002'}, + 'geometry': {'type': 'Polygon', + 'coordinates': [[[1, 1], [2, 1], [2, 2], [1, 2], [1, 1]]]} + } + ] + mock_fiona.return_value = mock_src + + import_bdc_grids(self.cube, "sm.shp", "md.shp", "lg.shp") - # Verify sources sources = self.cube.catalog.list_spatial_sources() - # 4 types * 2 tiles = 8 sources - self.assertEqual(len(sources), 8) source_ids = [s.id for s in sources] + # 2 tiles per level × 3 levels (SM, MD, LG) + self.assertEqual(len(sources), 6) self.assertIn("BDC_SM_001001", source_ids) + self.assertIn("BDC_SM_001002", source_ids) self.assertIn("BDC_LG_001002", source_ids) + # ------------------------------------------------------------------ + # VariableWriter: tile_id extraction from BDC_ source IDs + # ------------------------------------------------------------------ + @patch('disscube.pipeline.writer.VariableWriter._calculate_dir_hash') @patch('xarray.Dataset.to_zarr') @patch('fsspec.AbstractFileSystem.exists') def test_derive_uses_tile_id(self, mock_exists, mock_to_zarr, mock_hash): + """VariableWriter extracts tile_id from source IDs following BDC_LEVEL_TILE convention.""" mock_exists.return_value = False mock_hash.return_value = "fake_hash" - - # Setup master grid and source + from disscube.models import GridSpec, SpatialSource, SpatialDerivation, Variable - grid = GridSpec(id="BDC_SM", type="reference", crs="EPSG:31984", resolution=10, bbox=[0,0,100,100]) - source = SpatialSource(id="BDC_SM_001", name="Tile 001", format="raster", asset_url="tile.tif", crs="EPSG:31984") + grid = GridSpec(id="BDC_SM", type="reference", crs="EPSG:31984", + resolution=10, bbox=[0, 0, 100, 100]) + source = SpatialSource(id="BDC_SM_001", name="Tile 001", + format="raster", asset_url="tile.tif", crs="EPSG:31984") self.cube.register_grid(grid) self.cube.register_spatial_source(source) - derivation = SpatialDerivation( - source_id="BDC_SM_001", - grid_id="BDC_SM", - role="test", - variables=[Variable(name="var1", operator="identity")] - ) - from disscube.pipeline.writer import VariableWriter from disscube.pipeline.context import PipelineContext import xarray as xr import numpy as np - writer = VariableWriter(self.cube.store, self.cube.catalog) - + derivation = SpatialDerivation( + source_id="BDC_SM_001", grid_id="BDC_SM", role="test", + variables=[Variable(name="var1", operator="mean")], + ) ds = xr.Dataset({"var1": (("y", "x"), np.random.rand(10, 10))}) ctx = PipelineContext(source=source, grid=grid, derivation=derivation, data=ds) - - writer.execute(ctx) + VariableWriter(self.cube.store, self.cube.catalog).execute(ctx) - # Check catalog derived = self.cube.catalog.search_derived_variables(grid_id="BDC_SM") self.assertEqual(len(derived), 1) self.assertEqual(derived[0].tile_id, "001") self.assertIn("001", derived[0].asset_url) self.assertIn("001", derived[0].id) + # ------------------------------------------------------------------ + # load() with tile_id + # ------------------------------------------------------------------ + @patch('xarray.open_zarr') - def test_load_with_tile_id(self, mock_open_zarr): + def test_load_with_explicit_tile_id_filters_correctly(self, mock_open_zarr): + """load(name, tile_id='001') returns only the matching tile.""" from disscube.models import DerivedVariable import xarray as xr - - # Setup mock DataArray + mock_da = MagicMock(spec=xr.DataArray) mock_ds = MagicMock() mock_ds.__getitem__.return_value = mock_da mock_open_zarr.return_value = mock_ds - - # Setup 2 tiles for the same variable + d1 = DerivedVariable( id="hash_001_var1", name="var1", grid_id="BDC_SM", role="test", times=[], dtype="float32", derivation_id="hash", spec_hash="hash", - tile_id="001", asset_url="path/001.zarr" + tile_id="001", asset_url="path/001.zarr", ) d2 = DerivedVariable( id="hash_002_var1", name="var1", grid_id="BDC_SM", role="test", times=[], dtype="float32", derivation_id="hash", spec_hash="hash", - tile_id="002", asset_url="path/002.zarr" + tile_id="002", asset_url="path/002.zarr", ) self.cube.catalog.save_derived(d1) self.cube.catalog.save_derived(d2) - - # Test 1: Load without tile_id when multiple exist -> should fail - with self.assertRaises(ValueError) as cm: - self.cube.load("var1") - self.assertIn("Multiple tiles found", str(cm.exception)) - - # Test 2: Load with specific tile_id -> should succeed + res = self.cube.load("var1", tile_id="001") self.assertEqual(res, mock_da) mock_open_zarr.assert_called_with("path/001.zarr") + @unittest.expectedFailure + def test_load_without_tile_id_raises_for_ambiguous_tiles(self): + """ + PLANNED (not yet implemented): load(name) without tile_id should raise + ValueError when multiple tiles of the same variable exist on the same grid. + + Currently load() silently returns the first match. This test is marked + @expectedFailure to document the planned behavior as a regression guard — + when the feature is implemented, remove the decorator and the test will pass. + """ + from disscube.models import DerivedVariable + + d1 = DerivedVariable( + id="hash_001_var1", name="var1", grid_id="BDC_SM", role="test", + times=[], dtype="float32", derivation_id="hash", spec_hash="hash", + tile_id="001", asset_url="path/001.zarr", + ) + d2 = DerivedVariable( + id="hash_002_var1", name="var1", grid_id="BDC_SM", role="test", + times=[], dtype="float32", derivation_id="hash", spec_hash="hash", + tile_id="002", asset_url="path/002.zarr", + ) + self.cube.catalog.save_derived(d1) + self.cube.catalog.save_derived(d2) + + with self.assertRaises(ValueError): + self.cube.load("var1") + + if __name__ == '__main__': unittest.main() From f8a977b0f22f6af970fbe8bb7f81eaae50fa5ff1 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 09:13:49 -0300 Subject: [PATCH 07/13] Remove orphaned dissmodel-configs/ directory Single acre_5km.toml with no references from any Python code, test, example, or doc. Directory name collides with the sibling org repo, creating contributor confusion about where grid configs live. Co-Authored-By: Claude Sonnet 4.6 --- dissmodel-configs/grids/acre_5km.toml | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 dissmodel-configs/grids/acre_5km.toml diff --git a/dissmodel-configs/grids/acre_5km.toml b/dissmodel-configs/grids/acre_5km.toml deleted file mode 100644 index 7e62f3a..0000000 --- a/dissmodel-configs/grids/acre_5km.toml +++ /dev/null @@ -1,6 +0,0 @@ -id = "BR/5km" -type = "reference" -crs = "+proj=aea +lat_0=-12 +lon_0=-54 +lat_1=-2 +lat_2=-22 +x_0=5000000 +y_0=10000000 +ellps=GRS80 +units=m +no_defs" -resolution = 5000.0 -bbox = [ 2822000.0, 10049000.0, 3637000.0, 10474000.0,] -description = "Acre grid PERFECTLY ALIGNED with BDC pixel grid (2000m/4000m offset)" From 0349c9c225b8459cd8b956789165772d20fe2961 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 09:35:16 -0300 Subject: [PATCH 08/13] Add roundtrip tests, CI, and fix SpatialRelation spec_hash exclusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Item 9 — roundtrip integration tests (tests/test_roundtrip.py): Exercises the full Normalizer → GridAligner → Aggregator → VariableWriter chain via CubeClient.derive() and reloads via CubeClient.load(). Verifies pixel values, coverage_purity and dominance_purity coordinates survive Zarr serialisation, content_hash is recorded, and the second derive() hits cache. Item 10 — GitHub Actions CI (.github/workflows/ci.yml): Single job, Python 3.12, pip install -e ".[dev,bdc]", pytest -q. Triggers on push to main/evolution and all pull requests. Item 11 — SpatialRelation excluded from spec_hash (correctness fix): No pipeline stage reads SpatialRelation during computation. Including relations in the hash made the cache key sensitive to documentation-level metadata, meaning the same source + grid + operator + time window could map to different hashes depending on which relations were registered — breaking the reproducibility guarantee. Relations are now explicitly excluded. The existing test that asserted the old (incorrect) behaviour is updated; new tests in test_spec_hash.py guard the corrected contract. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/ci.yml | 23 ++++ README.md | 2 +- disscube/models/variable.py | 14 +-- tests/test_grid_relations.py | 60 +++++------ tests/test_roundtrip.py | 196 +++++++++++++++++++++++++++++++++++ tests/test_spec_hash.py | 113 ++++++++++++++++++++ 6 files changed, 364 insertions(+), 44 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 tests/test_roundtrip.py create mode 100644 tests/test_spec_hash.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..9ae18c8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,23 @@ +name: CI + +on: + push: + branches: ["main", "evolution"] + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: pip install -e ".[dev,bdc]" + + - name: Run tests + run: pytest -q diff --git a/README.md b/README.md index 0fd93a8..3cb5342 100644 --- a/README.md +++ b/README.md @@ -198,7 +198,7 @@ Operadores sobre fontes vetoriais (`majority`, `percentage`, `attribute`, `prese `CubeClient.load(name)` sem `tile_id` retorna silenciosamente o primeiro resultado quando múltiplos tiles da mesma variável existem na mesma grade. Erro explícito ou mosaico automático estão planejados. **Especifique sempre `tile_id` em workloads multi-tile.** **`SpatialRelation` não atua no pipeline** -O modelo `SpatialRelation` é persistido no catálogo e incluído no `spec_hash` (mudá-lo invalida o cache), mas nenhum estágio do pipeline usa as relações durante a derivação. A integração com estratégias hierárquicas de grades está reservada para versão futura. +O modelo `SpatialRelation` é persistido no catálogo, mas nenhum estágio do pipeline usa as relações durante a derivação — e por isso elas são **excluídas do `spec_hash`**. Incluí-las tornaria a chave de cache sensível a metadados que não afetam o resultado, quebrando a garantia de reprodutibilidade. A integração com estratégias hierárquicas de grades está reservada para versão futura. **`purity_threshold` reservado** O campo `purity_threshold` em `Derivation` é incluído no `spec_hash`, mas não é aplicado à saída — a máscara por pureza não está implementada. Definir `purity_threshold` muda o cache key sem mudar o resultado. diff --git a/disscube/models/variable.py b/disscube/models/variable.py index 1ed3a97..67ec7d4 100644 --- a/disscube/models/variable.py +++ b/disscube/models/variable.py @@ -66,21 +66,15 @@ def spec_hash(self) -> str: v.model_dump() for v in sorted(self.variables, key=lambda x: x.name) ] - relations_data = [] - for r in sorted( - self.relations, - key=lambda x: (x.source_grid_id, x.target_grid_id, x.strategy), - ): - r_dict = r.model_dump() - r_dict.pop("metadata", None) - relations_data.append(r_dict) - + # relations are intentionally excluded from the hash: + # no pipeline stage reads SpatialRelation during computation, so + # including them would make the cache key sensitive to metadata that + # does not affect the output — violating the reproducibility guarantee. relevant_data = { "source_id": self.source_id, "grid_id": self.grid_id, "role": self.role, "variables": variables_data, - "relations": relations_data, "valid_from": self.valid_from, # None for static variables "valid_until": self.valid_until, # None for static variables } diff --git a/tests/test_grid_relations.py b/tests/test_grid_relations.py index 7d2b6f0..a9b8bb0 100644 --- a/tests/test_grid_relations.py +++ b/tests/test_grid_relations.py @@ -74,44 +74,38 @@ def test_multiple_relations_and_bidirectional(tmp_path): assert len(relations_bdc) == 1 assert relations_bdc[0].source_grid_id == "br_mangue_100m" -def test_spec_hash_invalidation_with_relations(): - v = [Variable(name="test", operator="identity")] - - # Derivação sem relações - deriv1 = SpatialDerivation( - source_id="src1", - grid_id="grid1", - role="test", - variables=v, - relations=[] +def test_spec_hash_stable_regardless_of_relations(): + """Relations are excluded from spec_hash: they are not used in computation. + + Two derivations identical in every computational aspect (source, grid, role, + variables, temporal window) must produce the same spec_hash even if their + SpatialRelation lists differ. This is the correct behaviour: including + relations would make the cache key sensitive to metadata that does not + affect the output. + """ + v = [Variable(name="test", operator="mean")] + + deriv_no_rel = SpatialDerivation( + source_id="src1", grid_id="grid1", role="test", + variables=v, relations=[], ) - hash1 = deriv1.spec_hash() - - # Derivação com uma relação - rel = SpatialRelation(source_grid_id="grid1", target_grid_id="grid2", strategy="simple") - deriv2 = SpatialDerivation( - source_id="src1", - grid_id="grid1", - role="test", + deriv_with_rel = SpatialDerivation( + source_id="src1", grid_id="grid1", role="test", variables=v, - relations=[rel] + relations=[SpatialRelation(source_grid_id="grid1", + target_grid_id="grid2", + strategy="simple")], ) - hash2 = deriv2.spec_hash() - - assert hash1 != hash2 - - # Mudar parâmetro da relação invalida o hash - rel_modified = SpatialRelation(source_grid_id="grid1", target_grid_id="grid2", strategy="keepinboth") - deriv3 = SpatialDerivation( - source_id="src1", - grid_id="grid1", - role="test", + deriv_other_rel = SpatialDerivation( + source_id="src1", grid_id="grid1", role="test", variables=v, - relations=[rel_modified] + relations=[SpatialRelation(source_grid_id="grid1", + target_grid_id="grid2", + strategy="keepinboth")], ) - hash3 = deriv3.spec_hash() - - assert hash2 != hash3 + + assert deriv_no_rel.spec_hash() == deriv_with_rel.spec_hash() + assert deriv_with_rel.spec_hash() == deriv_other_rel.spec_hash() if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/test_roundtrip.py b/tests/test_roundtrip.py new file mode 100644 index 0000000..35a2165 --- /dev/null +++ b/tests/test_roundtrip.py @@ -0,0 +1,196 @@ +""" +Full-pipeline roundtrip integration tests. + +Exercises the complete Normalizer → GridAligner → Aggregator → VariableWriter +chain via CubeClient.derive(), then reloads via CubeClient.load() and verifies +that pixel values, purity coordinates, and catalog metadata survive the Zarr +serialisation roundtrip. +""" + +import numpy as np +import pytest +import rasterio +from rasterio.transform import from_bounds + +from disscube.client import CubeClient +from disscube.models import GridSpec, SpatialSource, SpatialDerivation, Variable + +CRS = "EPSG:31982" + + +def _write_tif(path, array, bbox=(0, 0, 100, 100), nodata=None): + rows, cols = array.shape + transform = from_bounds(*bbox, cols, rows) + with rasterio.open( + path, "w", driver="GTiff", height=rows, width=cols, count=1, + dtype="float32", crs=CRS, transform=transform, nodata=nodata, + ) as dst: + dst.write(array.astype(np.float32), 1) + return str(path) + + +@pytest.fixture() +def cube(tmp_path): + """Fresh CubeClient with a single 1×1 cell grid (100 m, 100×100 bbox).""" + c = CubeClient(str(tmp_path / "catalog.db"), str(tmp_path / "store")) + c.register_grid(GridSpec(id="G", type="local", crs=CRS, + resolution=100, bbox=[0, 0, 100, 100])) + return c + + +def _derive(cube, tif_path, operator, class_code=None, var_name="v", + valid_from=None): + cube.register_spatial_source( + SpatialSource(id="S", name="S", format="raster", + asset_url=tif_path, crs=CRS) + ) + derivation = SpatialDerivation( + source_id="S", grid_id="G", role="driver", + variables=[Variable(name=var_name, operator=operator, + class_code=class_code)], + valid_from=valid_from, + ) + return cube.derive(derivation), derivation + + +# --------------------------------------------------------------------------- # +# Value correctness +# --------------------------------------------------------------------------- # + +def test_mean_values_survive_roundtrip(tmp_path, cube): + """derive(mean) → load() → pixel value equals arithmetic mean of source pixels.""" + src = np.array([[2, 4], [3, 5]], dtype=np.float32) + tif = _write_tif(tmp_path / "mean.tif", src) + + _derive(cube, tif, operator="mean") + + da = cube.load("v", grid_id="G") + assert da.shape == (1, 1) + np.testing.assert_allclose(da.values[0, 0], np.mean([2, 4, 3, 5]), atol=1e-4) + + +def test_percentage_value_survives_roundtrip(tmp_path, cube): + """derive(percentage) → load() → value matches expected class fraction.""" + # 2×2 source over 1×1 target cell: 3 px class 1, 1 px class 2 → 0.75 + src = np.array([[1, 1], [1, 2]], dtype=np.float32) + tif = _write_tif(tmp_path / "pct.tif", src) + + _derive(cube, tif, operator="percentage", class_code=1, var_name="pct") + + da = cube.load("pct", grid_id="G") + np.testing.assert_allclose(da.values[0, 0], 0.75, atol=1e-5) + + +# --------------------------------------------------------------------------- # +# Purity coordinates survive Zarr serialisation +# --------------------------------------------------------------------------- # + +def test_coverage_purity_survives_roundtrip(tmp_path, cube): + """coverage_purity coordinate is preserved after Zarr write + reload.""" + src = np.array([[1, 1], [1, 2]], dtype=np.float32) + tif = _write_tif(tmp_path / "pct.tif", src) + + _derive(cube, tif, operator="percentage", class_code=1, var_name="pct") + + da = cube.load("pct", grid_id="G") + assert "coverage_purity" in da.coords, ( + "coverage_purity coordinate was dropped during Zarr serialisation. " + "Check that VariableWriter saves the full DataArray with auxiliary coords." + ) + # All 4 source pixels are valid → coverage = 1.0 + np.testing.assert_allclose(da.coords["coverage_purity"].values[0, 0], 1.0, atol=1e-5) + + +def test_dominance_purity_survives_roundtrip(tmp_path, cube): + """dominance_purity coordinate is preserved after Zarr write + reload.""" + src = np.array([[1, 1], [1, 2]], dtype=np.float32) + tif = _write_tif(tmp_path / "pct.tif", src) + + _derive(cube, tif, operator="percentage", class_code=1, var_name="pct") + + da = cube.load("pct", grid_id="G") + assert "dominance_purity" in da.coords, ( + "dominance_purity coordinate was dropped during Zarr serialisation." + ) + # Class 1 dominates: 3/4 = 0.75 + np.testing.assert_allclose(da.coords["dominance_purity"].values[0, 0], 0.75, atol=1e-5) + + +def test_nodata_purity_survives_roundtrip(tmp_path, cube): + """coverage_purity < 1 for cells with nodata pixels survives roundtrip.""" + # Half-nodata: left column class 1, right column nodata(-1) + src = np.array([[1, -1], [1, -1]], dtype=np.float32) + tif = _write_tif(tmp_path / "nodata.tif", src, nodata=-1) + + _derive(cube, tif, operator="percentage", class_code=1, var_name="p") + + da = cube.load("p", grid_id="G") + # 2 valid / 4 total → 0.5 + np.testing.assert_allclose(da.coords["coverage_purity"].values[0, 0], 0.5, atol=1e-5) + # class 1 is 100 % of valid pixels + np.testing.assert_allclose(da.values[0, 0], 1.0, atol=1e-5) + + +# --------------------------------------------------------------------------- # +# Catalog metadata +# --------------------------------------------------------------------------- # + +def test_content_hash_is_recorded(tmp_path, cube): + """VariableWriter records a non-empty SHA-256 content_hash in the catalog.""" + src = np.ones((2, 2), dtype=np.float32) + tif = _write_tif(tmp_path / "ones.tif", src) + + derived_vars, _ = _derive(cube, tif, operator="mean") + + assert len(derived_vars) == 1 + dv = derived_vars[0] + assert dv.content_hash is not None + assert len(dv.content_hash) == 64 # SHA-256 hex digest is 64 chars + + +def test_derive_records_spec_hash(tmp_path, cube): + """Catalog entry carries the spec_hash from the derivation.""" + src = np.ones((2, 2), dtype=np.float32) + tif = _write_tif(tmp_path / "ones.tif", src) + + derived_vars, derivation = _derive(cube, tif, operator="mean") + + assert derived_vars[0].spec_hash == derivation.spec_hash() + + +# --------------------------------------------------------------------------- # +# Cache hit +# --------------------------------------------------------------------------- # + +def test_cache_hit_skips_recomputation(tmp_path, cube): + """Second derive() with the same spec returns cached result without re-writing Zarr.""" + src = np.ones((2, 2), dtype=np.float32) + tif = _write_tif(tmp_path / "cache.tif", src) + + derived_first, derivation = _derive(cube, tif, operator="mean") + first_hash = derived_first[0].content_hash + + derived_second = cube.derive(derivation) + + assert derived_second[0].spec_hash == derived_first[0].spec_hash + assert derived_second[0].content_hash == first_hash + + +# --------------------------------------------------------------------------- # +# Temporal variable +# --------------------------------------------------------------------------- # + +def test_temporal_variable_roundtrip(tmp_path, cube): + """Temporal derivation (valid_from='2020') → load() returns (time, y, x).""" + src = np.full((2, 2), 7.0, dtype=np.float32) + tif = _write_tif(tmp_path / "t2020.tif", src) + + _derive(cube, tif, operator="mean", var_name="v", valid_from="2020") + + da = cube.load("v", grid_id="G") + assert da.ndim == 3, ( + f"Expected 3D (time, y, x), got {da.ndim}D. " + "Temporal variable should always produce a time axis." + ) + assert "time" in da.dims + assert list(da.coords["time"].values) == [2020] diff --git a/tests/test_spec_hash.py b/tests/test_spec_hash.py new file mode 100644 index 0000000..39d95ac --- /dev/null +++ b/tests/test_spec_hash.py @@ -0,0 +1,113 @@ +""" +Tests for SpatialDerivation.spec_hash() determinism and correctness. + +The spec_hash is the cache key for all derived outputs; any change to it +invalidates stored artefacts. Tests here guard against accidental changes +to hash composition. +""" + +from disscube.models import SpatialDerivation, SpatialRelation, Variable + + +def _base(): + return SpatialDerivation( + source_id="S1", grid_id="G1", role="driver", + variables=[Variable(name="v", operator="mean")], + ) + + +# --------------------------------------------------------------------------- # +# Determinism +# --------------------------------------------------------------------------- # + +def test_spec_hash_is_deterministic(): + """Same derivation always produces the same hash across calls.""" + d = _base() + assert d.spec_hash() == d.spec_hash() + + +def test_spec_hash_is_deterministic_across_instances(): + """Two independently constructed equal derivations share a hash.""" + assert _base().spec_hash() == _base().spec_hash() + + +# --------------------------------------------------------------------------- # +# Sensitivity to meaningful fields +# --------------------------------------------------------------------------- # + +def test_different_source_ids_differ(): + d1 = _base() + d2 = _base().model_copy(update={"source_id": "S2"}) + assert d1.spec_hash() != d2.spec_hash() + + +def test_different_grid_ids_differ(): + d1 = _base() + d2 = _base().model_copy(update={"grid_id": "G2"}) + assert d1.spec_hash() != d2.spec_hash() + + +def test_different_roles_differ(): + d1 = _base() + d2 = _base().model_copy(update={"role": "constraint"}) + assert d1.spec_hash() != d2.spec_hash() + + +def test_different_operators_differ(): + d1 = SpatialDerivation(source_id="S", grid_id="G", role="r", + variables=[Variable(name="v", operator="mean")]) + d2 = SpatialDerivation(source_id="S", grid_id="G", role="r", + variables=[Variable(name="v", operator="sum")]) + assert d1.spec_hash() != d2.spec_hash() + + +def test_different_valid_from_differ(): + d1 = _base().model_copy(update={"valid_from": "2020"}) + d2 = _base().model_copy(update={"valid_from": "2021"}) + assert d1.spec_hash() != d2.spec_hash() + + +def test_static_vs_temporal_differ(): + static = _base() + temporal = _base().model_copy(update={"valid_from": "2020"}) + assert static.spec_hash() != temporal.spec_hash() + + +# --------------------------------------------------------------------------- # +# Insensitivity to SpatialRelation (correctness fix) +# --------------------------------------------------------------------------- # + +def test_relations_do_not_affect_spec_hash(): + """ + Adding or changing SpatialRelation entries must NOT change the spec_hash. + + Relations are persisted in the catalog but no pipeline stage reads them + during computation. Including them in the hash would make the cache key + sensitive to documentation-level metadata, breaking the reproducibility + guarantee: the same source + grid + operator + time window must always + hash to the same key regardless of which relations are recorded. + """ + without_relation = _base() + with_relation = _base().model_copy(update={ + "relations": [ + SpatialRelation(source_grid_id="G1", target_grid_id="G2", + strategy="simple") + ] + }) + assert without_relation.spec_hash() == with_relation.spec_hash(), ( + "SpatialRelation must not affect spec_hash: relations are not used " + "in the computation pipeline." + ) + + +def test_different_relations_produce_same_hash(): + """Two derivations that differ only in relations must hash identically.""" + r1 = _base().model_copy(update={ + "relations": [SpatialRelation(source_grid_id="A", target_grid_id="B", + strategy="simple")] + }) + r2 = _base().model_copy(update={ + "relations": [SpatialRelation(source_grid_id="X", target_grid_id="Y", + strategy="chooseone")] + }) + assert r1.spec_hash() == r2.spec_hash() From 3c2c655edd743baf5b85c4636d9f8961b32bab90 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 09:43:45 -0300 Subject: [PATCH 09/13] Reorganize scripts/ into examples/ and tools/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/ misturava três naturezas distintas: tutorial narrativo, setup operacional e utilitário de exportação. A pasta foi dissolvida: - scripts/05_ilha_maranhao_mapbiomas.py → examples/case_studies/ilha_maranhao/01_derive.py Movido para junto dos outros case studies (brmangue, lucc_acre). Adicionado __main__ guard; código de exemplo reorganizado em main(). - scripts/import_bdc_tiles.py → examples/setup/03_import_bdc_tiles.py Já era referenciado como passo 3 do setup no examples/README.md; agora reside onde o README o indicava. Adicionado __main__ guard. - scripts/zarr_to_tif.py → tools/zarr_to_tif.py Utilitário de exportação, não é um exemplo de uso da biblioteca. - scripts/list_grids.py, scripts/list_sources.py Deletados: conteúdo trivial (< 10 linhas), paths hard-coded sem utilidade fora de um diretório de projeto específico. examples/README.md atualizado: referência a scripts/import_bdc_tiles.py corrigida para examples/setup/03_import_bdc_tiles.py; nova entrada para o case study Ilha do Maranhão; menção a tools/zarr_to_tif.py. Co-Authored-By: Claude Sonnet 4.6 --- examples/README.md | 15 ++- .../case_studies/ilha_maranhao/01_derive.py | 77 +++++++++++++++ examples/setup/03_import_bdc_tiles.py | 33 +++++++ scripts/05_ilha_maranhao_mapbiomas.py | 98 ------------------- scripts/import_bdc_tiles.py | 25 ----- scripts/list_grids.py | 7 -- scripts/list_sources.py | 19 ---- {scripts => tools}/zarr_to_tif.py | 0 8 files changed, 120 insertions(+), 154 deletions(-) create mode 100644 examples/case_studies/ilha_maranhao/01_derive.py create mode 100644 examples/setup/03_import_bdc_tiles.py delete mode 100644 scripts/05_ilha_maranhao_mapbiomas.py delete mode 100644 scripts/import_bdc_tiles.py delete mode 100644 scripts/list_grids.py delete mode 100644 scripts/list_sources.py rename {scripts => tools}/zarr_to_tif.py (100%) diff --git a/examples/README.md b/examples/README.md index 73e35b1..cee8f4b 100644 --- a/examples/README.md +++ b/examples/README.md @@ -8,18 +8,20 @@ This folder contains a structured demonstration of the DissCube pipeline, from c Bootstrap the catalog and register baseline data. - `python examples/setup/01_init_catalog.py`: Registers national and local simulation grids. - `python examples/setup/02_register_sources.py`: Registers raw data files as SpatialSources. -- `python scripts/import_bdc_tiles.py`: Imports BDC tile definitions (one-time, slow). +- `python examples/setup/03_import_bdc_tiles.py`: Imports BDC tile definitions (one-time, slow). ### 2. National Drivers Derive variables on the national 5 km mesh. - `python examples/drivers/01_brazil_national.py`: Slope, TI presence, and distance to cities/rivers. -### 3. Case Study: BR-MANGUE (Maranhão) +### 3. Case Study: Ilha do Maranhão +- `python examples/case_studies/ilha_maranhao/01_derive.py`: Temporal MapBiomas (majority) + static distance-to-seats variable. + +### 4. Case Study: BR-MANGUE (Maranhão) - `python examples/case_studies/brmangue/01_derive.py`: Derives land use and environmental variables at 100 m. - `python examples/case_studies/brmangue/02_simulate.py`: Runs the BrmangueRasterExecutor. -- `python examples/case_studies/brmangue/03_temporal_mapbiomas.py`: Demonstrates temporal MapBiomas integration. -### 4. Case Study: LUCC/AC (Acre) +### 5. Case Study: LUCC/AC (Acre) - `python examples/drivers/02_acre_5km.py`: Derives Acre-specific drivers at 5 km. - `python examples/case_studies/lucc_acre/01_derive.py`: Derives land use attributes from vector data. - `python examples/case_studies/lucc_acre/02_simulate.py`: Runs the LUCCRasterExecutor. @@ -27,4 +29,7 @@ Derive variables on the national 5 km mesh. --- -**Note:** For one-time administrative operations like importing BDC tiles from scratch, see `scripts/import_bdc_tiles.py`. +**Utilitários:** `tools/zarr_to_tif.py` converte um arquivo Zarr derivado para GeoTIFF. +``` +python tools/zarr_to_tif.py data/derived/.../var.zarr output.tif +``` diff --git a/examples/case_studies/ilha_maranhao/01_derive.py b/examples/case_studies/ilha_maranhao/01_derive.py new file mode 100644 index 0000000..c350e69 --- /dev/null +++ b/examples/case_studies/ilha_maranhao/01_derive.py @@ -0,0 +1,77 @@ +""" +examples/case_studies/ilha_maranhao/01_derive.py + +Deriva variáveis para a Ilha do Maranhão usando MapBiomas: + - uso (majority, temporal 2010-2022) + - dist_sedes (min_distance, estática) + +Pré-requisitos: + - python examples/setup/01_init_catalog.py + - python examples/setup/02_register_sources.py + - Arquivos data/raw/ilha_maranhao_mapbiomas_{2010,2022}.tif presentes + +Usage: + python examples/case_studies/ilha_maranhao/01_derive.py +""" + +from disscube.client import CubeClient +from disscube.models import SpatialSource, SpatialDerivation +from disscube.models.variable import Variable +from disscube.utils.grids import register_local_grid + + +def main(): + # ── 1. Cliente ─────────────────────────────────────────────────────────── + cube = CubeClient(catalog="catalog.db", store="data/") + + # ── 2. Grade local ─────────────────────────────────────────────────────── + register_local_grid( + cube, + name="ilha_maranhao", + bbox_geo=(-44.42, -2.80, -44.02, -2.40), + resolution=100, + snap=True, + ) + + # ── 3. Derivação temporal (MapBiomas 2010, 2022) ───────────────────────── + for year in [2010, 2022]: + cube.register_spatial_source(SpatialSource( + id=f"mapbiomas_ilha_ma_{year}", + name=f"MapBiomas Ilha do Maranhão — {year}", + format="raster", + asset_url=f"data/raw/ilha_maranhao_mapbiomas_{year}.tif", + crs="EPSG:4326", + time=year, + )) + print(f"\n[pipeline] Processando ano {year}...") + cube.derive(SpatialDerivation( + source_id=f"mapbiomas_ilha_ma_{year}", + grid_id="ilha_maranhao/100m", + role="land_use", + variables=[Variable(name="uso", operator="majority")], + )) + + # ── 4. Variável estática ───────────────────────────────────────────────── + print("\n[pipeline] Processando distância a sedes...") + cube.derive(SpatialDerivation( + source_id="urban_centers", + grid_id="ilha_maranhao/100m", + role="driver", + variables=[Variable(name="dist_sedes", operator="min_distance")], + )) + + # ── 5. Verificação ─────────────────────────────────────────────────────── + # "uso" retorna (time, y, x) por ser temporal; "dist_sedes" retorna (y, x) + da_uso = cube.load("uso", grid_id="ilha_maranhao/100m") + print(f"\nuso: {da_uso.dims} anos={list(da_uso.coords['time'].values)}") + + da_sedes = cube.load("dist_sedes", grid_id="ilha_maranhao/100m") + print(f"dist_sedes:{da_sedes.dims} shape={da_sedes.shape}") + + # ── 6. Integração DisSModel ────────────────────────────────────────────── + backend = cube.to_lucc_data(["uso", "dist_sedes"], grid_id="ilha_maranhao/100m") + print(f"\nBackend pronto: {backend}") + + +if __name__ == "__main__": + main() diff --git a/examples/setup/03_import_bdc_tiles.py b/examples/setup/03_import_bdc_tiles.py new file mode 100644 index 0000000..01ed125 --- /dev/null +++ b/examples/setup/03_import_bdc_tiles.py @@ -0,0 +1,33 @@ +""" +examples/setup/03_import_bdc_tiles.py + +Importa os tiles BDC (SM / MD / LG) como SpatialSources no catálogo. +Operação one-time; pode demorar alguns minutos dependendo do tamanho dos +shapefiles. + +Pré-requisito: + - python examples/setup/01_init_catalog.py + +Usage: + python examples/setup/03_import_bdc_tiles.py +""" + +from disscube.client import CubeClient +from disscube.utils.bdc_importer import import_bdc_grids + + +def main(): + cube = CubeClient(catalog="catalog.db", store="./data/") + + print("=== Importando tiles BDC (one-time, pode ser lento) ===") + import_bdc_grids( + cube, + sm_path="zip://data/bdc_grids/BDC_SM_V2.zip", + md_path="zip://data/bdc_grids/BDC_MD_V2.zip", + lg_path="zip://data/bdc_grids/BDC_LG_V2.zip", + ) + print("=== Tiles BDC registrados ===") + + +if __name__ == "__main__": + main() diff --git a/scripts/05_ilha_maranhao_mapbiomas.py b/scripts/05_ilha_maranhao_mapbiomas.py deleted file mode 100644 index 518d3c8..0000000 --- a/scripts/05_ilha_maranhao_mapbiomas.py +++ /dev/null @@ -1,98 +0,0 @@ -from disscube.client import CubeClient -from disscube.models import SpatialSource, SpatialDerivation -from disscube.models.variable import Variable -from disscube.utils.grids import register_local_grid -import os - -# ── 1. Cliente ─────────────────────────────────────────────────────────────── -cube = CubeClient( - catalog="catalog.db", - store="data/", -) - -# ── 2. Grade local ─────────────────────────────────────────────────────────── -grid = register_local_grid( - cube, - name="ilha_maranhao", - bbox_geo=(-44.42, -2.80, -44.02, -2.40), - resolution=100, - snap=True, -) - -# ── 3. Processamento Temporal (Loop de Anos) ───────────────────────────────── -# Para MapBiomas, geralmente temos uma série temporal. -# Aqui exemplificamos como registrar e derivar múltiplos anos. - -years = [2010, 2022] -for year in years: - asset = f"data/raw/ilha_maranhao_mapbiomas_{year}.tif" - - # Registro da Fonte - source_id = f"mapbiomas_ilha_ma_{year}" - source = SpatialSource( - id=source_id, - name=f"MapBiomas Ilha do Maranhão — {year}", - format="raster", - asset_url=asset, - crs="EPSG:4326", - time=year, # Define este dado como temporal no cubo - ) - cube.register_spatial_source(source) - - # Registro da Derivação - # O spec_hash será diferente para cada ano pois o source_id muda - derivation = SpatialDerivation( - source_id=source_id, - grid_id="ilha_maranhao/100m", - role="land_use", - variables=[ - Variable(name="uso", operator="majority"), - ], - ) - - print(f"\n[pipeline] Processando ano {year}...") - cube.derive(derivation) - -# ── 4. Distância a Sedes (Variável Estática) ───────────────────────────────── -derivation_sedes = SpatialDerivation( - source_id="urban_centers", - grid_id="ilha_maranhao/100m", - role="driver", - variables=[ - Variable(name="dist_sedes", operator="min_distance"), - ], -) -print("\n[pipeline] Processando distância a sedes...") -cube.derive(derivation_sedes) - -# ── 5. Carregamento do Cubo ────────────────────────────────────────────────── -# Ao carregar "uso", o Cubo detecta que existem múltiplos anos e retorna (time, y, x) -da_uso = cube.load("uso", grid_id="ilha_maranhao/100m") -print("\n=== Dados Temporais (uso) ===") -print(da_uso.dims, da_uso.coords["time"].values) - -# Ao carregar "dist_sedes", ele retorna (y, x) por ser estática -da_sedes = cube.load("dist_sedes", grid_id="ilha_maranhao/100m") -print("\n=== Dados Estáticos (dist_sedes) ===") -print(da_sedes.dims, da_sedes.shape) - -# ── 6. Integração com DisSModel (Backend Misto) ────────────────────────────── -# O backend resultante terá um eixo de tempo para "uso" e será estático para "dist_sedes" -backend = cube.to_lucc_data(["uso", "dist_sedes"], grid_id="ilha_maranhao/100m") - -print("\n=== Integração DisSModel Temporal ===") -print(f"Backend: {backend}") -print(f"É temporal? uso={backend.is_temporal('uso')}, dist_sedes={backend.is_temporal('dist_sedes')}") - -# No dissmodel, você pode acessar um ano específico do backend -data_2010 = backend.get("uso", time=2010) -print(f"Shape do slice de 2010: {data_2010.shape}") - -# Ou filtrar um período específico na carga do cubo -backend_filtered = cube.to_lucc_data( - ["uso"], - grid_id="ilha_maranhao/100m", - period=(2022, 2022) -) -print(f"Anos no backend filtrado: {backend_filtered.temporal_band_names()}") - diff --git a/scripts/import_bdc_tiles.py b/scripts/import_bdc_tiles.py deleted file mode 100644 index 143eb82..0000000 --- a/scripts/import_bdc_tiles.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -scripts/import_bdc_tiles.py - -Imports BDC tile shapefiles (SM / MD / LG) as SpatialSources into the catalog. -This is a one-time setup operation. - -Usage: - python scripts/import_bdc_tiles.py -""" - -from disscube.client import CubeClient -from disscube.utils.bdc_importer import import_bdc_grids - -CATALOG_FILE = "catalog.db" - -cube = CubeClient(catalog=CATALOG_FILE, store="./data/") - -print("=== Importing BDC tile sources (one-time, slow) ===") -import_bdc_grids( - cube, - sm_path="zip://data/bdc_grids/BDC_SM_V2.zip", - md_path="zip://data/bdc_grids/BDC_MD_V2.zip", - lg_path="zip://data/bdc_grids/BDC_LG_V2.zip", -) -print("\n=== BDC tiles registered ===") diff --git a/scripts/list_grids.py b/scripts/list_grids.py deleted file mode 100644 index d948a0e..0000000 --- a/scripts/list_grids.py +++ /dev/null @@ -1,7 +0,0 @@ -from disscube.client import CubeClient -cube = CubeClient(catalog="catalog.db", store="./data/") -for grid in cube.catalog.list_grids(): - print(f"Grid: {grid.id}") - print(f" CRS: {grid.crs}") - print(f" BBox: {grid.bbox}") - print(f" Res: {grid.resolution}") diff --git a/scripts/list_sources.py b/scripts/list_sources.py deleted file mode 100644 index c603990..0000000 --- a/scripts/list_sources.py +++ /dev/null @@ -1,19 +0,0 @@ -from disscube.client import CubeClient - -cube = CubeClient(catalog="catalog.db", store="./data/") - -print("=== Registered Spatial Sources ===") -sources = cube.catalog.list_spatial_sources() - -if not sources: - print("No sources found in catalog.") -else: - for src in sources: - print(f"Source: {src.id}") - print(f" Name: {src.name}") - print(f" Format: {src.format}") - print(f" URL: {src.asset_url}") - print(f" CRS: {src.crs}") - if src.time: - print(f" Time: {src.time}") - print("-" * 30) diff --git a/scripts/zarr_to_tif.py b/tools/zarr_to_tif.py similarity index 100% rename from scripts/zarr_to_tif.py rename to tools/zarr_to_tif.py From d114419877e4a7066de6e149ea8037b9167fc618 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 09:58:19 -0300 Subject: [PATCH 10/13] Reorganize examples/case_studies/ and move import_bdc_tiles to tools/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - examples/case_studies/brmangue/ + examples/case_studies/ilha_maranhao/ → examples/case_studies/maranhao/ Ambos os estudos usam a mesma grade (ilha_maranhao/100m). Consolidados em uma única pasta com numeração contínua: 01_mapbiomas_temporal.py (era ilha_maranhao/01_derive.py) 02_brmangue_derive.py (era brmangue/01_derive.py) 03_brmangue_simulate.py (era brmangue/02_simulate.py) - examples/setup/03_import_bdc_tiles.py → tools/import_bdc_tiles.py Operação de infraestrutura one-time, não narrativa de exemplo. examples/setup/ fica só com os dois scripts de bootstrap do tutorial. - examples/README.md atualizado: nova estrutura, tabela de utilitários. Co-Authored-By: Claude Sonnet 4.6 --- examples/README.md | 54 ++++++++++--------- .../01_mapbiomas_temporal.py} | 8 +-- .../02_brmangue_derive.py} | 20 +++---- .../03_brmangue_simulate.py} | 15 +++--- .../import_bdc_tiles.py | 4 +- 5 files changed, 55 insertions(+), 46 deletions(-) rename examples/case_studies/{ilha_maranhao/01_derive.py => maranhao/01_mapbiomas_temporal.py} (93%) rename examples/case_studies/{brmangue/01_derive.py => maranhao/02_brmangue_derive.py} (50%) rename examples/case_studies/{brmangue/02_simulate.py => maranhao/03_brmangue_simulate.py} (78%) rename examples/setup/03_import_bdc_tiles.py => tools/import_bdc_tiles.py (89%) diff --git a/examples/README.md b/examples/README.md index cee8f4b..30ea798 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,35 +1,41 @@ -# DissCube Examples +# DisSCube — Exemplos -This folder contains a structured demonstration of the DissCube pipeline, from catalog initialization to complete case studies. +Demonstração estruturada do pipeline DisSCube, do bootstrap do catálogo até +estudos de caso completos. -## Execution Order +## Ordem de execução -### 1. Setup (One-time) -Bootstrap the catalog and register baseline data. -- `python examples/setup/01_init_catalog.py`: Registers national and local simulation grids. -- `python examples/setup/02_register_sources.py`: Registers raw data files as SpatialSources. -- `python examples/setup/03_import_bdc_tiles.py`: Imports BDC tile definitions (one-time, slow). +### 1. Setup (one-time) +Bootstrap do catálogo e registro dos dados base. +- `python examples/setup/01_init_catalog.py` — registra grades nacionais e locais. +- `python examples/setup/02_register_sources.py` — registra arquivos brutos como SpatialSources. -### 2. National Drivers -Derive variables on the national 5 km mesh. -- `python examples/drivers/01_brazil_national.py`: Slope, TI presence, and distance to cities/rivers. +### 2. Drivers nacionais +Deriva variáveis na grade nacional BR/5km. +- `python examples/drivers/01_brazil_national.py` — slope, TI, distância a cidades/rios. -### 3. Case Study: Ilha do Maranhão -- `python examples/case_studies/ilha_maranhao/01_derive.py`: Temporal MapBiomas (majority) + static distance-to-seats variable. +### 3. Estudo de caso: Maranhão (Ilha do Maranhão, 100 m) +Dois estudos sobre a mesma área geográfica e grade. +- `python examples/case_studies/maranhao/01_mapbiomas_temporal.py` — série temporal MapBiomas (uso majority) + dist_sedes estática. +- `python examples/case_studies/maranhao/02_brmangue_derive.py` — deriva uso, alt, solo para o modelo BR-MANGUE. +- `python examples/case_studies/maranhao/03_brmangue_simulate.py` — executa BrmangueRasterExecutor. -### 4. Case Study: BR-MANGUE (Maranhão) -- `python examples/case_studies/brmangue/01_derive.py`: Derives land use and environmental variables at 100 m. -- `python examples/case_studies/brmangue/02_simulate.py`: Runs the BrmangueRasterExecutor. - -### 5. Case Study: LUCC/AC (Acre) -- `python examples/drivers/02_acre_5km.py`: Derives Acre-specific drivers at 5 km. -- `python examples/case_studies/lucc_acre/01_derive.py`: Derives land use attributes from vector data. -- `python examples/case_studies/lucc_acre/02_simulate.py`: Runs the LUCCRasterExecutor. -- `python examples/case_studies/lucc_acre/03_temporal_drivers.py`: Simulation loop with temporal drivers. +### 4. Estudo de caso: Acre (AC/5km) +- `python examples/drivers/02_acre_5km.py` — drivers regionais Acre 5 km. +- `python examples/case_studies/lucc_acre/01_derive.py` — atributos de uso do solo de fonte vetorial. +- `python examples/case_studies/lucc_acre/02_simulate.py` — executa LUCCRasterExecutor. +- `python examples/case_studies/lucc_acre/03_temporal_drivers.py` — loop de simulação com drivers temporais. --- -**Utilitários:** `tools/zarr_to_tif.py` converte um arquivo Zarr derivado para GeoTIFF. -``` +## Utilitários (`tools/`) + +| Script | Uso | +|---|---| +| `tools/zarr_to_tif.py` | Converte Zarr derivado para GeoTIFF | +| `tools/import_bdc_tiles.py` | Importa tiles BDC SM/MD/LG no catálogo (one-time, lento) | + +```bash python tools/zarr_to_tif.py data/derived/.../var.zarr output.tif +python tools/import_bdc_tiles.py ``` diff --git a/examples/case_studies/ilha_maranhao/01_derive.py b/examples/case_studies/maranhao/01_mapbiomas_temporal.py similarity index 93% rename from examples/case_studies/ilha_maranhao/01_derive.py rename to examples/case_studies/maranhao/01_mapbiomas_temporal.py index c350e69..d2e511b 100644 --- a/examples/case_studies/ilha_maranhao/01_derive.py +++ b/examples/case_studies/maranhao/01_mapbiomas_temporal.py @@ -1,8 +1,8 @@ """ -examples/case_studies/ilha_maranhao/01_derive.py +examples/case_studies/maranhao/01_mapbiomas_temporal.py -Deriva variáveis para a Ilha do Maranhão usando MapBiomas: - - uso (majority, temporal 2010-2022) +Deriva série temporal MapBiomas para a Ilha do Maranhão (100 m): + - uso (majority, temporal 2010-2022) - dist_sedes (min_distance, estática) Pré-requisitos: @@ -11,7 +11,7 @@ - Arquivos data/raw/ilha_maranhao_mapbiomas_{2010,2022}.tif presentes Usage: - python examples/case_studies/ilha_maranhao/01_derive.py + python examples/case_studies/maranhao/01_mapbiomas_temporal.py """ from disscube.client import CubeClient diff --git a/examples/case_studies/brmangue/01_derive.py b/examples/case_studies/maranhao/02_brmangue_derive.py similarity index 50% rename from examples/case_studies/brmangue/01_derive.py rename to examples/case_studies/maranhao/02_brmangue_derive.py index 94c74dc..46d1d62 100644 --- a/examples/case_studies/brmangue/01_derive.py +++ b/examples/case_studies/maranhao/02_brmangue_derive.py @@ -1,28 +1,30 @@ """ -examples/case_studies/brmangue/01_derive.py +examples/case_studies/maranhao/02_brmangue_derive.py -BR-MANGUE case study — Derivation of land use variables for Ilha do Maranhão (ilha_maranhao/100m). +BR-MANGUE — deriva variáveis de uso do solo para a Ilha do Maranhão (100 m): + - uso, alt, solo (majority / mean sobre maranhao_base) + +Pré-requisitos: + - python examples/setup/01_init_catalog.py + - python examples/setup/02_register_sources.py Usage: - python examples/case_studies/brmangue/01_derive.py + python examples/case_studies/maranhao/02_brmangue_derive.py """ from disscube.client import CubeClient from disscube.models import SpatialDerivation, Variable - GRID_ID = "ilha_maranhao/100m" SOURCE_ID = "maranhao_base" cube = CubeClient(catalog="catalog.db", store="./data/") -# Sanity checks if not cube.catalog.get_grid(GRID_ID): - raise RuntimeError(f"Grid {GRID_ID!r} not found. Run examples/setup/01_init_catalog.py first.") + raise RuntimeError(f"Grade {GRID_ID!r} não encontrada. Execute examples/setup/01_init_catalog.py primeiro.") if not cube.catalog.get_spatial_source(SOURCE_ID): - raise RuntimeError(f"Source {SOURCE_ID!r} not found. Run examples/setup/02_register_sources.py first.") + raise RuntimeError(f"Fonte {SOURCE_ID!r} não encontrada. Execute examples/setup/02_register_sources.py primeiro.") -# Derivation — uso, alt, solo from maranhao_base raster -print(f"\n[derive] Processing {SOURCE_ID} @ {GRID_ID}...") +print(f"\n[derive] {SOURCE_ID} @ {GRID_ID}...") cube.derive(SpatialDerivation( source_id=SOURCE_ID, grid_id=GRID_ID, diff --git a/examples/case_studies/brmangue/02_simulate.py b/examples/case_studies/maranhao/03_brmangue_simulate.py similarity index 78% rename from examples/case_studies/brmangue/02_simulate.py rename to examples/case_studies/maranhao/03_brmangue_simulate.py index d13b9c1..632272f 100644 --- a/examples/case_studies/brmangue/02_simulate.py +++ b/examples/case_studies/maranhao/03_brmangue_simulate.py @@ -1,10 +1,13 @@ """ -examples/case_studies/brmangue/02_simulate.py +examples/case_studies/maranhao/03_brmangue_simulate.py -BR-MANGUE case study — Executes the BrmangueRasterExecutor simulation using derived variables. +BR-MANGUE — executa o BrmangueRasterExecutor com as variáveis derivadas. + +Pré-requisito: + - python examples/case_studies/maranhao/02_brmangue_derive.py Usage: - python examples/case_studies/brmangue/02_simulate.py + python examples/case_studies/maranhao/03_brmangue_simulate.py """ from disscube.client import CubeClient @@ -23,16 +26,14 @@ cube = CubeClient(catalog="catalog.db", store="./data/") -# 1. Load derived variables print(f"\n[load] {VARIABLES} @ {GRID_ID}") backend = cube.to_lucc_data(VARIABLES, grid_id=GRID_ID) print(f" bands: {backend.band_names()}") -# 2. Run simulation if BrmangueRasterExecutor and ExperimentRecord: print("\n--- Running BrmangueRasterExecutor ---") source = cube.catalog.get_spatial_source(SOURCE_ID) - + record = ExperimentRecord( experiment_id="brmangue_cube_integration", parameters={"end_time": 5, "interactive": False, "bands": ["uso"]}, @@ -47,4 +48,4 @@ print(f"Simulation done. Output: {record.output_path}") else: - print("\nBrmangueRasterExecutor not available — simulation skipped.") + print("\nBrmangueRasterExecutor não disponível — simulation skipped.") diff --git a/examples/setup/03_import_bdc_tiles.py b/tools/import_bdc_tiles.py similarity index 89% rename from examples/setup/03_import_bdc_tiles.py rename to tools/import_bdc_tiles.py index 01ed125..4f465d4 100644 --- a/examples/setup/03_import_bdc_tiles.py +++ b/tools/import_bdc_tiles.py @@ -1,5 +1,5 @@ """ -examples/setup/03_import_bdc_tiles.py +tools/import_bdc_tiles.py Importa os tiles BDC (SM / MD / LG) como SpatialSources no catálogo. Operação one-time; pode demorar alguns minutos dependendo do tamanho dos @@ -9,7 +9,7 @@ - python examples/setup/01_init_catalog.py Usage: - python examples/setup/03_import_bdc_tiles.py + python tools/import_bdc_tiles.py """ from disscube.client import CubeClient From eafac6698fd016d63bafc42c6d2efd14a069e966 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 10:02:09 -0300 Subject: [PATCH 11/13] Fix four factual errors in documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit catalog.md / overview.md — spec_hash não inclui SpatialRelation: Relações foram explicitamente excluídas do hash no item 11 (nenhum estágio do pipeline as usa durante a computação). As duas páginas ainda listavam relações como input do hash. Corrigido: removido do JSON de exemplo e das listas; adicionada explicação do motivo. operators.md — tabela e nota de rodapé incorretas: majority, minority e percentage têm needs_fine_alignment=True e _resampling=Resampling.nearest no código; a doc dizia 'mode'. A nota de rodapé afirmava que std/percentage usam dado pré-reamostrado — o oposto do que o código faz. Tabela reestruturada em dois grupos (reamostragem direta vs. alinhamento fino) com comportamento correto. bdc.md — três bugs: 1. Parâmetros sm_shp/md_shp/lg_shp não existem: são sm_path/md_path/lg_path. 2. Filtro de tiles usava prefixo "BR/5km_"; tiles são registrados como BDC_SM_ (ex: BDC_SM_009002). 3. Comentário "mosaico automático" contradiz a limitação conhecida de load() sem tile_id. Substituído por aviso explícito. Co-Authored-By: Claude Sonnet 4.6 --- docs/architecture/catalog.md | 4 +--- docs/architecture/operators.md | 21 +++++++++++++++------ docs/architecture/overview.md | 3 ++- docs/guides/bdc.md | 18 +++++++++++------- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/docs/architecture/catalog.md b/docs/architecture/catalog.md index 5c8dce2..ad1ac46 100644 --- a/docs/architecture/catalog.md +++ b/docs/architecture/catalog.md @@ -34,7 +34,6 @@ relevant_data = { "grid_id": ..., "role": ..., "variables": [...], # ordenados por nome - "relations": [...], # ordenados, sem campo metadata "valid_from": ..., "valid_until": ..., } @@ -49,13 +48,12 @@ return hashlib.sha256(encoded).hexdigest() - Adicionar/remover/renomear variáveis - Mudar o operador ou `class_code` - Mudar `valid_from` / `valid_until` -- Adicionar/remover relações espaciais **O que não muda o hash:** -- Metadados das relações (`SpatialRelation.metadata`) - A ordem das variáveis na lista (são ordenadas por nome) - `bbox` de `Derivation` (metadado descritivo, não parâmetro) +- `SpatialRelation` — relações são persistidas no catálogo mas excluídas do hash porque nenhum estágio do pipeline as usa durante a computação. Incluí-las tornaria a chave de cache sensível a metadados que não afetam o resultado. ## Hash de conteúdo (`content_hash`) diff --git a/docs/architecture/operators.md b/docs/architecture/operators.md index 4dba3b3..8639adf 100644 --- a/docs/architecture/operators.md +++ b/docs/architecture/operators.md @@ -30,22 +30,31 @@ O `GridAligner` consulta `op_cls.resampling()` para escolher o método de reamos ## Operadores disponíveis -### Zonais — raster e vetor +### Zonais com reamostragem direta + +Estes operadores recebem do `GridAligner` um `DataArray` já reprojetado na resolução alvo. O `_resampling` determina o método de reprojeção. | Operador | `_resampling` | Raster | Vetor | `requires_class_code` | |---|---|---|---|---| | `mean` | `average` | média dos pixels no upscale | — | não | | `sum` | `sum` | soma dos pixels no upscale | — | não | -| `std` | `nearest` | valor reamostrado¹ | — | não | | `min` | `min` | mínimo no upscale | — | não | | `max` | `max` | máximo no upscale | — | não | -| `majority` | `mode` | moda no upscale | rasteriza com `class_code` (ou 1) | não | -| `minority` | `mode` | moda no upscale | rasteriza com `class_code` (ou 1) | não | -| `percentage` | `mode` | valor reamostrado¹ | rasteriza com `class_code` | **sim** | | `attribute` | `nearest` | passthrough | rasteriza com valor da coluna `var.name` | não | | `presence` | `nearest` | passthrough | rasteriza com `class_code` (ou 1) binário | não | -¹ `std` e `percentage` para rasters usam o dado já reamostrado pelo `GridAligner` — estatísticas verdadeiras de janela (como `rasterstats`) são trabalho futuro. +### Zonais com alinhamento fino (`needs_fine_alignment = True`) + +Estes operadores recebem um array de alta resolução snappado na origem do grid alvo. O `GridAligner` reprojeita com `Resampling.nearest` em resolução fina (sub-múltiplo inteiro do tamanho da célula); o operador reduz por janelas reais sobre os pixels brutos. + +| Operador | Raster | Vetor | `requires_class_code` | +|---|---|---|---| +| `std` | desvio padrão real por célula | — | não | +| `majority` | classe dominante por contagem | rasteriza com `class_code` (ou 1) | não | +| `minority` | classe menos frequente por contagem | rasteriza com `class_code` (ou 1) | não | +| `percentage` | fração de pixels da classe-alvo | rasteriza com `class_code` | **sim** | + +Os três últimos produzem também `coverage_purity` e `dominance_purity` como coordenadas do `DataArray` de saída (persistem no Zarr junto à variável). ### Proximidade — vetor (e passthrough para raster) diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md index 8038da2..4313108 100644 --- a/docs/architecture/overview.md +++ b/docs/architecture/overview.md @@ -86,9 +86,10 @@ Cada `SpatialDerivation` tem um `spec_hash` — SHA-256 determinístico de: - `grid_id` - `role` - variáveis (nome + operador + class_code, ordenadas por nome) -- relações espaciais (ordenadas, sem metadados) - `valid_from` / `valid_until` +`SpatialRelation` é excluída do hash: nenhum estágio do pipeline a usa durante a computação, então incluí-la tornaria o cache sensível a metadados sem efeito no resultado. + Se qualquer parâmetro mudar, o hash muda. O pipeline verifica o cache antes de processar: se todos os `DerivedVariable` com o mesmo `spec_hash` já existem no disco, a derivação é pulada. `Derivation.spec_hash()` delega a `SpatialDerivation.spec_hash()` e adiciona `purity_threshold` quando definido. `bbox` é excluído — é metadado descritivo, não parâmetro de derivação. diff --git a/docs/guides/bdc.md b/docs/guides/bdc.md index 279963c..c92160a 100644 --- a/docs/guides/bdc.md +++ b/docs/guides/bdc.md @@ -16,9 +16,9 @@ from disscube.utils.bdc_importer import import_bdc_grids import_bdc_grids( cube, - sm_shp="data/bdc_grids/BDC_SM_V2.shp", - md_shp="data/bdc_grids/BDC_MD_V2.shp", - lg_shp="data/bdc_grids/BDC_LG_V2.shp", + sm_path="data/bdc_grids/BDC_SM_V2.shp", + md_path="data/bdc_grids/BDC_MD_V2.shp", + lg_path="data/bdc_grids/BDC_LG_V2.shp", ) ``` @@ -39,8 +39,9 @@ derivation = SpatialDerivation( # Processar um tile cube.derive(derivation, tile_id="009002") -# Processar todos os tiles em loop -tiles = [s for s in cube.catalog.list_spatial_sources() if s.id.startswith("BR/5km_")] +# Processar todos os tiles SM em loop +# Tiles são registrados com IDs no formato BDC_SM_ (ex: BDC_SM_009002) +tiles = [s for s in cube.catalog.list_spatial_sources() if s.id.startswith("BDC_SM_")] for tile_source in tiles: tile_id = tile_source.id.split("_")[-1] cube.derive(derivation, tile_id=tile_id) @@ -51,13 +52,16 @@ Cada tile é processado de forma independente e pode ser paralelizado. ## Carregar resultado tileado ```python -# Tile específico +# Tile específico — sempre funciona da = cube.load("slope", tile_id="009002") -# Todos os tiles da grade (mosaico automático) +# Por grade — funciona apenas quando há um único tile no resultado da = cube.load("slope", grid_id="BR/5km") ``` +!!! warning "Carga multi-tile" + `load()` sem `tile_id` retorna silenciosamente o primeiro resultado quando múltiplos tiles da mesma variável existem na mesma grade. Mosaico automático não está implementado. **Sempre especifique `tile_id` em workloads multi-tile.** + ## Grade 100m nacional Para projetos que precisam de resolução mais alta que BDC_SM (10m), DisSCube suporta uma grade de 100m customizada: From 4e5630da8b6b2c0eb5eda6eb6505020ade1feefa Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 10:45:29 -0300 Subject: [PATCH 12/13] fix: write CRS after variable loop so grid_mapping is set on all data vars rioxarray.write_crs() sets grid_mapping="spatial_ref" only on data variables already present in the Dataset at call time. Calling it before the loop meant zero variables received the attribute, so QGIS and other CF-compliant readers could not auto-detect the projection. Co-Authored-By: Claude Sonnet 4.6 --- disscube/pipeline/aggregator.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/disscube/pipeline/aggregator.py b/disscube/pipeline/aggregator.py index f37fad7..5534148 100644 --- a/disscube/pipeline/aggregator.py +++ b/disscube/pipeline/aggregator.py @@ -67,10 +67,6 @@ def execute(self, ctx: PipelineContext) -> PipelineContext: source_data = ctx.data final_ds = xr.Dataset(coords={"y": grid.ys, "x": grid.xs}) - named_wkt = _crs_to_named_wkt(grid.crs) - final_ds.rio.write_crs(named_wkt, inplace=True) - final_ds.rio.write_transform(grid.transform, inplace=True) - spatial_ref = final_ds.coords.get("spatial_ref") for var in ctx.derivation.variables: op_cls = OPERATOR_REGISTRY.get(var.operator) @@ -103,8 +99,13 @@ def execute(self, ctx: PipelineContext) -> PipelineContext: # product catalog. final_ds[var.name] = result - if spatial_ref is not None: - final_ds = final_ds.assign_coords({"spatial_ref": spatial_ref}) + # Write CRS and transform AFTER all variables are present. + # rioxarray.write_crs() sets grid_mapping="spatial_ref" on every existing + # data variable — calling it before the loop would miss them all, causing + # QGIS and other CF-compliant readers to not detect the projection. + named_wkt = _crs_to_named_wkt(grid.crs) + final_ds.rio.write_crs(named_wkt, inplace=True) + final_ds.rio.write_transform(grid.transform, inplace=True) ctx.data = final_ds return ctx \ No newline at end of file From 4fcd59d624f212868d5aea608a6c6594d3fde511 Mon Sep 17 00:00:00 2001 From: Sergio Souza Costa Date: Mon, 15 Jun 2026 10:45:37 -0300 Subject: [PATCH 13/13] fix: skip stale catalog entries on load, add purge_stale(), silence zarr warnings - load() now skips DerivedVariable entries whose Zarr file no longer exists on disk (catalog accumulates orphans when files are deleted between runs). Skipped entries are logged at DEBUG level. - CubeClient.purge_stale() removes all such orphan entries from the catalog; CatalogStore protocol and both implementations get delete_derived(). - to_zarr(..., consolidated=False) and open_zarr(..., consolidated=False) suppress the ZarrUserWarning about consolidated metadata not being part of the Zarr v3 spec. Co-Authored-By: Claude Sonnet 4.6 --- disscube/catalog/json_store.py | 4 ++++ disscube/catalog/protocol.py | 1 + disscube/catalog/sqlite_store.py | 4 ++++ disscube/client/cube_client.py | 41 ++++++++++++++++++++++++++++---- disscube/pipeline/writer.py | 2 +- tests/test_bdc_master_grids.py | 9 ++++--- tests/test_temporal.py | 2 +- 7 files changed, 53 insertions(+), 10 deletions(-) diff --git a/disscube/catalog/json_store.py b/disscube/catalog/json_store.py index 9d58129..6dec868 100644 --- a/disscube/catalog/json_store.py +++ b/disscube/catalog/json_store.py @@ -43,6 +43,10 @@ def save_derived(self, derived: DerivedVariable) -> None: self._data["derived"][derived.id] = derived.model_dump() self._save() + def delete_derived(self, derived_id: str) -> None: + self._data["derived"].pop(derived_id, None) + self._save() + def search_derived_variables(self, grid_id: str | None = None, role: str | None = None, tile_id: str | None = None) -> List[DerivedVariable]: results = [] for d in self._data["derived"].values(): diff --git a/disscube/catalog/protocol.py b/disscube/catalog/protocol.py index 3872ab4..aa8a7a9 100644 --- a/disscube/catalog/protocol.py +++ b/disscube/catalog/protocol.py @@ -11,6 +11,7 @@ def get_spatial_source(self, source_id: str) -> Optional[SpatialSource]: ... def list_spatial_sources(self) -> List[SpatialSource]: ... def save_derived(self, derived: DerivedVariable) -> None: ... + def delete_derived(self, derived_id: str) -> None: ... def search_derived_variables(self, grid_id: str | None = None, role: str | None = None, tile_id: str | None = None) -> List[DerivedVariable]: ... def get_derived_by_hash(self, spec_hash: str) -> Optional[DerivedVariable]: ... diff --git a/disscube/catalog/sqlite_store.py b/disscube/catalog/sqlite_store.py index 823ee1e..7a28c9c 100644 --- a/disscube/catalog/sqlite_store.py +++ b/disscube/catalog/sqlite_store.py @@ -92,6 +92,10 @@ def save_derived(self, derived: DerivedVariable) -> None: (derived.id, derived.grid_id, derived.spec_hash, derived.tile_id, derived.role, derived.model_dump_json()) ) + def delete_derived(self, derived_id: str) -> None: + with self._get_connection() as conn: + conn.execute("DELETE FROM derived WHERE id = ?", (derived_id,)) + def search_derived_variables(self, grid_id: str | None = None, role: str | None = None, tile_id: str | None = None) -> List[DerivedVariable]: query = "SELECT data FROM derived WHERE 1=1" params = [] diff --git a/disscube/client/cube_client.py b/disscube/client/cube_client.py index bae7643..fe3cd0e 100644 --- a/disscube/client/cube_client.py +++ b/disscube/client/cube_client.py @@ -117,6 +117,24 @@ def derive_declarative( """ return self.derive(derivation.to_spatial_derivation(grid_id), tile_id=tile_id) + def purge_stale(self) -> int: + """ + Remove catalog entries whose Zarr files no longer exist on disk. + + Returns the number of entries removed. Safe to call at any time; + entries with valid files are untouched. + """ + all_derived = self.catalog.search_derived_variables() + removed = 0 + for d in all_derived: + if not os.path.exists(d.asset_url): + self.catalog.delete_derived(d.id) + log.debug("Purged stale catalog entry: %s", d.asset_url) + removed += 1 + if removed: + log.info("purge_stale: removed %d stale catalog entries", removed) + return removed + def load( self, variable_id: str, @@ -148,9 +166,17 @@ def load( "Please specify grid_id." ) - # Separate temporal from static matches - temporal = [d for d in matches if d.times] - static = [d for d in matches if not d.times] + # Separate temporal from static matches, skipping stale catalog entries + # whose files no longer exist on disk (catalog can accumulate orphans when + # a source_id or other spec field changes between runs). + def _exists(d: DerivedVariable) -> bool: + ok = os.path.exists(d.asset_url) + if not ok: + log.debug("Stale catalog entry skipped (file missing): %s", d.asset_url) + return ok + + temporal = [d for d in matches if d.times and _exists(d)] + static = [d for d in matches if not d.times and _exists(d)] if temporal: # Stack temporal slices along time axis sorted by first time value @@ -158,14 +184,19 @@ def load( slices = [] time_coords = [] for d in temporal_sorted: - da = xr.open_zarr(d.asset_url)[d.name] + da = xr.open_zarr(d.asset_url, consolidated=False)[d.name] slices.append(da) time_coords.extend(d.times) return xr.concat(slices, dim=xr.DataArray(time_coords, dims="time")) # Static — original behaviour + if not static: + msg = f"Derived variable not found on disk: {variable_id}" + if grid_id: + msg += f" on grid {grid_id}" + raise ValueError(msg) derived = static[0] - return xr.open_zarr(derived.asset_url)[derived.name] + return xr.open_zarr(derived.asset_url, consolidated=False)[derived.name] def to_lucc_data( self, diff --git a/disscube/pipeline/writer.py b/disscube/pipeline/writer.py index d29a732..d00a75f 100644 --- a/disscube/pipeline/writer.py +++ b/disscube/pipeline/writer.py @@ -46,7 +46,7 @@ def execute(self, ctx: PipelineContext) -> PipelineContext: full_path = self.storage.get_full_path(relative_path) # Save as dataset to preserve all coordinates (including spatial_ref) - da.to_dataset(name=var_name).to_zarr(full_path, mode="w") + da.to_dataset(name=var_name).to_zarr(full_path, mode="w", consolidated=False) # Calculate content hash (SHA-256) of the Zarr directory content_hash = self._calculate_dir_hash(full_path) diff --git a/tests/test_bdc_master_grids.py b/tests/test_bdc_master_grids.py index e9863f5..8e369e1 100644 --- a/tests/test_bdc_master_grids.py +++ b/tests/test_bdc_master_grids.py @@ -130,8 +130,9 @@ def test_derive_uses_tile_id(self, mock_exists, mock_to_zarr, mock_hash): # load() with tile_id # ------------------------------------------------------------------ + @patch('disscube.client.cube_client.os.path.exists', return_value=True) @patch('xarray.open_zarr') - def test_load_with_explicit_tile_id_filters_correctly(self, mock_open_zarr): + def test_load_with_explicit_tile_id_filters_correctly(self, mock_open_zarr, mock_exists): """load(name, tile_id='001') returns only the matching tile.""" from disscube.models import DerivedVariable import xarray as xr @@ -156,10 +157,12 @@ def test_load_with_explicit_tile_id_filters_correctly(self, mock_open_zarr): res = self.cube.load("var1", tile_id="001") self.assertEqual(res, mock_da) - mock_open_zarr.assert_called_with("path/001.zarr") + mock_open_zarr.assert_called_with("path/001.zarr", consolidated=False) + @patch('disscube.client.cube_client.os.path.exists', return_value=True) + @patch('xarray.open_zarr') @unittest.expectedFailure - def test_load_without_tile_id_raises_for_ambiguous_tiles(self): + def test_load_without_tile_id_raises_for_ambiguous_tiles(self, mock_open_zarr, mock_exists): """ PLANNED (not yet implemented): load(name) without tile_id should raise ValueError when multiple tiles of the same variable exist on the same grid. diff --git a/tests/test_temporal.py b/tests/test_temporal.py index c9a409a..5a48b6a 100644 --- a/tests/test_temporal.py +++ b/tests/test_temporal.py @@ -46,7 +46,7 @@ def _ones_da(grid): def _write_zarr(da, path, name="v"): - da.to_dataset(name=name).to_zarr(path, mode="w") + da.to_dataset(name=name).to_zarr(path, mode="w", consolidated=False) return str(path)