From 66445ebf0dadb99835ec9e44ea1e70310a42ee54 Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Tue, 2 Jun 2026 14:16:05 +0100 Subject: [PATCH 01/10] fix: Forward storage_options to parquet metadata reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Schema/collection/failure-info reads passed `storage_options` (and `credential_provider`) to the data read via `pl.read_parquet` / `pl.scan_parquet`, but the separate embedded-schema metadata read called `pl.read_parquet_metadata` with no options. Against non-AWS S3-compatible stores reached purely through `storage_options` (lakeFS, MinIO, R2, Tigris, …) the metadata read fell back to the default AWS credential chain and endpoint, breaking typed reads. Thread the storage-related options into all metadata reads in `_storage/parquet.py` via a small `_metadata_read_options` helper. Fixes #352 Co-Authored-By: Claude Opus 4.8 --- dataframely/_storage/parquet.py | 42 +++++++++++++---- tests/schema/test_read_write_parquet.py | 62 +++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 9 deletions(-) diff --git a/dataframely/_storage/parquet.py b/dataframely/_storage/parquet.py index 655c595..dbf0e24 100644 --- a/dataframely/_storage/parquet.py +++ b/dataframely/_storage/parquet.py @@ -53,13 +53,13 @@ def write_frame( def scan_frame(self, **kwargs: Any) -> tuple[pl.LazyFrame, SerializedSchema | None]: source = kwargs.pop("source") lf = pl.scan_parquet(source, **kwargs) - metadata = _read_serialized_schema(source) + metadata = _read_serialized_schema(source, **_metadata_read_options(kwargs)) return lf, metadata def read_frame(self, **kwargs: Any) -> tuple[pl.DataFrame, SerializedSchema | None]: source = kwargs.pop("source") df = pl.read_parquet(source, **kwargs) - metadata = _read_serialized_schema(source) + metadata = _read_serialized_schema(source, **_metadata_read_options(kwargs)) return df, metadata # ------------------------------ Collections --------------------------------------- @@ -159,12 +159,18 @@ def _collection_from_parquet( else pl.read_parquet(scan_path, **kwargs).lazy() ) if is_file: - collection_types.append(_read_serialized_collection(source_path)) + collection_types.append( + _read_serialized_collection( + source_path, **_metadata_read_options(kwargs) + ) + ) else: prefix = get_file_prefix(fs) for file in fs.glob(fs.sep.join([source_path, "**", "*.parquet"])): collection_types.append( - _read_serialized_collection(f"{prefix}{file}") + _read_serialized_collection( + f"{prefix}{file}", **_metadata_read_options(kwargs) + ) ) return data, collection_types @@ -234,7 +240,7 @@ def scan_failure_info( file = kwargs.pop("file") # Meta data - metadata = pl.read_parquet_metadata(file) + metadata = pl.read_parquet_metadata(file, **_metadata_read_options(kwargs)) serialized_schema = assert_failure_info_metadata( metadata.get(SCHEMA_METADATA_KEY) ) @@ -245,11 +251,29 @@ def scan_failure_info( return lf, serialized_rules, serialized_schema -def _read_serialized_collection(path: str) -> SerializedCollection | None: - meta = pl.read_parquet_metadata(path) +def _metadata_read_options(kwargs: dict[str, Any]) -> dict[str, Any]: + """Extract the options required to reach the data store for a metadata read. + + The parquet metadata is read separately from the data itself. We must forward the + same storage-related options (e.g. ``storage_options`` and ``credential_provider``) + so that metadata reads against non-AWS S3-compatible stores (lakeFS, MinIO, R2, …) + hit the correct endpoint and credentials instead of falling back to the default AWS + credential chain and endpoint. + """ + return { + key: kwargs[key] + for key in ("storage_options", "credential_provider") + if key in kwargs + } + + +def _read_serialized_collection( + path: str, **read_options: Any +) -> SerializedCollection | None: + meta = pl.read_parquet_metadata(path, **read_options) return meta.get(COLLECTION_METADATA_KEY) -def _read_serialized_schema(path: str) -> SerializedSchema | None: - meta = pl.read_parquet_metadata(path) +def _read_serialized_schema(path: str, **read_options: Any) -> SerializedSchema | None: + meta = pl.read_parquet_metadata(path, **read_options) return meta.get(SCHEMA_METADATA_KEY) diff --git a/tests/schema/test_read_write_parquet.py b/tests/schema/test_read_write_parquet.py index 4b83031..a3a36e0 100644 --- a/tests/schema/test_read_write_parquet.py +++ b/tests/schema/test_read_write_parquet.py @@ -1,11 +1,13 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause +import uuid from pathlib import Path import polars as pl import pytest from fsspec import url_to_fs +from polars.testing import assert_frame_equal import dataframely as dy from dataframely._storage.parquet import SCHEMA_METADATA_KEY @@ -59,3 +61,63 @@ def test_write_parquet_fails_without_mkdir(tmp_path: str) -> None: # Act / Assert with pytest.raises(FileNotFoundError): MySchema.write_parquet(df, file=p) + + +# --------------------------------- STORAGE OPTIONS ---------------------------------- # + + +@pytest.mark.s3 +@pytest.mark.parametrize("lazy", [True, False]) +def test_read_parquet_metadata_uses_storage_options( + s3_server: str, + s3_bucket: str, + monkeypatch: pytest.MonkeyPatch, + lazy: bool, +) -> None: + """The embedded-schema metadata read must use the same ``storage_options`` as the + data read. + + Regression test for https://github.com/Quantco/dataframely/issues/352: against + non-AWS S3-compatible stores (lakeFS, MinIO, R2, …) reached purely via + ``storage_options``, the metadata read previously fell back to the default AWS + credential chain and endpoint, breaking typed reads. + """ + # Arrange: provide credentials and endpoint *only* via `storage_options`, never via + # the environment, so the metadata read fails unless the options are forwarded. + for var in ( + "AWS_ENDPOINT_URL", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_ALLOW_HTTP", + "AWS_S3_ALLOW_UNSAFE_RENAME", + "AWS_DEFAULT_REGION", + "AWS_REGION", + ): + monkeypatch.delenv(var, raising=False) + + storage_options = { + "aws_access_key_id": "testing", + "aws_secret_access_key": "testing", + "aws_endpoint_url": s3_server, + "aws_region": "us-east-1", + "aws_allow_http": "true", + } + path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" + + df = MySchema.create_empty() + MySchema.write_parquet(df, file=path, storage_options=storage_options) + + # Act: `validation="forbid"` only returns if the schema stored in the metadata is + # read successfully and matches, so a passing read proves the metadata read used the + # forwarded `storage_options`. + if lazy: + out: pl.DataFrame = MySchema.scan_parquet( + path, validation="forbid", storage_options=storage_options + ).collect() + else: + out = MySchema.read_parquet( + path, validation="forbid", storage_options=storage_options + ) + + # Assert + assert_frame_equal(df, out) From b5645a86ce6129511f34b1795f6b8d097b795a7a Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Tue, 2 Jun 2026 15:06:25 +0100 Subject: [PATCH 02/10] fix: Forward storage_options in public parquet-metadata helpers `read_parquet_metadata_schema` and `read_parquet_metadata_collection` read parquet metadata from a (possibly remote) source but accepted no options, so they could not reach non-AWS S3-compatible stores either. Accept and forward `**kwargs` (e.g. `storage_options`, `credential_provider`) to `pl.read_parquet_metadata`, matching `read_parquet`/`scan_parquet`. Add s3-marked regression tests covering both helpers. Co-Authored-By: Claude Opus 4.8 --- dataframely/collection/collection.py | 6 +++- dataframely/schema.py | 6 +++- tests/collection/test_storage.py | 48 +++++++++++++++++++++++++ tests/schema/test_read_write_parquet.py | 4 +++ 4 files changed, 62 insertions(+), 2 deletions(-) diff --git a/dataframely/collection/collection.py b/dataframely/collection/collection.py index 43aab21..1c8767c 100644 --- a/dataframely/collection/collection.py +++ b/dataframely/collection/collection.py @@ -1273,17 +1273,21 @@ def _validate_input_keys(cls, data: Mapping[str, FrameType], /) -> None: def read_parquet_metadata_collection( source: str | Path | IO[bytes] | bytes, + **kwargs: Any, ) -> type[Collection] | None: """Read a dataframely Collection type from the metadata of a parquet file. Args: source: Path to a parquet file or a file-like object that contains the metadata. + kwargs: Additional keyword arguments passed directly to + :meth:`polars.read_parquet_metadata`, e.g. ``storage_options`` and + ``credential_provider`` to reach non-AWS S3-compatible stores. Returns: The collection that was serialized to the metadata. `None` if no collection metadata is found or the deserialization fails. """ - metadata = pl.read_parquet_metadata(source) + metadata = pl.read_parquet_metadata(source, **kwargs) if (schema_metadata := metadata.get(COLLECTION_METADATA_KEY)) is not None: return deserialize_collection(schema_metadata, strict=False) return None diff --git a/dataframely/schema.py b/dataframely/schema.py index c7fc92c..af54920 100644 --- a/dataframely/schema.py +++ b/dataframely/schema.py @@ -1416,17 +1416,21 @@ def _rules_match(lhs: dict[str, Rule], rhs: dict[str, Rule]) -> bool: def read_parquet_metadata_schema( source: str | Path | IO[bytes] | bytes, + **kwargs: Any, ) -> type[Schema] | None: """Read a dataframely schema from the metadata of a parquet file. Args: source: Path to a parquet file or a file-like object that contains the metadata. + kwargs: Additional keyword arguments passed directly to + :meth:`polars.read_parquet_metadata`, e.g. ``storage_options`` and + ``credential_provider`` to reach non-AWS S3-compatible stores. Returns: The schema that was serialized to the metadata. `None` if no schema metadata is found or the deserialization fails. """ - metadata = pl.read_parquet_metadata(source) + metadata = pl.read_parquet_metadata(source, **kwargs) if (schema_metadata := metadata.get(SCHEMA_METADATA_KEY)) is not None: return deserialize_schema(schema_metadata, strict=False) diff --git a/tests/collection/test_storage.py b/tests/collection/test_storage.py index 9554ca5..2672414 100644 --- a/tests/collection/test_storage.py +++ b/tests/collection/test_storage.py @@ -1,6 +1,7 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause +import uuid from typing import Any import polars as pl @@ -498,6 +499,53 @@ def test_read_invalid_parquet_metadata_collection( assert collection is None +@pytest.mark.s3 +def test_read_parquet_metadata_collection_uses_storage_options( + s3_server: str, + s3_bucket: str, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """`read_parquet_metadata_collection` must forward `storage_options` to the + metadata read. + + Regression test for https://github.com/Quantco/dataframely/issues/352. + """ + # Arrange: credentials/endpoint are provided *only* via `storage_options`, never via + # the environment, so the metadata read fails unless the options are forwarded. + for var in ( + "AWS_ENDPOINT_URL", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_ALLOW_HTTP", + "AWS_S3_ALLOW_UNSAFE_RENAME", + "AWS_DEFAULT_REGION", + "AWS_REGION", + ): + monkeypatch.delenv(var, raising=False) + + storage_options = { + "aws_access_key_id": "testing", + "aws_secret_access_key": "testing", + "aws_endpoint_url": s3_server, + "aws_region": "us-east-1", + "aws_allow_http": "true", + } + path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" + pl.DataFrame({"a": [1, 2, 3]}).write_parquet( + path, + metadata={COLLECTION_METADATA_KEY: MyCollection.serialize()}, + storage_options=storage_options, + ) + + # Act + collection = dy.read_parquet_metadata_collection( + path, storage_options=storage_options + ) + + # Assert + assert collection is not None + + @pytest.mark.parametrize( "any_tmp_path", ["tmp_path", pytest.param("s3_tmp_path", marks=pytest.mark.s3)], diff --git a/tests/schema/test_read_write_parquet.py b/tests/schema/test_read_write_parquet.py index a3a36e0..f0628a3 100644 --- a/tests/schema/test_read_write_parquet.py +++ b/tests/schema/test_read_write_parquet.py @@ -121,3 +121,7 @@ def test_read_parquet_metadata_uses_storage_options( # Assert assert_frame_equal(df, out) + + # The standalone metadata helper must forward `storage_options` too. + schema = dy.read_parquet_metadata_schema(path, storage_options=storage_options) + assert schema is not None From bb3b0ab834ce805cee1b6ddcb01b003f1628b7d9 Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Tue, 2 Jun 2026 15:17:39 +0100 Subject: [PATCH 03/10] refactor: Align metadata-read options with original code and forward retries Address review feedback on the storage-options forwarding fix: - Match the file's docstring convention for the public metadata helpers: drop the enumerated `storage_options`/`credential_provider` note and use the same terse "passed directly to :meth:`polars.read_parquet_metadata`" wording as `read_parquet`/`scan_parquet`. - Forward `retries` alongside `storage_options`/`credential_provider` in `_metadata_read_options`, since `read_parquet_metadata` accepts it and it is storage-reaching. Clarify in the docstring why the call sites must filter the scan/read kwargs (the narrower `read_parquet_metadata` signature rejects options like `n_rows`/`columns`) instead of forwarding everything. Co-Authored-By: Claude Opus 4.8 --- dataframely/_storage/parquet.py | 18 ++++++++++-------- dataframely/collection/collection.py | 3 +-- dataframely/schema.py | 3 +-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dataframely/_storage/parquet.py b/dataframely/_storage/parquet.py index dbf0e24..e50ff2d 100644 --- a/dataframely/_storage/parquet.py +++ b/dataframely/_storage/parquet.py @@ -252,17 +252,19 @@ def scan_failure_info( def _metadata_read_options(kwargs: dict[str, Any]) -> dict[str, Any]: - """Extract the options required to reach the data store for a metadata read. - - The parquet metadata is read separately from the data itself. We must forward the - same storage-related options (e.g. ``storage_options`` and ``credential_provider``) - so that metadata reads against non-AWS S3-compatible stores (lakeFS, MinIO, R2, …) - hit the correct endpoint and credentials instead of falling back to the default AWS - credential chain and endpoint. + """Select the storage-related options to forward to a parquet metadata read. + + The metadata is read separately from the data, via + :meth:`polars.read_parquet_metadata`. That function accepts only a narrow subset of + the options understood by :meth:`polars.read_parquet`/:meth:`polars.scan_parquet`, + so we cannot simply forward all ``kwargs``. We forward exactly the options that + determine *how the store is reached* so that metadata reads against non-AWS + S3-compatible stores (lakeFS, MinIO, R2, …) hit the correct endpoint and credentials + instead of falling back to the default AWS credential chain and endpoint. """ return { key: kwargs[key] - for key in ("storage_options", "credential_provider") + for key in ("storage_options", "credential_provider", "retries") if key in kwargs } diff --git a/dataframely/collection/collection.py b/dataframely/collection/collection.py index 1c8767c..6476f20 100644 --- a/dataframely/collection/collection.py +++ b/dataframely/collection/collection.py @@ -1280,8 +1280,7 @@ def read_parquet_metadata_collection( Args: source: Path to a parquet file or a file-like object that contains the metadata. kwargs: Additional keyword arguments passed directly to - :meth:`polars.read_parquet_metadata`, e.g. ``storage_options`` and - ``credential_provider`` to reach non-AWS S3-compatible stores. + :meth:`polars.read_parquet_metadata`. Returns: The collection that was serialized to the metadata. `None` if no collection diff --git a/dataframely/schema.py b/dataframely/schema.py index af54920..d624ae6 100644 --- a/dataframely/schema.py +++ b/dataframely/schema.py @@ -1423,8 +1423,7 @@ def read_parquet_metadata_schema( Args: source: Path to a parquet file or a file-like object that contains the metadata. kwargs: Additional keyword arguments passed directly to - :meth:`polars.read_parquet_metadata`, e.g. ``storage_options`` and - ``credential_provider`` to reach non-AWS S3-compatible stores. + :meth:`polars.read_parquet_metadata`. Returns: The schema that was serialized to the metadata. `None` if no schema metadata From ceac55b6e98ca5ca68e2ba62c6f04b29f90190d7 Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Tue, 2 Jun 2026 15:27:15 +0100 Subject: [PATCH 04/10] test: Tighten storage_options regression tests and trim helper docstring - Shrink `_metadata_read_options` to a one-line comment matching the other private helpers in the module (no oversized docstring). - Extract an `s3_storage_options` fixture (mirrors `s3_tmp_path`, but strips the AWS_* env vars so the store is reachable *only* via `storage_options`) and use it across the schema, collection and failure-info regression tests. - Add a failure-info regression test covering the `scan_failure_info` metadata read, and split the schema test so the typed read and the standalone `read_parquet_metadata_schema` helper are asserted independently. - Drop the end-to-end collection typed-read test: it cannot pass via `storage_options` alone because member discovery goes through fsspec (`url_to_fs`/`fs.exists`), which does not receive `storage_options` -- a separate limitation from the polars metadata read this PR fixes. Co-Authored-By: Claude Opus 4.8 --- dataframely/_storage/parquet.py | 13 ++--- tests/collection/test_storage.py | 32 +++--------- tests/conftest.py | 31 ++++++++++++ tests/failure_info/test_storage.py | 33 +++++++++++++ tests/schema/test_read_write_parquet.py | 65 ++++++++++++------------- 5 files changed, 105 insertions(+), 69 deletions(-) diff --git a/dataframely/_storage/parquet.py b/dataframely/_storage/parquet.py index e50ff2d..e0e5087 100644 --- a/dataframely/_storage/parquet.py +++ b/dataframely/_storage/parquet.py @@ -252,16 +252,9 @@ def scan_failure_info( def _metadata_read_options(kwargs: dict[str, Any]) -> dict[str, Any]: - """Select the storage-related options to forward to a parquet metadata read. - - The metadata is read separately from the data, via - :meth:`polars.read_parquet_metadata`. That function accepts only a narrow subset of - the options understood by :meth:`polars.read_parquet`/:meth:`polars.scan_parquet`, - so we cannot simply forward all ``kwargs``. We forward exactly the options that - determine *how the store is reached* so that metadata reads against non-AWS - S3-compatible stores (lakeFS, MinIO, R2, …) hit the correct endpoint and credentials - instead of falling back to the default AWS credential chain and endpoint. - """ + # Forward only the options that `read_parquet_metadata` accepts (it has a narrower + # signature than `read_parquet`/`scan_parquet`) so the metadata read reaches the + # same store as the data read. return { key: kwargs[key] for key in ("storage_options", "credential_provider", "retries") diff --git a/tests/collection/test_storage.py b/tests/collection/test_storage.py index 2672414..a6debc8 100644 --- a/tests/collection/test_storage.py +++ b/tests/collection/test_storage.py @@ -501,45 +501,25 @@ def test_read_invalid_parquet_metadata_collection( @pytest.mark.s3 def test_read_parquet_metadata_collection_uses_storage_options( - s3_server: str, s3_bucket: str, - monkeypatch: pytest.MonkeyPatch, + s3_storage_options: dict[str, str], ) -> None: - """`read_parquet_metadata_collection` must forward `storage_options` to the - metadata read. + """The standalone `read_parquet_metadata_collection` helper must forward + `storage_options` to the metadata read. Regression test for https://github.com/Quantco/dataframely/issues/352. """ - # Arrange: credentials/endpoint are provided *only* via `storage_options`, never via - # the environment, so the metadata read fails unless the options are forwarded. - for var in ( - "AWS_ENDPOINT_URL", - "AWS_ACCESS_KEY_ID", - "AWS_SECRET_ACCESS_KEY", - "AWS_ALLOW_HTTP", - "AWS_S3_ALLOW_UNSAFE_RENAME", - "AWS_DEFAULT_REGION", - "AWS_REGION", - ): - monkeypatch.delenv(var, raising=False) - - storage_options = { - "aws_access_key_id": "testing", - "aws_secret_access_key": "testing", - "aws_endpoint_url": s3_server, - "aws_region": "us-east-1", - "aws_allow_http": "true", - } + # Arrange path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" pl.DataFrame({"a": [1, 2, 3]}).write_parquet( path, metadata={COLLECTION_METADATA_KEY: MyCollection.serialize()}, - storage_options=storage_options, + storage_options=s3_storage_options, ) # Act collection = dy.read_parquet_metadata_collection( - path, storage_options=storage_options + path, storage_options=s3_storage_options ) # Assert diff --git a/tests/conftest.py b/tests/conftest.py index 75828f5..dda85f8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -44,6 +44,37 @@ def s3_tmp_path(s3_server: str, s3_bucket: str, monkeypatch: pytest.MonkeyPatch) return f"{s3_bucket}/{str(uuid.uuid4())}" +@pytest.fixture() +def s3_storage_options( + s3_server: str, monkeypatch: pytest.MonkeyPatch +) -> dict[str, str]: + """Credentials and endpoint for the moto server, supplied *only* via + ``storage_options``. + + Unlike :func:`s3_tmp_path`, this fixture deletes the ``AWS_*`` environment + variables so the store is unreachable through the default credential chain. A read + therefore succeeds only if ``storage_options`` is forwarded to it -- which is what + the regression tests for https://github.com/Quantco/dataframely/issues/352 assert. + """ + for var in ( + "AWS_ENDPOINT_URL", + "AWS_ACCESS_KEY_ID", + "AWS_SECRET_ACCESS_KEY", + "AWS_ALLOW_HTTP", + "AWS_S3_ALLOW_UNSAFE_RENAME", + "AWS_DEFAULT_REGION", + "AWS_REGION", + ): + monkeypatch.delenv(var, raising=False) + return { + "aws_access_key_id": "testing", + "aws_secret_access_key": "testing", + "aws_endpoint_url": s3_server, + "aws_region": "us-east-1", + "aws_allow_http": "true", + } + + @pytest.fixture() def any_tmp_path(request: pytest.FixtureRequest) -> str: return str(request.getfixturevalue(request.param)) diff --git a/tests/failure_info/test_storage.py b/tests/failure_info/test_storage.py index 10b3e7d..c4580f9 100644 --- a/tests/failure_info/test_storage.py +++ b/tests/failure_info/test_storage.py @@ -1,6 +1,8 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause +import uuid + import polars as pl import pytest from fsspec import AbstractFileSystem, url_to_fs @@ -165,6 +167,37 @@ def test_write_parquet_custom_metadata( assert pl.read_parquet_metadata(p)["custom"] == "test" +@pytest.mark.s3 +@pytest.mark.parametrize("lazy", [True, False]) +def test_read_parquet_uses_storage_options_for_metadata( + s3_bucket: str, + s3_storage_options: dict[str, str], + lazy: bool, +) -> None: + """`FailureInfo.scan_parquet`/`read_parquet` must forward `storage_options` to the + rule/schema metadata read, not just the data read. + + Regression test for https://github.com/Quantco/dataframely/issues/352. + """ + # Arrange + df = pl.DataFrame({"a": [4, 5, 6, 6, 7, 8], "b": [1, 2, 3, 4, 5, 6]}) + _, failure = MySchema.filter(df) + path = f"{s3_bucket}/{uuid.uuid4()}/failure.parquet" + failure.write_parquet(path, storage_options=s3_storage_options) + + # Act: reading failure info always reads the rule/schema metadata, so a successful + # read proves the metadata read used the forwarded `storage_options`. + if lazy: + read = dy.FailureInfo.scan_parquet(path, storage_options=s3_storage_options) + else: + read = dy.FailureInfo.read_parquet(path, storage_options=s3_storage_options) + + # Assert + assert_frame_equal(failure._lf, read._lf) + assert failure._rule_columns == read._rule_columns + assert MySchema.matches(read.schema) + + def test_write_parquet_fails_without_mkdir(tmp_path: str) -> None: # Arrange df = pl.DataFrame( diff --git a/tests/schema/test_read_write_parquet.py b/tests/schema/test_read_write_parquet.py index f0628a3..8b79199 100644 --- a/tests/schema/test_read_write_parquet.py +++ b/tests/schema/test_read_write_parquet.py @@ -68,60 +68,59 @@ def test_write_parquet_fails_without_mkdir(tmp_path: str) -> None: @pytest.mark.s3 @pytest.mark.parametrize("lazy", [True, False]) -def test_read_parquet_metadata_uses_storage_options( - s3_server: str, +def test_read_parquet_uses_storage_options_for_metadata( s3_bucket: str, - monkeypatch: pytest.MonkeyPatch, + s3_storage_options: dict[str, str], lazy: bool, ) -> None: - """The embedded-schema metadata read must use the same ``storage_options`` as the - data read. + """`scan_parquet`/`read_parquet` must forward `storage_options` to the embedded + schema metadata read, not just the data read. Regression test for https://github.com/Quantco/dataframely/issues/352: against non-AWS S3-compatible stores (lakeFS, MinIO, R2, …) reached purely via - ``storage_options``, the metadata read previously fell back to the default AWS + `storage_options`, the metadata read previously fell back to the default AWS credential chain and endpoint, breaking typed reads. """ - # Arrange: provide credentials and endpoint *only* via `storage_options`, never via - # the environment, so the metadata read fails unless the options are forwarded. - for var in ( - "AWS_ENDPOINT_URL", - "AWS_ACCESS_KEY_ID", - "AWS_SECRET_ACCESS_KEY", - "AWS_ALLOW_HTTP", - "AWS_S3_ALLOW_UNSAFE_RENAME", - "AWS_DEFAULT_REGION", - "AWS_REGION", - ): - monkeypatch.delenv(var, raising=False) - - storage_options = { - "aws_access_key_id": "testing", - "aws_secret_access_key": "testing", - "aws_endpoint_url": s3_server, - "aws_region": "us-east-1", - "aws_allow_http": "true", - } + # Arrange + df = MySchema.validate(pl.DataFrame({"a": [1, 2, 3]}), cast=True) path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" - - df = MySchema.create_empty() - MySchema.write_parquet(df, file=path, storage_options=storage_options) + MySchema.write_parquet(df, file=path, storage_options=s3_storage_options) # Act: `validation="forbid"` only returns if the schema stored in the metadata is # read successfully and matches, so a passing read proves the metadata read used the # forwarded `storage_options`. if lazy: out: pl.DataFrame = MySchema.scan_parquet( - path, validation="forbid", storage_options=storage_options + path, validation="forbid", storage_options=s3_storage_options ).collect() else: out = MySchema.read_parquet( - path, validation="forbid", storage_options=storage_options + path, validation="forbid", storage_options=s3_storage_options ) # Assert assert_frame_equal(df, out) - # The standalone metadata helper must forward `storage_options` too. - schema = dy.read_parquet_metadata_schema(path, storage_options=storage_options) + +@pytest.mark.s3 +def test_read_parquet_metadata_schema_uses_storage_options( + s3_bucket: str, + s3_storage_options: dict[str, str], +) -> None: + """The standalone `read_parquet_metadata_schema` helper must forward + `storage_options` to the metadata read. + + Regression test for https://github.com/Quantco/dataframely/issues/352. + """ + # Arrange + path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" + MySchema.write_parquet( + MySchema.create_empty(), file=path, storage_options=s3_storage_options + ) + + # Act + schema = dy.read_parquet_metadata_schema(path, storage_options=s3_storage_options) + + # Assert assert schema is not None + assert schema.matches(MySchema) From 34d3b83ca41de8a9853252f9e2de2cefe7b751ac Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Tue, 2 Jun 2026 15:54:08 +0100 Subject: [PATCH 05/10] test: Trim regression-test docstrings and drop ticket references Documentation should stand on its own, so remove the issue-tracker links from the `s3_storage_options` fixture and the storage-options regression tests, and shrink their docstrings to a single line in line with the surrounding tests. Co-Authored-By: Claude Opus 4.8 --- tests/collection/test_storage.py | 6 +----- tests/conftest.py | 9 +++------ tests/failure_info/test_storage.py | 7 ++----- tests/schema/test_read_write_parquet.py | 21 +++++---------------- 4 files changed, 11 insertions(+), 32 deletions(-) diff --git a/tests/collection/test_storage.py b/tests/collection/test_storage.py index a6debc8..7325838 100644 --- a/tests/collection/test_storage.py +++ b/tests/collection/test_storage.py @@ -504,11 +504,7 @@ def test_read_parquet_metadata_collection_uses_storage_options( s3_bucket: str, s3_storage_options: dict[str, str], ) -> None: - """The standalone `read_parquet_metadata_collection` helper must forward - `storage_options` to the metadata read. - - Regression test for https://github.com/Quantco/dataframely/issues/352. - """ + """`read_parquet_metadata_collection` must forward `storage_options` to the read.""" # Arrange path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" pl.DataFrame({"a": [1, 2, 3]}).write_parquet( diff --git a/tests/conftest.py b/tests/conftest.py index dda85f8..03ceccf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -48,13 +48,10 @@ def s3_tmp_path(s3_server: str, s3_bucket: str, monkeypatch: pytest.MonkeyPatch) def s3_storage_options( s3_server: str, monkeypatch: pytest.MonkeyPatch ) -> dict[str, str]: - """Credentials and endpoint for the moto server, supplied *only* via - ``storage_options``. + """Credentials and endpoint for the moto server, supplied only via storage options. - Unlike :func:`s3_tmp_path`, this fixture deletes the ``AWS_*`` environment - variables so the store is unreachable through the default credential chain. A read - therefore succeeds only if ``storage_options`` is forwarded to it -- which is what - the regression tests for https://github.com/Quantco/dataframely/issues/352 assert. + Unlike :func:`s3_tmp_path`, the ``AWS_*`` environment variables are removed, so the + store is reachable only when ``storage_options`` is forwarded to a read. """ for var in ( "AWS_ENDPOINT_URL", diff --git a/tests/failure_info/test_storage.py b/tests/failure_info/test_storage.py index c4580f9..88e67b6 100644 --- a/tests/failure_info/test_storage.py +++ b/tests/failure_info/test_storage.py @@ -174,11 +174,8 @@ def test_read_parquet_uses_storage_options_for_metadata( s3_storage_options: dict[str, str], lazy: bool, ) -> None: - """`FailureInfo.scan_parquet`/`read_parquet` must forward `storage_options` to the - rule/schema metadata read, not just the data read. - - Regression test for https://github.com/Quantco/dataframely/issues/352. - """ + """`storage_options` must reach the rule/schema metadata read, not just the data + read.""" # Arrange df = pl.DataFrame({"a": [4, 5, 6, 6, 7, 8], "b": [1, 2, 3, 4, 5, 6]}) _, failure = MySchema.filter(df) diff --git a/tests/schema/test_read_write_parquet.py b/tests/schema/test_read_write_parquet.py index 8b79199..9228efb 100644 --- a/tests/schema/test_read_write_parquet.py +++ b/tests/schema/test_read_write_parquet.py @@ -73,22 +73,15 @@ def test_read_parquet_uses_storage_options_for_metadata( s3_storage_options: dict[str, str], lazy: bool, ) -> None: - """`scan_parquet`/`read_parquet` must forward `storage_options` to the embedded - schema metadata read, not just the data read. - - Regression test for https://github.com/Quantco/dataframely/issues/352: against - non-AWS S3-compatible stores (lakeFS, MinIO, R2, …) reached purely via - `storage_options`, the metadata read previously fell back to the default AWS - credential chain and endpoint, breaking typed reads. - """ + """`storage_options` must reach the embedded schema metadata read, not just the + data read.""" # Arrange df = MySchema.validate(pl.DataFrame({"a": [1, 2, 3]}), cast=True) path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" MySchema.write_parquet(df, file=path, storage_options=s3_storage_options) - # Act: `validation="forbid"` only returns if the schema stored in the metadata is - # read successfully and matches, so a passing read proves the metadata read used the - # forwarded `storage_options`. + # Act: `validation="forbid"` only returns if the metadata schema is read and matches, + # so a passing read proves the metadata read used the forwarded `storage_options`. if lazy: out: pl.DataFrame = MySchema.scan_parquet( path, validation="forbid", storage_options=s3_storage_options @@ -107,11 +100,7 @@ def test_read_parquet_metadata_schema_uses_storage_options( s3_bucket: str, s3_storage_options: dict[str, str], ) -> None: - """The standalone `read_parquet_metadata_schema` helper must forward - `storage_options` to the metadata read. - - Regression test for https://github.com/Quantco/dataframely/issues/352. - """ + """`read_parquet_metadata_schema` must forward `storage_options` to the read.""" # Arrange path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" MySchema.write_parquet( From 1f621c1f7a237868d57eeb86b2c688890df3fd83 Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Tue, 2 Jun 2026 15:59:28 +0100 Subject: [PATCH 06/10] test: Use bare `# Act` marker to match the surrounding tests Co-Authored-By: Claude Opus 4.8 --- tests/failure_info/test_storage.py | 5 +++-- tests/schema/test_read_write_parquet.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/failure_info/test_storage.py b/tests/failure_info/test_storage.py index 88e67b6..d327e45 100644 --- a/tests/failure_info/test_storage.py +++ b/tests/failure_info/test_storage.py @@ -182,8 +182,9 @@ def test_read_parquet_uses_storage_options_for_metadata( path = f"{s3_bucket}/{uuid.uuid4()}/failure.parquet" failure.write_parquet(path, storage_options=s3_storage_options) - # Act: reading failure info always reads the rule/schema metadata, so a successful - # read proves the metadata read used the forwarded `storage_options`. + # Act + # Reading failure info always reads the rule/schema metadata, so a successful read + # proves the metadata read used the forwarded `storage_options`. if lazy: read = dy.FailureInfo.scan_parquet(path, storage_options=s3_storage_options) else: diff --git a/tests/schema/test_read_write_parquet.py b/tests/schema/test_read_write_parquet.py index 9228efb..0eec2cd 100644 --- a/tests/schema/test_read_write_parquet.py +++ b/tests/schema/test_read_write_parquet.py @@ -80,8 +80,9 @@ def test_read_parquet_uses_storage_options_for_metadata( path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" MySchema.write_parquet(df, file=path, storage_options=s3_storage_options) - # Act: `validation="forbid"` only returns if the metadata schema is read and matches, - # so a passing read proves the metadata read used the forwarded `storage_options`. + # Act + # `validation="forbid"` only returns if the metadata schema is read and matches, so + # a passing read proves the metadata read used the forwarded `storage_options`. if lazy: out: pl.DataFrame = MySchema.scan_parquet( path, validation="forbid", storage_options=s3_storage_options From 5df26851328f1d130f4656a72f2426b835f9fac9 Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Tue, 2 Jun 2026 16:17:23 +0100 Subject: [PATCH 07/10] refactor: Hoist metadata read options out of the member loop Co-Authored-By: Claude Opus 4.8 --- dataframely/_storage/parquet.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/dataframely/_storage/parquet.py b/dataframely/_storage/parquet.py index e0e5087..f0fcadc 100644 --- a/dataframely/_storage/parquet.py +++ b/dataframely/_storage/parquet.py @@ -147,6 +147,7 @@ def _collection_from_parquet( # between lazy and eager reads data = {} collection_types = [] + metadata_options = _metadata_read_options(kwargs) fs: AbstractFileSystem = url_to_fs(path)[0] for key in members: @@ -160,16 +161,14 @@ def _collection_from_parquet( ) if is_file: collection_types.append( - _read_serialized_collection( - source_path, **_metadata_read_options(kwargs) - ) + _read_serialized_collection(source_path, **metadata_options) ) else: prefix = get_file_prefix(fs) for file in fs.glob(fs.sep.join([source_path, "**", "*.parquet"])): collection_types.append( _read_serialized_collection( - f"{prefix}{file}", **_metadata_read_options(kwargs) + f"{prefix}{file}", **metadata_options ) ) From 48793a588de172af12ecae82ffa55d8a6152180b Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Wed, 3 Jun 2026 16:37:59 +0100 Subject: [PATCH 08/10] refactor: Derive metadata read options from read_parquet_metadata signature Introspect pl.read_parquet_metadata's parameters instead of hard-coding the allowlist, so the forwarded options track the installed polars version rather than drifting against it. Co-Authored-By: Claude Opus 4.8 --- dataframely/_storage/parquet.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dataframely/_storage/parquet.py b/dataframely/_storage/parquet.py index f0fcadc..ffc0208 100644 --- a/dataframely/_storage/parquet.py +++ b/dataframely/_storage/parquet.py @@ -1,6 +1,7 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause +import inspect from collections.abc import Iterable from typing import Any @@ -250,15 +251,14 @@ def scan_failure_info( return lf, serialized_rules, serialized_schema +# `read_parquet_metadata` has no `**kwargs`, so unrecognized keys raise `TypeError`. +_METADATA_READ_PARAMS = frozenset( + inspect.signature(pl.read_parquet_metadata).parameters +) - {"source"} + + def _metadata_read_options(kwargs: dict[str, Any]) -> dict[str, Any]: - # Forward only the options that `read_parquet_metadata` accepts (it has a narrower - # signature than `read_parquet`/`scan_parquet`) so the metadata read reaches the - # same store as the data read. - return { - key: kwargs[key] - for key in ("storage_options", "credential_provider", "retries") - if key in kwargs - } + return {k: v for k, v in kwargs.items() if k in _METADATA_READ_PARAMS} def _read_serialized_collection( From f8e717a0000d5dcca12fabacf7dfd897b8bbbf9f Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Thu, 4 Jun 2026 19:54:30 +0200 Subject: [PATCH 09/10] test: Isolate storage_options regression tests with a unique bucket Polars caches object stores per bucket on the Rust side, where monkeypatch.delenv cannot clear them. A bucket configured once from AWS_* env vars stayed reachable without storage_options, so the regression tests passed even when forwarding was dropped. Replace s3_storage_options with s3_isolated, which hands out a freshly-named bucket that is never env-configured. Reaching it requires forwarding storage_options to every read, so the tests now fail without the fix. Also drop a stale comment in parquet.py. Co-Authored-By: Claude Opus 4.8 (1M context) --- dataframely/_storage/parquet.py | 1 - tests/collection/test_storage.py | 10 +++++----- tests/conftest.py | 20 ++++++++++++++------ tests/failure_info/test_storage.py | 12 ++++++------ tests/schema/test_read_write_parquet.py | 22 +++++++++++----------- 5 files changed, 36 insertions(+), 29 deletions(-) diff --git a/dataframely/_storage/parquet.py b/dataframely/_storage/parquet.py index ffc0208..805efbd 100644 --- a/dataframely/_storage/parquet.py +++ b/dataframely/_storage/parquet.py @@ -251,7 +251,6 @@ def scan_failure_info( return lf, serialized_rules, serialized_schema -# `read_parquet_metadata` has no `**kwargs`, so unrecognized keys raise `TypeError`. _METADATA_READ_PARAMS = frozenset( inspect.signature(pl.read_parquet_metadata).parameters ) - {"source"} diff --git a/tests/collection/test_storage.py b/tests/collection/test_storage.py index 7325838..58823a7 100644 --- a/tests/collection/test_storage.py +++ b/tests/collection/test_storage.py @@ -501,21 +501,21 @@ def test_read_invalid_parquet_metadata_collection( @pytest.mark.s3 def test_read_parquet_metadata_collection_uses_storage_options( - s3_bucket: str, - s3_storage_options: dict[str, str], + s3_isolated: tuple[str, dict[str, str]], ) -> None: """`read_parquet_metadata_collection` must forward `storage_options` to the read.""" # Arrange - path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" + bucket, storage_options = s3_isolated + path = f"{bucket}/{uuid.uuid4()}/df.parquet" pl.DataFrame({"a": [1, 2, 3]}).write_parquet( path, metadata={COLLECTION_METADATA_KEY: MyCollection.serialize()}, - storage_options=s3_storage_options, + storage_options=storage_options, ) # Act collection = dy.read_parquet_metadata_collection( - path, storage_options=s3_storage_options + path, storage_options=storage_options ) # Assert diff --git a/tests/conftest.py b/tests/conftest.py index 03ceccf..7b8870c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -45,13 +45,17 @@ def s3_tmp_path(s3_server: str, s3_bucket: str, monkeypatch: pytest.MonkeyPatch) @pytest.fixture() -def s3_storage_options( +def s3_isolated( s3_server: str, monkeypatch: pytest.MonkeyPatch -) -> dict[str, str]: - """Credentials and endpoint for the moto server, supplied only via storage options. +) -> tuple[str, dict[str, str]]: + """A freshly-named bucket that is only reachable via the returned ``storage_options``. - Unlike :func:`s3_tmp_path`, the ``AWS_*`` environment variables are removed, so the - store is reachable only when ``storage_options`` is forwarded to a read. + Polars caches object stores per bucket, and these caches live Rust-side and are not + cleared by ``monkeypatch.delenv``. A bucket configured once from ``AWS_*`` env vars + (e.g. by :func:`s3_tmp_path`) therefore stays reachable without ``storage_options``, + which would let a read silently succeed even if ``storage_options`` was dropped. A + unique bucket has no such cached store, so reaching it requires forwarding + ``storage_options`` to every read. """ for var in ( "AWS_ENDPOINT_URL", @@ -63,7 +67,11 @@ def s3_storage_options( "AWS_REGION", ): monkeypatch.delenv(var, raising=False) - return { + bucket = f"isolated-{uuid.uuid4()}" + boto3.client( + "s3", endpoint_url=s3_server, aws_access_key_id="", aws_secret_access_key="" + ).create_bucket(Bucket=bucket) + return f"s3://{bucket}", { "aws_access_key_id": "testing", "aws_secret_access_key": "testing", "aws_endpoint_url": s3_server, diff --git a/tests/failure_info/test_storage.py b/tests/failure_info/test_storage.py index d327e45..3823342 100644 --- a/tests/failure_info/test_storage.py +++ b/tests/failure_info/test_storage.py @@ -170,25 +170,25 @@ def test_write_parquet_custom_metadata( @pytest.mark.s3 @pytest.mark.parametrize("lazy", [True, False]) def test_read_parquet_uses_storage_options_for_metadata( - s3_bucket: str, - s3_storage_options: dict[str, str], + s3_isolated: tuple[str, dict[str, str]], lazy: bool, ) -> None: """`storage_options` must reach the rule/schema metadata read, not just the data read.""" # Arrange + bucket, storage_options = s3_isolated df = pl.DataFrame({"a": [4, 5, 6, 6, 7, 8], "b": [1, 2, 3, 4, 5, 6]}) _, failure = MySchema.filter(df) - path = f"{s3_bucket}/{uuid.uuid4()}/failure.parquet" - failure.write_parquet(path, storage_options=s3_storage_options) + path = f"{bucket}/{uuid.uuid4()}/failure.parquet" + failure.write_parquet(path, storage_options=storage_options) # Act # Reading failure info always reads the rule/schema metadata, so a successful read # proves the metadata read used the forwarded `storage_options`. if lazy: - read = dy.FailureInfo.scan_parquet(path, storage_options=s3_storage_options) + read = dy.FailureInfo.scan_parquet(path, storage_options=storage_options) else: - read = dy.FailureInfo.read_parquet(path, storage_options=s3_storage_options) + read = dy.FailureInfo.read_parquet(path, storage_options=storage_options) # Assert assert_frame_equal(failure._lf, read._lf) diff --git a/tests/schema/test_read_write_parquet.py b/tests/schema/test_read_write_parquet.py index 0eec2cd..22bae98 100644 --- a/tests/schema/test_read_write_parquet.py +++ b/tests/schema/test_read_write_parquet.py @@ -69,27 +69,27 @@ def test_write_parquet_fails_without_mkdir(tmp_path: str) -> None: @pytest.mark.s3 @pytest.mark.parametrize("lazy", [True, False]) def test_read_parquet_uses_storage_options_for_metadata( - s3_bucket: str, - s3_storage_options: dict[str, str], + s3_isolated: tuple[str, dict[str, str]], lazy: bool, ) -> None: """`storage_options` must reach the embedded schema metadata read, not just the data read.""" # Arrange + bucket, storage_options = s3_isolated df = MySchema.validate(pl.DataFrame({"a": [1, 2, 3]}), cast=True) - path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" - MySchema.write_parquet(df, file=path, storage_options=s3_storage_options) + path = f"{bucket}/{uuid.uuid4()}/df.parquet" + MySchema.write_parquet(df, file=path, storage_options=storage_options) # Act # `validation="forbid"` only returns if the metadata schema is read and matches, so # a passing read proves the metadata read used the forwarded `storage_options`. if lazy: out: pl.DataFrame = MySchema.scan_parquet( - path, validation="forbid", storage_options=s3_storage_options + path, validation="forbid", storage_options=storage_options ).collect() else: out = MySchema.read_parquet( - path, validation="forbid", storage_options=s3_storage_options + path, validation="forbid", storage_options=storage_options ) # Assert @@ -98,18 +98,18 @@ def test_read_parquet_uses_storage_options_for_metadata( @pytest.mark.s3 def test_read_parquet_metadata_schema_uses_storage_options( - s3_bucket: str, - s3_storage_options: dict[str, str], + s3_isolated: tuple[str, dict[str, str]], ) -> None: """`read_parquet_metadata_schema` must forward `storage_options` to the read.""" # Arrange - path = f"{s3_bucket}/{uuid.uuid4()}/df.parquet" + bucket, storage_options = s3_isolated + path = f"{bucket}/{uuid.uuid4()}/df.parquet" MySchema.write_parquet( - MySchema.create_empty(), file=path, storage_options=s3_storage_options + MySchema.create_empty(), file=path, storage_options=storage_options ) # Act - schema = dy.read_parquet_metadata_schema(path, storage_options=s3_storage_options) + schema = dy.read_parquet_metadata_schema(path, storage_options=storage_options) # Assert assert schema is not None From e33a50a2ee4451af65b2541e394940fe77127ce2 Mon Sep 17 00:00:00 2001 From: Mattijs De Paepe Date: Thu, 4 Jun 2026 21:26:54 +0200 Subject: [PATCH 10/10] test: Delete the isolated S3 bucket on fixture teardown Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/conftest.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 7b8870c..4f84c67 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,7 +47,7 @@ def s3_tmp_path(s3_server: str, s3_bucket: str, monkeypatch: pytest.MonkeyPatch) @pytest.fixture() def s3_isolated( s3_server: str, monkeypatch: pytest.MonkeyPatch -) -> tuple[str, dict[str, str]]: +) -> Iterator[tuple[str, dict[str, str]]]: """A freshly-named bucket that is only reachable via the returned ``storage_options``. Polars caches object stores per bucket, and these caches live Rust-side and are not @@ -68,16 +68,23 @@ def s3_isolated( ): monkeypatch.delenv(var, raising=False) bucket = f"isolated-{uuid.uuid4()}" - boto3.client( + client = boto3.client( "s3", endpoint_url=s3_server, aws_access_key_id="", aws_secret_access_key="" - ).create_bucket(Bucket=bucket) - return f"s3://{bucket}", { - "aws_access_key_id": "testing", - "aws_secret_access_key": "testing", - "aws_endpoint_url": s3_server, - "aws_region": "us-east-1", - "aws_allow_http": "true", - } + ) + client.create_bucket(Bucket=bucket) + yield ( + f"s3://{bucket}", + { + "aws_access_key_id": "testing", + "aws_secret_access_key": "testing", + "aws_endpoint_url": s3_server, + "aws_region": "us-east-1", + "aws_allow_http": "true", + }, + ) + for obj in client.list_objects_v2(Bucket=bucket).get("Contents", []): + client.delete_object(Bucket=bucket, Key=obj["Key"]) + client.delete_bucket(Bucket=bucket) @pytest.fixture()