diff --git a/datashare-python/datashare_python/cli/task.py b/datashare-python/datashare_python/cli/task.py index a9d8c6b5..b5509e62 100644 --- a/datashare-python/datashare_python/cli/task.py +++ b/datashare-python/datashare_python/cli/task.py @@ -28,7 +28,7 @@ _TASK_ID_HELP = "task ID" _WATCH_HELP = "watch a task until it's complete" -TaskArgs = str +StrTaskArgs = str task_app = AsyncTyper(name="task") @@ -36,7 +36,7 @@ @task_app.async_command(help=_START_HELP) async def start( name: Annotated[str, typer.Argument(help=_NAME_HELP)], - args: Annotated[TaskArgs, typer.Argument(help=_ARGS_HELP)] = None, + args: Annotated[StrTaskArgs, typer.Argument(help=_ARGS_HELP)] = None, group: Annotated[ str | None, typer.Option("--group", "-g", help=_GROUP_HELP), diff --git a/datashare-python/datashare_python/constants.py b/datashare-python/datashare_python/constants.py index 383a3fa3..091f16e0 100644 --- a/datashare-python/datashare_python/constants.py +++ b/datashare-python/datashare_python/constants.py @@ -11,6 +11,7 @@ DEFAULT_NAMESPACE = "datashare-default" METADATA_JSON = "metadata.json" +MANIFEST_JSON = "manifest.json" TIKA_METADATA_RESOURCENAME = "tika_metadata_resourcename" diff --git a/datashare-python/datashare_python/logging_.py b/datashare-python/datashare_python/logging_.py index 379fb220..2b70734d 100644 --- a/datashare-python/datashare_python/logging_.py +++ b/datashare-python/datashare_python/logging_.py @@ -132,7 +132,7 @@ def _encode_value(value: Any) -> str: return "true" if value else "false" if isinstance(value, numbers.Number): return str(value) - return json.dumps(value).decode() + return json.dumps(value) def _json_formatter(datefmt: str) -> BaseJsonFormatter: diff --git a/datashare-python/datashare_python/objects.py b/datashare-python/datashare_python/objects.py index 7aa8ec91..70d8495c 100644 --- a/datashare-python/datashare_python/objects.py +++ b/datashare-python/datashare_python/objects.py @@ -1,5 +1,6 @@ import logging import os +from abc import ABC from asyncio import Lock from collections.abc import Awaitable, Callable from dataclasses import InitVar, dataclass, field @@ -7,12 +8,15 @@ from enum import StrEnum, unique from io import BytesIO from pathlib import Path -from typing import Annotated, Any, ClassVar, Literal, Self, TypeVar, cast +from typing import Annotated, Any, ClassVar, Generic, Literal, Self, TypeVar, cast import langcodes +from icij_common.registrable import Registrable from lru import LRU from pydantic_core import PydanticCustomError, ValidationError, core_schema -from pydantic_core.core_schema import PlainValidatorFunctionSchema +from pydantic_core.core_schema import ( + PlainValidatorFunctionSchema, +) from pydantic_extra_types.language_code import LanguageName from temporalio import workflow @@ -34,12 +38,17 @@ from icij_common.pydantic_utils import ( icij_config, lowercamel_case_config, + make_enum_discriminator, merge_configs, no_enum_values_config, + tagged_union, ) from pydantic import ( AfterValidator, + AliasChoices, BeforeValidator, + ConfigDict, + Discriminator, Field, GetCoreSchemaHandler, TypeAdapter, @@ -256,13 +265,123 @@ def _is_absolute_path(v: bytes | BytesIO | Path) -> Any: return v -@dataclass(frozen=True) -class DocArtifact: +class ArtifactType(StrEnum): + STRUCTURE = "structure" + ASR_TRANSCRIPTION = "transcription" + + +class ManifestEntryStatus(StrEnum): + COMPLETE = "complete" + + +class TaskArgs(DatashareModel, ABC): + def as_manifest_task_input(self) -> dict[str, Any]: + # This is a base implementation, if the input is too large to be dumped, + # override this and pop large keys + as_manifest = self.model_dump(by_alias=True) + return as_manifest + + +A = TypeVar("A", bound=TaskArgs) + + +class ManifestEntry(DatashareModel, Generic[A], ABC): + status: ManifestEntryStatus + label: str | None = None + input: Annotated[ + dict[str, Any] | None, + Field( + validation_alias=AliasChoices("taskInput", "input"), + serialization_alias="taskInput", + ), + ] + + @classmethod + def complete(cls, args: A, label: str | None = None, **kwargs) -> Self: + return cls( + input=args.as_manifest_task_input(), + label=label, + status=ManifestEntryStatus.COMPLETE, + **kwargs, + ) + + +class PaginationType(StrEnum): + FILESYSTEM = "filesystem" + BYTE_RANGES = "byteRanges" + + +class BasePagination(DatashareModel, Registrable, ABC): + registry_key: ClassVar[str] = Field(frozen=True, default="type") + + total: int + type: ClassVar[PaginationType] = Field(frozen=True) + + +def _validate_pages_range(v: Any) -> None: + if not isinstance(v, list): + msg = f"expected a list, got {type(v)}" + raise TypeError(msg) + previous_end = None + for page_i, (start, end) in enumerate(v): + if not start <= end: + msg = "end of page must be >= start" + raise ValueError(msg) + if previous_end is not None and previous_end != start: + msg = ( + f"start of page {page_i} doesn't match end of previous " + f"page {previous_end}" + ) + raise ValueError(msg) + return v + + +PagesRange = Annotated[list[tuple[int, int]], AfterValidator(_validate_pages_range)] + + +@BasePagination.register(PaginationType.FILESYSTEM) +class FilesystemPagination(BasePagination): + type: ClassVar[PaginationType] = Field( + default=PaginationType.FILESYSTEM, frozen=True + ) + + +@BasePagination.register(PaginationType.BYTE_RANGES) +class ByteRangesPagination(BasePagination): + type: ClassVar[PaginationType] = Field( + default=PaginationType.BYTE_RANGES, frozen=True + ) + byte_ranges: PagesRange + + @model_validator(mode="after") + def byte_ranges_length_should_match_total(self) -> Self: + if len(self.byte_ranges) != self.total: + msg = ( + f"byte_ranges must match total. Found {len(self.byte_ranges)} for" + f" byte_ranges and {self.total} for total." + ) + raise ValueError(msg) + return self + + +pagination_discriminator = make_enum_discriminator("type", PaginationType) +Pagination = Annotated[ + tagged_union(BasePagination.__subclasses__(), lambda x: x.type), + Discriminator(pagination_discriminator), +] + + +class DocArtifact(BaseModel, ABC): + # This object is not used for serde, just as a container, it's OK to allow + # arbitrary types (to allow storing BytesIO) + model_config = ConfigDict(arbitrary_types_allowed=True) + project: str doc_id: str artifact: Annotated[bytes | BytesIO | Path, AfterValidator(_is_absolute_path)] - filename: str - metadata_key: str + filename: ClassVar[str] # Override this + type: ClassVar[ArtifactType] # Override this + manifest_entry: ManifestEntry @unique diff --git a/datashare-python/datashare_python/utils.py b/datashare-python/datashare_python/utils.py index 04dd6ba3..3e4fd02a 100644 --- a/datashare-python/datashare_python/utils.py +++ b/datashare-python/datashare_python/utils.py @@ -36,7 +36,7 @@ SyncProgressRateHandler, ) -from .constants import METADATA_JSON +from .constants import MANIFEST_JSON, METADATA_JSON from .objects import DocArtifact, DocumentLocation, FilesystemDocument from .types_ import RawAsyncProgressHandler @@ -338,34 +338,75 @@ def _metadata_path(doc_id: str, *, project: str) -> Path: return metadata_path -def _read_artifact_metadata(root: Path, artifact: DocArtifact) -> dict: - m_path = root / _metadata_path(artifact.doc_id, project=artifact.project) +def _manifest_path(doc_id: str, *, project: str) -> Path: + manifest_path = artifacts_dir(doc_id, project=project) / MANIFEST_JSON + return manifest_path + + +def _read_artifact_manifest(root: Path, artifact: DocArtifact) -> dict: + m_path = root / _manifest_path(artifact.doc_id, project=artifact.project) + if not m_path.exists(): + m_path = root / _metadata_path(artifact.doc_id, project=artifact.project) + if not m_path.exists(): + msg = f"couldn't find manifest nor metadata for {artifact.doc_id}" + raise FileNotFoundError(msg) return json.loads(m_path.read_text()) def write_artifact(root: Path, artifact: DocArtifact) -> Path: + # TODO: WARNING many writers could write at the time, to avoid inconsistent + # states we should handle this somehow artif_dir = root / artifacts_dir(artifact.doc_id, project=artifact.project) artif_dir.mkdir(exist_ok=True, parents=True) - # TODO: if transcriptions are too large we could also serialize them - # as jsonl - artifact_path: Path = artif_dir / artifact.filename - match artifact.artifact: + artifact_path = artif_dir / artifact.filename + # Read the metadata first (things could go wrong here in case someone is reading + # at the same time). We read in a backward compatible wat and write to that same + # location. We don't take responsibility for migrating the data, the DS back will + # do it + manifest_path, manifest = _read_manifest_backward_compatible(root, artifact) + is_legacy = manifest_path.name == "metadata.json" + # Pop the status key from the manifest before writing + manifest_entry = manifest.get(artifact.type) + if manifest_entry is not None and not is_legacy: + manifest[artifact.type].pop("status", None) + manifest_path.write_text(json.dumps(manifest)) + # Write the artifact + _write_artifact_bytes(artifact_path, artifact.artifact) + # Update the manifest entry with details and new states + if is_legacy: + manifest_entry = str(artifact_path.relative_to(artif_dir)) + else: + manifest_entry = artifact.manifest_entry.model_dump(mode="json", by_alias=True) + manifest[artifact.type] = manifest_entry + manifest_path.write_text(json.dumps(manifest)) + return artifact_path.relative_to(artif_dir) + + +def _read_manifest_backward_compatible( + root: Path, artifact: DocArtifact +) -> tuple[Path, dict[str, Any]]: + manifest_path = root / _manifest_path(artifact.doc_id, project=artifact.project) + if manifest_path.exists(): + return manifest_path, _read_artifact_manifest(root, artifact) + meta_path = root / _metadata_path(artifact.doc_id, project=artifact.project) + if meta_path.exists(): + return meta_path, _read_artifact_manifest(root, artifact) + return manifest_path, dict() + + +def _write_artifact_bytes(path: Path, artifact: bytes | BytesIO | Path) -> None: + match artifact: case bytes(): - artifact_path.write_bytes(artifact.artifact) + path.write_bytes(artifact) case BytesIO(): - with artifact_path.open("wb") as f: - f.write(artifact.artifact.read()) + with path.open("wb") as f: + f.write(artifact.read()) case Path(): - artifact_path.unlink(missing_ok=True) - shutil.move(artifact.artifact, artifact_path) + path.unlink(missing_ok=True) + shutil.move(artifact, path) case _: - msg = f"unsupported artifact type: {artifact.artifact.__class__.__name__}" + msg = f"unsupported artifact type: {artifact.__class__.__name__}" raise ValueError(msg) - meta_path = root / _metadata_path(artifact.doc_id, project=artifact.project) - meta = _read_artifact_metadata(root, artifact) if meta_path.exists() else dict() - meta[artifact.metadata_key] = artifact.filename - meta_path.write_text(json.dumps(meta)) - return artifact_path.relative_to(artif_dir) def debuggable_name( diff --git a/datashare-python/pyproject.toml b/datashare-python/pyproject.toml index ec639283..8ad37d75 100644 --- a/datashare-python/pyproject.toml +++ b/datashare-python/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "alive-progress~=3.2", "aiohttp~=3.11", "hatchling~=1.27", - "icij-common[elasticsearch]~=0.8.2", + "icij-common[elasticsearch]~=0.8.3", "langcodes~=3.5", "python-json-logger~=4.0", "pyyaml~=6.0", diff --git a/datashare-python/tests/test_objects.py b/datashare-python/tests/test_objects.py index c42f8322..3130592a 100644 --- a/datashare-python/tests/test_objects.py +++ b/datashare-python/tests/test_objects.py @@ -1,3 +1,4 @@ +import json import re from datetime import datetime from pathlib import Path @@ -6,6 +7,8 @@ from datashare_python.conftest import TEST_PROJECT from datashare_python.constants import TIKA_METADATA_RESOURCENAME from datashare_python.objects import ( + BasePagination, + ByteRangesPagination, DatashareLanguage, Document, DocumentLocation, @@ -94,3 +97,20 @@ def test_invalid_datashare_language_should_raise( # When/Then with pytest.raises(ValidationError, match=expected_msg): type_adapter.validate_python(language) + + +def test_pagination_serde() -> None: + # Given + pagination = ByteRangesPagination(total=3, byte_ranges=[(0, 1), (1, 2), (2, 3)]) + ta = TypeAdapter(BasePagination) + # When + serialized = pagination.model_dump_json(by_alias=True) + deserialized = ta.validate_json(serialized) + # Then + expected_serialized = { + "type": "byteRanges", + "total": 3, + "byteRanges": [[0, 1], [1, 2], [2, 3]], + } + assert json.loads(serialized) == expected_serialized + assert deserialized == pagination diff --git a/datashare-python/tests/test_utils.py b/datashare-python/tests/test_utils.py index 52fea31d..acb2971b 100644 --- a/datashare-python/tests/test_utils.py +++ b/datashare-python/tests/test_utils.py @@ -2,9 +2,16 @@ import uuid from datetime import timedelta from pathlib import Path +from typing import ClassVar import pytest -from datashare_python.objects import DatashareModel, DocArtifact +from datashare_python.objects import ( + ArtifactType, + DatashareModel, + DocArtifact, + ManifestEntry, + TaskArgs, +) from datashare_python.types_ import TemporalClient from datashare_python.utils import activity_defn, positional_args_only, write_artifact from datashare_python.worker import datashare_worker @@ -117,20 +124,31 @@ async def test_deserialization_error( assert "2 validation errors for DeserArg" in root_cause.message +class MockedArgs(TaskArgs): + some_value: str + + +class MockedManifestEntry(ManifestEntry): ... + + +class MockedArtifact(DocArtifact): + filename: ClassVar[str] = "mocked-structure" + type: ClassVar[ArtifactType] = ArtifactType.STRUCTURE + + def test_write_artifact(tmp_path: Path) -> None: from datashare_python.conftest import TEST_PROJECT # noqa: PLC0415 # Given + args = MockedArgs(some_value="value") root_dir = Path(tmp_path) artifact_bytes = b"artifacts" - filename = "some_artifact" - metadata_key = "artifact_key" - artifact = DocArtifact( + manifest_entry = MockedManifestEntry.complete(args) + artifact = MockedArtifact( project=TEST_PROJECT, doc_id="doc_id", artifact=artifact_bytes, - filename=filename, - metadata_key=metadata_key, + manifest_entry=manifest_entry, ) # When write_artifact(root_dir, artifact) @@ -138,13 +156,18 @@ def test_write_artifact(tmp_path: Path) -> None: artifact_dir = root_dir / TEST_PROJECT / "do" / "c_" / "doc_id" assert artifact_dir.exists() assert artifact_dir.is_dir() - meta_path = artifact_dir / "metadata.json" - assert meta_path.exists() - meta = json.loads(meta_path.read_text()) - assert meta == {"artifact_key": filename} - artifact_name = meta.get(metadata_key) - assert artifact_name is not None - artifact_path = artifact_dir / artifact_name + manifest_path = artifact_dir / "manifest.json" + assert manifest_path.exists() + manifest = json.loads(manifest_path.read_text()) + expected_manifest = { + "structure": { + "status": "complete", + "taskInput": {"someValue": "value"}, + "label": None, + } + } + assert manifest == expected_manifest + artifact_path = artifact_dir / "mocked-structure" assert artifact_path.exists() assert artifact_path.read_bytes() == artifact_bytes @@ -153,18 +176,59 @@ def test_write_artifact_with_existing_metadata(tmp_path: Path) -> None: from datashare_python.conftest import TEST_PROJECT # noqa: PLC0415 # Given + args = MockedArgs(some_value="value") + root_dir = Path(tmp_path) + artifact_bytes = b"artifacts" + manifest_entry = MockedManifestEntry.complete(args) + artifact = MockedArtifact( + project=TEST_PROJECT, + doc_id="doc_id", + artifact=artifact_bytes, + manifest_entry=manifest_entry, + ) + existing_manifest = {"some": "value"} + artifact_dir = root_dir / TEST_PROJECT / "do" / "c_" / "doc_id" + artifact_dir.mkdir(parents=True, exist_ok=True) + manifest_path = artifact_dir / "manifest.json" + manifest_path.write_text(json.dumps(existing_manifest)) + # When + write_artifact(root_dir, artifact) + # Then + artifact_dir = root_dir / TEST_PROJECT / "do" / "c_" / "doc_id" + assert artifact_dir.exists() + assert artifact_dir.is_dir() + assert manifest_path.exists() + manifest = json.loads(manifest_path.read_text()) + expected_manifest = { + "structure": { + "status": "complete", + "taskInput": {"someValue": "value"}, + "label": None, + }, + "some": "value", + } + assert manifest == expected_manifest + assert manifest == expected_manifest + artifact_path = artifact_dir / "mocked-structure" + assert artifact_path.exists() + assert artifact_path.read_bytes() == artifact_bytes + + +def test_write_artifact_with_existing_legacy_metadata(tmp_path: Path) -> None: + from datashare_python.conftest import TEST_PROJECT # noqa: PLC0415 + + # Given + args = MockedArgs(some_value="value") root_dir = Path(tmp_path) artifact_bytes = b"artifacts" - filename = "some_artifact" - metadata_key = "artifact_key" - artifact = DocArtifact( + manifest_entry = MockedManifestEntry.complete(args) + artifact = MockedArtifact( project=TEST_PROJECT, doc_id="doc_id", artifact=artifact_bytes, - filename=filename, - metadata_key=metadata_key, + manifest_entry=manifest_entry, ) - existing_metadata = {"some": "metadata"} + existing_metadata = {"structure": "existing-structure"} artifact_dir = root_dir / TEST_PROJECT / "do" / "c_" / "doc_id" artifact_dir.mkdir(parents=True, exist_ok=True) meta_path = artifact_dir / "metadata.json" @@ -178,9 +242,52 @@ def test_write_artifact_with_existing_metadata(tmp_path: Path) -> None: meta_path = artifact_dir / "metadata.json" assert meta_path.exists() meta = json.loads(meta_path.read_text()) - assert meta == {"artifact_key": filename, "some": "metadata"} - artifact_name = meta.get(metadata_key) + assert meta == {"structure": MockedArtifact.filename} + artifact_name = meta.get(ArtifactType.STRUCTURE) assert artifact_name is not None artifact_path = artifact_dir / artifact_name assert artifact_path.exists() assert artifact_path.read_bytes() == artifact_bytes + + +def test_overwrite_artifact(tmp_path: Path) -> None: + from datashare_python.conftest import TEST_PROJECT # noqa: PLC0415 + + # Given + args = MockedArgs(some_value="value") + root_dir = Path(tmp_path) + manifest_entry = MockedManifestEntry.complete(args) + first = MockedArtifact( + project=TEST_PROJECT, + doc_id="doc_id", + artifact=b"first", + manifest_entry=manifest_entry, + ) + second = MockedArtifact( + project=TEST_PROJECT, + doc_id="doc_id", + artifact=b"second", + manifest_entry=manifest_entry, + ) + write_artifact(root_dir, first) + # When + write_artifact(root_dir, second) + # Then + artifact_dir = root_dir / TEST_PROJECT / "do" / "c_" / "doc_id" + assert artifact_dir.exists() + assert artifact_dir.is_dir() + manifest_path = artifact_dir / "manifest.json" + assert manifest_path.exists() + manifest = json.loads(manifest_path.read_text()) + expected_manifest = { + "structure": { + "status": "complete", + "taskInput": {"someValue": "value"}, + "label": None, + }, + } + assert manifest == expected_manifest + assert manifest == expected_manifest + artifact_path = artifact_dir / "mocked-structure" + assert artifact_path.exists() + assert artifact_path.read_bytes() == b"second" diff --git a/worker-template/uv.dist.lock b/worker-template/uv.dist.lock index b5025ac4..c36f93ed 100644 --- a/worker-template/uv.dist.lock +++ b/worker-template/uv.dist.lock @@ -307,7 +307,7 @@ wheels = [ [[package]] name = "datashare-python" -version = "0.9.4" +version = "0.9.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -325,9 +325,9 @@ dependencies = [ { name = "tomlkit" }, { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b0/1e/b6dcacb07a29f526151c98003e1c97e0b8ed0a19692abb217f9477ee80ee/datashare_python-0.9.4.tar.gz", hash = "sha256:2ab6bd6b4237f8474e61b21335441b311e84d3d0d3b6a8441fd0f33558bc790a", size = 320672, upload-time = "2026-06-23T09:25:20.411Z" } +sdist = { url = "https://files.pythonhosted.org/packages/cc/6a/cb3c7c3073573c6ad0cf56cd6c6b6c97dbd1dbd6392ec0314e78865c9dee/datashare_python-0.9.5.tar.gz", hash = "sha256:9ff757584b46bf95af74a1c890734d3233f02f54724806ec9b346a04c39e311a", size = 320645, upload-time = "2026-06-23T12:11:01.145Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/29/5c/cf9c94901a93a275f366c1e64418dc7c5eb217baede67ce21ec273d0ddb1/datashare_python-0.9.4-py3-none-any.whl", hash = "sha256:1ade134e1584d5c32255418eeedf5f5c7c6383d931a593f7765fd26b17114ff7", size = 326668, upload-time = "2026-06-23T09:25:21.423Z" }, + { url = "https://files.pythonhosted.org/packages/1b/27/826ac48804b0bda7a35260fdc925ff25fda3fcc527da32c4c484461ed8d2/datashare_python-0.9.5-py3-none-any.whl", hash = "sha256:c1dce1b981342dc9a3e80d3bfd3246d4e774d55c116a37321e31ac2b39d4ae08", size = 326631, upload-time = "2026-06-23T12:10:59.749Z" }, ] [[package]] diff --git a/worker-template/uv.lock b/worker-template/uv.lock index f13444da..c4ca0b83 100644 --- a/worker-template/uv.lock +++ b/worker-template/uv.lock @@ -307,7 +307,7 @@ wheels = [ [[package]] name = "datashare-python" -version = "0.9.5" +version = "0.9.8" source = { editable = "../datashare-python" } dependencies = [ { name = "aiohttp" }, @@ -316,8 +316,6 @@ dependencies = [ { name = "icij-common", extra = ["elasticsearch"] }, { name = "langcodes" }, { name = "lru-dict" }, - { name = "nest-asyncio" }, - { name = "orjson" }, { name = "pydantic-extra-types", extra = ["pycountry"] }, { name = "python-json-logger" }, { name = "pyyaml" }, @@ -331,11 +329,9 @@ requires-dist = [ { name = "aiohttp", specifier = "~=3.11" }, { name = "alive-progress", specifier = "~=3.2" }, { name = "hatchling", specifier = "~=1.27" }, - { name = "icij-common", extras = ["elasticsearch"], specifier = "~=0.8.2" }, + { name = "icij-common", extras = ["elasticsearch"], specifier = "~=0.8.3" }, { name = "langcodes", specifier = "~=3.5" }, { name = "lru-dict", specifier = "~=1.4" }, - { name = "nest-asyncio", specifier = "~=1.6" }, - { name = "orjson", specifier = "~=3.11" }, { name = "pydantic-extra-types", extras = ["pycountry"], specifier = ">=2.11.1" }, { name = "python-json-logger", specifier = "~=4.0" }, { name = "pyyaml", specifier = "~=6.0" }, @@ -646,16 +642,16 @@ wheels = [ [[package]] name = "icij-common" -version = "0.8.2" +version = "0.8.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiostream" }, { name = "pydantic" }, { name = "pydantic-settings" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e0/51/91d85cc23e275bb51a5cae47873af9a7160f62797b25663d902cf38e6ab7/icij_common-0.8.2.tar.gz", hash = "sha256:7db68266d8facb43142131d81e998cd74c6ae73508456743e00df43a15cc2995", size = 15937, upload-time = "2026-04-07T12:13:39.774Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/8d/4b2b9bb7dcac24d8bf514ccf2da6465ca3cd9b4c4f4bde0a1c97e681d595/icij_common-0.8.3.tar.gz", hash = "sha256:009dda7c1d688ecf7705cf88517deb3d0abe02300b42c8c433eb36516462fe75", size = 16161, upload-time = "2026-07-03T09:43:50.837Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/5a/99d123230023124ba4ec67765feb2a04b656ae1f473a14f6b29cc54b02bf/icij_common-0.8.2-py3-none-any.whl", hash = "sha256:c5a1a3ccd54e7bf53acad293a0af50ab15156b225791b43383197b896bae955f", size = 20565, upload-time = "2026-04-07T12:13:38.709Z" }, + { url = "https://files.pythonhosted.org/packages/1c/8e/99879d623fdf4e2371e44135dcf690d7f43b921616e4c918633fe4cece48/icij_common-0.8.3-py3-none-any.whl", hash = "sha256:5214cbb73dca364cca16005f5fac3c2f3944acab6dea269a68d26f631e39a348", size = 21030, upload-time = "2026-07-03T09:43:49.934Z" }, ] [package.optional-dependencies] @@ -1154,74 +1150,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/de/e5/b7d20451657664b07986c2f6e3be564433f5dcaf3482d68eaecd79afaf03/numpy-2.4.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:be71bf1edb48ebbbf7f6337b5bfd2f895d1902f6335a5830b20141fc126ffba0", size = 12502577, upload-time = "2026-01-31T23:13:07.08Z" }, ] -[[package]] -name = "orjson" -version = "3.11.8" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9d/1b/2024d06792d0779f9dbc51531b61c24f76c75b9f4ce05e6f3377a1814cea/orjson-3.11.8.tar.gz", hash = "sha256:96163d9cdc5a202703e9ad1b9ae757d5f0ca62f4fa0cc93d1f27b0e180cc404e", size = 5603832, upload-time = "2026-03-31T16:16:27.878Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/67/41/5aa7fa3b0f4dc6b47dcafc3cea909299c37e40e9972feabc8b6a74e2730d/orjson-3.11.8-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:003646067cc48b7fcab2ae0c562491c9b5d2cbd43f1e5f16d98fd118c5522d34", size = 229229, upload-time = "2026-03-31T16:14:50.424Z" }, - { url = "https://files.pythonhosted.org/packages/0a/d7/57e7f2458e0a2c41694f39fc830030a13053a84f837a5b73423dca1f0938/orjson-3.11.8-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:ed193ce51d77a3830cad399a529cd4ef029968761f43ddc549e1bc62b40d88f8", size = 128871, upload-time = "2026-03-31T16:14:51.888Z" }, - { url = "https://files.pythonhosted.org/packages/53/4a/e0fdb9430983e6c46e0299559275025075568aad5d21dd606faee3703924/orjson-3.11.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30491bc4f862aa15744b9738517454f1e46e56c972a2be87d70d727d5b2a8f8", size = 132104, upload-time = "2026-03-31T16:14:53.142Z" }, - { url = "https://files.pythonhosted.org/packages/08/4a/2025a60ff3f5c8522060cda46612d9b1efa653de66ed2908591d8d82f22d/orjson-3.11.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6eda5b8b6be91d3f26efb7dc6e5e68ee805bc5617f65a328587b35255f138bf4", size = 130483, upload-time = "2026-03-31T16:14:54.605Z" }, - { url = "https://files.pythonhosted.org/packages/2d/3c/b9cde05bdc7b2385c66014e0620627da638d3d04e4954416ab48c31196c5/orjson-3.11.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee8db7bfb6fe03581bbab54d7c4124a6dd6a7f4273a38f7267197890f094675f", size = 135481, upload-time = "2026-03-31T16:14:55.901Z" }, - { url = "https://files.pythonhosted.org/packages/ff/f2/a8238e7734de7cb589fed319857a8025d509c89dc52fdcc88f39c6d03d5a/orjson-3.11.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d8b5231de76c528a46b57010bbd83fb51e056aa0220a372fd5065e978406f1c", size = 146819, upload-time = "2026-03-31T16:14:57.548Z" }, - { url = "https://files.pythonhosted.org/packages/db/10/dbf1e2a3cafea673b1b4350e371877b759060d6018a998643b7040e5de48/orjson-3.11.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:58a4a208a6fbfdb7a7327b8f201c6014f189f721fd55d047cafc4157af1bc62a", size = 132846, upload-time = "2026-03-31T16:14:58.91Z" }, - { url = "https://files.pythonhosted.org/packages/f8/fc/55e667ec9c85694038fcff00573d221b085d50777368ee3d77f38668bf3c/orjson-3.11.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f8952d6d2505c003e8f0224ff7858d341fa4e33fef82b91c4ff0ef070f2393c", size = 133580, upload-time = "2026-03-31T16:15:00.519Z" }, - { url = "https://files.pythonhosted.org/packages/7e/a6/c08c589a9aad0cb46c4831d17de212a2b6901f9d976814321ff8e69e8785/orjson-3.11.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0022bb50f90da04b009ce32c512dc1885910daa7cb10b7b0cba4505b16db82a8", size = 142042, upload-time = "2026-03-31T16:15:01.906Z" }, - { url = "https://files.pythonhosted.org/packages/5c/cc/2f78ea241d52b717d2efc38878615fe80425bf2beb6e68c984dde257a766/orjson-3.11.8-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ff51f9d657d1afb6f410cb435792ce4e1fe427aab23d2fcd727a2876e21d4cb6", size = 423845, upload-time = "2026-03-31T16:15:03.703Z" }, - { url = "https://files.pythonhosted.org/packages/70/07/c17dcf05dd8045457538428a983bf1f1127928df5bf328cb24d2b7cddacb/orjson-3.11.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6dbe9a97bdb4d8d9d5367b52a7c32549bba70b2739c58ef74a6964a6d05ae054", size = 147729, upload-time = "2026-03-31T16:15:05.203Z" }, - { url = "https://files.pythonhosted.org/packages/90/6c/0fb6e8a24e682e0958d71711ae6f39110e4b9cd8cab1357e2a89cb8e1951/orjson-3.11.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a5c370674ebabe16c6ccac33ff80c62bf8a6e59439f5e9d40c1f5ab8fd2215b7", size = 136425, upload-time = "2026-03-31T16:15:07.052Z" }, - { url = "https://files.pythonhosted.org/packages/b2/35/4d3cc3a3d616035beb51b24a09bb872942dc452cf2df0c1d11ab35046d9f/orjson-3.11.8-cp311-cp311-win32.whl", hash = "sha256:0e32f7154299f42ae66f13488963269e5eccb8d588a65bc839ed986919fc9fac", size = 131870, upload-time = "2026-03-31T16:15:08.678Z" }, - { url = "https://files.pythonhosted.org/packages/13/26/9fe70f81d16b702f8c3a775e8731b50ad91d22dacd14c7599b60a0941cd1/orjson-3.11.8-cp311-cp311-win_amd64.whl", hash = "sha256:25e0c672a2e32348d2eb33057b41e754091f2835f87222e4675b796b92264f06", size = 127440, upload-time = "2026-03-31T16:15:09.994Z" }, - { url = "https://files.pythonhosted.org/packages/e8/c6/b038339f4145efd2859c1ca53097a52c0bb9cbdd24f947ebe146da1ad067/orjson-3.11.8-cp311-cp311-win_arm64.whl", hash = "sha256:9185589c1f2a944c17e26c9925dcdbc2df061cc4a145395c57f0c51f9b5dbfcd", size = 127399, upload-time = "2026-03-31T16:15:11.412Z" }, - { url = "https://files.pythonhosted.org/packages/01/f6/8d58b32ab32d9215973a1688aebd098252ee8af1766c0e4e36e7831f0295/orjson-3.11.8-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:1cd0b77e77c95758f8e1100139844e99f3ccc87e71e6fc8e1c027e55807c549f", size = 229233, upload-time = "2026-03-31T16:15:12.762Z" }, - { url = "https://files.pythonhosted.org/packages/a9/8b/2ffe35e71f6b92622e8ea4607bf33ecf7dfb51b3619dcfabfd36cbe2d0a5/orjson-3.11.8-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:6a3d159d5ffa0e3961f353c4b036540996bf8b9697ccc38261c0eac1fd3347a6", size = 128772, upload-time = "2026-03-31T16:15:14.237Z" }, - { url = "https://files.pythonhosted.org/packages/27/d2/1f8682ae50d5c6897a563cb96bc106da8c9cb5b7b6e81a52e4cc086679b9/orjson-3.11.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76070a76e9c5ae661e2d9848f216980d8d533e0f8143e6ed462807b242e3c5e8", size = 131946, upload-time = "2026-03-31T16:15:15.607Z" }, - { url = "https://files.pythonhosted.org/packages/52/4b/5500f76f0eece84226e0689cb48dcde081104c2fa6e2483d17ca13685ffb/orjson-3.11.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:54153d21520a71a4c82a0dbb4523e468941d549d221dc173de0f019678cf3813", size = 130368, upload-time = "2026-03-31T16:15:17.066Z" }, - { url = "https://files.pythonhosted.org/packages/da/4e/58b927e08fbe9840e6c920d9e299b051ea667463b1f39a56e668669f8508/orjson-3.11.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:469ac2125611b7c5741a0b3798cd9e5786cbad6345f9f400c77212be89563bec", size = 135540, upload-time = "2026-03-31T16:15:18.404Z" }, - { url = "https://files.pythonhosted.org/packages/56/7c/ba7cb871cba1bcd5cd02ee34f98d894c6cea96353ad87466e5aef2429c60/orjson-3.11.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:14778ffd0f6896aa613951a7fbf4690229aa7a543cb2bfbe9f358e08aafa9546", size = 146877, upload-time = "2026-03-31T16:15:19.833Z" }, - { url = "https://files.pythonhosted.org/packages/0b/5d/eb9c25fc1386696c6a342cd361c306452c75e0b55e86ad602dd4827a7fd7/orjson-3.11.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea56a955056a6d6c550cf18b3348656a9d9a4f02e2d0c02cabf3c73f1055d506", size = 132837, upload-time = "2026-03-31T16:15:21.282Z" }, - { url = "https://files.pythonhosted.org/packages/37/87/5ddeb7fc1fbd9004aeccab08426f34c81a5b4c25c7061281862b015fce2b/orjson-3.11.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53a0f57e59a530d18a142f4d4ba6dfc708dc5fdedce45e98ff06b44930a2a48f", size = 133624, upload-time = "2026-03-31T16:15:22.641Z" }, - { url = "https://files.pythonhosted.org/packages/22/09/90048793db94ee4b2fcec4ac8e5ddb077367637d6650be896b3494b79bb7/orjson-3.11.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9b48e274f8824567d74e2158199e269597edf00823a1b12b63d48462bbf5123e", size = 141904, upload-time = "2026-03-31T16:15:24.435Z" }, - { url = "https://files.pythonhosted.org/packages/c0/cf/eb284847487821a5d415e54149a6449ba9bfc5872ce63ab7be41b8ec401c/orjson-3.11.8-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3f262401086a3960586af06c054609365e98407151f5ea24a62893a40d80dbbb", size = 423742, upload-time = "2026-03-31T16:15:26.155Z" }, - { url = "https://files.pythonhosted.org/packages/44/09/e12423d327071c851c13e76936f144a96adacfc037394dec35ac3fc8d1e8/orjson-3.11.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8e8c6218b614badf8e229b697865df4301afa74b791b6c9ade01d19a9953a942", size = 147806, upload-time = "2026-03-31T16:15:27.909Z" }, - { url = "https://files.pythonhosted.org/packages/b3/6d/37c2589ba864e582ffe7611643314785c6afb1f83c701654ef05daa8fcc7/orjson-3.11.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:093d489fa039ddade2db541097dbb484999fcc65fc2b0ff9819141e2ab364f25", size = 136485, upload-time = "2026-03-31T16:15:29.749Z" }, - { url = "https://files.pythonhosted.org/packages/be/c9/135194a02ab76b04ed9a10f68624b7ebd238bbe55548878b11ff15a0f352/orjson-3.11.8-cp312-cp312-win32.whl", hash = "sha256:e0950ed1bcb9893f4293fd5c5a7ee10934fbf82c4101c70be360db23ce24b7d2", size = 131966, upload-time = "2026-03-31T16:15:31.687Z" }, - { url = "https://files.pythonhosted.org/packages/ed/9a/9796f8fbe3cf30ce9cb696748dbb535e5c87be4bf4fe2e9ca498ef1fa8cf/orjson-3.11.8-cp312-cp312-win_amd64.whl", hash = "sha256:3cf17c141617b88ced4536b2135c552490f07799f6ad565948ea07bef0dcb9a6", size = 127441, upload-time = "2026-03-31T16:15:33.333Z" }, - { url = "https://files.pythonhosted.org/packages/cc/47/5aaf54524a7a4a0dd09dd778f3fa65dd2108290615b652e23d944152bc8e/orjson-3.11.8-cp312-cp312-win_arm64.whl", hash = "sha256:48854463b0572cc87dac7d981aa72ed8bf6deedc0511853dc76b8bbd5482d36d", size = 127364, upload-time = "2026-03-31T16:15:34.748Z" }, - { url = "https://files.pythonhosted.org/packages/66/7f/95fba509bb2305fab0073558f1e8c3a2ec4b2afe58ed9fcb7d3b8beafe94/orjson-3.11.8-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:3f23426851d98478c8970da5991f84784a76682213cd50eb73a1da56b95239dc", size = 229180, upload-time = "2026-03-31T16:15:36.426Z" }, - { url = "https://files.pythonhosted.org/packages/f6/9d/b237215c743ca073697d759b5503abd2cb8a0d7b9c9e21f524bcf176ab66/orjson-3.11.8-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:ebaed4cef74a045b83e23537b52ef19a367c7e3f536751e355a2a394f8648559", size = 128754, upload-time = "2026-03-31T16:15:38.049Z" }, - { url = "https://files.pythonhosted.org/packages/42/3d/27d65b6d11e63f133781425f132807aef793ed25075fec686fc8e46dd528/orjson-3.11.8-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97c8f5d3b62380b70c36ffacb2a356b7c6becec86099b177f73851ba095ef623", size = 131877, upload-time = "2026-03-31T16:15:39.484Z" }, - { url = "https://files.pythonhosted.org/packages/dd/cc/faee30cd8f00421999e40ef0eba7332e3a625ce91a58200a2f52c7fef235/orjson-3.11.8-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:436c4922968a619fb7fef1ccd4b8b3a76c13b67d607073914d675026e911a65c", size = 130361, upload-time = "2026-03-31T16:15:41.274Z" }, - { url = "https://files.pythonhosted.org/packages/5c/bb/a6c55896197f97b6d4b4e7c7fd77e7235517c34f5d6ad5aadd43c54c6d7c/orjson-3.11.8-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1ab359aff0436d80bfe8a23b46b5fea69f1e18aaf1760a709b4787f1318b317f", size = 135521, upload-time = "2026-03-31T16:15:42.758Z" }, - { url = "https://files.pythonhosted.org/packages/9c/7c/ca3a3525aa32ff636ebb1778e77e3587b016ab2edb1b618b36ba96f8f2c0/orjson-3.11.8-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f89b6d0b3a8d81e1929d3ab3d92bbc225688bd80a770c49432543928fe09ac55", size = 146862, upload-time = "2026-03-31T16:15:44.341Z" }, - { url = "https://files.pythonhosted.org/packages/3c/0c/18a9d7f18b5edd37344d1fd5be17e94dc652c67826ab749c6e5948a78112/orjson-3.11.8-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29c009e7a2ca9ad0ed1376ce20dd692146a5d9fe4310848904b6b4fee5c5c137", size = 132847, upload-time = "2026-03-31T16:15:46.368Z" }, - { url = "https://files.pythonhosted.org/packages/23/91/7e722f352ad67ca573cee44de2a58fb810d0f4eb4e33276c6a557979fd8a/orjson-3.11.8-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:705b895b781b3e395c067129d8551655642dfe9437273211d5404e87ac752b53", size = 133637, upload-time = "2026-03-31T16:15:48.123Z" }, - { url = "https://files.pythonhosted.org/packages/af/04/32845ce13ac5bd1046ddb02ac9432ba856cc35f6d74dde95864fe0ad5523/orjson-3.11.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:88006eda83858a9fdf73985ce3804e885c2befb2f506c9a3723cdeb5a2880e3e", size = 141906, upload-time = "2026-03-31T16:15:49.626Z" }, - { url = "https://files.pythonhosted.org/packages/02/5e/c551387ddf2d7106d9039369862245c85738b828844d13b99ccb8d61fd06/orjson-3.11.8-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:55120759e61309af7fcf9e961c6f6af3dde5921cdb3ee863ef63fd9db126cae6", size = 423722, upload-time = "2026-03-31T16:15:51.176Z" }, - { url = "https://files.pythonhosted.org/packages/00/a3/ecfe62434096f8a794d4976728cb59bcfc4a643977f21c2040545d37eb4c/orjson-3.11.8-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:98bdc6cb889d19bed01de46e67574a2eab61f5cc6b768ed50e8ac68e9d6ffab6", size = 147801, upload-time = "2026-03-31T16:15:52.939Z" }, - { url = "https://files.pythonhosted.org/packages/18/6d/0dce10b9f6643fdc59d99333871a38fa5a769d8e2fc34a18e5d2bfdee900/orjson-3.11.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:708c95f925a43ab9f34625e45dcdadf09ec8a6e7b664a938f2f8d5650f6c090b", size = 136460, upload-time = "2026-03-31T16:15:54.431Z" }, - { url = "https://files.pythonhosted.org/packages/01/d6/6dde4f31842d87099238f1f07b459d24edc1a774d20687187443ab044191/orjson-3.11.8-cp313-cp313-win32.whl", hash = "sha256:01c4e5a6695dc09098f2e6468a251bc4671c50922d4d745aff1a0a33a0cf5b8d", size = 131956, upload-time = "2026-03-31T16:15:56.081Z" }, - { url = "https://files.pythonhosted.org/packages/c1/f9/4e494a56e013db957fb77186b818b916d4695b8fa2aa612364974160e91b/orjson-3.11.8-cp313-cp313-win_amd64.whl", hash = "sha256:c154a35dd1330707450bb4d4e7dd1f17fa6f42267a40c1e8a1daa5e13719b4b8", size = 127410, upload-time = "2026-03-31T16:15:57.54Z" }, - { url = "https://files.pythonhosted.org/packages/57/7f/803203d00d6edb6e9e7eef421d4e1adbb5ea973e40b3533f3cfd9aeb374e/orjson-3.11.8-cp313-cp313-win_arm64.whl", hash = "sha256:4861bde57f4d253ab041e374f44023460e60e71efaa121f3c5f0ed457c3a701e", size = 127338, upload-time = "2026-03-31T16:15:59.106Z" }, - { url = "https://files.pythonhosted.org/packages/6d/35/b01910c3d6b85dc882442afe5060cbf719c7d1fc85749294beda23d17873/orjson-3.11.8-cp314-cp314-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:ec795530a73c269a55130498842aaa762e4a939f6ce481a7e986eeaa790e9da4", size = 229171, upload-time = "2026-03-31T16:16:00.651Z" }, - { url = "https://files.pythonhosted.org/packages/c2/56/c9ec97bd11240abef39b9e5d99a15462809c45f677420fd148a6c5e6295e/orjson-3.11.8-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:c492a0e011c0f9066e9ceaa896fbc5b068c54d365fea5f3444b697ee01bc8625", size = 128746, upload-time = "2026-03-31T16:16:02.673Z" }, - { url = "https://files.pythonhosted.org/packages/3b/e4/66d4f30a90de45e2f0cbd9623588e8ae71eef7679dbe2ae954ed6d66a41f/orjson-3.11.8-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:883206d55b1bd5f5679ad5e6ddd3d1a5e3cac5190482927fdb8c78fb699193b5", size = 131867, upload-time = "2026-03-31T16:16:04.342Z" }, - { url = "https://files.pythonhosted.org/packages/19/30/2a645fc9286b928675e43fa2a3a16fb7b6764aa78cc719dc82141e00f30b/orjson-3.11.8-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5774c1fdcc98b2259800b683b19599c133baeb11d60033e2095fd9d4667b82db", size = 124664, upload-time = "2026-03-31T16:16:05.837Z" }, - { url = "https://files.pythonhosted.org/packages/db/44/77b9a86d84a28d52ba3316d77737f6514e17118119ade3f91b639e859029/orjson-3.11.8-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ac7381c83dd3d4a6347e6635950aa448f54e7b8406a27c7ecb4a37e9f1ae08b", size = 129701, upload-time = "2026-03-31T16:16:07.407Z" }, - { url = "https://files.pythonhosted.org/packages/b3/ea/eff3d9bfe47e9bc6969c9181c58d9f71237f923f9c86a2d2f490cd898c82/orjson-3.11.8-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:14439063aebcb92401c11afc68ee4e407258d2752e62d748b6942dad20d2a70d", size = 141202, upload-time = "2026-03-31T16:16:09.48Z" }, - { url = "https://files.pythonhosted.org/packages/52/c8/90d4b4c60c84d62068d0cf9e4d8f0a4e05e76971d133ac0c60d818d4db20/orjson-3.11.8-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa72e71977bff96567b0f500fc5bfd2fdf915f34052c782a4c6ebbdaa97aa858", size = 127194, upload-time = "2026-03-31T16:16:11.02Z" }, - { url = "https://files.pythonhosted.org/packages/8d/c7/ea9e08d1f0ba981adffb629811148b44774d935171e7b3d780ae43c4c254/orjson-3.11.8-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7679bc2f01bb0d219758f1a5f87bb7c8a81c0a186824a393b366876b4948e14f", size = 133639, upload-time = "2026-03-31T16:16:13.434Z" }, - { url = "https://files.pythonhosted.org/packages/6c/8c/ddbbfd6ba59453c8fc7fe1d0e5983895864e264c37481b2a791db635f046/orjson-3.11.8-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:14f7b8fcb35ef403b42fa5ecfa4ed032332a91f3dc7368fbce4184d59e1eae0d", size = 141914, upload-time = "2026-03-31T16:16:14.955Z" }, - { url = "https://files.pythonhosted.org/packages/4e/31/dbfbefec9df060d34ef4962cd0afcb6fa7a9ec65884cb78f04a7859526c3/orjson-3.11.8-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:c2bdf7b2facc80b5e34f48a2d557727d5c5c57a8a450de122ae81fa26a81c1bc", size = 423800, upload-time = "2026-03-31T16:16:16.594Z" }, - { url = "https://files.pythonhosted.org/packages/87/cf/f74e9ae9803d4ab46b163494adba636c6d7ea955af5cc23b8aaa94cfd528/orjson-3.11.8-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ccd7ba1b0605813a0715171d39ec4c314cb97a9c85893c2c5c0c3a3729df38bf", size = 147837, upload-time = "2026-03-31T16:16:18.585Z" }, - { url = "https://files.pythonhosted.org/packages/64/e6/9214f017b5db85e84e68602792f742e5dc5249e963503d1b356bee611e01/orjson-3.11.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:cdbc8c9c02463fef4d3c53a9ba3336d05496ec8e1f1c53326a1e4acc11f5c600", size = 136441, upload-time = "2026-03-31T16:16:20.151Z" }, - { url = "https://files.pythonhosted.org/packages/24/dd/3590348818f58f837a75fb969b04cdf187ae197e14d60b5e5a794a38b79d/orjson-3.11.8-cp314-cp314-win32.whl", hash = "sha256:0b57f67710a8cd459e4e54eb96d5f77f3624eba0c661ba19a525807e42eccade", size = 131983, upload-time = "2026-03-31T16:16:21.823Z" }, - { url = "https://files.pythonhosted.org/packages/3f/0f/b6cb692116e05d058f31ceee819c70f097fa9167c82f67fabe7516289abc/orjson-3.11.8-cp314-cp314-win_amd64.whl", hash = "sha256:735e2262363dcbe05c35e3a8869898022af78f89dde9e256924dc02e99fe69ca", size = 127396, upload-time = "2026-03-31T16:16:23.685Z" }, - { url = "https://files.pythonhosted.org/packages/c0/d1/facb5b5051fabb0ef9d26c6544d87ef19a939a9a001198655d0d891062dd/orjson-3.11.8-cp314-cp314-win_arm64.whl", hash = "sha256:6ccdea2c213cf9f3d9490cbd5d427693c870753df41e6cb375bd79bcbafc8817", size = 127330, upload-time = "2026-03-31T16:16:25.496Z" }, -] - [[package]] name = "packaging" version = "26.0" diff --git a/worker-template/worker_template/objects.py b/worker-template/worker_template/objects.py index 63d4ac43..2b8bf7cd 100644 --- a/worker-template/worker_template/objects.py +++ b/worker-template/worker_template/objects.py @@ -1,5 +1,5 @@ import pycountry -from datashare_python.objects import DatashareLanguage, DatashareModel +from datashare_python.objects import DatashareLanguage, DatashareModel, TaskArgs from pydantic import Field @@ -29,7 +29,7 @@ class TranslateAndClassifyConfig(DatashareModel): classification: ClassificationConfig = ClassificationConfig() -class TranslateAndClassifyArgs(DatashareModel): +class TranslateAndClassifyArgs(TaskArgs): project: str language: DatashareLanguage config: TranslateAndClassifyConfig = TranslateAndClassifyConfig() diff --git a/workers/asr-worker/asr_worker/activities.py b/workers/asr-worker/asr_worker/activities.py index f15345be..ba149bb0 100644 --- a/workers/asr-worker/asr_worker/activities.py +++ b/workers/asr-worker/asr_worker/activities.py @@ -2,9 +2,10 @@ import logging from asyncio import AbstractEventLoop from collections.abc import AsyncGenerator, AsyncIterable, Iterable +from functools import partial from itertools import tee from pathlib import Path -from typing import Annotated, Any, cast +from typing import Annotated, Any, Protocol, cast from caul.objects import ASRResult, PreprocessedInput from caul.tasks import ( @@ -16,7 +17,6 @@ ) from datashare_python.dependencies import lifespan_worker_config from datashare_python.objects import ( - DocArtifact, Document, FilesystemDocument, ) @@ -62,11 +62,15 @@ RUN_INFERENCE_ACTIVITY, SEARCH_AUDIOS_ACTIVITY, SUPPORTED_CONTENT_TYPES, - TRANSCRIPTION_METADATA_KEY, - TRANSCRIPTION_METADATA_VALUE, ) from .dependencies import lifespan_es_client -from .objects import InferenceRunnerConfig, Transcription +from .objects import ( + ASRArgs, + InferenceRunnerConfig, + Transcription, + TranscriptionArtifact, + TranscriptionManifestEntry, +) logger = logging.getLogger(__name__) @@ -79,6 +83,10 @@ _INFERENCE_CONFIG_TYPE_ADAPTER = TypeAdapter(InferenceRunnerConfig) +class ArtifactFactory(Protocol): + def __call__(self, artifact: bytes) -> TranscriptionArtifact: ... + + class ASRActivities(ActivityWithProgress): @activity_defn(name=SEARCH_AUDIOS_ACTIVITY) async def search_audio_paths( @@ -183,7 +191,7 @@ def postprocess( inference_results: list[Path], audio_batch: Path, config: ParakeetPostprocessorConfig, - project: str, + args: ASRArgs, *, progress: Annotated[ # noqa: ARG002 SyncProgressRateHandler | None, Weight(value=_BASE_WEIGHT) @@ -212,7 +220,7 @@ def postprocess( postprocessor, inference_results, doc_ids, - project=project, + args, artifacts_root=artifacts_root, event_loop=self._event_loop, progress=progress, @@ -315,9 +323,9 @@ def postprocess_act( postprocessor: Postprocessor, inference_results: Iterable[ASRResult], doc_ids: Iterable[str], + args: ASRArgs, *, artifacts_root: Path, - project: str, event_loop: AbstractEventLoop | None = None, progress: SyncProgressRateHandler | None = None, ) -> int: @@ -326,9 +334,16 @@ def postprocess_act( n_docs = 0 for i, (doc_id, asr_result) in enumerate(zip(doc_ids, transcriptions, strict=True)): n_docs += 1 - t_path = write_transcription( - doc_id, asr_result, artifacts_root=artifacts_root, project=project + manifest_entry = TranscriptionManifestEntry.complete( + args, confidence=asr_result.score + ) + artifact_factory = partial( + TranscriptionArtifact, + project=args.project, + doc_id=doc_id, + manifest_entry=manifest_entry, ) + t_path = write_transcription(asr_result, artifact_factory, artifacts_root) logger.debug("wrote transcription for %s", t_path) if progress is not None and event_loop is not None: progress(i, event_loop) @@ -352,17 +367,11 @@ def _preprocess( def write_transcription( - doc_id: str, asr_result: ASRResult, *, artifacts_root: Path, project: str + asr_result: ASRResult, artifact_factory: ArtifactFactory, artifacts_root: Path ) -> Path: result = Transcription.from_asr_handler_result(asr_result) artifact_bytes = result.model_dump_json().encode() - artifact = DocArtifact( - project=project, - doc_id=doc_id, - filename=TRANSCRIPTION_METADATA_VALUE, - metadata_key=TRANSCRIPTION_METADATA_KEY, - artifact=artifact_bytes, - ) + artifact = artifact_factory(artifact=artifact_bytes) # TODO: if transcriptions are too large we could also serialize them # as jsonl rel_path = write_artifact(artifacts_root, artifact) diff --git a/workers/asr-worker/asr_worker/constants.py b/workers/asr-worker/asr_worker/constants.py index 7a05583b..1d621e4b 100644 --- a/workers/asr-worker/asr_worker/constants.py +++ b/workers/asr-worker/asr_worker/constants.py @@ -14,9 +14,6 @@ RESPONSE_ERROR = "error" -TRANSCRIPTION_METADATA_KEY = "transcription" -TRANSCRIPTION_METADATA_VALUE = "transcription.json" - ASR_WORKFLOW = "asr.transcription" GET_CONFIG_ACTIVITY = "asr.transcription.config" PREPROCESS_ACTIVITY = "asr.transcription.preprocess" diff --git a/workers/asr-worker/asr_worker/objects.py b/workers/asr-worker/asr_worker/objects.py index 25ba88dd..b12a8cf0 100644 --- a/workers/asr-worker/asr_worker/objects.py +++ b/workers/asr-worker/asr_worker/objects.py @@ -1,12 +1,18 @@ import math from collections import defaultdict from functools import cache -from typing import Annotated, Any, Self +from typing import Annotated, Any, ClassVar, Self from caul.asr_pipeline import ASRPipelineConfig from caul.config import InferenceRunnerConfig as CaulInferenceRunnerConfig from caul.objects import ASRLanguage, ASRModel, ASRResult -from datashare_python.objects import DatashareModel +from datashare_python.objects import ( + ArtifactType, + DatashareModel, + DocArtifact, + ManifestEntry, + TaskArgs, +) from icij_common.pydantic_utils import make_enum_discriminator, tagged_union from pydantic import Discriminator, Field, RootModel @@ -23,12 +29,26 @@ DocId = str -class ASRArgs(DatashareModel): +class TranscriptionManifestEntry(ManifestEntry): + confidence: float | None + + +class TranscriptionArtifact(DocArtifact): + filename: ClassVar[str] = "transcription.json" + type: ClassVar[ArtifactType] = ArtifactType.ASR_TRANSCRIPTION + + +class ASRArgs(TaskArgs): project: str docs: list[DocId] | DocumentSearchQuery config: ASRPipelineConfig = Field(default_factory=ASRPipelineConfig.parakeet) batch_size: int + def as_manifest_task_input(self) -> dict[str, Any]: + as_entry = super().as_manifest_task_input() + as_entry.pop("docs") + return as_entry + class ASRResponse(DatashareModel): n_transcribed: int diff --git a/workers/asr-worker/asr_worker/workflows.py b/workers/asr-worker/asr_worker/workflows.py index f6403a1d..aa572c89 100644 --- a/workers/asr-worker/asr_worker/workflows.py +++ b/workers/asr-worker/asr_worker/workflows.py @@ -89,7 +89,7 @@ async def run(self, args: ASRArgs) -> ASRResponse: inference_results, batch_paths, repeat(config.postprocessing), - repeat(args.project), + repeat(args), strict=False, ) ) diff --git a/workers/asr-worker/tests/test_activities.py b/workers/asr-worker/tests/test_activities.py index 3d569622..97d8d930 100644 --- a/workers/asr-worker/tests/test_activities.py +++ b/workers/asr-worker/tests/test_activities.py @@ -14,7 +14,7 @@ write_audio_batches, ) from asr_worker.config import ASRWorkerConfig -from asr_worker.objects import DocId, Transcription +from asr_worker.objects import ASRArgs, DocId, Transcription, TranscriptionManifestEntry from caul.objects import ASRResult, InputMetadata, PreprocessedInput, PreprocessorOutput from caul.tasks import InferenceRunner, Postprocessor, Preprocessor from datashare_python.conftest import TEST_PROJECT @@ -265,6 +265,7 @@ async def test_infer_act(tmpdir: Path) -> None: def test_postprocess_act(tmpdir: Path) -> None: # Given + args = ASRArgs(project=TEST_PROJECT, docs=[], batch_size=2) postprocessor = MockPostprocessor() project = TEST_PROJECT artifacts_root = Path(tmpdir) @@ -273,9 +274,9 @@ def test_postprocess_act(tmpdir: Path) -> None: postprocess_act( postprocessor, INFERENCE_RESULTS, - doc_ids=doc_ids, + doc_ids, + args, artifacts_root=artifacts_root, - project=project, ) # Then expected_artifact_dirs = [ @@ -285,11 +286,16 @@ def test_postprocess_act(tmpdir: Path) -> None: ] for res, d in zip(INFERENCE_RESULTS, expected_artifact_dirs, strict=True): assert d.exists() - metadata_path = d / "metadata.json" - assert metadata_path.exists() - metadata = json.loads(metadata_path.read_text()) - assert metadata["transcription"] == "transcription.json" - transcription_path = d / metadata["transcription"] + manifest_path = d / "manifest.json" + assert manifest_path.exists() + manifest = json.loads(manifest_path.read_text()) + assert "transcription" in manifest + manifest_entry = TranscriptionManifestEntry.model_validate( + manifest["transcription"] + ) + assert manifest_entry.confidence == 1 + assert manifest_entry.input + transcription_path = d / "transcription.json" assert transcription_path.exists() transcription = Transcription.model_validate_json( transcription_path.read_text() diff --git a/workers/asr-worker/tests/test_workflows.py b/workers/asr-worker/tests/test_workflows.py index cdad3a67..4cb2c0f5 100644 --- a/workers/asr-worker/tests/test_workflows.py +++ b/workers/asr-worker/tests/test_workflows.py @@ -14,11 +14,12 @@ Timestamp, Transcript, Transcription, + TranscriptionManifestEntry, ) from asr_worker.workflows import ASRWorkflow, TaskQueues from caul.objects import ASRResult from datashare_python.conftest import TEST_PROJECT -from datashare_python.objects import FilesystemDocument +from datashare_python.objects import FilesystemDocument, ManifestEntryStatus from datashare_python.types_ import TemporalClient from datashare_python.worker import worker_context from pydantic import TypeAdapter @@ -181,12 +182,14 @@ async def test_asr_workflow_e2e( for d in expected_artifact_dirs: assert d.exists() assert d.is_dir() - meta_path = d / "metadata.json" - assert meta_path.exists() - meta = json.loads(meta_path.read_text()) - transcription_name = meta.get("transcription") - assert transcription_name is not None - transcription_path = d / transcription_name + manifest_path = d / "manifest.json" + assert manifest_path.exists() + manifest = json.loads(manifest_path.read_text()) + asr_manifest_entry = TranscriptionManifestEntry.model_validate( + manifest["transcription"] + ) + assert asr_manifest_entry.status is ManifestEntryStatus.COMPLETE + transcription_path = d / "transcription.json" assert transcription_path.exists() transcription = Transcription.model_validate_json( transcription_path.read_text() diff --git a/workers/asr-worker/uv.dist.lock b/workers/asr-worker/uv.dist.lock index 3ec6a446..e34fee6b 100644 --- a/workers/asr-worker/uv.dist.lock +++ b/workers/asr-worker/uv.dist.lock @@ -563,7 +563,7 @@ dev = [ [[package]] name = "datashare-python" -version = "0.9.4" +version = "0.9.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, @@ -581,9 +581,9 @@ dependencies = [ { name = "tomlkit", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, { name = "typer", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b0/1e/b6dcacb07a29f526151c98003e1c97e0b8ed0a19692abb217f9477ee80ee/datashare_python-0.9.4.tar.gz", hash = "sha256:2ab6bd6b4237f8474e61b21335441b311e84d3d0d3b6a8441fd0f33558bc790a", size = 320672, upload-time = "2026-06-23T09:25:20.411Z" } +sdist = { url = "https://files.pythonhosted.org/packages/cc/6a/cb3c7c3073573c6ad0cf56cd6c6b6c97dbd1dbd6392ec0314e78865c9dee/datashare_python-0.9.5.tar.gz", hash = "sha256:9ff757584b46bf95af74a1c890734d3233f02f54724806ec9b346a04c39e311a", size = 320645, upload-time = "2026-06-23T12:11:01.145Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/29/5c/cf9c94901a93a275f366c1e64418dc7c5eb217baede67ce21ec273d0ddb1/datashare_python-0.9.4-py3-none-any.whl", hash = "sha256:1ade134e1584d5c32255418eeedf5f5c7c6383d931a593f7765fd26b17114ff7", size = 326668, upload-time = "2026-06-23T09:25:21.423Z" }, + { url = "https://files.pythonhosted.org/packages/1b/27/826ac48804b0bda7a35260fdc925ff25fda3fcc527da32c4c484461ed8d2/datashare_python-0.9.5-py3-none-any.whl", hash = "sha256:c1dce1b981342dc9a3e80d3bfd3246d4e774d55c116a37321e31ac2b39d4ae08", size = 326631, upload-time = "2026-06-23T12:10:59.749Z" }, ] [[package]] diff --git a/workers/asr-worker/uv.lock b/workers/asr-worker/uv.lock index 6fdbb449..e49ee42e 100644 --- a/workers/asr-worker/uv.lock +++ b/workers/asr-worker/uv.lock @@ -563,7 +563,7 @@ dev = [ [[package]] name = "datashare-python" -version = "0.9.5" +version = "0.9.8" source = { editable = "../../datashare-python" } dependencies = [ { name = "aiohttp", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, @@ -572,7 +572,6 @@ dependencies = [ { name = "icij-common", extra = ["elasticsearch"], marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, { name = "langcodes", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, { name = "lru-dict", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, - { name = "orjson", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, { name = "pydantic-extra-types", extra = ["pycountry"], marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, { name = "python-json-logger", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, { name = "pyyaml", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, @@ -586,10 +585,9 @@ requires-dist = [ { name = "aiohttp", specifier = "~=3.11" }, { name = "alive-progress", specifier = "~=3.2" }, { name = "hatchling", specifier = "~=1.27" }, - { name = "icij-common", extras = ["elasticsearch"], specifier = "~=0.8.2" }, + { name = "icij-common", extras = ["elasticsearch"], specifier = "~=0.8.3" }, { name = "langcodes", specifier = "~=3.5" }, { name = "lru-dict", specifier = "~=1.4" }, - { name = "orjson", specifier = "~=3.11" }, { name = "pydantic-extra-types", extras = ["pycountry"], specifier = ">=2.11.1" }, { name = "python-json-logger", specifier = "~=4.0" }, { name = "pyyaml", specifier = "~=6.0" }, @@ -870,16 +868,16 @@ wheels = [ [[package]] name = "icij-common" -version = "0.8.2" +version = "0.8.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiostream", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, { name = "pydantic", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, { name = "pydantic-settings", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-20-datashare-asr-worker-cpu' and extra == 'extra-20-datashare-asr-worker-gpu')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e0/51/91d85cc23e275bb51a5cae47873af9a7160f62797b25663d902cf38e6ab7/icij_common-0.8.2.tar.gz", hash = "sha256:7db68266d8facb43142131d81e998cd74c6ae73508456743e00df43a15cc2995", size = 15937, upload-time = "2026-04-07T12:13:39.774Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/8d/4b2b9bb7dcac24d8bf514ccf2da6465ca3cd9b4c4f4bde0a1c97e681d595/icij_common-0.8.3.tar.gz", hash = "sha256:009dda7c1d688ecf7705cf88517deb3d0abe02300b42c8c433eb36516462fe75", size = 16161, upload-time = "2026-07-03T09:43:50.837Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/5a/99d123230023124ba4ec67765feb2a04b656ae1f473a14f6b29cc54b02bf/icij_common-0.8.2-py3-none-any.whl", hash = "sha256:c5a1a3ccd54e7bf53acad293a0af50ab15156b225791b43383197b896bae955f", size = 20565, upload-time = "2026-04-07T12:13:38.709Z" }, + { url = "https://files.pythonhosted.org/packages/1c/8e/99879d623fdf4e2371e44135dcf690d7f43b921616e4c918633fe4cece48/icij_common-0.8.3-py3-none-any.whl", hash = "sha256:5214cbb73dca364cca16005f5fac3c2f3944acab6dea269a68d26f631e39a348", size = 21030, upload-time = "2026-07-03T09:43:49.934Z" }, ] [package.optional-dependencies] @@ -2012,22 +2010,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ac/24/7c731839566d30dc70556d9824ef17692d896c15e3df627bce8c16f753e1/optuna-4.8.0-py3-none-any.whl", hash = "sha256:c57a7682679c36bfc9bca0da430698179e513874074b71bebedb0334964ab930", size = 419456, upload-time = "2026-03-16T04:59:56.977Z" }, ] -[[package]] -name = "orjson" -version = "3.11.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7e/0c/964746fcafbd16f8ff53219ad9f6b412b34f345c75f384ad434ceaadb538/orjson-3.11.9.tar.gz", hash = "sha256:4fef17e1f8722c11587a6ef18e35902450221da0028e65dbaaa543619e68e48f", size = 5599163, upload-time = "2026-05-06T15:11:08.309Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/51/3fb9e65ae76ee97bd611869a503fa3fc0a6e81dd8b737cf3003f682df7ff/orjson-3.11.9-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:f01c4818b3fc9b0da8e096722a84318071eaa118df35f6ed2344da0e73a5444f", size = 228522, upload-time = "2026-05-06T15:09:35.362Z" }, - { url = "https://files.pythonhosted.org/packages/16/fa/9d54b07cb3f3b0bfd57841478e42d7a0ece4a9f49f9907eecf5a45461687/orjson-3.11.9-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:3ebca4179031ee716ed076ffadc29428e900512f6fccee8614c9983157fcf19c", size = 128463, upload-time = "2026-05-06T15:09:37.063Z" }, - { url = "https://files.pythonhosted.org/packages/49/bd/360686f39348aa88827cb6fbf7dc606fd41c831a35235e1abf1db8e3a9e6/orjson-3.11.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:147302878da387104b66bb4a8b0227d1d487e976ce41a8501916161072ed87b1", size = 133971, upload-time = "2026-05-06T15:09:45.239Z" }, - { url = "https://files.pythonhosted.org/packages/b6/8a/4081492586d75b073d60c5271a8d0f05a0955cabf1e34c8473f6fcd84235/orjson-3.11.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:63e0efbc991250c0b3143488fa57d95affcabbfc63c99c48d625dd37779aafe2", size = 136959, upload-time = "2026-05-06T15:09:51.311Z" }, - { url = "https://files.pythonhosted.org/packages/16/6d/11867a3ffa3a3608d84a4de51ef4dd0896d6b5cc9132fbe1daf593e677bc/orjson-3.11.9-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:9ef6fe90aadef185c7b128859f40beb24720b4ecea95379fc9000931179c3a49", size = 228515, upload-time = "2026-05-06T15:09:57.265Z" }, - { url = "https://files.pythonhosted.org/packages/24/75/05912954c8b288f34fcf5cd4b9b071cb4f6e77b9961e175e56ebb258089f/orjson-3.11.9-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:e5c9b8f28e726e97d97696c826bc7bea5d71cecd63576dba92924a32c1961291", size = 128409, upload-time = "2026-05-06T15:09:59.063Z" }, - { url = "https://files.pythonhosted.org/packages/0e/a4/82b7a2fe5d8a67a59ed831b24d59a3d46ea7d207b66e1602d376541d94a6/orjson-3.11.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be4fa4f0af7fa18951f7ab3fc2148e223af211bf03f59e1c6034ec3f97f21d61", size = 134014, upload-time = "2026-05-06T15:10:08.213Z" }, - { url = "https://files.pythonhosted.org/packages/df/e5/4d2d8af06f788329b4f78f8cc3679bb395392fcaa1e4d8d3c33e85308fa4/orjson-3.11.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:71e63adb0e1f1ed5d9e168f50a91ceb93ae6420731d222dc7da5c69409aa47aa", size = 136943, upload-time = "2026-05-06T15:10:14.405Z" }, -] - [[package]] name = "overrides" version = "7.7.0" diff --git a/workers/extract-worker/extract_worker/activities.py b/workers/extract-worker/extract_worker/activities.py index 86014a39..b0d78760 100644 --- a/workers/extract-worker/extract_worker/activities.py +++ b/workers/extract-worker/extract_worker/activities.py @@ -3,13 +3,17 @@ import mimetypes import os from collections.abc import AsyncIterable -from functools import cache +from functools import cache, partial from itertools import chain from pathlib import Path from typing import Any, cast from datashare_python.dependencies import lifespan_es_client, lifespan_worker_config -from datashare_python.objects import DocArtifact, Document, DocumentLocation +from datashare_python.objects import ( + ByteRangesPagination, + Document, + DocumentLocation, +) from datashare_python.types_ import AsyncProgressRateHandler from datashare_python.utils import ( ActivityWithProgress, @@ -46,15 +50,17 @@ from temporalio import activity from .config import ExtractWorkerConfig -from .constants import MARKDOWN_DIRNAME, MARKDOWN_METADATA_KEY from .mimetypes_ import types_map from .objects import ( DocId, DocumentSearchQuery, ErrorReport, + MarkdownExtractArgs, MarkdownExtractResponse, ProcessedDoc, ProcessingReport, + StructureArtifact, + StructureManifestEntry, ) logger = logging.getLogger(__name__) @@ -106,8 +112,7 @@ async def create_markdown_extract_batches( async def extract_markdown_content( self, batch: Path, - project: str, - config: PipelineConfig, + args: MarkdownExtractArgs, *, progress: AsyncProgressRateHandler | None = None, ) -> MarkdownExtractResponse: @@ -118,16 +123,17 @@ async def extract_markdown_content( MinerUPipeline, ) - pipeline = Pipeline.from_config(config) + pipeline = Pipeline.from_config(args.config) worker_config = cast(ExtractWorkerConfig, lifespan_worker_config()) workdir = worker_config.workdir - output_dir = activity_workdir(workdir, project) + output_dir = activity_workdir(workdir, args.project) output_dir.mkdir(parents=True, exist_ok=True) batch = workdir / batch logger.debug("extracting doc content as markdown...") res = await extract_markdown_content_act( pipeline, batch, + args, worker_config=worker_config, output_dir=output_dir, progress=progress, @@ -169,6 +175,7 @@ async def create_markdown_extract_batches_act( async def extract_markdown_content_act( pipeline: Pipeline, batch: Path, + args: MarkdownExtractArgs, *, worker_config: ExtractWorkerConfig, output_dir: Path, @@ -194,8 +201,9 @@ async def extract_markdown_content_act( docs = iter(docs) n_docs, n_pages, n_successes, n_successes_pages = 0, 0, 0, 0 errors = [] + manifest_entry_factory = partial(StructureManifestEntry.complete, args=args) async for extract_res in results: - # Heartbeat explicitely to avoid heartbeat timeout + # Heartbeat explicitly to avoid heartbeat timeout with contextlib.suppress(RuntimeError): activity.heartbeat() doc = next(docs) @@ -210,12 +218,16 @@ async def extract_markdown_content_act( n_successes += 1 n_successes_pages += doc.n_pages md_path = output_dir / extract_res.output.path - artifact = DocArtifact( + pages = extract_res.output.pages + pages = ByteRangesPagination( + total=pages.total, byte_ranges=pages.byte_ranges + ) + manifest_entry = manifest_entry_factory(pages=pages) + artifact = StructureArtifact( project=doc.index, doc_id=doc.id, artifact=md_path, - metadata_key=MARKDOWN_METADATA_KEY, - filename=MARKDOWN_DIRNAME, + manifest_entry=manifest_entry, ) write_artifact(artifacts_root, artifact) if progress is not None: diff --git a/workers/extract-worker/extract_worker/objects.py b/workers/extract-worker/extract_worker/objects.py index 844792be..c16a4e9f 100644 --- a/workers/extract-worker/extract_worker/objects.py +++ b/workers/extract-worker/extract_worker/objects.py @@ -1,6 +1,14 @@ -from typing import Any, Self - -from datashare_python.objects import DatashareModel, FilesystemDocument +from typing import Any, ClassVar, Self + +from datashare_python.objects import ( + ArtifactType, + ByteRangesPagination, + DatashareModel, + DocArtifact, + FilesystemDocument, + ManifestEntry, + TaskArgs, +) from extract_core import ( DoclingPipelineConfig, Error, @@ -21,11 +29,26 @@ pipeline_discriminator = make_enum_discriminator("pipeline", PipelineType) -class MarkdownExtractArgs(DatashareModel): +class MarkdownExtractArgs(TaskArgs): project: str docs: list[DocId] | DocumentSearchQuery | None config: PipelineConfig = Field(default_factory=DoclingPipelineConfig) + def as_manifest_task_input(self) -> dict[str, Any]: + as_entry = super().as_manifest_task_input() + as_entry.pop("docs") + return as_entry + + +class StructureArtifact(DocArtifact): + filename: ClassVar[str] = "structure" + type: ClassVar[ArtifactType] = ArtifactType.STRUCTURE + + +class StructureManifestEntry(ManifestEntry): + confidence: float | None = None + pages: ByteRangesPagination + class ProcessingReport(DatashareModel): n_docs: int = 0 diff --git a/workers/extract-worker/extract_worker/workflows.py b/workers/extract-worker/extract_worker/workflows.py index 53712a1c..19221898 100644 --- a/workers/extract-worker/extract_worker/workflows.py +++ b/workers/extract-worker/extract_worker/workflows.py @@ -44,7 +44,7 @@ async def run(self, args: MarkdownExtractArgs) -> MarkdownExtractResponse: # Extract Markdown content # Distribute batches docs with (more or less) constant number of page per batch, # across workers - extract_args = [(b, args.project, args.config) for b in extract_batches] + extract_args = [(b, args) for b in extract_batches] task_queue = worker_config.device.md_extract_queue(args.config.pipeline) extract_acts = ( execute_activity( diff --git a/workers/extract-worker/pyproject.toml b/workers/extract-worker/pyproject.toml index f709047e..733b1541 100644 --- a/workers/extract-worker/pyproject.toml +++ b/workers/extract-worker/pyproject.toml @@ -10,13 +10,13 @@ readme = "README.md" requires-python = ">=3.13,<3.15" dependencies = [ "datashare-python~=0.9.0", - "extract-core==0.6.0", + "extract-core==0.7.0", "temporalio==1.23.0", ] [project.optional-dependencies] base = [ - "extract-python[docling,marker]==0.6.0", + "extract-python[docling,marker]==0.7.2", "opencv-python-headless==4.11.0.86" ] cpu = [ @@ -32,7 +32,7 @@ flash-attn = [ "flash-attn==2.8.3.post1", ] mineru = [ - "extract-python[mineru]==0.6.0", + "extract-python[mineru]==0.7.2", "opencv-python-headless==4.11.0.86" ] @@ -50,7 +50,7 @@ worker_config_cls = "extract_worker.config:WORKER_CONFIG_CLS" [dependency-groups] dev = [ - "extract-python[docling]==0.6.0", + "extract-python[docling]==0.7.2", "black>=26.1.0", "nest-asyncio>=1.6.0", "pre-commit>=4.5.1", diff --git a/workers/extract-worker/tests/test_activities.py b/workers/extract-worker/tests/test_activities.py index ce9bf858..88f37175 100644 --- a/workers/extract-worker/tests/test_activities.py +++ b/workers/extract-worker/tests/test_activities.py @@ -6,10 +6,10 @@ import pytest from datashare_python.conftest import TEST_PROJECT -from datashare_python.objects import DocumentLocation +from datashare_python.objects import ArtifactType, DocumentLocation, ManifestEntryStatus from datashare_python.utils import read_jsonl from extract_core import InputDoc, OutputFormat, Pipeline, Result, Status -from extract_core.objects import ConversionOutput, Error, PageIndexes, SupportedExt +from extract_core.objects import ConversionOutput, Error, Pages, SupportedExt from extract_worker.activities import ( _build_doc_query, create_markdown_extract_batches_act, @@ -21,9 +21,11 @@ DocId, DocumentSearchQuery, ErrorReport, + MarkdownExtractArgs, MarkdownExtractResponse, ProcessedDoc, ProcessingReport, + StructureManifestEntry, ) from icij_common.es import ESClient, ids_query, match_all from icij_common.registrable import FromConfig, RegistrableConfig @@ -120,7 +122,9 @@ async def test_create_markdown_extraction_batches_act( _RES_0 = Result( input=InputDoc(ext=SupportedExt.PDF, path=Path("doc-0.pdf")), status=Status.SUCCESS, - output=ConversionOutput(path=Path("markdown"), pages=PageIndexes(root=[(0, 1)])), + output=ConversionOutput( + path=Path("markdown"), pages=Pages(total=2, byte_ranges=[(0, 1), (1, 2)]) + ), ) _RES_2_ERRORS = [Error(id="error-id", title="error-title", detail="error-detail")] @@ -136,6 +140,7 @@ async def test_extract_markdown_content_act( test_worker_config: ExtractWorkerConfig, ) -> None: # Given + args = MarkdownExtractArgs(project=TEST_PROJECT, docs=[]) batch = [PROCESSED_DOC_0, PROCESSED_DOC_2] extract_results = [_RES_0, _RES_2] pipeline = MockPipeline(extract_results) @@ -151,6 +156,7 @@ async def test_extract_markdown_content_act( # When res = await extract_markdown_content_act( pipeline, + args=args, batch=batch_path, worker_config=test_worker_config, output_dir=output_dir, @@ -169,12 +175,16 @@ async def test_extract_markdown_content_act( d = artifacts_root / TEST_PROJECT / "do" / "c-" / "doc-0" assert d.exists() assert d.is_dir() - meta_path = d / "metadata.json" + meta_path = d / "manifest.json" assert meta_path.exists() - meta = json.loads(meta_path.read_text()) - md_dir = meta.get("extract.markdown") - assert md_dir is not None - md_dir = d / md_dir + manifest = json.loads(meta_path.read_text()) + entry = StructureManifestEntry.model_validate( + manifest[ArtifactType.STRUCTURE.value] + ) + assert entry.status is ManifestEntryStatus.COMPLETE + assert entry.pages.byte_ranges + assert entry.pages.total == 2 + md_dir = d / "structure" assert md_dir.exists() assert md_dir.is_dir() diff --git a/workers/extract-worker/uv.dist.lock b/workers/extract-worker/uv.dist.lock index 0025d86b..d9f3fd1e 100644 --- a/workers/extract-worker/uv.dist.lock +++ b/workers/extract-worker/uv.dist.lock @@ -597,9 +597,9 @@ dev = [ requires-dist = [ { name = "cuda-bindings", marker = "sys_platform == 'linux' and extra == 'gpu'", specifier = "==12.9.4" }, { name = "datashare-python", specifier = "~=0.9.0" }, - { name = "extract-core", specifier = "==0.6.0" }, - { name = "extract-python", extras = ["docling", "marker"], marker = "extra == 'base'", specifier = "==0.6.0" }, - { name = "extract-python", extras = ["mineru"], marker = "extra == 'mineru'", specifier = "==0.6.0" }, + { name = "extract-core", specifier = "==0.7.0" }, + { name = "extract-python", extras = ["docling", "marker"], marker = "extra == 'base'", specifier = "==0.7.2" }, + { name = "extract-python", extras = ["mineru"], marker = "extra == 'mineru'", specifier = "==0.7.2" }, { name = "flash-attn", marker = "extra == 'flash-attn'", specifier = "==2.8.3.post1" }, { name = "opencv-python-headless", marker = "extra == 'base'", specifier = "==4.11.0.86" }, { name = "opencv-python-headless", marker = "extra == 'mineru'", specifier = "==4.11.0.86" }, @@ -614,7 +614,7 @@ provides-extras = ["base", "cpu", "flash-attn", "gpu", "mineru"] [package.metadata.requires-dev] dev = [ { name = "black", specifier = ">=26.1.0" }, - { name = "extract-python", extras = ["docling"], specifier = "==0.6.0" }, + { name = "extract-python", extras = ["docling"], specifier = "==0.7.2" }, { name = "nest-asyncio", specifier = ">=1.6.0" }, { name = "pre-commit", specifier = ">=4.5.1" }, { name = "psutil", specifier = ">=6.1.0" }, @@ -628,7 +628,7 @@ dev = [ [[package]] name = "datashare-python" -version = "0.9.7" +version = "0.9.8" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, @@ -644,9 +644,9 @@ dependencies = [ { name = "tomlkit", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, { name = "typer", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d4/54/d917201590f70bcae62557b5882513538e4f3a2545907cacf3d299c1d606/datashare_python-0.9.7.tar.gz", hash = "sha256:841b9cb62f7c1835be17f1d6d72ccd33bf20120b72dbd3f10c5d43d0d2dcb32e", size = 320939, upload-time = "2026-06-30T11:45:10.182Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9f/3c/a8d28a86627a8967d65070500ed1999434aa556b918c32c0029e9d3a998a/datashare_python-0.9.8.tar.gz", hash = "sha256:8b8a56780446827b77032d4eed402c9733a8399df6f301322d2577fdb19255c9", size = 320930, upload-time = "2026-07-02T08:08:16.289Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/37/fa/7c3845a7f81fbce104f2b146dada4b8d00f81d3c29d07fe043b7527de3a7/datashare_python-0.9.7-py3-none-any.whl", hash = "sha256:f9329a269c62cd94e753cc6b255553aa2ad96f8b598537100a9e8cc4fff3929c", size = 327003, upload-time = "2026-06-30T11:45:08.685Z" }, + { url = "https://files.pythonhosted.org/packages/71/2f/7f7f6aa1f670b3ab7d6cd6bc862ced5a2e499a14fe35a8a6629f6e0b2c98/datashare_python-0.9.8-py3-none-any.whl", hash = "sha256:ec3595ffb45e7eb45f0556813be0f278a7c779584579dab435eb88ff05703734", size = 327002, upload-time = "2026-07-02T08:08:14.688Z" }, ] [[package]] @@ -900,7 +900,7 @@ wheels = [ [[package]] name = "extract-core" -version = "0.6.0" +version = "0.7.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "docling-slim", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, @@ -910,22 +910,22 @@ dependencies = [ { name = "pydantic", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, { name = "pydantic-extra-types", extra = ["pycountry"], marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/32/4c/554c22607bad87040e8df2bd433012578f92dae6431715bed05eb7198340/extract_core-0.6.0.tar.gz", hash = "sha256:2a177d910adbb249529a05591dc6bbf447fc17988c97f9de3893366707f074c6", size = 138321, upload-time = "2026-06-24T09:03:22.464Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e6/ea/1bbc3485d0570455ca874aa84b4f02c4b4d509123710fe1832c02acca08c/extract_core-0.7.0.tar.gz", hash = "sha256:09bfef043c84afb11dd0d50582044cee7c6a5d3cdf95788067d90904b737dd80", size = 138372, upload-time = "2026-06-25T09:25:17.672Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/fe/75c11eddf0b8bdec342cd067e3e4a69071e648c204f518902d0f62979e82/extract_core-0.6.0-py3-none-any.whl", hash = "sha256:488b7e08adf83e2390990bccb8b34e1339d17ab2178a5e18ddb48c6af1cc3130", size = 10349, upload-time = "2026-06-24T09:03:21.305Z" }, + { url = "https://files.pythonhosted.org/packages/d4/41/2eb44fcd9b33d9a4da7d61691f2eea3ebbad565288601d1b8169d2f169f7/extract_core-0.7.0-py3-none-any.whl", hash = "sha256:51d7cb28fcc8173a34e20862a236354f5da35e2bd0156d0001650e22d6e655d3", size = 10414, upload-time = "2026-06-25T09:25:18.496Z" }, ] [[package]] name = "extract-python" -version = "0.6.0" +version = "0.7.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "extract-core", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, { name = "icij-common", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e1/4b/15036758830b70486469b9624b8ace188ed5ae6b618f1af179821896f5b5/extract_python-0.6.0.tar.gz", hash = "sha256:e9771f945ef902a268714c28110b3815e14b7e6cbdb92724c4f8b03ee3042f15", size = 259177, upload-time = "2026-06-24T09:03:18.575Z" } +sdist = { url = "https://files.pythonhosted.org/packages/26/a0/b7431c5d00be20b3f6e4f9ed767bc570348a67228b81d0c0d3edea76f160/extract_python-0.7.2.tar.gz", hash = "sha256:e05186ac6a27abf47033458c1b395469c523fb91d4c60cc0ce7ee78119084c68", size = 259201, upload-time = "2026-06-29T14:08:42.131Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fd/a5/266ec0663ad62995f054f7e1a3a987ba875be835750a14004b856e4bd748/extract_python-0.6.0-py3-none-any.whl", hash = "sha256:45eda3595ab25d41b0916934fa3dfd27efa48d88ef49f8c39a333009df75a25b", size = 9207, upload-time = "2026-06-24T09:03:19.495Z" }, + { url = "https://files.pythonhosted.org/packages/60/1b/fc2a47c3919496ac5370150bddab5eb4c44006fc78c33ae90b8d6bb69866/extract_python-0.7.2-py3-none-any.whl", hash = "sha256:34ffc721a031b653c5c15841beb0ea07ee9ec34e1294004e23924bf0a4a7af2b", size = 9134, upload-time = "2026-06-29T14:08:41.132Z" }, ] [package.optional-dependencies] diff --git a/workers/extract-worker/uv.lock b/workers/extract-worker/uv.lock index a700f405..a4891f6f 100644 --- a/workers/extract-worker/uv.lock +++ b/workers/extract-worker/uv.lock @@ -597,9 +597,9 @@ dev = [ requires-dist = [ { name = "cuda-bindings", marker = "sys_platform == 'linux' and extra == 'gpu'", specifier = "==12.9.4" }, { name = "datashare-python", editable = "../../datashare-python" }, - { name = "extract-core", specifier = "==0.6.0" }, - { name = "extract-python", extras = ["docling", "marker"], marker = "extra == 'base'", specifier = "==0.6.0" }, - { name = "extract-python", extras = ["mineru"], marker = "extra == 'mineru'", specifier = "==0.6.0" }, + { name = "extract-core", specifier = "==0.7.0" }, + { name = "extract-python", extras = ["docling", "marker"], marker = "extra == 'base'", specifier = "==0.7.2" }, + { name = "extract-python", extras = ["mineru"], marker = "extra == 'mineru'", specifier = "==0.7.2" }, { name = "flash-attn", marker = "extra == 'flash-attn'", specifier = "==2.8.3.post1" }, { name = "opencv-python-headless", marker = "extra == 'base'", specifier = "==4.11.0.86" }, { name = "opencv-python-headless", marker = "extra == 'mineru'", specifier = "==4.11.0.86" }, @@ -614,7 +614,7 @@ provides-extras = ["base", "cpu", "flash-attn", "gpu", "mineru"] [package.metadata.requires-dev] dev = [ { name = "black", specifier = ">=26.1.0" }, - { name = "extract-python", extras = ["docling"], specifier = "==0.6.0" }, + { name = "extract-python", extras = ["docling"], specifier = "==0.7.2" }, { name = "nest-asyncio", specifier = ">=1.6.0" }, { name = "pre-commit", specifier = ">=4.5.1" }, { name = "psutil", specifier = ">=6.1.0" }, @@ -628,7 +628,7 @@ dev = [ [[package]] name = "datashare-python" -version = "0.9.7" +version = "0.9.8" source = { editable = "../../datashare-python" } dependencies = [ { name = "aiohttp", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, @@ -650,7 +650,7 @@ requires-dist = [ { name = "aiohttp", specifier = "~=3.11" }, { name = "alive-progress", specifier = "~=3.2" }, { name = "hatchling", specifier = "~=1.27" }, - { name = "icij-common", extras = ["elasticsearch"], specifier = "~=0.8.2" }, + { name = "icij-common", extras = ["elasticsearch"], specifier = "~=0.8.3" }, { name = "langcodes", specifier = "~=3.5" }, { name = "lru-dict", specifier = "~=1.4" }, { name = "pydantic-extra-types", extras = ["pycountry"], specifier = ">=2.11.1" }, @@ -927,7 +927,7 @@ wheels = [ [[package]] name = "extract-core" -version = "0.6.0" +version = "0.7.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "docling-slim", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, @@ -937,22 +937,22 @@ dependencies = [ { name = "pydantic", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, { name = "pydantic-extra-types", extra = ["pycountry"], marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/32/4c/554c22607bad87040e8df2bd433012578f92dae6431715bed05eb7198340/extract_core-0.6.0.tar.gz", hash = "sha256:2a177d910adbb249529a05591dc6bbf447fc17988c97f9de3893366707f074c6", size = 138321, upload-time = "2026-06-24T09:03:22.464Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e6/ea/1bbc3485d0570455ca874aa84b4f02c4b4d509123710fe1832c02acca08c/extract_core-0.7.0.tar.gz", hash = "sha256:09bfef043c84afb11dd0d50582044cee7c6a5d3cdf95788067d90904b737dd80", size = 138372, upload-time = "2026-06-25T09:25:17.672Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4b/fe/75c11eddf0b8bdec342cd067e3e4a69071e648c204f518902d0f62979e82/extract_core-0.6.0-py3-none-any.whl", hash = "sha256:488b7e08adf83e2390990bccb8b34e1339d17ab2178a5e18ddb48c6af1cc3130", size = 10349, upload-time = "2026-06-24T09:03:21.305Z" }, + { url = "https://files.pythonhosted.org/packages/d4/41/2eb44fcd9b33d9a4da7d61691f2eea3ebbad565288601d1b8169d2f169f7/extract_core-0.7.0-py3-none-any.whl", hash = "sha256:51d7cb28fcc8173a34e20862a236354f5da35e2bd0156d0001650e22d6e655d3", size = 10414, upload-time = "2026-06-25T09:25:18.496Z" }, ] [[package]] name = "extract-python" -version = "0.6.0" +version = "0.7.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "extract-core", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, { name = "icij-common", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e1/4b/15036758830b70486469b9624b8ace188ed5ae6b618f1af179821896f5b5/extract_python-0.6.0.tar.gz", hash = "sha256:e9771f945ef902a268714c28110b3815e14b7e6cbdb92724c4f8b03ee3042f15", size = 259177, upload-time = "2026-06-24T09:03:18.575Z" } +sdist = { url = "https://files.pythonhosted.org/packages/26/a0/b7431c5d00be20b3f6e4f9ed767bc570348a67228b81d0c0d3edea76f160/extract_python-0.7.2.tar.gz", hash = "sha256:e05186ac6a27abf47033458c1b395469c523fb91d4c60cc0ce7ee78119084c68", size = 259201, upload-time = "2026-06-29T14:08:42.131Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fd/a5/266ec0663ad62995f054f7e1a3a987ba875be835750a14004b856e4bd748/extract_python-0.6.0-py3-none-any.whl", hash = "sha256:45eda3595ab25d41b0916934fa3dfd27efa48d88ef49f8c39a333009df75a25b", size = 9207, upload-time = "2026-06-24T09:03:19.495Z" }, + { url = "https://files.pythonhosted.org/packages/60/1b/fc2a47c3919496ac5370150bddab5eb4c44006fc78c33ae90b8d6bb69866/extract_python-0.7.2-py3-none-any.whl", hash = "sha256:34ffc721a031b653c5c15841beb0ea07ee9ec34e1294004e23924bf0a4a7af2b", size = 9134, upload-time = "2026-06-29T14:08:41.132Z" }, ] [package.optional-dependencies] @@ -1272,16 +1272,16 @@ wheels = [ [[package]] name = "icij-common" -version = "0.8.2" +version = "0.8.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiostream", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, { name = "pydantic", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, { name = "pydantic-settings", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (platform_machine != 'x86_64' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu') or sys_platform == 'darwin' or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-base' and extra == 'extra-24-datashare-extract-worker-mineru') or (sys_platform != 'linux' and extra == 'extra-24-datashare-extract-worker-cpu' and extra == 'extra-24-datashare-extract-worker-gpu')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e0/51/91d85cc23e275bb51a5cae47873af9a7160f62797b25663d902cf38e6ab7/icij_common-0.8.2.tar.gz", hash = "sha256:7db68266d8facb43142131d81e998cd74c6ae73508456743e00df43a15cc2995", size = 15937, upload-time = "2026-04-07T12:13:39.774Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/8d/4b2b9bb7dcac24d8bf514ccf2da6465ca3cd9b4c4f4bde0a1c97e681d595/icij_common-0.8.3.tar.gz", hash = "sha256:009dda7c1d688ecf7705cf88517deb3d0abe02300b42c8c433eb36516462fe75", size = 16161, upload-time = "2026-07-03T09:43:50.837Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/5a/99d123230023124ba4ec67765feb2a04b656ae1f473a14f6b29cc54b02bf/icij_common-0.8.2-py3-none-any.whl", hash = "sha256:c5a1a3ccd54e7bf53acad293a0af50ab15156b225791b43383197b896bae955f", size = 20565, upload-time = "2026-04-07T12:13:38.709Z" }, + { url = "https://files.pythonhosted.org/packages/1c/8e/99879d623fdf4e2371e44135dcf690d7f43b921616e4c918633fe4cece48/icij_common-0.8.3-py3-none-any.whl", hash = "sha256:5214cbb73dca364cca16005f5fac3c2f3944acab6dea269a68d26f631e39a348", size = 21030, upload-time = "2026-07-03T09:43:49.934Z" }, ] [package.optional-dependencies] diff --git a/workers/translation-worker/translation_worker/objects.py b/workers/translation-worker/translation_worker/objects.py index 0716d830..72b6eced 100644 --- a/workers/translation-worker/translation_worker/objects.py +++ b/workers/translation-worker/translation_worker/objects.py @@ -4,6 +4,7 @@ from datashare_python.objects import ( DatashareModel, Language, + TaskArgs, ) from icij_common.es import ( DOC_CONTENT_TRANSLATED, @@ -71,7 +72,7 @@ def untranslated_query(target: Language) -> dict: return query[QUERY] -class TranslationArgs(DatashareModel): +class TranslationArgs(TaskArgs): project: str docs: list[DocId] | DocumentSearchQuery | None = None config: "TranslationConfig" diff --git a/workers/translation-worker/uv.lock b/workers/translation-worker/uv.lock index 2d1e684e..6db81e81 100644 --- a/workers/translation-worker/uv.lock +++ b/workers/translation-worker/uv.lock @@ -437,7 +437,7 @@ wheels = [ [[package]] name = "datashare-python" -version = "0.9.5" +version = "0.9.8" source = { editable = "../../datashare-python" } dependencies = [ { name = "aiohttp" }, @@ -446,8 +446,6 @@ dependencies = [ { name = "icij-common", extra = ["elasticsearch"] }, { name = "langcodes" }, { name = "lru-dict" }, - { name = "nest-asyncio" }, - { name = "orjson" }, { name = "pydantic-extra-types", extra = ["pycountry"] }, { name = "python-json-logger" }, { name = "pyyaml" }, @@ -461,11 +459,9 @@ requires-dist = [ { name = "aiohttp", specifier = "~=3.11" }, { name = "alive-progress", specifier = "~=3.2" }, { name = "hatchling", specifier = "~=1.27" }, - { name = "icij-common", extras = ["elasticsearch"], specifier = "~=0.8.2" }, + { name = "icij-common", extras = ["elasticsearch"], specifier = "~=0.8.3" }, { name = "langcodes", specifier = "~=3.5" }, { name = "lru-dict", specifier = "~=1.4" }, - { name = "nest-asyncio", specifier = "~=1.6" }, - { name = "orjson", specifier = "~=3.11" }, { name = "pydantic-extra-types", extras = ["pycountry"], specifier = ">=2.11.1" }, { name = "python-json-logger", specifier = "~=4.0" }, { name = "pyyaml", specifier = "~=6.0" }, @@ -784,16 +780,16 @@ wheels = [ [[package]] name = "icij-common" -version = "0.8.2" +version = "0.8.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiostream" }, { name = "pydantic" }, { name = "pydantic-settings" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e0/51/91d85cc23e275bb51a5cae47873af9a7160f62797b25663d902cf38e6ab7/icij_common-0.8.2.tar.gz", hash = "sha256:7db68266d8facb43142131d81e998cd74c6ae73508456743e00df43a15cc2995", size = 15937, upload-time = "2026-04-07T12:13:39.774Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/8d/4b2b9bb7dcac24d8bf514ccf2da6465ca3cd9b4c4f4bde0a1c97e681d595/icij_common-0.8.3.tar.gz", hash = "sha256:009dda7c1d688ecf7705cf88517deb3d0abe02300b42c8c433eb36516462fe75", size = 16161, upload-time = "2026-07-03T09:43:50.837Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/5a/99d123230023124ba4ec67765feb2a04b656ae1f473a14f6b29cc54b02bf/icij_common-0.8.2-py3-none-any.whl", hash = "sha256:c5a1a3ccd54e7bf53acad293a0af50ab15156b225791b43383197b896bae955f", size = 20565, upload-time = "2026-04-07T12:13:38.709Z" }, + { url = "https://files.pythonhosted.org/packages/1c/8e/99879d623fdf4e2371e44135dcf690d7f43b921616e4c918633fe4cece48/icij_common-0.8.3-py3-none-any.whl", hash = "sha256:5214cbb73dca364cca16005f5fac3c2f3944acab6dea269a68d26f631e39a348", size = 21030, upload-time = "2026-07-03T09:43:49.934Z" }, ] [package.optional-dependencies] @@ -1323,44 +1319,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/fa/e5c43397632a399f542663ed3e3e37763ee203ba845b10b266cd2ede8925/onnxruntime-1.25.1-cp312-cp312-win_arm64.whl", hash = "sha256:b6c7aa5cae606d5c90a392679fac074b60f80025a2e83e1e90fdf882bd2a97f0", size = 12634433, upload-time = "2026-04-27T22:00:25.918Z" }, ] -[[package]] -name = "orjson" -version = "3.11.8" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9d/1b/2024d06792d0779f9dbc51531b61c24f76c75b9f4ce05e6f3377a1814cea/orjson-3.11.8.tar.gz", hash = "sha256:96163d9cdc5a202703e9ad1b9ae757d5f0ca62f4fa0cc93d1f27b0e180cc404e", size = 5603832, upload-time = "2026-03-31T16:16:27.878Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/67/41/5aa7fa3b0f4dc6b47dcafc3cea909299c37e40e9972feabc8b6a74e2730d/orjson-3.11.8-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:003646067cc48b7fcab2ae0c562491c9b5d2cbd43f1e5f16d98fd118c5522d34", size = 229229, upload-time = "2026-03-31T16:14:50.424Z" }, - { url = "https://files.pythonhosted.org/packages/0a/d7/57e7f2458e0a2c41694f39fc830030a13053a84f837a5b73423dca1f0938/orjson-3.11.8-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:ed193ce51d77a3830cad399a529cd4ef029968761f43ddc549e1bc62b40d88f8", size = 128871, upload-time = "2026-03-31T16:14:51.888Z" }, - { url = "https://files.pythonhosted.org/packages/53/4a/e0fdb9430983e6c46e0299559275025075568aad5d21dd606faee3703924/orjson-3.11.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30491bc4f862aa15744b9738517454f1e46e56c972a2be87d70d727d5b2a8f8", size = 132104, upload-time = "2026-03-31T16:14:53.142Z" }, - { url = "https://files.pythonhosted.org/packages/08/4a/2025a60ff3f5c8522060cda46612d9b1efa653de66ed2908591d8d82f22d/orjson-3.11.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6eda5b8b6be91d3f26efb7dc6e5e68ee805bc5617f65a328587b35255f138bf4", size = 130483, upload-time = "2026-03-31T16:14:54.605Z" }, - { url = "https://files.pythonhosted.org/packages/2d/3c/b9cde05bdc7b2385c66014e0620627da638d3d04e4954416ab48c31196c5/orjson-3.11.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee8db7bfb6fe03581bbab54d7c4124a6dd6a7f4273a38f7267197890f094675f", size = 135481, upload-time = "2026-03-31T16:14:55.901Z" }, - { url = "https://files.pythonhosted.org/packages/ff/f2/a8238e7734de7cb589fed319857a8025d509c89dc52fdcc88f39c6d03d5a/orjson-3.11.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d8b5231de76c528a46b57010bbd83fb51e056aa0220a372fd5065e978406f1c", size = 146819, upload-time = "2026-03-31T16:14:57.548Z" }, - { url = "https://files.pythonhosted.org/packages/db/10/dbf1e2a3cafea673b1b4350e371877b759060d6018a998643b7040e5de48/orjson-3.11.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:58a4a208a6fbfdb7a7327b8f201c6014f189f721fd55d047cafc4157af1bc62a", size = 132846, upload-time = "2026-03-31T16:14:58.91Z" }, - { url = "https://files.pythonhosted.org/packages/f8/fc/55e667ec9c85694038fcff00573d221b085d50777368ee3d77f38668bf3c/orjson-3.11.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f8952d6d2505c003e8f0224ff7858d341fa4e33fef82b91c4ff0ef070f2393c", size = 133580, upload-time = "2026-03-31T16:15:00.519Z" }, - { url = "https://files.pythonhosted.org/packages/7e/a6/c08c589a9aad0cb46c4831d17de212a2b6901f9d976814321ff8e69e8785/orjson-3.11.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0022bb50f90da04b009ce32c512dc1885910daa7cb10b7b0cba4505b16db82a8", size = 142042, upload-time = "2026-03-31T16:15:01.906Z" }, - { url = "https://files.pythonhosted.org/packages/5c/cc/2f78ea241d52b717d2efc38878615fe80425bf2beb6e68c984dde257a766/orjson-3.11.8-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ff51f9d657d1afb6f410cb435792ce4e1fe427aab23d2fcd727a2876e21d4cb6", size = 423845, upload-time = "2026-03-31T16:15:03.703Z" }, - { url = "https://files.pythonhosted.org/packages/70/07/c17dcf05dd8045457538428a983bf1f1127928df5bf328cb24d2b7cddacb/orjson-3.11.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6dbe9a97bdb4d8d9d5367b52a7c32549bba70b2739c58ef74a6964a6d05ae054", size = 147729, upload-time = "2026-03-31T16:15:05.203Z" }, - { url = "https://files.pythonhosted.org/packages/90/6c/0fb6e8a24e682e0958d71711ae6f39110e4b9cd8cab1357e2a89cb8e1951/orjson-3.11.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a5c370674ebabe16c6ccac33ff80c62bf8a6e59439f5e9d40c1f5ab8fd2215b7", size = 136425, upload-time = "2026-03-31T16:15:07.052Z" }, - { url = "https://files.pythonhosted.org/packages/b2/35/4d3cc3a3d616035beb51b24a09bb872942dc452cf2df0c1d11ab35046d9f/orjson-3.11.8-cp311-cp311-win32.whl", hash = "sha256:0e32f7154299f42ae66f13488963269e5eccb8d588a65bc839ed986919fc9fac", size = 131870, upload-time = "2026-03-31T16:15:08.678Z" }, - { url = "https://files.pythonhosted.org/packages/13/26/9fe70f81d16b702f8c3a775e8731b50ad91d22dacd14c7599b60a0941cd1/orjson-3.11.8-cp311-cp311-win_amd64.whl", hash = "sha256:25e0c672a2e32348d2eb33057b41e754091f2835f87222e4675b796b92264f06", size = 127440, upload-time = "2026-03-31T16:15:09.994Z" }, - { url = "https://files.pythonhosted.org/packages/e8/c6/b038339f4145efd2859c1ca53097a52c0bb9cbdd24f947ebe146da1ad067/orjson-3.11.8-cp311-cp311-win_arm64.whl", hash = "sha256:9185589c1f2a944c17e26c9925dcdbc2df061cc4a145395c57f0c51f9b5dbfcd", size = 127399, upload-time = "2026-03-31T16:15:11.412Z" }, - { url = "https://files.pythonhosted.org/packages/01/f6/8d58b32ab32d9215973a1688aebd098252ee8af1766c0e4e36e7831f0295/orjson-3.11.8-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:1cd0b77e77c95758f8e1100139844e99f3ccc87e71e6fc8e1c027e55807c549f", size = 229233, upload-time = "2026-03-31T16:15:12.762Z" }, - { url = "https://files.pythonhosted.org/packages/a9/8b/2ffe35e71f6b92622e8ea4607bf33ecf7dfb51b3619dcfabfd36cbe2d0a5/orjson-3.11.8-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:6a3d159d5ffa0e3961f353c4b036540996bf8b9697ccc38261c0eac1fd3347a6", size = 128772, upload-time = "2026-03-31T16:15:14.237Z" }, - { url = "https://files.pythonhosted.org/packages/27/d2/1f8682ae50d5c6897a563cb96bc106da8c9cb5b7b6e81a52e4cc086679b9/orjson-3.11.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76070a76e9c5ae661e2d9848f216980d8d533e0f8143e6ed462807b242e3c5e8", size = 131946, upload-time = "2026-03-31T16:15:15.607Z" }, - { url = "https://files.pythonhosted.org/packages/52/4b/5500f76f0eece84226e0689cb48dcde081104c2fa6e2483d17ca13685ffb/orjson-3.11.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:54153d21520a71a4c82a0dbb4523e468941d549d221dc173de0f019678cf3813", size = 130368, upload-time = "2026-03-31T16:15:17.066Z" }, - { url = "https://files.pythonhosted.org/packages/da/4e/58b927e08fbe9840e6c920d9e299b051ea667463b1f39a56e668669f8508/orjson-3.11.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:469ac2125611b7c5741a0b3798cd9e5786cbad6345f9f400c77212be89563bec", size = 135540, upload-time = "2026-03-31T16:15:18.404Z" }, - { url = "https://files.pythonhosted.org/packages/56/7c/ba7cb871cba1bcd5cd02ee34f98d894c6cea96353ad87466e5aef2429c60/orjson-3.11.8-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:14778ffd0f6896aa613951a7fbf4690229aa7a543cb2bfbe9f358e08aafa9546", size = 146877, upload-time = "2026-03-31T16:15:19.833Z" }, - { url = "https://files.pythonhosted.org/packages/0b/5d/eb9c25fc1386696c6a342cd361c306452c75e0b55e86ad602dd4827a7fd7/orjson-3.11.8-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea56a955056a6d6c550cf18b3348656a9d9a4f02e2d0c02cabf3c73f1055d506", size = 132837, upload-time = "2026-03-31T16:15:21.282Z" }, - { url = "https://files.pythonhosted.org/packages/37/87/5ddeb7fc1fbd9004aeccab08426f34c81a5b4c25c7061281862b015fce2b/orjson-3.11.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53a0f57e59a530d18a142f4d4ba6dfc708dc5fdedce45e98ff06b44930a2a48f", size = 133624, upload-time = "2026-03-31T16:15:22.641Z" }, - { url = "https://files.pythonhosted.org/packages/22/09/90048793db94ee4b2fcec4ac8e5ddb077367637d6650be896b3494b79bb7/orjson-3.11.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9b48e274f8824567d74e2158199e269597edf00823a1b12b63d48462bbf5123e", size = 141904, upload-time = "2026-03-31T16:15:24.435Z" }, - { url = "https://files.pythonhosted.org/packages/c0/cf/eb284847487821a5d415e54149a6449ba9bfc5872ce63ab7be41b8ec401c/orjson-3.11.8-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3f262401086a3960586af06c054609365e98407151f5ea24a62893a40d80dbbb", size = 423742, upload-time = "2026-03-31T16:15:26.155Z" }, - { url = "https://files.pythonhosted.org/packages/44/09/e12423d327071c851c13e76936f144a96adacfc037394dec35ac3fc8d1e8/orjson-3.11.8-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8e8c6218b614badf8e229b697865df4301afa74b791b6c9ade01d19a9953a942", size = 147806, upload-time = "2026-03-31T16:15:27.909Z" }, - { url = "https://files.pythonhosted.org/packages/b3/6d/37c2589ba864e582ffe7611643314785c6afb1f83c701654ef05daa8fcc7/orjson-3.11.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:093d489fa039ddade2db541097dbb484999fcc65fc2b0ff9819141e2ab364f25", size = 136485, upload-time = "2026-03-31T16:15:29.749Z" }, - { url = "https://files.pythonhosted.org/packages/be/c9/135194a02ab76b04ed9a10f68624b7ebd238bbe55548878b11ff15a0f352/orjson-3.11.8-cp312-cp312-win32.whl", hash = "sha256:e0950ed1bcb9893f4293fd5c5a7ee10934fbf82c4101c70be360db23ce24b7d2", size = 131966, upload-time = "2026-03-31T16:15:31.687Z" }, - { url = "https://files.pythonhosted.org/packages/ed/9a/9796f8fbe3cf30ce9cb696748dbb535e5c87be4bf4fe2e9ca498ef1fa8cf/orjson-3.11.8-cp312-cp312-win_amd64.whl", hash = "sha256:3cf17c141617b88ced4536b2135c552490f07799f6ad565948ea07bef0dcb9a6", size = 127441, upload-time = "2026-03-31T16:15:33.333Z" }, - { url = "https://files.pythonhosted.org/packages/cc/47/5aaf54524a7a4a0dd09dd778f3fa65dd2108290615b652e23d944152bc8e/orjson-3.11.8-cp312-cp312-win_arm64.whl", hash = "sha256:48854463b0572cc87dac7d981aa72ed8bf6deedc0511853dc76b8bbd5482d36d", size = 127364, upload-time = "2026-03-31T16:15:34.748Z" }, -] - [[package]] name = "packaging" version = "26.0"