diff --git a/datashare-python/datashare_python/logging_.py b/datashare-python/datashare_python/logging_.py index 1afa8bf..2b70734 100644 --- a/datashare-python/datashare_python/logging_.py +++ b/datashare-python/datashare_python/logging_.py @@ -7,7 +7,7 @@ from icij_common.logging_utils import DATE_FMT, STREAM_HANDLER_FMT from pythonjsonlogger.core import BaseJsonFormatter -from pythonjsonlogger.orjson import OrjsonFormatter +from pythonjsonlogger.json import JsonFormatter from temporalio import activity, workflow from .config import LogFormat, LogLevel @@ -132,11 +132,9 @@ def _encode_value(value: Any) -> str: return "true" if value else "false" if isinstance(value, numbers.Number): return str(value) - return json.dumps(value).decode() + return json.dumps(value) def _json_formatter(datefmt: str) -> BaseJsonFormatter: - fmt = OrjsonFormatter( # let's keep logging as fast as possible - _LOGGED_ATTRIBUTES, datefmt=datefmt - ) + fmt = JsonFormatter(_LOGGED_ATTRIBUTES, datefmt=datefmt) return fmt diff --git a/workers/extract-worker/.dockerignore b/workers/extract-worker/.dockerignore index f1c3a14..9863aae 100644 --- a/workers/extract-worker/.dockerignore +++ b/workers/extract-worker/.dockerignore @@ -5,4 +5,5 @@ !README.md !uv.dist.lock !uv.lock -!io-worker-deps-exclusion.txt \ No newline at end of file +!io-worker-deps-exclusion.txt +!base-worker-deps-exclusion.txt diff --git a/workers/extract-worker/Dockerfile b/workers/extract-worker/Dockerfile index 5d2ec6a..0658384 100644 --- a/workers/extract-worker/Dockerfile +++ b/workers/extract-worker/Dockerfile @@ -30,6 +30,18 @@ RUN uv python install 3.14t WORKDIR /app +# Slim runtime base: same python:3.14-slim-trixie interpreter as extract-worker-builder +# (so the venv copied from the *-build stages resolves its absolute interpreter path), +# plus uv, but NO build toolchain. Workers are assembled by copying their /app on top. +FROM python:3.14-slim-trixie AS runtime-base + +ENV PYTHONUNBUFFERED=1 +ENV UV_NO_DEV=1 + +COPY --from=ghcr.io/astral-sh/uv:0.11.24 /uv /uvx /bin/ + +WORKDIR /app + # Flash attention libs has to be built on the hardware and is not available as a prebuild wheel. We don't want to do # it in the final image since it requires nvcc which is already embedded in Python deps and is super heavy. Sadly the # embedded version of nvcc is not usable for building @@ -40,7 +52,7 @@ COPY --from=ghcr.io/astral-sh/uv:0.11.6 /uv /bin/ RUN uv python install 3.14t # Install torch from the lockfile by install extras and skipping some packages -RUN --mount=type=cache,target=~/.cache/uv \ +RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=uv.dist.lock,target=uv.lock \ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ uv sync --frozen --no-install-project --no-editable --extra base --extra gpu \ @@ -53,7 +65,7 @@ RUN --mount=type=bind,source=uv.dist.lock,target=uv.lock \ uv export --frozen --extra base-gpu --no-dev | \ sed -n 's/^flash-attn==\([^; ]*\).*/\1/p' > flash-attn-version.txt -RUN --mount=type=cache,target=~/.cache/uv \ +RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install pip && \ MAX_JOBS=8 uv run python -m pip wheel "flash-attn==$(cat flash-attn-version.txt)" \ --no-build-isolation --no-deps -w /flash-attn-wheel @@ -62,7 +74,7 @@ RUN --mount=type=cache,target=~/.cache/uv \ FROM extract-worker-builder AS io-worker ADD io-worker-deps-exclusion.txt ./ # Install deps first to optimize layer cache -RUN --mount=type=cache,target=~/.cache/uv \ +RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=uv.dist.lock,target=uv.lock \ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ uv sync -v --frozen --no-editable --no-install-project $(sed 's/^/--no-install-package /' io-worker-deps-exclusion.txt) @@ -74,7 +86,7 @@ ADD extract_worker ./extract_worker/ ADD entrypoints/io_worker.sh ./entrypoints/io_worker.sh # Then install service -RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable $(sed 's/^/--no-install-package /' io-worker-deps-exclusion.txt) +RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable $(sed 's/^/--no-install-package /' io-worker-deps-exclusion.txt) RUN rm -rf ~/.cache/pip ENTRYPOINT ["entrypoints/io_worker.sh"] @@ -88,12 +100,15 @@ RUN apt update && \ rm -rf /var/lib/apt/lists/* -FROM extract-base-builder AS extract-cpu-worker +FROM extract-base-builder AS extract-cpu-build +# Skip the mineru-only subtree: extract-core pulls mineru unconditionally, but docling/marker don't use it (and it conflicts with marker) +ADD base-worker-deps-exclusion.txt ./ # Install deps first to optimize layer cache -RUN --mount=type=cache,target=~/.cache/uv \ +RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=uv.dist.lock,target=uv.lock \ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ - uv sync -v --frozen --no-editable --no-install-project --extra base --extra cpu + uv sync -v --frozen --no-editable --no-install-project --extra base --extra cpu \ + $(sed 's/^/--no-install-package /' base-worker-deps-exclusion.txt) # Then copy code ADD uv.dist.lock ./uv.lock @@ -102,23 +117,34 @@ ADD extract_worker ./extract_worker/ ADD entrypoints/extract_cpu_worker.sh ./entrypoints/extract_cpu_worker.sh # Then install service -RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra base --extra cpu +RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable --extra base --extra cpu \ + $(sed 's/^/--no-install-package /' base-worker-deps-exclusion.txt) RUN rm -rf ~/.cache/pip +# Slim runtime: drop build-essential/g++ (only needed to build the venv), keep tesseract for OCR +FROM runtime-base AS extract-cpu-worker +RUN apt update && \ + apt install -y --no-install-recommends tesseract-ocr && \ + rm -rf /var/lib/apt/lists/* +COPY --from=extract-cpu-build /app /app + ENTRYPOINT ["entrypoints/extract_cpu_worker.sh"] -FROM extract-base-builder AS extract-gpu-worker +FROM extract-base-builder AS extract-gpu-build ## Copy the flash-attn wheel #COPY --from=flash-attn-builder /flash-attn-wheel /flash-attn-wheel ## Install the wheel #RUN uv pip install /flash-attn-wheel/*.whl --no-deps +# Skip the mineru-only subtree: extract-core pulls mineru unconditionally, but docling/marker don't use it (and it conflicts with marker). +ADD base-worker-deps-exclusion.txt ./ # Install deps first to optimize layer cache -RUN --mount=type=cache,target=~/.cache/uv \ +RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=uv.dist.lock,target=uv.lock \ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ - uv sync -v --frozen --no-editable --no-install-project --extra base --extra base --extra gpu #--no-install-package flash-attn + uv sync -v --frozen --no-editable --no-install-project --extra base --extra gpu \ + $(sed 's/^/--no-install-package /' base-worker-deps-exclusion.txt) #--no-install-package flash-attn # Then copy code ADD uv.dist.lock ./uv.lock @@ -127,15 +153,23 @@ ADD extract_worker ./extract_worker/ ADD entrypoints/extract_gpu_worker.sh ./entrypoints/extract_gpu_worker.sh # Then install service -RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra base --extra gpu +RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable --extra base --extra gpu \ + $(sed 's/^/--no-install-package /' base-worker-deps-exclusion.txt) RUN rm -rf ~/.cache/pip +# Slim runtime (as extract-cpu-worker): no toolchain, keep tesseract +FROM runtime-base AS extract-gpu-worker +RUN apt update && \ + apt install -y --no-install-recommends tesseract-ocr && \ + rm -rf /var/lib/apt/lists/* +COPY --from=extract-gpu-build /app /app + ENTRYPOINT ["entrypoints/extract_gpu_worker.sh"] FROM extract-worker-builder AS extract-cpu-mineru-worker # Install deps first to optimize layer cache -RUN --mount=type=cache,target=~/.cache/uv \ +RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=uv.dist.lock,target=uv.lock \ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra cpu @@ -147,7 +181,7 @@ ADD extract_worker ./extract_worker/ ADD entrypoints/extract_cpu_worker.sh ./entrypoints/extract_cpu_worker.sh # Then install service -RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra cpu +RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra cpu RUN rm -rf ~/.cache/pip ENTRYPOINT ["entrypoints/extract_cpu_worker.sh"] @@ -155,10 +189,10 @@ ENTRYPOINT ["entrypoints/extract_cpu_worker.sh"] FROM extract-worker-builder AS extract-gpu-mineru-worker # Install deps first to optimize layer cache -RUN --mount=type=cache,target=~/.cache/uv \ +RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=uv.dist.lock,target=uv.lock \ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ - uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra mineru --extra gpu + uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra gpu # Then copy code ADD uv.dist.lock ./uv.lock @@ -167,7 +201,7 @@ ADD extract_worker ./extract_worker/ ADD entrypoints/extract_gpu_worker.sh ./entrypoints/extract_gpu_worker.sh # Then install service -RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra mineru --extra gpu +RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra gpu RUN rm -rf ~/.cache/pip ENTRYPOINT ["entrypoints/extract_gpu_worker.sh"] diff --git a/workers/extract-worker/base-worker-deps-exclusion.txt b/workers/extract-worker/base-worker-deps-exclusion.txt new file mode 100644 index 0000000..3d68265 --- /dev/null +++ b/workers/extract-worker/base-worker-deps-exclusion.txt @@ -0,0 +1,15 @@ +mineru +mineru-vl-utils +modelscope +magika +reportlab +pypdf +fast-langdetect +fasttext-predict +json-repair +robust-downloader +loguru +colorlog +httpx-retries +aiofiles +xlsxwriter