Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions datashare-python/datashare_python/logging_.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from icij_common.logging_utils import DATE_FMT, STREAM_HANDLER_FMT
from pythonjsonlogger.core import BaseJsonFormatter
from pythonjsonlogger.orjson import OrjsonFormatter
from pythonjsonlogger.json import JsonFormatter
from temporalio import activity, workflow

from .config import LogFormat, LogLevel
Expand Down Expand Up @@ -132,11 +132,9 @@ def _encode_value(value: Any) -> str:
return "true" if value else "false"
if isinstance(value, numbers.Number):
return str(value)
return json.dumps(value).decode()
return json.dumps(value)


def _json_formatter(datefmt: str) -> BaseJsonFormatter:
fmt = OrjsonFormatter( # let's keep logging as fast as possible
_LOGGED_ATTRIBUTES, datefmt=datefmt
)
fmt = JsonFormatter(_LOGGED_ATTRIBUTES, datefmt=datefmt)
return fmt
3 changes: 2 additions & 1 deletion workers/extract-worker/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
!README.md
!uv.dist.lock
!uv.lock
!io-worker-deps-exclusion.txt
!io-worker-deps-exclusion.txt
!base-worker-deps-exclusion.txt
68 changes: 51 additions & 17 deletions workers/extract-worker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ RUN uv python install 3.14t

WORKDIR /app

# Slim runtime base: same python:3.14-slim-trixie interpreter as extract-worker-builder
# (so the venv copied from the *-build stages resolves its absolute interpreter path),
# plus uv, but NO build toolchain. Workers are assembled by copying their /app on top.
FROM python:3.14-slim-trixie AS runtime-base

ENV PYTHONUNBUFFERED=1
ENV UV_NO_DEV=1

COPY --from=ghcr.io/astral-sh/uv:0.11.24 /uv /uvx /bin/

WORKDIR /app

# Flash attention libs has to be built on the hardware and is not available as a prebuild wheel. We don't want to do
# it in the final image since it requires nvcc which is already embedded in Python deps and is super heavy. Sadly the
# embedded version of nvcc is not usable for building
Expand All @@ -40,7 +52,7 @@ COPY --from=ghcr.io/astral-sh/uv:0.11.6 /uv /bin/
RUN uv python install 3.14t

# Install torch from the lockfile by install extras and skipping some packages
RUN --mount=type=cache,target=~/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --frozen --no-install-project --no-editable --extra base --extra gpu \
Expand All @@ -53,7 +65,7 @@ RUN --mount=type=bind,source=uv.dist.lock,target=uv.lock \
uv export --frozen --extra base-gpu --no-dev | \
sed -n 's/^flash-attn==\([^; ]*\).*/\1/p' > flash-attn-version.txt

RUN --mount=type=cache,target=~/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install pip && \
MAX_JOBS=8 uv run python -m pip wheel "flash-attn==$(cat flash-attn-version.txt)" \
--no-build-isolation --no-deps -w /flash-attn-wheel
Expand All @@ -62,7 +74,7 @@ RUN --mount=type=cache,target=~/.cache/uv \
FROM extract-worker-builder AS io-worker
ADD io-worker-deps-exclusion.txt ./
# Install deps first to optimize layer cache
RUN --mount=type=cache,target=~/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync -v --frozen --no-editable --no-install-project $(sed 's/^/--no-install-package /' io-worker-deps-exclusion.txt)
Expand All @@ -74,7 +86,7 @@ ADD extract_worker ./extract_worker/
ADD entrypoints/io_worker.sh ./entrypoints/io_worker.sh

# Then install service
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable $(sed 's/^/--no-install-package /' io-worker-deps-exclusion.txt)
RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable $(sed 's/^/--no-install-package /' io-worker-deps-exclusion.txt)
RUN rm -rf ~/.cache/pip

ENTRYPOINT ["entrypoints/io_worker.sh"]
Expand All @@ -88,12 +100,15 @@ RUN apt update && \
rm -rf /var/lib/apt/lists/*


FROM extract-base-builder AS extract-cpu-worker
FROM extract-base-builder AS extract-cpu-build
# Skip the mineru-only subtree: extract-core pulls mineru unconditionally, but docling/marker don't use it (and it conflicts with marker)
ADD base-worker-deps-exclusion.txt ./
# Install deps first to optimize layer cache
RUN --mount=type=cache,target=~/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync -v --frozen --no-editable --no-install-project --extra base --extra cpu
uv sync -v --frozen --no-editable --no-install-project --extra base --extra cpu \
$(sed 's/^/--no-install-package /' base-worker-deps-exclusion.txt)

# Then copy code
ADD uv.dist.lock ./uv.lock
Expand All @@ -102,23 +117,34 @@ ADD extract_worker ./extract_worker/
ADD entrypoints/extract_cpu_worker.sh ./entrypoints/extract_cpu_worker.sh

# Then install service
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra base --extra cpu
RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable --extra base --extra cpu \
$(sed 's/^/--no-install-package /' base-worker-deps-exclusion.txt)
RUN rm -rf ~/.cache/pip

# Slim runtime: drop build-essential/g++ (only needed to build the venv), keep tesseract for OCR
FROM runtime-base AS extract-cpu-worker
RUN apt update && \
apt install -y --no-install-recommends tesseract-ocr && \
rm -rf /var/lib/apt/lists/*
COPY --from=extract-cpu-build /app /app

ENTRYPOINT ["entrypoints/extract_cpu_worker.sh"]


FROM extract-base-builder AS extract-gpu-worker
FROM extract-base-builder AS extract-gpu-build
## Copy the flash-attn wheel
#COPY --from=flash-attn-builder /flash-attn-wheel /flash-attn-wheel
## Install the wheel
#RUN uv pip install /flash-attn-wheel/*.whl --no-deps

# Skip the mineru-only subtree: extract-core pulls mineru unconditionally, but docling/marker don't use it (and it conflicts with marker).
ADD base-worker-deps-exclusion.txt ./
# Install deps first to optimize layer cache
RUN --mount=type=cache,target=~/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync -v --frozen --no-editable --no-install-project --extra base --extra base --extra gpu #--no-install-package flash-attn
uv sync -v --frozen --no-editable --no-install-project --extra base --extra gpu \
$(sed 's/^/--no-install-package /' base-worker-deps-exclusion.txt) #--no-install-package flash-attn

# Then copy code
ADD uv.dist.lock ./uv.lock
Expand All @@ -127,15 +153,23 @@ ADD extract_worker ./extract_worker/
ADD entrypoints/extract_gpu_worker.sh ./entrypoints/extract_gpu_worker.sh

# Then install service
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra base --extra gpu
RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable --extra base --extra gpu \
$(sed 's/^/--no-install-package /' base-worker-deps-exclusion.txt)
RUN rm -rf ~/.cache/pip

# Slim runtime (as extract-cpu-worker): no toolchain, keep tesseract
FROM runtime-base AS extract-gpu-worker
RUN apt update && \
apt install -y --no-install-recommends tesseract-ocr && \
rm -rf /var/lib/apt/lists/*
COPY --from=extract-gpu-build /app /app

ENTRYPOINT ["entrypoints/extract_gpu_worker.sh"]


FROM extract-worker-builder AS extract-cpu-mineru-worker
# Install deps first to optimize layer cache
RUN --mount=type=cache,target=~/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra cpu
Expand All @@ -147,18 +181,18 @@ ADD extract_worker ./extract_worker/
ADD entrypoints/extract_cpu_worker.sh ./entrypoints/extract_cpu_worker.sh

# Then install service
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra cpu
RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra cpu
RUN rm -rf ~/.cache/pip

ENTRYPOINT ["entrypoints/extract_cpu_worker.sh"]


FROM extract-worker-builder AS extract-gpu-mineru-worker
# Install deps first to optimize layer cache
RUN --mount=type=cache,target=~/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra mineru --extra gpu
uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra gpu

# Then copy code
ADD uv.dist.lock ./uv.lock
Expand All @@ -167,7 +201,7 @@ ADD extract_worker ./extract_worker/
ADD entrypoints/extract_gpu_worker.sh ./entrypoints/extract_gpu_worker.sh

# Then install service
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra mineru --extra gpu
RUN --mount=type=cache,target=/root/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra gpu
RUN rm -rf ~/.cache/pip

ENTRYPOINT ["entrypoints/extract_gpu_worker.sh"]
15 changes: 15 additions & 0 deletions workers/extract-worker/base-worker-deps-exclusion.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
mineru
mineru-vl-utils
modelscope
magika
reportlab
pypdf
fast-langdetect
fasttext-predict
json-repair
robust-downloader
loguru
colorlog
httpx-retries
aiofiles
xlsxwriter
Loading