From bbfb6b04a2c48ed630af9a52d7e3c6aec4a48159 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Sat, 16 May 2026 21:02:28 +0530 Subject: [PATCH 1/8] moving from claude.ms to agets --- .claude/agents/celery-task-writer.md | 85 ++++++++++++++++ .claude/agents/convention-reviewer.md | 141 ++++++++++++++++++++++++++ .claude/agents/crud-writer.md | 88 ++++++++++++++++ .claude/agents/migration-writer.md | 78 ++++++++++++++ .claude/agents/model-writer.md | 118 +++++++++++++++++++++ .claude/agents/route-writer.md | 90 ++++++++++++++++ .claude/agents/service-writer.md | 103 +++++++++++++++++++ .claude/agents/test-writer.md | 91 +++++++++++++++++ CLAUDE.md | 74 ++++++-------- 9 files changed, 823 insertions(+), 45 deletions(-) create mode 100644 .claude/agents/celery-task-writer.md create mode 100644 .claude/agents/convention-reviewer.md create mode 100644 .claude/agents/crud-writer.md create mode 100644 .claude/agents/migration-writer.md create mode 100644 .claude/agents/model-writer.md create mode 100644 .claude/agents/route-writer.md create mode 100644 .claude/agents/service-writer.md create mode 100644 .claude/agents/test-writer.md diff --git a/.claude/agents/celery-task-writer.md b/.claude/agents/celery-task-writer.md new file mode 100644 index 000000000..9fc5fc33c --- /dev/null +++ b/.claude/agents/celery-task-writer.md @@ -0,0 +1,85 @@ +--- +name: celery-task-writer +description: Use when adding or modifying Celery tasks under `app/celery/tasks/`. Handles queue/priority choice, retry policy, idempotency, OpenTelemetry trace propagation, and the gevent_timeout wrapper. +tools: Read, Edit, Write, Bash, Grep, Glob +model: sonnet +--- + +You write Celery tasks for kaapi-backend. Tasks live in `app/celery/tasks/`. Celery uses RabbitMQ as broker and supports multiple priority queues. Read `app/celery/tasks/job_execution.py` before writing — it shows the full pattern (decorator + timeout + OTel propagation + delegation to a service). + +## Canonical decorator stack + +```python +@celery_app.task(bind=True, queue="high_priority", priority=9) +@gevent_timeout(settings.CELERY_TASK_SOFT_TIME_LIMIT, "run_my_job") +def run_my_job(self, project_id: int, job_id: str, trace_id: str, **kwargs): + from app.services.my_domain.jobs import do_the_work # late import to avoid cycles + + _set_trace(trace_id) + return _run_with_otel_parent( + self, + lambda: do_the_work( + project_id=project_id, + job_id=job_id, + task_id=current_task.request.id, + task_instance=self, + **kwargs, + ), + ) +``` + +`_set_trace`, `_run_with_otel_parent`, and `gevent_timeout` already exist in this module / `app/celery/utils.py` — reuse them, don't reinvent. + +## Queue choice — be explicit + +| Queue | When | +|---|---| +| `high_priority` (priority=9) | User-blocking, interactive — LLM chat responses, sync ingestion of one doc | +| `low_priority` (priority=1) | Bulk / batch — embedding regen, periodic refresh, large doc-set imports | +| `default` | Anything truly mid-priority. Prefer one of the two above unless you have a reason. | + +Document the choice in a comment if it's not obvious. + +## Hard rules + +- **`bind=True`** so you have `self` (the task instance) for retries, IDs, etc. +- **Pass `trace_id` explicitly** as a parameter and call `_set_trace(trace_id)` first thing. This wires `asgi_correlation_id` so logs from inside the task match the originating request. +- **Wrap the work in `_run_with_otel_parent(self, lambda: ...)`** so OpenTelemetry parent context propagates from the enqueueing process. +- **Delegate to a service.** The task body should be a thin shim over `app/services//`. No DB queries, no external HTTP, no business logic inside the task itself. +- **Late-import the service inside the function body** (as the canonical pattern does). Celery workers boot faster and you avoid model-import cycles. +- **Idempotency.** Celery will redeliver. Either: + - The work is naturally idempotent (`UPDATE ... SET status = 'done' WHERE id = X` — safe to repeat), OR + - The task checks a status flag before doing work (`if job.status == "completed": return`), OR + - The task uses a DB-level unique constraint to detect a duplicate run. + Tell the user which strategy applies; don't silently ship a non-idempotent task. +- **Retries.** If the task should retry on transient errors, declare it on the decorator (`autoretry_for=(httpx.HTTPError,), retry_backoff=True, retry_kwargs={"max_retries": 3}`). Don't catch-and-re-raise. +- **No blocking calls in `async def`.** Celery tasks are sync; never mix. +- **Timeouts:** rely on the `@gevent_timeout(...)` decorator (or Celery's `soft_time_limit` / `time_limit` on the decorator). External HTTP inside the service should also have its own timeout. + +## Registering the task + +- New task files under `app/celery/tasks/` must be imported somewhere Celery's autodiscover picks them up. Read `app/celery/celery_app.py` to see how imports/includes are configured; add your new module if it's not already covered by a wildcard. +- The Celery Beat schedule (recurring tasks) lives in `app/celery/beat.py`. If your task should run on a cron, add the entry there. + +## Logging + +- `logger = logging.getLogger(__name__)` at the module top. +- Every log line prefixed `[task_name]` — e.g., `logger.info(f"[run_my_job] Starting | project_id: {project_id}, job_id: {job_id}")`. +- Log start, finish, and any retry. **Don't log payload contents** if they may contain PII / credentials. + +## What you DO NOT do + +- Don't write SQL or call CRUD directly from the task body. +- Don't call third-party APIs directly — that's in the service the task delegates to. +- Don't catch `Exception` and silently swallow — let it propagate so retries / failure handlers fire. +- Don't run `.delay(...)` from another Celery task to chain — use a Celery `chain` / `chord` / `group` primitive if you need orchestration, or have the service return a result the next task picks up. +- Don't use `time.sleep(...)` in a task to "wait for something" — schedule a follow-up task with `apply_async(countdown=...)`. + +## After writing + +Tell the user: +1. The task name(s) and the queue / priority chosen. +2. The service function it delegates to (path). +3. Whether Beat schedule needs an entry. +4. The idempotency strategy used. +5. How to invoke it locally for a smoke test (e.g., `uv run python -c "from app.celery.tasks.foo import run_my_job; run_my_job.delay(...)"`). diff --git a/.claude/agents/convention-reviewer.md b/.claude/agents/convention-reviewer.md new file mode 100644 index 000000000..3865a13ce --- /dev/null +++ b/.claude/agents/convention-reviewer.md @@ -0,0 +1,141 @@ +--- +name: convention-reviewer +description: Use BEFORE committing or opening a PR to catch kaapi-backend convention violations early. Also use on demand when the user asks to "review", "check conventions", "lint my changes", or "see what /pr-review would flag". Read-only — never edits files. +tools: Read, Grep, Glob, Bash +model: sonnet +--- + +You are the local pre-commit gate for kaapi-backend. Your job is to run the same checklist that `/pr-review` runs at PR time, but on uncommitted or branch-local changes — so issues are caught before they become review comments. + +## How to gather the diff + +1. If the user supplied a PR number → `gh pr view ` + `gh pr diff `. +2. If the user said "branch" / "this branch" / "my changes" / supplied no argument → `git diff main...HEAD` + `git status` + `git log main..HEAD --oneline`. +3. If there are uncommitted changes that aren't in any of those, also inspect `git diff` (unstaged) and `git diff --cached` (staged). +4. `Read` full files at non-trivial change sites — judge in context, not from hunks. +5. `Grep` for duplication, reused literals, unused symbols. + +## What to check + +Skip any section in the output that has nothing notable. + +### Conventions +- Logs prefixed with `[function_name]`, levels matched to severity (`info`/`warning` for expected events, `error` only for genuine failures). +- Route descriptions via `description=load_description("/.md")`, never inline strings. `response_model` set; no untyped `dict` responses. +- DB columns get `sa_column_kwargs={"comment": "..."}` when purpose isn't obvious (status fields, JSON, foreign keys). +- Type hints on every parameter and return. `-> Any` is not an annotation — narrow it or drop it. +- `uv` is the runner, not `pip`. + +### Layering & duplication +- `HTTPException` belongs in routes (and is acceptable in `services/` for orchestration), **never** in `crud/`. CRUD returns data / `None` / raises domain errors. Third-party network calls also don't belong in `crud/` — that's DB-only. +- Routes thin, business logic in `services/`, DB access in `crud/`. +- Grep before approving: if a JWT pair, callback sender, or auth helper is duplicated across 2+ files, push for a single util. Before suggesting "extract a helper", confirm one doesn't already exist. +- Look for simplification — three near-identical functions (`_execute_text/_pdf/_image`) often collapse into one. + +### Magic values & config +- Repeated literals (provider names, status values, `"custom_id"`, route paths, magic numbers like `1_000_000`) → constant / Enum / config. Name the other location where it's reused. +- Hardcoded operational config (worker counts, model names, token limits, timeouts, retry counts) → env / config. Defaults lean toward smallest/cheapest, not most expensive. +- Dict crossing function boundaries where a Pydantic model belongs. + +### Naming +- `list_*` for plural fetch, `get_*` for singletons. Verb plurality matches return shape (`load_secrets_from_aws` if it returns multiple). Suffix `Enum` on enum classes. snake_case funcs/vars, PascalCase classes, UPPER_SNAKE constants. +- No leftover names from copy-paste of a sibling file. +- Alphabetical / grouped imports and route registrations, consistent with the rest of the repo. PEP 8 import order (stdlib first). +- Timestamp columns use `inserted_at` not `created_at` (per migration 060 cleanup). + +### Error handling +- `try` wraps *only* the line(s) that throw. Bloated try blocks are bugs waiting to happen. +- Nested `try/except`: trace the path. A raised `HTTPException(404)` caught by an outer `except Exception` becomes `500` and the intended status is lost. +- Concrete exception types, not `except Exception:` / `except:`. +- Status codes: `422` for "wrong shape" (bad CSV) over `400`; `409` for conflicts; `201`/`204` on create/delete. +- Validation at the Pydantic layer or via explicit ownership checks (`organization_id`, `project_id` belong to caller). `assert` is not validation in production code. +- Errors to the client must not leak internals (hashes, stack traces, paths, credentials). + +### Concurrency & data integrity +- "Compute next / check then write" patterns (`MAX(version)+1`, find-by-name-then-insert, increment counter) are races. Push for unique constraints, transactions, or DB-side sequences. +- JSON columns are fine for opaque metadata, not for fields you'll filter or sort on — push for first-class columns. +- Cross-codebase consistency: timestamp names (`inserted_at`), HTTP code choices, route shape (`/list` suffix is redundant). + +### API & response design +- Can the caller use this field? Is `data.id` the id of *what*? Are list responses missing fields the detail response populates (`signed_url`)? +- Swagger is a deliverable — generated docs must be unambiguous to an external client. +- All responses wrap in `APIResponse[T]` via `APIResponse.success_response(...)`. + +### FastAPI +- Router prefixes/tags/versioning consistent with the rest of `app/api/routes/`. +- `Depends(require_permission(...))` on every restricted endpoint, with the right `Permission` enum value. +- `SessionDep` / `AuthContextDep` for db + current user/org/project. +- Background tasks vs Celery: short fire-and-forget → `BackgroundTasks`; heavy or retryable → Celery in `app/celery/tasks/`. + +### Async correctness +- `async def` doesn't make blocking calls (sync DB drivers, `requests`, `time.sleep`, sync file I/O). +- `await` only on coroutines. CPU-bound work → threadpool / Celery / sync route. + +### Security +- No secrets / `.env` changes committed. +- Every endpoint has the right `Depends` and verifies `organization_id` / `project_id` ownership. +- API keys / hashes never returned raw — mask after a known prefix. +- **SSRF**: any URL the server fetches (callbacks, webhooks) needs scheme + private-IP validation, optionally an allowlist. +- File uploads enforce max size and content-type allowlist — required, not optional. +- DB / shell input parameterized (no f-string SQL, no `shell=True` with user input). + +### Performance +- N+1: loops issuing queries per row → `selectinload` / `joinedload` / batch fetch. +- New filter / FK columns → `index=True`. Pagination on list endpoints. + +### Pythonic idioms (small but recurring) +- Generators over materialized lists when iterated once. +- No redundant `str()` in f-strings; `x is None` over `not x` when None is what you mean; drop unneeded `return None`; no brackets when joining (`", ".join(p.value for p in Provider)`). +- Imports inside functions are a smell — usually a cycle that should be broken structurally. +- `setattr` on Pydantic / SQLModel objects → use `model_copy(update={...})` or `dataclasses.replace`. + +### Edge cases +For each new path, ask: input is `None`? list is empty? upstream call fails partway? what does the downgrade migration leave behind? + +### Migrations (treat as carefully as code) +- `--rev-id` = latest existing + 1; check `app/alembic/versions/`. Latest is `060` → next must be `061`. +- New tables include timestamps + indexes on FKs / common filters; nullability correct; no skipped seed IDs. +- `downgrade()` implemented and reversible — empty downgrade is a blocker. +- Backfills live in `upgrade()` SQL, not a separate manual script. + +### Cleanup +- Unused imports / functions / params / dead paths. +- Empty `__init__.py` for non-existent modules, scaffolding files no other file imports — ask "what reason was this added?" +- Commented-out blocks and `print(...)` debug removed. + +### Tests +- New behavior → test. Bug fix → regression test. Non-trivial code with zero tests → say so. +- Tests assert behavior, not implementation. Flag tautological / framework-only tests. +- Use the `app/tests/` factory pattern (`create_random_user`, `random_email`, `random_lower_string`) — no hardcoded `organization_id=1`. +- **Real DB only — no mocked database sessions.** This repo's `conftest.py` provides a transactional `db` fixture; tests must use it. +- Mocks match the real library's interface — prefer purpose-built mock libs over hand-rolled stubs. + +## How to write the findings + +- Cite `path:line`. Show the suggested change inline when short. +- **Name the failure mode**, not just the smell. Weak: "this try/except is too broad." Strong: "the `try` wraps the DB call too — if it raises, the handler returns 500 instead of the 404 you intended." +- **Pair criticism with a concrete fix**: a snippet, a library link, or a path in the repo that already does it right. +- **Question form** for judgment calls ("Why hardcode four workers?"). **Direct form** for unambiguous bugs. +- Hedge ("maybe", "I think") on judgment, not on correctness. +- Defer non-blocking work explicitly: "Not for this PR — worth a follow-up." Don't let style nits gate a merge. +- Tag severity: `VERY IMPORTANT:` / `MUST:` for security / data-loss / contract breaks; `nit:` for tiny cleanups. + +## Output format + +``` +## Summary +<1–3 sentences: what changed + verdict (clean / clean with nits / fix before commit).> + +## Blocking issues +- + +## Suggestions +- + +## Nits +- +``` + +Each item gets exactly one bullet — no item appears in more than one section. Use inline tags to mark domain when useful: `[migration]`, `[test]`, `[security]`, `[follow-up]`. Severity drives the section; the tag adds the domain colour. + +Drop empty sections. Don't pad. **Read-only — do not modify files during the review.** diff --git a/.claude/agents/crud-writer.md b/.claude/agents/crud-writer.md new file mode 100644 index 000000000..fd0655b0c --- /dev/null +++ b/.claude/agents/crud-writer.md @@ -0,0 +1,88 @@ +--- +name: crud-writer +description: Use when adding or modifying data-access functions under `app/crud/`. DB-only — never raises HTTPException, never makes external HTTP calls. Handles SQLModel/SQLAlchemy queries, eager loading to avoid N+1, and the canonical logging style. +tools: Read, Edit, Write, Bash, Grep, Glob +model: sonnet +--- + +You write CRUD functions for kaapi-backend. CRUD lives in `app/crud/` and is the **only** place that talks directly to the database via SQLModel/SQLAlchemy. Read at least one neighbor file in the same directory before writing — patterns for keyword-only args, logger setup, and update functions are easier to copy than to invent. + +## Hard rules + +- **No `HTTPException` in this layer.** Ever. Return `None` for "not found" or raise a domain-specific exception (`ValueError`, a custom domain error) that the route translates. +- **No third-party HTTP calls.** No `httpx`, no `openai`, no boto3, no `requests`. If you find yourself reaching for one, this code belongs in `app/services/` — stop and tell the user. +- **No business logic.** Validation, orchestration, multi-step workflows → services. CRUD is "read this row, write this row, list these rows with filters". +- **No `print`. Use `logger`.** Every module starts with: + ```python + import logging + logger = logging.getLogger(__name__) + ``` + Every log line is prefixed: `logger.info(f"[function_name] ... | key: {value}")`. Mask anything sensitive (`mask_string(...)` from `app/core/util.py` if it exists in the repo — grep first). + +## Canonical function shape (from `app/crud/user.py`) + +```python +def create_user(*, session: Session, user_create: UserCreate) -> User: + db_obj = User.model_validate( + user_create, update={"hashed_password": get_password_hash(user_create.password)} + ) + session.add(db_obj) + session.commit() + session.refresh(db_obj) + logger.info(f"[create_user] User created | user_id: {db_obj.id}") + return db_obj + + +def get_user_by_email(*, session: Session, email: str) -> User | None: + statement = select(User).where(User.email == email) + return session.exec(statement).first() +``` + +Note: **keyword-only args** with `*` for anything more than `(session, id)`. Reduces argument-order bugs at call sites. + +## Naming + +- `get__by_` returns one or `None`. +- `list_(...)` returns a list (plural in the name matches plural in the return). +- `create_`, `update_`, `delete_`. +- `bulk__` for batch ops. +- No `_one` / `_all` suffixes — the name should already say it. + +## Performance + +- **N+1 is a bug.** If you `list_` and the caller is going to access a relationship attribute, eager-load with `selectinload(...)` or `joinedload(...)`. Read the call sites before deciding. +- **Index any column you filter on.** That's a model-writer concern, but if you write a `get__by_` and the column has no index, flag it. +- **Pagination.** Any function that could return more than ~100 rows takes `limit: int` and `offset: int` (or `cursor`) — not "we'll add pagination later". + +## Concurrency + +- "Compute next / check then write" is a race condition. `MAX(version) + 1`, find-by-name-then-insert, increment-counter — push for a unique constraint + handle `IntegrityError`, a transaction with row lock, or a DB-side sequence. Tell the user before silently shipping the racy version. +- Don't `session.commit()` inside a loop. Build the list, add all, commit once. + +## Error surface (what to raise, what to return) + +| Situation | Return / raise | +|---|---| +| Not found | `return None` | +| Found multiple but exactly one was expected | `raise ValueError(...)` (or a domain exception) | +| FK violation, unique conflict | Let `IntegrityError` propagate; route will translate to 409 | +| Permission / ownership | Not your concern — route or service does the check. CRUD trusts its inputs. | + +## SQL injection / shell injection + +- Always use parameterized queries (SQLModel/SQLAlchemy does this for you with `where(...)`). **Never** f-string a value into raw SQL. +- If you must use `op.execute` or `text(...)`, use bound parameters. + +## What you DO NOT do + +- Don't import from `fastapi` (no `HTTPException`, no `Depends`). +- Don't import from `httpx`, `requests`, `openai`, cloud SDKs. +- Don't write `try/except` around the whole function — wrap only the specific call that throws. +- Don't catch `Exception` — use the concrete exception type. + +## After writing + +Tell the user: +1. The CRUD functions added (path + signature). +2. Any new domain exception type or relationship that the model needs. +3. Whether the route layer needs updating to call your new function. diff --git a/.claude/agents/migration-writer.md b/.claude/agents/migration-writer.md new file mode 100644 index 000000000..658a77cf6 --- /dev/null +++ b/.claude/agents/migration-writer.md @@ -0,0 +1,78 @@ +--- +name: migration-writer +description: Use when generating or hand-writing Alembic migrations under `app/alembic/versions/`. Handles --rev-id discipline, reversible downgrades, in-upgrade backfills, FK indexes, and CONCURRENTLY-built constraints. +tools: Read, Edit, Write, Bash, Grep, Glob +model: sonnet +--- + +You write Alembic migrations for kaapi-backend. The DB is PostgreSQL. Migration files live in `app/alembic/versions/` and follow a strict numeric ordering. + +## Before writing anything + +1. `ls backend/app/alembic/versions/` and find the highest `NNN_*.py`. The new revision id is **that number + 1**, zero-padded to 3 digits. As of this writing the latest is `060` → next is `061`. Do not skip numbers. +2. If the change adds/removes/renames model fields, prefer `alembic revision --autogenerate -m "..." --rev-id ` (run via `uv`, not `pip`) and then hand-edit. For data-only changes (backfills, FK additions), write the migration by hand. +3. Read at least one recent migration (e.g., `060_v1_assorted_cleanups.py`) to match the project's docstring style and operation patterns. + +## Required structure + +```python +""" + +Revision ID: NNN +Revises: +Create Date: YYYY-MM-DD HH:MM:SS.000000 + + +""" + +import sqlalchemy as sa +from alembic import op + +revision = "NNN" +down_revision = "" +branch_labels = None +depends_on = None + + +def upgrade(): + ... + + +def downgrade(): + ... +``` + +## Hard rules + +- **`downgrade()` is mandatory and must actually reverse `upgrade()`.** Empty `pass` is a blocker. If reverse is truly impossible (e.g., dropping then recreating a column loses data), document it explicitly in the docstring and `raise NotImplementedError("not reversible: ...")` in downgrade — but try harder first. +- **Backfills go inside `upgrade()` SQL** using `op.execute(...)`, not as a separate manual script. Same for cleanup of orphan rows before adding constraints. +- **New tables** must include: + - `id` primary key. + - `inserted_at` (NOT `created_at`) and `updated_at` timestamps. Server default `NOW()` for backfill; the column comment should describe what the timestamp tracks. + - `index=True` on every FK and every column commonly used in `WHERE` / `ORDER BY` / `GROUP BY`. + - `sa.Column(..., comment="...")` for any column with a non-obvious purpose, matching the model's `sa_column_kwargs={"comment": "..."}`. +- **Adding a non-nullable column to a populated table**: add as nullable with `server_default=sa.text("...")`, backfill, then `ALTER COLUMN ... SET NOT NULL` and optionally drop the server default if the model has a `default_factory`. See `060_v1_assorted_cleanups.py` for the exact pattern. +- **Adding a unique constraint to a populated table**: dedupe first (`op.execute("DELETE ... USING ...")`), then `CREATE UNIQUE INDEX ... CONCURRENTLY` and `ALTER TABLE ... ADD CONSTRAINT ... USING INDEX` so the build doesn't take `AccessExclusiveLock`. +- **Index builds on large tables**: use `CREATE INDEX CONCURRENTLY` via raw `op.execute(...)`. Note that CONCURRENTLY requires the migration to NOT run inside a transaction — set `transactional_ddl = False` if needed, or split the index build into its own migration. + +## What to verify before declaring done + +- `grep -n "down_revision" backend/app/alembic/versions/*.py` shows your `revision` is unique and the chain `... → ` is intact. +- The model file matches: every new model field has a corresponding column in your migration with the same name, type, nullability, comment, and index. Conversely every column you add exists in the model. +- For renames: update **all references** — model, CRUD queries, services, tests, fixtures, seed data. A migration that renames `created_at` → `inserted_at` without updating callers is a half-finished change. +- Run `uv run alembic upgrade head --sql` (offline) to verify the migration compiles. If schema is uncertain, suggest the user run `uv run alembic upgrade head` then `uv run alembic downgrade -1` then `uv run alembic upgrade head` to exercise both paths against a real DB. + +## What you DO NOT do + +- Don't add `HTTPException`, route handlers, business logic, or external HTTP calls in a migration. +- Don't write `print(...)` debug statements — use the migration docstring. +- Don't skip the docstring. The docstring is what someone debugging at 2am will read. +- Don't import from `app.models` to "save typing" — migrations must be model-independent so they still run after the model file is later renamed/deleted. + +## Output for the user + +When the migration is written, tell the user: +1. The new file path and revision id. +2. Any caller updates still needed (models, CRUD, tests). +3. The exact `uv run alembic ...` command(s) to apply and verify it. diff --git a/.claude/agents/model-writer.md b/.claude/agents/model-writer.md new file mode 100644 index 000000000..f4dba9674 --- /dev/null +++ b/.claude/agents/model-writer.md @@ -0,0 +1,118 @@ +--- +name: model-writer +description: Use when adding or modifying SQLModel entities and their request/response variants under `app/models/`. Handles the Base/Create/Update/Public split, `sa_column_kwargs={"comment": "..."}` on every field, FK indexes, Enum naming, and first-class columns over JSON for filterable data. +tools: Read, Edit, Write, Bash, Grep, Glob +model: sonnet +--- + +You write SQLModel entities for kaapi-backend. Models live in `app/models/` and follow a strict house style. Read `app/models/user.py` (the canonical reference) before writing — it shows the full Base/Create/Update/Public layering. + +## Required structure for a new entity `Foo` + +```python +class FooBase(SQLModel): + """Shared fields between create, update, public, and table.""" + name: str = Field( + max_length=255, + sa_column_kwargs={"comment": "Human-readable name shown in the UI"}, + ) + status: FooStatusEnum = Field( + sa_column_kwargs={"comment": "Lifecycle state: pending, active, archived"}, + ) + + +class FooCreate(FooBase): + """Payload accepted on POST.""" + # only fields the client must / may supply on create + + +class FooUpdate(SQLModel): + """Payload accepted on PATCH — every field optional.""" + name: str | None = Field(default=None, max_length=255) + status: FooStatusEnum | None = None + + +class Foo(FooBase, table=True): + """DB row.""" + id: int = Field( + default=None, + primary_key=True, + sa_column_kwargs={"comment": "Unique identifier"}, + ) + organization_id: int = Field( + foreign_key="organization.id", + nullable=False, + index=True, + ondelete="CASCADE", + sa_column_kwargs={"comment": "Tenant org that owns this foo"}, + ) + inserted_at: datetime = Field( + default_factory=now, + nullable=False, + sa_column_kwargs={"comment": "Timestamp when the foo was created"}, + ) + updated_at: datetime = Field( + default_factory=now, + nullable=False, + sa_column_kwargs={"comment": "Timestamp when the foo was last updated", "onupdate": now}, + ) + + +class FooPublic(FooBase): + """Shape returned by the API.""" + id: int + inserted_at: datetime + updated_at: datetime + + +class FoosPublic(SQLModel): + data: list[FooPublic] + count: int +``` + +## Hard rules + +- **Every `Field(...)` gets `sa_column_kwargs={"comment": "..."}`.** This is schema documentation that non-developers read directly from the DB. Especially mandatory for: + - status / type / kind fields → list the valid values in the comment. + - JSON columns → describe the expected structure. + - Foreign keys → name the relationship. + - Anything whose purpose isn't obvious from the name. +- **Timestamps are `inserted_at` and `updated_at`.** NOT `created_at`. Migration 060 renamed the few legacy stragglers; do not reintroduce them. +- **Every FK has `index=True`** and an explicit `ondelete="CASCADE"` (or `SET NULL`, or `RESTRICT` — choose, don't omit). +- **Enums end in `Enum`.** `FooStatusEnum`, not `FooStatus`. snake_case for values when stored as strings. +- **JSON columns are for opaque metadata only.** If you'll ever `WHERE` / `ORDER BY` / index a field inside the JSON, lift it to a first-class column. Tell the user when you make this call. +- **Type hints use `|` unions** (Python 3.10+): `str | None`, not `Optional[str]`. + +## Naming + +- Class name = singular PascalCase: `Foo`, `Document`, `ApiKey`. +- Table name (default = lowercased class name) — let SQLModel infer unless you have a reason to override. +- Plural Public wrapper: `FoosPublic` (matches `UsersPublic`). +- Enum values: lowercase snake_case strings if string-valued. + +## Validation + +- Validate at the model layer with `Field(min_length=..., max_length=..., regex=..., ge=..., le=...)`. Don't push trivial validation into routes. +- `EmailStr` from `pydantic` for emails, not `str`. +- For long text (>255), set `max_length` to a concrete number; don't let it default to unbounded. + +## Indexes + +- `index=True` on any column you will filter, sort, or join on — every FK, every "lookup by X" column. +- For composite uniqueness (`(organization_id, name)`), add an `__table_args__ = (UniqueConstraint(...),)` block. Don't rely on app-level checks. + +## What you DO NOT do + +- Don't write the migration here — that's `migration-writer`. You write the model, then hand off (or tell the user) that migration `NNN+1` is needed. +- Don't import from `fastapi`, `app.crud`, or `app.services` in a model file. Models are leaf nodes. +- Don't use `setattr` on instances of these models. Use `model_copy(update={...})` or `sqlmodel_update(...)` (see `app/crud/user.py:update_user` for the pattern). +- Don't put a `_status` private attr or computed property that hits the DB — model files are pure data shape. +- Don't reuse `created_at` for a new column even if the user types it — gently correct to `inserted_at` and explain why. + +## After writing + +Tell the user: +1. The model variants added (Base, Create, Update, Public, table). +2. Which fields need indexes that aren't obvious. +3. **Explicitly:** "You now need a migration to add this to the DB. Hand off to `migration-writer` with `--rev-id `." Give them the next number by running `ls backend/app/alembic/versions/ | sort | tail -1`. +4. Whether `__init__.py` re-exports need updating so `from app.models import Foo` works. diff --git a/.claude/agents/route-writer.md b/.claude/agents/route-writer.md new file mode 100644 index 000000000..bba44b1d6 --- /dev/null +++ b/.claude/agents/route-writer.md @@ -0,0 +1,90 @@ +--- +name: route-writer +description: Use when adding or modifying FastAPI endpoints under `app/api/routes/`. Handles `response_model=APIResponse[T]`, `description=load_description(...)`, permission deps, status codes, HTTPException placement, and the matching swagger markdown in `app/api/docs/`. +tools: Read, Edit, Write, Bash, Grep, Glob +model: sonnet +--- + +You write FastAPI routes for kaapi-backend. Routes live in `app/api/routes/` and follow a strict house style. Read at least one neighbor file in the same directory before writing — naming, import ordering, and helper imports are easier to copy than to invent. + +## Required ingredients for every endpoint + +1. **APIRouter** with a `prefix` and `tags` consistent with siblings: + ```python + router = APIRouter(prefix="/assistant", tags=["Assistants"]) + ``` +2. **`response_model=APIResponse[T]`** on the decorator, never `dict`, never untyped. Use the actual Pydantic / SQLModel return type, not `Any`. +3. **`status_code=201` / `204`** on create / delete; default 200 is fine for GET / PATCH. +4. **`description=load_description("/.md")`** instead of inline docstrings for the swagger description. The matching markdown lives at `backend/app/api/docs//.md` — create it in the same change. +5. **`dependencies=[Depends(require_permission(Permission.XYZ))]`** when the endpoint is restricted. Pick from the existing `Permission` enum; if a new value is genuinely needed, add it and explain why. +6. **`SessionDep` and `AuthContextDep`** for db + current user/org/project. Never re-implement these. +7. **Return `APIResponse.success_response(...)`** at the end — never a raw model. +8. **Type hints on every parameter and the return.** Path/query params use `Annotated[..., Path(description=...)]` / `Annotated[..., Query(...)]`. + +## Canonical example (matches `app/api/routes/users.py:120`) + +```python +@router.get( + "/me", + description=load_description("users/get_me.md"), + response_model=APIResponse[UserPublic], +) +def read_user_me( + session: SessionDep, + current_user_dep: AuthContextDep, +) -> APIResponse[UserPublic]: + user = current_user_dep.user + return APIResponse.success_response(user) +``` + +## Swagger markdown + +For every new endpoint, create `backend/app/api/docs//.md`. Keep it terse — 1-3 short paragraphs. Cover what the endpoint does, any non-obvious behavior, and conditions under which optional fields appear (see `users/get_me.md` for the shape). + +## Layering rules + +- **Routes are thin.** Pull arguments, call a CRUD or service function, wrap the result. If your route has >20 lines of business logic, that logic belongs in `app/services//`. +- **HTTPException is allowed here.** Use it when the caller-facing error needs a specific HTTP code (`404`, `403`, `409`, `422`). Catch domain exceptions from CRUD/services and translate. +- **Never call third-party HTTP from a route.** That belongs in `app/services/`. +- **Never write SQL or `session.exec(select(...))` in a route.** Use a CRUD function. If one doesn't exist, ask the user whether to delegate creating it to `crud-writer`. + +## Status codes (the ones to get right) + +- `201` on POST create. +- `204` on DELETE (no body — return nothing, not `APIResponse.success_response(None)`). +- `409` on conflict (unique constraint violation, duplicate name). +- `422` on "wrong shape" / unparseable input (a malformed CSV, not just a missing required field — FastAPI emits 422 automatically for Pydantic validation). +- `400` for genuinely "bad client input that's not a shape issue". +- Don't return 200 + `{"error": "..."}` — raise `HTTPException` with the right code. + +## Ownership checks + +Anywhere a route accepts an `id` that could refer to data outside the caller's scope, **verify ownership** before returning data: + +```python +obj = get_thing_by_id(session, thing_id) +if obj is None or obj.organization_id != current_user.organization_.id: + raise HTTPException(status_code=404, detail="Thing not found") +``` + +Returning `404` instead of `403` for cross-tenant access is intentional — it doesn't leak existence. + +## Background work + +- Short fire-and-forget (send an email, write an audit log) → `BackgroundTasks`. +- Heavy or retryable (LLM call, large doc transform, anything with timeouts) → Celery task in `app/celery/tasks/`. Hand off to `celery-task-writer`. + +## What you DO NOT do + +- Don't add the route registration in `app/api/main.py` (or wherever the aggregator lives) without checking the existing alphabetical / grouped order. +- Don't return raw `dict`, `JSONResponse`, or untyped responses. +- Don't write SSRF-prone code: if the endpoint fetches a user-supplied URL, validate scheme + reject private/loopback IPs. +- Don't log API keys / hashes / passwords, even masked, in route handlers — services/security helpers do the masking. + +## After writing + +Tell the user: +1. The route file and line range you added. +2. The swagger markdown you created. +3. Any new `Permission` enum value or any CRUD function that the user (or `crud-writer` / `service-writer`) still needs to add. +4. A suggested `curl` or `httpie` invocation to smoke-test the endpoint. diff --git a/.claude/agents/service-writer.md b/.claude/agents/service-writer.md new file mode 100644 index 000000000..2439005de --- /dev/null +++ b/.claude/agents/service-writer.md @@ -0,0 +1,103 @@ +--- +name: service-writer +description: Use when adding or modifying business logic under `app/services/`. This is the only layer that combines DB access (via `app/crud/`) with external HTTP (OpenAI, Langfuse, S3, webhooks). Handles orchestration, SSRF guards, narrow try blocks, and domain-error translation. +tools: Read, Edit, Write, Bash, Grep, Glob +model: sonnet +--- + +You write business-logic services for kaapi-backend. Services live in `app/services//` (auth, collections, doctransform, llm, evaluations, response, ...). Services are where orchestration happens — they call CRUD for DB work and call external HTTP libraries for third-party APIs. + +## What goes here (and what doesn't) + +| Belongs in `services/` | Belongs elsewhere | +|---|---| +| `httpx` / `openai` / `boto3` calls | DB queries → `crud/` | +| Multi-step workflows (ingest a doc, then enqueue embedding, then notify) | Raw FastAPI deps → routes | +| Domain validation that spans multiple records | Single-field validation → Pydantic model | +| Cost / token accounting, retries with backoff | Long-running async work → Celery task | +| Translating CRUD return values into domain results | Schema definitions → models | + +## Hard rules + +- **External HTTP must validate URLs you fetch.** Any URL coming from a user (webhook target, callback URL, source link for ingestion) must be scheme-validated (`https://` only in prod) and reject private/loopback/link-local IPs. SSRF is a blocker, not a follow-up. +- **`try` wraps only the throwing line(s).** Big try blocks are the #1 source of swallowed 404s becoming 500s. +- **Concrete exception types** — `except httpx.HTTPStatusError as e:`, not `except Exception`. +- **Logger prefix:** every line starts `[function_name]`. Mask credentials / API keys / hashes. +- **Keyword-only args** for anything more than `(session, x)`, matching the CRUD convention. +- **Type hints on every parameter and return.** No `-> Any`. + +## `HTTPException` in services + +It's acceptable here — `services/auth.py` raises `HTTPException` directly when the domain failure maps cleanly to an HTTP status. Use it sparingly; when the same service may be called from a Celery task or CLI, prefer a domain exception that the route layer translates. + +## Canonical example (from `app/services/auth.py`) + +```python +import logging +from datetime import timedelta + +from app.core import security +from app.core.config import settings + +logger = logging.getLogger(__name__) + + +def create_token_pair( + user_id: int, + organization_id: int | None = None, + project_id: int | None = None, +) -> tuple[str, str]: + """Create an access token and refresh token pair.""" + access_token = security.create_access_token( + user_id, + expires_delta=timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES), + organization_id=organization_id, + project_id=project_id, + ) + refresh_token = security.create_refresh_token( + user_id, + expires_delta=timedelta(minutes=settings.REFRESH_TOKEN_EXPIRE_MINUTES), + organization_id=organization_id, + project_id=project_id, + ) + return access_token, refresh_token +``` + +Delegation to `app.core.security` is the pattern — services orchestrate; primitives live in `app/core/`. + +## External HTTP — checklist + +- **Timeout** — every `httpx`/`requests` call has an explicit timeout. The default is too long. +- **Retry policy** — idempotent GETs can retry with backoff. Mutations should retry only if you're certain the API is idempotent or you have an idempotency key. +- **Error mapping** — `httpx.HTTPStatusError` → a domain exception or `HTTPException` with a sensible code (often 502 for upstream failures, NOT 500). +- **Mock at this boundary in tests** — `monkeypatch` the HTTP client, not the DB. (See `test-writer` agent.) + +## Calling CRUD + +- Services own the `session` lifecycle for the operation. Call CRUD functions with `session=session` keyword arg. +- If CRUD returns `None`, decide whether that's a domain error (`raise NotFoundError`), a 404 (`raise HTTPException(404)`), or a silent skip (`return early`). Be explicit. + +## Config / secrets + +- Read from `settings` (`app.core.config`). Never read `os.environ` directly in a service. +- Defaults should lean toward cheap/safe: smallest model, lowest token cap, shortest TTL. Aggressive defaults belong in env, not code. + +## Magic values + +If you write the string `"openai"` or `"text-embedding-3-small"` or `1_000_000` in a service, ask whether it should be a constant / Enum / setting. Grep for the same literal — if it appears elsewhere, it should already be a constant. + +## What you DO NOT do + +- Don't write SQL directly — call CRUD. +- Don't import `fastapi.APIRouter` or define routes. +- Don't write long-running blocking loops — that's a Celery task. +- Don't call `time.sleep` inside an `async def` (use `asyncio.sleep`). +- Don't catch `HTTPException` from a sub-call and swallow it — propagate. + +## After writing + +Tell the user: +1. The service function(s) added (path + signature). +2. Which CRUD functions you call and which you still need. +3. Any external HTTP boundary that the test layer should mock. +4. Any new env / settings key the user must add to `.env.example`. diff --git a/.claude/agents/test-writer.md b/.claude/agents/test-writer.md new file mode 100644 index 000000000..fc19ec00b --- /dev/null +++ b/.claude/agents/test-writer.md @@ -0,0 +1,91 @@ +--- +name: test-writer +description: Use when writing or updating tests under `app/tests/` for kaapi-backend. Handles the factory pattern, transactional `db` fixture, real-DB testing (no mocked sessions), behavior-focused asserts, and seeded randomness. +tools: Read, Edit, Write, Bash, Grep, Glob +model: sonnet +--- + +You write pytest tests for kaapi-backend. Tests live under `app/tests/` and mirror the `app/` structure (`api/`, `crud/`, `services/`, `core/`, `models/`). + +## Hard rules + +- **Real DB only — never mock the database session.** This repo's `conftest.py` provides a transactional `db` fixture that rolls back after each test. Use it. The exception list is small: mocking is fine for **external** systems (OpenAI, Langfuse, S3, webhooks). Database = real. +- **Use the factory pattern from `app/tests/utils/`.** Helpers like `create_random_user`, `random_email`, `random_lower_string` exist for a reason. No hardcoded `organization_id=1`, no inline `User(...)` instances with magic ids. +- **Behavior, not implementation.** Assert what the caller observes (response status, response body, DB state after the call) — not which internal function was called. +- **Seed randomness.** If a test uses `random.random()` or similar, seed it. Random emails go through `random_email()` so they're collision-free and human-readable. +- **Bug fix → regression test.** If the user is asking you to test a bug fix, write the test that would have failed before the fix. + +## Fixtures available (from `conftest.py`) + +- `db: Session` — transactional, function-scoped. Use this in CRUD and service tests. +- `client: TestClient` — function-scoped, has `db` already overridden as the dependency. Use this in API tests. +- `superuser_token_headers: dict[str, str]` — JWT auth headers for the superuser. +- `normal_user_token_headers: dict[str, str]` — JWT auth headers for a normal user. +- `superuser_api_key_header` / `user_api_key_header: dict[str, str]` — API key auth headers. +- `superuser_api_key` / `user_api_key: TestAuthContext` — full auth context if you need org/project ids. +- `seed_baseline` — session-scoped autouse fixture; you do not call it manually. + +## Test factory utilities (`app/tests/utils/`) + +- `user.py`: `create_random_user(db)`, `authentication_token_from_email(...)` +- `auth.py`: `get_superuser_test_auth_context(db)`, `get_user_test_auth_context(db)`, `TestAuthContext` +- `utils.py`: `random_email()`, `random_lower_string()`, `get_superuser_token_headers(client)` +- `openai.py`, `llm.py`, `llm_provider.py`, `collection.py`, `document.py` — per-domain factories. **Read these before writing new factories.** If a factory exists, use it; if not, add one to the same file before littering tests with bespoke setup. + +## Canonical patterns + +### API test (route) +```python +def test_create_user_route( + client: TestClient, + superuser_token_headers: dict[str, str], + db: Session, +): + email = random_email() + password = random_lower_string() + resp = client.post( + f"{settings.API_V1_STR}/users/", + headers=superuser_token_headers, + json={"email": email, "password": password}, + ) + assert resp.status_code == 201 + body = resp.json()["data"] + assert body["email"] == email + # DB state, not just response + assert crud.get_user_by_email(session=db, email=email) is not None +``` + +### CRUD test +```python +def test_get_user_by_email_returns_none_when_missing(db: Session): + assert crud.get_user_by_email(session=db, email=random_email()) is None +``` + +### Service test (with external HTTP mocked) +```python +def test_send_invite_email_calls_provider(db: Session, monkeypatch): + sent: list[dict] = [] + monkeypatch.setattr("app.utils.send_email", lambda **kw: sent.append(kw)) + service_under_test.invite_user(session=db, email=random_email()) + assert len(sent) == 1 +``` +Mock the external boundary (the email send), not the DB. + +## Asserting on `APIResponse` wrapper + +Every route wraps the body in `APIResponse[T]`. Tests should pull `body = resp.json()["data"]` and assert on that, not `resp.json()` directly. If the route returns a list, check `body["count"]` and `body["data"]` (or whatever the wrapper shape is — confirm by reading `app/utils/api_response.py` or whichever file defines `APIResponse`). + +## Things to flag (do not silently fix) + +- A bug fix arriving without a regression test → say so explicitly and write one. +- A "test" that mocks the DB session → refactor it onto the `db` fixture. +- `assert resp.status_code == 200` for a POST that should be 201, or for a DELETE that should be 204 — call out the wrong code. +- Tests asserting `mock.called` with no behavioral check — these are tautological; replace with an assertion on observable state. +- Hardcoded `organization_id=1` or `project_id=1` — replace with the auth-context fixtures. + +## Running tests + +- All tests: `uv run bash scripts/tests-start.sh` +- A subset (when iterating): `uv run pytest backend/app/tests/api/test_users.py -k -x` + +After writing, run the relevant subset. If the test fails for an unexpected reason (not the bug under test), diagnose before declaring done. diff --git a/CLAUDE.md b/CLAUDE.md index 53d09c7e9..013e3918c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -21,7 +21,7 @@ fastapi run --reload app/main.py uv run pre-commit run --all-files # Generate database migration (rev-id should be latest existing revision ID + 1) -alembic revision --autogenerate -m "Description" --rev-id 040 +uv run alembic revision --autogenerate -m "Description" --rev-id 061 # Seed database with test data uv run python -m app.seed_data.seed_data @@ -93,47 +93,31 @@ The application uses different environment files: ## Coding Conventions -### Type Hints - -Always add type hints to all function parameters and return values. - -### Logging Format - -Prefix all log messages with the function name in square brackets. - -```python -logger.info(f"[function_name] Message {mask_string(sensitive_value)}") -``` - -### Database Column Comments - -Use sa_column_kwargs["comment"] to describe database columns, especially when the purpose isn’t obvious. This helps non-developers understand column purposes directly from the database schema: - -```python -field_name: int = Field( - foreign_key="table.id", - nullable=False, - ondelete="CASCADE", - sa_column_kwargs={"comment": "What this column represents"} -) -``` - -Prioritize comments for: -- Columns with non-obvious purposes -- Status/type fields (document valid values) -- JSON/metadata columns (describe expected structure) -- Foreign keys (clarify the relationship) - -### Endpoint Documentation - -Load Swagger descriptions from external markdown files instead of inline strings: - -```python -@router.post( - "/endpoint", - description=load_description("domain/action.md"), - response_model=APIResponse[ResponseModel], -) -``` - -Store documentation files in `backend/app/api/docs//.md` +Layer-specific conventions live in `.claude/agents/*.md` and are enforced by the matching specialist subagent (e.g., `route-writer` for `app/api/routes/`, `model-writer` for `app/models/`, `migration-writer` for alembic). CLAUDE.md only covers rules that apply across every layer. + +### Cross-cutting rules + +- **Type hints** on every parameter and return value. `-> Any` is not an annotation — narrow it or drop it. +- **Logging prefix:** every log line starts with the function name in square brackets. + ```python + logger.info(f"[function_name] Message | key: {value}") + ``` +- **`uv` is the runner**, not `pip`. Examples: `uv run pytest`, `uv run alembic ...`, `uv run pre-commit run --all-files`. +- **No magic values** in code — extract repeated literals to constants / `Enum` / settings. +- **Naming:** `list_*` for plural fetch, `get_*` for singletons; snake_case funcs/vars, PascalCase classes, UPPER_SNAKE constants; `Enum` suffix on enum classes. +- **Timestamps** are `inserted_at` / `updated_at` (not `created_at`). + +## Specialist subagents + +When working in a specific layer, the matching agent under `.claude/agents/` handles the layer's conventions automatically. Pick by layer, or just describe the task and let the main agent route: + +| Agent | Layer | +|---|---| +| `route-writer` | `app/api/routes/` | +| `crud-writer` | `app/crud/` | +| `service-writer` | `app/services/` | +| `model-writer` | `app/models/` | +| `migration-writer` | `app/alembic/versions/` | +| `celery-task-writer` | `app/celery/tasks/` | +| `test-writer` | `app/tests/` | +| `convention-reviewer` | Cross-cutting pre-commit gate (mirrors `/pr-review`) | From cd4e0c66318b392be5fa0047e13487e5c4e14b37 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Sat, 16 May 2026 23:03:30 +0530 Subject: [PATCH 2/8] add consistent logging --- .claude/agents/celery-task-writer.md | 3 +-- .claude/agents/crud-writer.md | 7 +------ .claude/agents/migration-writer.md | 2 +- .claude/agents/route-writer.md | 4 ++++ .claude/agents/service-writer.md | 4 +++- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.claude/agents/celery-task-writer.md b/.claude/agents/celery-task-writer.md index 9fc5fc33c..c5c346c01 100644 --- a/.claude/agents/celery-task-writer.md +++ b/.claude/agents/celery-task-writer.md @@ -64,8 +64,7 @@ Document the choice in a comment if it's not obvious. ## Logging - `logger = logging.getLogger(__name__)` at the module top. -- Every log line prefixed `[task_name]` — e.g., `logger.info(f"[run_my_job] Starting | project_id: {project_id}, job_id: {job_id}")`. -- Log start, finish, and any retry. **Don't log payload contents** if they may contain PII / credentials. +- Every line is `logger.info(f"[task_name] Message | key: {value}")`. Log start, finish, and any retry. Mask PII / credentials with `mask_string` from `app.utils` — **never log raw payloads** if they may contain sensitive data. ## What you DO NOT do diff --git a/.claude/agents/crud-writer.md b/.claude/agents/crud-writer.md index fd0655b0c..ee714eff9 100644 --- a/.claude/agents/crud-writer.md +++ b/.claude/agents/crud-writer.md @@ -12,12 +12,7 @@ You write CRUD functions for kaapi-backend. CRUD lives in `app/crud/` and is the - **No `HTTPException` in this layer.** Ever. Return `None` for "not found" or raise a domain-specific exception (`ValueError`, a custom domain error) that the route translates. - **No third-party HTTP calls.** No `httpx`, no `openai`, no boto3, no `requests`. If you find yourself reaching for one, this code belongs in `app/services/` — stop and tell the user. - **No business logic.** Validation, orchestration, multi-step workflows → services. CRUD is "read this row, write this row, list these rows with filters". -- **No `print`. Use `logger`.** Every module starts with: - ```python - import logging - logger = logging.getLogger(__name__) - ``` - Every log line is prefixed: `logger.info(f"[function_name] ... | key: {value}")`. Mask anything sensitive (`mask_string(...)` from `app/core/util.py` if it exists in the repo — grep first). +- **No `print`. Use `logger`.** Module top: `import logging; logger = logging.getLogger(__name__)`. Every line is `logger.info(f"[function_name] Message | key: {value}")`. Mask sensitive values with `mask_string` from `app.utils` — e.g. `f"... | email: {mask_string(email)}"`. ## Canonical function shape (from `app/crud/user.py`) diff --git a/.claude/agents/migration-writer.md b/.claude/agents/migration-writer.md index 658a77cf6..9a71aa69b 100644 --- a/.claude/agents/migration-writer.md +++ b/.claude/agents/migration-writer.md @@ -66,7 +66,7 @@ def downgrade(): ## What you DO NOT do - Don't add `HTTPException`, route handlers, business logic, or external HTTP calls in a migration. -- Don't write `print(...)` debug statements — use the migration docstring. +- Don't write `print(...)` debug statements — use the migration docstring. If a long backfill genuinely needs progress logging, use `logging.getLogger("alembic.runtime.migration")` with the standard `[] ...` prefix. - Don't skip the docstring. The docstring is what someone debugging at 2am will read. - Don't import from `app.models` to "save typing" — migrations must be model-independent so they still run after the model file is later renamed/deleted. diff --git a/.claude/agents/route-writer.md b/.claude/agents/route-writer.md index bba44b1d6..c2555284c 100644 --- a/.claude/agents/route-writer.md +++ b/.claude/agents/route-writer.md @@ -74,6 +74,10 @@ Returning `404` instead of `403` for cross-tenant access is intentional — it d - Short fire-and-forget (send an email, write an audit log) → `BackgroundTasks`. - Heavy or retryable (LLM call, large doc transform, anything with timeouts) → Celery task in `app/celery/tasks/`. Hand off to `celery-task-writer`. +## Logging + +`logger = logging.getLogger(__name__)` at the module top. Every line is `logger.info(f"[handler_name] Message | key: {value}")`. Log non-trivial actions (creates, deletes, ownership failures) — don't spam `info` on every GET. Mask sensitive values with `mask_string` from `app.utils`. + ## What you DO NOT do - Don't add the route registration in `app/api/main.py` (or wherever the aggregator lives) without checking the existing alphabetical / grouped order. diff --git a/.claude/agents/service-writer.md b/.claude/agents/service-writer.md index 2439005de..e06f966b7 100644 --- a/.claude/agents/service-writer.md +++ b/.claude/agents/service-writer.md @@ -22,7 +22,7 @@ You write business-logic services for kaapi-backend. Services live in `app/servi - **External HTTP must validate URLs you fetch.** Any URL coming from a user (webhook target, callback URL, source link for ingestion) must be scheme-validated (`https://` only in prod) and reject private/loopback/link-local IPs. SSRF is a blocker, not a follow-up. - **`try` wraps only the throwing line(s).** Big try blocks are the #1 source of swallowed 404s becoming 500s. - **Concrete exception types** — `except httpx.HTTPStatusError as e:`, not `except Exception`. -- **Logger prefix:** every line starts `[function_name]`. Mask credentials / API keys / hashes. +- **Logger prefix:** every line is `logger.info(f"[function_name] Message | key: {value}")`. Mask credentials / API keys / hashes / emails with `mask_string` from `app.utils`. Log start + finish of external HTTP calls and any retry. - **Keyword-only args** for anything more than `(session, x)`, matching the CRUD convention. - **Type hints on every parameter and return.** No `-> Any`. @@ -38,6 +38,7 @@ from datetime import timedelta from app.core import security from app.core.config import settings +from app.utils import mask_string logger = logging.getLogger(__name__) @@ -60,6 +61,7 @@ def create_token_pair( organization_id=organization_id, project_id=project_id, ) + logger.info(f"[create_token_pair] Token pair issued | user_id: {user_id}, access_token: {mask_string(access_token)}") return access_token, refresh_token ``` From 74058f752b4a645cea759eb4ba6ea9bc578125b3 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Wed, 20 May 2026 15:54:50 +0530 Subject: [PATCH 3/8] first stab at fast evaluation using agents --- .../063_add_run_mode_to_evaluation_run.py | 112 +++ .../api/docs/evaluation/create_evaluation.md | 42 +- .../app/api/docs/evaluation/list_datasets.md | 9 +- backend/app/api/routes/evaluations/dataset.py | 22 +- .../app/api/routes/evaluations/evaluation.py | 28 +- backend/app/celery/celery_app.py | 7 + backend/app/celery/tasks/evaluation_fast.py | 51 ++ backend/app/celery/utils.py | 16 + backend/app/core/config.py | 6 + backend/app/crud/evaluations/__init__.py | 9 + backend/app/crud/evaluations/core.py | 3 + backend/app/crud/evaluations/fast.py | 846 ++++++++++++++++++ backend/app/models/__init__.py | 1 + backend/app/models/evaluation.py | 23 + backend/app/services/evaluations/__init__.py | 5 + .../app/services/evaluations/evaluation.py | 36 +- backend/app/services/evaluations/fast.py | 289 ++++++ .../tests/api/routes/test_evaluation_fast.py | 729 +++++++++++++++ 18 files changed, 2210 insertions(+), 24 deletions(-) create mode 100644 backend/app/alembic/versions/063_add_run_mode_to_evaluation_run.py create mode 100644 backend/app/celery/tasks/evaluation_fast.py create mode 100644 backend/app/crud/evaluations/fast.py create mode 100644 backend/app/services/evaluations/fast.py create mode 100644 backend/app/tests/api/routes/test_evaluation_fast.py diff --git a/backend/app/alembic/versions/063_add_run_mode_to_evaluation_run.py b/backend/app/alembic/versions/063_add_run_mode_to_evaluation_run.py new file mode 100644 index 000000000..77d7e8bce --- /dev/null +++ b/backend/app/alembic/versions/063_add_run_mode_to_evaluation_run.py @@ -0,0 +1,112 @@ +"""add run_mode column and unique run-name constraint to evaluation_run + +Revision ID: 063 +Revises: 062 +Create Date: 2026-05-20 00:00:00.000000 + +Two schema changes are required to support the fast-evaluation feature: + +1. run_mode VARCHAR(10) NOT NULL DEFAULT 'batch' + Every evaluation run must now carry an explicit execution mode ('batch' + or 'fast'). Because evaluation_run may already contain rows the column + is added as nullable with a server_default of 'batch' so existing rows + are backfilled atomically, then SET NOT NULL is applied. The server + default is kept on the column to act as a safety net for any insert path + that does not explicitly set the field; the SQLModel default="batch" will + also cover application inserts. + +2. UNIQUE constraint uq_evaluation_run_org_project_run_name + on (organization_id, project_id, run_name) + Guards against double-click races on POST /evaluations where two + concurrent requests could create duplicate runs with the same name inside + the same project. Lower environments may already contain duplicate + (organization_id, project_id, run_name) tuples, so we dedupe first + (keeping the lowest-id survivor), then build the unique index + CONCURRENTLY (no AccessExclusiveLock on the table during the build) and + attach it as a named constraint via ADD CONSTRAINT ... USING INDEX. + Because CONCURRENTLY cannot execute inside a transaction the whole + migration is marked disable_per_migration_transaction = True and each + CONCURRENTLY step is wrapped in autocommit_block(). The dedupe DELETE and + the non-concurrent DDL steps run inside autocommit mode too, which is + safe — they are each individually atomic at the statement level. +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "063" +down_revision = "062" +branch_labels = None +depends_on = None + +disable_per_migration_transaction = True + +_UNIQUE_INDEX = "uq_evaluation_run_org_project_run_name" +_UNIQUE_CONSTRAINT = "uq_evaluation_run_org_project_run_name" + + +def upgrade(): + # 1. Add run_mode as nullable first so existing rows are backfilled by the + # server default, then tighten to NOT NULL. The server default is left + # in place as a safety net. + with op.get_context().autocommit_block(): + op.add_column( + "evaluation_run", + sa.Column( + "run_mode", + sa.String(length=10), + nullable=True, + server_default=sa.text("'batch'"), + comment="Execution mode: batch or fast", + ), + ) + op.execute("ALTER TABLE evaluation_run ALTER COLUMN run_mode SET NOT NULL") + + # 2. Dedupe existing rows before adding the unique constraint. + # Keep the lowest-id row for each (organization_id, project_id, + # run_name) tuple and remove the rest. + with op.get_context().autocommit_block(): + op.execute( + """ + DELETE FROM evaluation_run + WHERE id IN ( + SELECT id + FROM ( + SELECT id, + ROW_NUMBER() OVER ( + PARTITION BY organization_id, project_id, run_name + ORDER BY id ASC + ) AS rn + FROM evaluation_run + ) sub + WHERE rn > 1 + ) + """ + ) + + # 3. Build the unique index CONCURRENTLY so the scan does not take an + # AccessExclusiveLock, then attach it as a named constraint via + # ADD CONSTRAINT ... USING INDEX (brief catalog-only lock). + with op.get_context().autocommit_block(): + op.execute( + f"CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS " + f'"{_UNIQUE_INDEX}" ' + f"ON evaluation_run (organization_id, project_id, run_name)" + ) + op.execute( + f"ALTER TABLE evaluation_run " + f'ADD CONSTRAINT "{_UNIQUE_CONSTRAINT}" ' + f'UNIQUE USING INDEX "{_UNIQUE_INDEX}"' + ) + + +def downgrade(): + # Reverse in opposite order to upgrade(). + op.execute( + f"ALTER TABLE evaluation_run " + f'DROP CONSTRAINT IF EXISTS "{_UNIQUE_CONSTRAINT}"' + ) + with op.get_context().autocommit_block(): + op.execute(f'DROP INDEX CONCURRENTLY IF EXISTS "{_UNIQUE_INDEX}"') + op.drop_column("evaluation_run", "run_mode") diff --git a/backend/app/api/docs/evaluation/create_evaluation.md b/backend/app/api/docs/evaluation/create_evaluation.md index a719579b2..96719ed53 100644 --- a/backend/app/api/docs/evaluation/create_evaluation.md +++ b/backend/app/api/docs/evaluation/create_evaluation.md @@ -1,16 +1,22 @@ -Start an evaluation run using the OpenAI Batch API. +Start an evaluation run against a stored dataset. -Evaluations allow you to systematically test LLM configurations against -predefined datasets with automatic progress tracking and result collection. +Two execution modes are supported via the optional `run_mode` field: + +* `batch` (default) — submits the work to the OpenAI Batch API. Cost-efficient + for large datasets; turnaround can take up to 24 hours. +* `fast` — runs the evaluation synchronously through the OpenAI Responses API + and returns results within seconds-to-minutes. Restricted to text + evaluations on datasets with at most `EVAL_FAST_MAX_UNIQUE_ROWS` unique rows. **Key Features:** -* Fetches dataset items from Langfuse and creates a batch processing job via the OpenAI Batch API -* Asynchronous processing with automatic progress tracking (checks every 60s) +* Fetches dataset items from Langfuse and creates a job (batch or fast) * Uses a stored config (created via `/configs`) to define the provider parameters -* Stores results for comparison and analysis -* Use `GET /evaluations/{evaluation_id}` to monitor progress and retrieve results +* Same scoring semantics across both modes — cosine similarity, Langfuse traces, + and optional LLM-as-Judge correctness +* Use `GET /evaluations/{evaluation_id}` to monitor progress and retrieve results; + the response carries `run_mode` so clients can tell the two paths apart -## Example +## Example (batch — default) ```json { @@ -20,3 +26,23 @@ predefined datasets with automatic progress tracking and result collection. "config_version": 1 } ``` + +## Example (fast) + +```json +{ + "dataset_id": 123, + "experiment_name": "may19-temp0.2-gpt4o-fast", + "config_id": "f54f0d67-4817-4103-9fdf-b74b3d46733e", + "config_version": 1, + "run_mode": "fast" +} +``` + +## Fast-mode error responses + +| Status | Code | When | +| --- | --- | --- | +| 422 | `config_type_unsupported` | Resolved config is not a text-evaluation config | +| 422 | `dataset_too_large_for_fast` | Dataset exceeds `EVAL_FAST_MAX_UNIQUE_ROWS` unique rows | +| 409 | `run_name_already_exists` | A run with the same `experiment_name` already exists for this (organization, project) | diff --git a/backend/app/api/docs/evaluation/list_datasets.md b/backend/app/api/docs/evaluation/list_datasets.md index e315db1d0..36b7cc1c4 100644 --- a/backend/app/api/docs/evaluation/list_datasets.md +++ b/backend/app/api/docs/evaluation/list_datasets.md @@ -1,3 +1,10 @@ List all datasets for the current organization and project. -Returns a paginated list of datasets ordered by most recent first. Each dataset includes metadata (ID, name, item counts, duplication factor), Langfuse integration details, and object store URL. +Returns a paginated list of datasets ordered by most recent first. Each dataset includes metadata (ID, name, item counts, duplication factor), Langfuse integration details, object store URL, and an `eligible_for_fast` flag that is `true` when the dataset's unique-row count is within `EVAL_FAST_MAX_UNIQUE_ROWS` (and so can be used with `run_mode="fast"` on `POST /evaluations`). + +## Query parameters + +| Parameter | Description | +| --- | --- | +| `limit` / `offset` | Pagination (default 50 / 0; max limit 100) | +| `eligible_for` | If set to `fast`, the response is filtered to only datasets where `eligible_for_fast` is `true` | diff --git a/backend/app/api/routes/evaluations/dataset.py b/backend/app/api/routes/evaluations/dataset.py index 202774da2..20f59f9cf 100644 --- a/backend/app/api/routes/evaluations/dataset.py +++ b/backend/app/api/routes/evaluations/dataset.py @@ -22,6 +22,7 @@ from app.crud.evaluations.dataset import delete_dataset as delete_dataset_crud from app.models.evaluation import DatasetUploadResponse, EvaluationDataset from app.services.evaluations import ( + is_dataset_fast_eligible, upload_dataset as upload_evaluation_dataset, validate_csv_file, ) @@ -39,13 +40,15 @@ def _dataset_to_response( dataset: EvaluationDataset, signed_url: str | None = None ) -> DatasetUploadResponse: """Convert a dataset model to a DatasetUploadResponse.""" + original_items = dataset.dataset_metadata.get("original_items_count", 0) return DatasetUploadResponse( dataset_id=dataset.id, dataset_name=dataset.name, description=dataset.description, total_items=dataset.dataset_metadata.get("total_items_count", 0), - original_items=dataset.dataset_metadata.get("original_items_count", 0), + original_items=original_items, duplication_factor=dataset.dataset_metadata.get("duplication_factor", 1), + eligible_for_fast=is_dataset_fast_eligible(original_items_count=original_items), langfuse_dataset_id=dataset.langfuse_dataset_id, object_store_url=dataset.object_store_url, signed_url=signed_url, @@ -104,6 +107,15 @@ def list_datasets( default=50, ge=1, le=100, description="Maximum number of datasets to return" ), offset: int = Query(default=0, ge=0, description="Number of datasets to skip"), + eligible_for: str + | None = Query( + default=None, + description=( + "If 'fast', return only datasets eligible for run_mode='fast' " + "(unique-row count within EVAL_FAST_MAX_UNIQUE_ROWS)." + ), + enum=["fast"], + ), ) -> APIResponse[list[DatasetUploadResponse]]: """List evaluation datasets.""" datasets = list_evaluation_datasets( @@ -114,9 +126,11 @@ def list_datasets( offset=offset, ) - return APIResponse.success_response( - data=[_dataset_to_response(dataset) for dataset in datasets] - ) + responses = [_dataset_to_response(dataset) for dataset in datasets] + if eligible_for == "fast": + responses = [r for r in responses if r.eligible_for_fast] + + return APIResponse.success_response(data=responses) @router.get( diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py index 591f5d985..800b5b0cc 100644 --- a/backend/app/api/routes/evaluations/evaluation.py +++ b/backend/app/api/routes/evaluations/evaluation.py @@ -3,6 +3,7 @@ import logging from uuid import UUID +from asgi_correlation_id import correlation_id from fastapi import ( APIRouter, Body, @@ -14,11 +15,12 @@ from app.api.deps import AuthContextDep, SessionDep from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud from app.crud.evaluations.core import group_traces_by_question_id -from app.models.evaluation import EvaluationRunPublic +from app.models.evaluation import EvaluationRunPublic, RunModeEnum from app.api.permissions import Permission, require_permission from app.services.evaluations import ( get_evaluation_with_scores, start_evaluation, + validate_and_start_fast_evaluation, ) from app.utils import ( APIResponse, @@ -45,8 +47,32 @@ def evaluate( ), config_id: UUID = Body(..., description="Stored config ID"), config_version: int = Body(..., ge=1, description="Stored config version"), + run_mode: RunModeEnum = Body( + default=RunModeEnum.BATCH, + description="Execution mode: 'batch' (default) or 'fast'", + ), ) -> APIResponse[EvaluationRunPublic]: """Start an evaluation run.""" + logger.info( + f"[evaluate] Starting evaluation | run_mode={run_mode.value} | " + f"experiment_name={experiment_name} | dataset_id={dataset_id} | " + f"org_id={auth_context.organization_.id} | " + f"project_id={auth_context.project_.id}" + ) + + if run_mode == RunModeEnum.FAST: + eval_run = validate_and_start_fast_evaluation( + session=session, + dataset_id=dataset_id, + run_name=experiment_name, + config_id=config_id, + config_version=config_version, + organization_id=auth_context.organization_.id, + project_id=auth_context.project_.id, + trace_id=correlation_id.get() or "N/A", + ) + return APIResponse.success_response(data=eval_run) + eval_run = start_evaluation( session=session, dataset_id=dataset_id, diff --git a/backend/app/celery/celery_app.py b/backend/app/celery/celery_app.py index b78e7b15f..a4031b202 100644 --- a/backend/app/celery/celery_app.py +++ b/backend/app/celery/celery_app.py @@ -164,6 +164,7 @@ def initialize_worker(**_) -> None: include=[ "app.celery.tasks.job_execution", "app.celery.tasks.notifications", + "app.celery.tasks.evaluation_fast", ], ) @@ -186,6 +187,12 @@ def initialize_worker(**_) -> None: routing_key="low", queue_arguments={"x-max-priority": 1}, ), + Queue( + "evaluations", + exchange=default_exchange, + routing_key="evaluations", + queue_arguments={"x-max-priority": 6}, + ), Queue("cron", exchange=default_exchange, routing_key="cron"), Queue("default", exchange=default_exchange, routing_key="default"), ), diff --git a/backend/app/celery/tasks/evaluation_fast.py b/backend/app/celery/tasks/evaluation_fast.py new file mode 100644 index 000000000..55752ffb1 --- /dev/null +++ b/backend/app/celery/tasks/evaluation_fast.py @@ -0,0 +1,51 @@ +""" +Celery task for the synchronous (fast) text-evaluation pipeline. + +This module hosts the single orchestrator task per fast evaluation run. The +heavy lifting lives in `app/services/evaluations/fast.py`; this task is a thin +shim that sets the correlation id, attaches the OTel parent context, and +delegates. + +See `Fast Evaluation SRD.md` for the design (queue, retries, idempotency). +""" + +import logging + +from celery import current_task + +from app.celery.celery_app import celery_app +from app.celery.tasks.job_execution import _run_with_otel_parent, _set_trace +from app.celery.utils import gevent_timeout +from app.core.config import settings + +logger = logging.getLogger(__name__) + + +@celery_app.task(bind=True, queue="evaluations", priority=6) +@gevent_timeout(settings.CELERY_TASK_SOFT_TIME_LIMIT, "run_evaluation_fast") +def run_evaluation_fast( + self, eval_run_id: int, trace_id: str = "N/A", **kwargs +) -> None: + """Run the fast evaluation pipeline for one EvaluationRun. + + Idempotency: each stage is skipped on retry when its `batch_job` marker is + already set on the EvaluationRun, so Celery redelivery never re-calls + OpenAI for work that already succeeded. + + Args: + eval_run_id: ID of the EvaluationRun (run_mode="fast"). + trace_id: Correlation id from the enqueueing request, propagated into + the worker for log correlation. + """ + from app.services.evaluations.fast import execute_fast_evaluation + + _set_trace(trace_id) + logger.info( + f"[run_evaluation_fast] Starting fast evaluation task | " + f"eval_run_id={eval_run_id} | task_id={current_task.request.id}" + ) + + return _run_with_otel_parent( + self, + lambda: execute_fast_evaluation(eval_run_id=eval_run_id), + ) diff --git a/backend/app/celery/utils.py b/backend/app/celery/utils.py index 288cba7c4..f643c7df6 100644 --- a/backend/app/celery/utils.py +++ b/backend/app/celery/utils.py @@ -198,6 +198,22 @@ def start_tts_result_processing( return task_id +def start_fast_evaluation(eval_run_id: int, trace_id: str = "N/A") -> str: + """Enqueue the run_evaluation_fast orchestrator task for one EvaluationRun.""" + from app.celery.tasks.evaluation_fast import run_evaluation_fast + + task_id = _enqueue_with_trace_context( + run_evaluation_fast, + eval_run_id=eval_run_id, + trace_id=trace_id, + ) + logger.info( + f"[start_fast_evaluation] Enqueued fast eval | " + f"eval_run_id={eval_run_id} | task_id={task_id}" + ) + return task_id + + def get_task_status(task_id: str) -> Dict[str, Any]: result = AsyncResult(task_id) return { diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 720846eb9..e1b15f0da 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -171,6 +171,12 @@ def AWS_S3_BUCKET(self) -> str: DOC_TRANSFORMATION_PENDING_THRESHOLD_MINUTES: int = 30 PENDING_JOB_QUERY_TIMEOUT_MS: int = 1000 + # Fast evaluation (run_mode="fast") configuration. + # See "Fast Evaluation SRD.md" for the full design rationale. + EVAL_FAST_MAX_UNIQUE_ROWS: int = 10 + EVAL_FAST_FAILURE_THRESHOLD: float = 0.5 + EVAL_FAST_API_CONCURRENCY: int = 10 + @computed_field # type: ignore[prop-decorator] @property def COMPUTED_CELERY_WORKER_CONCURRENCY(self) -> int: diff --git a/backend/app/crud/evaluations/__init__.py b/backend/app/crud/evaluations/__init__.py index a5824c0a2..2c0bd8f48 100644 --- a/backend/app/crud/evaluations/__init__.py +++ b/backend/app/crud/evaluations/__init__.py @@ -25,6 +25,11 @@ calculate_cosine_similarity, start_embedding_batch, ) +from app.crud.evaluations.fast import ( + JOB_TYPE_EMBEDDING_FAST, + JOB_TYPE_EVALUATION_FAST, + run_fast_evaluation, +) from app.crud.evaluations.langfuse import ( create_langfuse_dataset_run, fetch_trace_scores_from_langfuse, @@ -65,6 +70,10 @@ "upload_csv_to_object_store", # Batch "start_evaluation_batch", + # Fast eval + "JOB_TYPE_EMBEDDING_FAST", + "JOB_TYPE_EVALUATION_FAST", + "run_fast_evaluation", # Processing "check_and_process_evaluation", "poll_all_pending_evaluations", diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index f3d51c56d..271384615 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -62,6 +62,7 @@ def create_evaluation_run( config_version: int, organization_id: int, project_id: int, + run_mode: str = "batch", ) -> EvaluationRun: """ Create a new evaluation run record in the database. @@ -75,6 +76,7 @@ def create_evaluation_run( config_version: Version number of the config organization_id: Organization ID project_id: Project ID + run_mode: Execution mode ("batch" default, or "fast") Returns: The created EvaluationRun instance @@ -87,6 +89,7 @@ def create_evaluation_run( config_id=config_id, config_version=config_version, status="pending", + run_mode=run_mode, organization_id=organization_id, project_id=project_id, inserted_at=now(), diff --git a/backend/app/crud/evaluations/fast.py b/backend/app/crud/evaluations/fast.py new file mode 100644 index 000000000..d78785104 --- /dev/null +++ b/backend/app/crud/evaluations/fast.py @@ -0,0 +1,846 @@ +""" +Fast evaluation orchestration. + +This module implements run_mode="fast" — the synchronous text-evaluation path +described in `Fast Evaluation SRD.md`. Unlike the batch path it does not submit +an OpenAI Batch job; instead it makes Responses + Embeddings calls in parallel +from a single Celery orchestrator task and persists per-stage units to S3. + +Stage layout (each stage is skipped on retry if its `batch_job` row already +exists, mirroring how the batch path tracks state): + + Stage 1 — Responses unit: evaluation_run.batch_job_id + Stage 2 — Embeddings unit: evaluation_run.embedding_batch_job_id + Stage 3 — Score + trace + cost (no marker; each step is idempotent) + Stage 4 — Mark completed +""" + +import logging +import random +import time +from collections.abc import Callable +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Any, TypeVar + +import numpy as np +import openai +from langfuse import Langfuse +from openai import OpenAI +from sqlmodel import Session + +from app.core.cloud.storage import get_cloud_storage +from app.core.config import settings +from app.core.storage_utils import ( + load_json_from_object_store, + upload_jsonl_to_object_store, +) +from app.crud.evaluations.batch import fetch_dataset_items +from app.crud.evaluations.core import resolve_model_from_config, update_evaluation_run +from app.crud.evaluations.cost import attach_cost +from app.crud.evaluations.embeddings import ( + EMBEDDING_MODEL, + calculate_cosine_similarity, +) +from app.crud.evaluations.langfuse import ( + create_langfuse_dataset_run, + update_traces_with_cosine_scores, +) +from app.crud.job import create_batch_job, get_batch_job +from app.models import EvaluationRun, EvaluationRunUpdate +from app.models.batch_job import BatchJobCreate +from app.models.evaluation import RunModeEnum +from app.models.llm.request import TextLLMParams + +logger = logging.getLogger(__name__) + +_T = TypeVar("_T") + + +# job_type discriminators on batch_job for the two stages of the fast path. +# The OpenAI Batch concepts (provider_batch_id, provider_file_id, …) are NULL +# on these rows; the presence of the row + its raw_output_url is what tells the +# orchestrator a stage is already done. +JOB_TYPE_EVALUATION_FAST = "evaluation_fast" +JOB_TYPE_EMBEDDING_FAST = "embedding_fast" + + +# Retry policy for individual OpenAI calls inside Stage 1 / Stage 2. +# Retries on rate-limit, timeout, and connection errors; permanent errors +# (auth, validation, model not found) fail the item immediately. +_RETRY_MAX_ATTEMPTS = 5 +_RETRY_BASE_DELAY_SECONDS = 1.0 +_RETRY_MAX_DELAY_SECONDS = 30.0 + +# These OpenAI exception classes are retryable transient failures. +_RETRYABLE_OPENAI_ERRORS: tuple[type[Exception], ...] = ( + openai.RateLimitError, + openai.APITimeoutError, + openai.APIConnectionError, + openai.InternalServerError, +) + + +def _sleep_with_backoff(attempt: int) -> None: + """Exponential backoff with full jitter, capped at _RETRY_MAX_DELAY_SECONDS.""" + base = min( + _RETRY_BASE_DELAY_SECONDS * (2 ** (attempt - 1)), _RETRY_MAX_DELAY_SECONDS + ) + delay = random.uniform(0, base) + time.sleep(delay) + + +def _call_with_retry(label: str, fn: Callable[[], _T]) -> _T: + """Call `fn()` with retry on transient OpenAI errors. + + Returns the function's result on success. Raises the last exception when + retries are exhausted or when the error is non-retryable. The loop always + either returns or raises on the final attempt; the explicit + `RuntimeError` at the tail keeps mypy happy about the function's return + being reachable. + """ + for attempt in range(1, _RETRY_MAX_ATTEMPTS + 1): + try: + return fn() + except _RETRYABLE_OPENAI_ERRORS as exc: + if attempt == _RETRY_MAX_ATTEMPTS: + logger.warning( + f"[_call_with_retry] Exhausted retries | label={label} | " + f"attempt={attempt} | error={exc}" + ) + raise + logger.info( + f"[_call_with_retry] Transient error, retrying | label={label} | " + f"attempt={attempt} | error={exc}" + ) + _sleep_with_backoff(attempt) + except openai.OpenAIError as exc: + # Permanent OpenAI errors (auth, validation, model not found) — fail fast. + logger.warning( + f"[_call_with_retry] Permanent error, no retry | label={label} | " + f"error={exc}" + ) + raise + raise RuntimeError(f"_call_with_retry exited without result for {label}") + + +def _build_responses_params( + *, + config: TextLLMParams, + question: str, +) -> dict[str, Any]: + """Build the parameter dict for one OpenAI Responses API call. + + Mirrors the body shape `build_evaluation_jsonl` uses for the batch path so + fast eval and batch eval generate equivalent outputs for the same config. + """ + params: dict[str, Any] = { + "model": config.model, + "instructions": config.instructions, + "input": question, + } + + if "temperature" in config.model_fields_set: + params["temperature"] = config.temperature + + if config.reasoning: + params["reasoning"] = {"effort": config.reasoning} + + if config.knowledge_base_ids: + params["tools"] = [ + { + "type": "file_search", + "vector_store_ids": config.knowledge_base_ids, + "max_num_results": config.max_num_results or 20, + } + ] + + return params + + +def _field(obj: Any, name: str, default: Any = None) -> Any: + """Read a field from an object or dict, returning default when missing. + + OpenAI SDK objects expose fields as attributes; older response shapes and + tests sometimes pass dicts. This unifies access so call sites don't have + to branch on the runtime type. + """ + if obj is None: + return default + if isinstance(obj, dict): + return obj.get(name, default) + return getattr(obj, name, default) + + +def _extract_response_text(response: Any) -> str: + """Extract the model-generated text from an OpenAI Responses object. + + Prefers `response.output_text` (the SDK's flat helper) and falls back to a + structured walk of `response.output` if the helper is empty. + """ + output_text = _field(response, "output_text") + if output_text: + return output_text + + output = _field(response, "output") + if not output: + return "" + + for item in output: + if _field(item, "type") != "message": + continue + for content in _field(item, "content") or []: + if _field(content, "type") == "output_text": + text = _field(content, "text") + if text: + return text + return "" + + +def _usage_to_dict(usage: Any) -> dict[str, int]: + """Normalize an OpenAI usage object into the dict shape kaapi's cost layer expects.""" + return { + "input_tokens": int(_field(usage, "input_tokens", 0) or 0), + "output_tokens": int(_field(usage, "output_tokens", 0) or 0), + "total_tokens": int(_field(usage, "total_tokens", 0) or 0), + } + + +def _responses_call_for_item( + *, + openai_client: OpenAI, + config: TextLLMParams, + item: dict[str, Any], +) -> dict[str, Any]: + """Run one Responses API call for a single dataset item. + + Returns the same per-item shape as `parse_evaluation_output` from the batch + path so downstream code (Langfuse trace creation, embeddings, cost) does + not need to branch on run_mode. + """ + item_id = item["id"] + question = item["input"].get("question", "") if item.get("input") else "" + ground_truth = ( + item["expected_output"].get("answer", "") if item.get("expected_output") else "" + ) + question_id = (item.get("metadata") or {}).get("question_id") + + if not question: + return { + "item_id": item_id, + "question": "", + "generated_output": "ERROR: missing question in dataset item", + "ground_truth": ground_truth, + "response_id": None, + "usage": None, + "question_id": question_id, + "failed": True, + } + + params = _build_responses_params(config=config, question=question) + + try: + response = _call_with_retry( + label=f"responses.create:{item_id}", + fn=lambda: openai_client.responses.create(**params), + ) + except openai.OpenAIError as exc: + logger.warning( + f"[_responses_call_for_item] Item failed | item_id={item_id} | error={exc}" + ) + return { + "item_id": item_id, + "question": question, + "generated_output": f"ERROR: {exc}", + "ground_truth": ground_truth, + "response_id": None, + "usage": None, + "question_id": question_id, + "failed": True, + } + + return { + "item_id": item_id, + "question": question, + "generated_output": _extract_response_text(response), + "ground_truth": ground_truth, + "response_id": getattr(response, "id", None), + "usage": _usage_to_dict(getattr(response, "usage", None)), + "question_id": question_id, + "failed": False, + } + + +def _embedding_call_for_pair( + *, + openai_client: OpenAI, + embedding_model: str, + item_id: str, + output_text: str, + ground_truth: str, +) -> dict[str, Any]: + """Run one Embeddings API call for an (output, ground_truth) pair. + + Returns a dict with both embeddings and the prompt_tokens usage so the + cost layer can price the call. `failed=True` indicates a permanent or + retry-exhausted failure for the pair. + """ + if not output_text or not ground_truth: + return { + "item_id": item_id, + "output_embedding": None, + "ground_truth_embedding": None, + "usage": None, + "failed": True, + "error": "empty output or ground_truth", + } + + try: + response = _call_with_retry( + label=f"embeddings.create:{item_id}", + fn=lambda: openai_client.embeddings.create( + model=embedding_model, + input=[output_text, ground_truth], + encoding_format="float", + ), + ) + except openai.OpenAIError as exc: + logger.warning( + f"[_embedding_call_for_pair] Item failed | item_id={item_id} | error={exc}" + ) + return { + "item_id": item_id, + "output_embedding": None, + "ground_truth_embedding": None, + "usage": None, + "failed": True, + "error": str(exc), + } + + data = _field(response, "data") or [] + if len(data) < 2: + return { + "item_id": item_id, + "output_embedding": None, + "ground_truth_embedding": None, + "usage": None, + "failed": True, + "error": f"expected 2 embeddings, got {len(data)}", + } + + output_embedding: list[float] | None = None + ground_truth_embedding: list[float] | None = None + for emb in data: + index = _field(emb, "index") + vector = _field(emb, "embedding") + if index == 0: + output_embedding = vector + elif index == 1: + ground_truth_embedding = vector + + usage_obj = _field(response, "usage") + usage_dict: dict[str, int] = { + "prompt_tokens": int(_field(usage_obj, "prompt_tokens", 0) or 0), + "total_tokens": int(_field(usage_obj, "total_tokens", 0) or 0), + } + + return { + "item_id": item_id, + "output_embedding": output_embedding, + "ground_truth_embedding": ground_truth_embedding, + "usage": usage_dict, + "failed": output_embedding is None or ground_truth_embedding is None, + } + + +def _is_failure_threshold_breached(*, failed_rows: int, total_rows: int) -> bool: + """True if the failed-row fraction exceeds EVAL_FAST_FAILURE_THRESHOLD.""" + if total_rows == 0: + return False + return (failed_rows / total_rows) > settings.EVAL_FAST_FAILURE_THRESHOLD + + +def _upload_unit_to_s3( + *, + session: Session, + project_id: int, + eval_run_id: int, + filename: str, + results: list[dict[str, Any]], +) -> str | None: + """Upload a stage unit (responses or embeddings) as JSON to S3.""" + storage = get_cloud_storage(session=session, project_id=project_id) + return upload_jsonl_to_object_store( + storage=storage, + results=results, + filename=filename, + subdirectory=f"evaluations/fast/{eval_run_id}", + format="json", + ) + + +def _load_unit_from_s3( + *, session: Session, project_id: int, url: str +) -> list[dict[str, Any]]: + """Load a stage unit back from S3. Raises if the unit cannot be loaded.""" + storage = get_cloud_storage(session=session, project_id=project_id) + data = load_json_from_object_store(storage=storage, url=url) + if data is None: + raise RuntimeError(f"Failed to load fast eval unit from S3 | url={url}") + if not isinstance(data, list): + raise RuntimeError( + f"Fast eval unit at {url} is not a list | type={type(data).__name__}" + ) + return data + + +def _stage1_responses( + *, + session: Session, + openai_client: OpenAI, + eval_run: EvaluationRun, + config: TextLLMParams, + dataset_items: list[dict[str, Any]], + log_prefix: str, +) -> tuple[EvaluationRun, list[dict[str, Any]]]: + """Stage 1 — generate one Responses API call per dataset item. + + Skipped on retry if `eval_run.batch_job_id` is already set; in that case we + reload the persisted unit from S3 so downstream stages still have results. + """ + if eval_run.batch_job_id: + existing = get_batch_job(session=session, batch_job_id=eval_run.batch_job_id) + if existing and existing.raw_output_url: + logger.info( + f"[_stage1_responses] {log_prefix} Skipping stage 1 (already done) | " + f"batch_job_id={existing.id}" + ) + results = _load_unit_from_s3( + session=session, + project_id=eval_run.project_id, + url=existing.raw_output_url, + ) + return eval_run, results + + logger.info( + f"[_stage1_responses] {log_prefix} Running stage 1 | " + f"items={len(dataset_items)} | model={config.model} | " + f"concurrency={settings.EVAL_FAST_API_CONCURRENCY}" + ) + + results: list[dict[str, Any]] = [] + max_workers = max(1, min(settings.EVAL_FAST_API_CONCURRENCY, len(dataset_items))) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit( + _responses_call_for_item, + openai_client=openai_client, + config=config, + item=item, + ): item["id"] + for item in dataset_items + } + for future in as_completed(futures): + results.append(future.result()) + + failed_count = sum(1 for r in results if r.get("failed")) + logger.info( + f"[_stage1_responses] {log_prefix} Stage 1 finished | " + f"total={len(results)} | failed={failed_count}" + ) + + if _is_failure_threshold_breached( + failed_rows=failed_count, total_rows=len(results) + ): + raise RuntimeError( + f"Fast eval Stage 1 exceeded failure threshold | " + f"failed={failed_count}/{len(results)} | " + f"threshold={settings.EVAL_FAST_FAILURE_THRESHOLD}" + ) + + raw_output_url = _upload_unit_to_s3( + session=session, + project_id=eval_run.project_id, + eval_run_id=eval_run.id, + filename=f"responses_{eval_run.id}.json", + results=results, + ) + + # Aggregate usage for the batch_job summary so the cost layer can price the + # stage from raw_output_url alone if it needs to re-derive later. + summed_usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0} + for r in results: + usage = r.get("usage") or {} + for k in summed_usage: + summed_usage[k] += int(usage.get(k, 0) or 0) + + batch_job = create_batch_job( + session=session, + batch_job_create=BatchJobCreate( + provider="openai", + job_type=JOB_TYPE_EVALUATION_FAST, + config={ + "endpoint": "/v1/responses", + "run_mode": RunModeEnum.FAST.value, + "model": config.model, + "usage": summed_usage, + }, + raw_output_url=raw_output_url, + total_items=len(results), + organization_id=eval_run.organization_id, + project_id=eval_run.project_id, + ), + ) + + # `batch_job_id` and `total_items` aren't on EvaluationRunUpdate (the + # update model only covers fields that change during the lifecycle), so + # set them directly and let `update_evaluation_run` bump `updated_at`. + eval_run.batch_job_id = batch_job.id + eval_run.total_items = len(results) + eval_run = update_evaluation_run( + session=session, + eval_run=eval_run, + update=EvaluationRunUpdate(), + ) + + return eval_run, results + + +def _stage2_embeddings( + *, + session: Session, + openai_client: OpenAI, + eval_run: EvaluationRun, + response_results: list[dict[str, Any]], + log_prefix: str, +) -> tuple[EvaluationRun, list[dict[str, Any]]]: + """Stage 2 — embed each (output, ground_truth) pair for cosine similarity. + + Skipped on retry if `eval_run.embedding_batch_job_id` is already set. + """ + if eval_run.embedding_batch_job_id: + existing = get_batch_job( + session=session, batch_job_id=eval_run.embedding_batch_job_id + ) + if existing and existing.raw_output_url: + logger.info( + f"[_stage2_embeddings] {log_prefix} Skipping stage 2 (already done) | " + f"batch_job_id={existing.id}" + ) + embeddings = _load_unit_from_s3( + session=session, + project_id=eval_run.project_id, + url=existing.raw_output_url, + ) + return eval_run, embeddings + + # Only embed items that succeeded in Stage 1. + embed_candidates = [r for r in response_results if not r.get("failed")] + logger.info( + f"[_stage2_embeddings] {log_prefix} Running stage 2 | " + f"items={len(embed_candidates)} | model={EMBEDDING_MODEL} | " + f"concurrency={settings.EVAL_FAST_API_CONCURRENCY}" + ) + + embedding_results: list[dict[str, Any]] = [] + max_workers = max( + 1, min(settings.EVAL_FAST_API_CONCURRENCY, len(embed_candidates) or 1) + ) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit( + _embedding_call_for_pair, + openai_client=openai_client, + embedding_model=EMBEDDING_MODEL, + item_id=r["item_id"], + output_text=r.get("generated_output", ""), + ground_truth=r.get("ground_truth", ""), + ): r["item_id"] + for r in embed_candidates + } + for future in as_completed(futures): + embedding_results.append(future.result()) + + failed_count = sum(1 for r in embedding_results if r.get("failed")) + # Failure threshold for embeddings is computed over the whole dataset, not + # over the candidate subset — items that failed Stage 1 already count as + # failures from the user's perspective. + total_failures = failed_count + sum(1 for r in response_results if r.get("failed")) + logger.info( + f"[_stage2_embeddings] {log_prefix} Stage 2 finished | " + f"total={len(embedding_results)} | failed={failed_count}" + ) + + if _is_failure_threshold_breached( + failed_rows=total_failures, total_rows=len(response_results) + ): + raise RuntimeError( + f"Fast eval Stage 2 exceeded failure threshold | " + f"failed={total_failures}/{len(response_results)} | " + f"threshold={settings.EVAL_FAST_FAILURE_THRESHOLD}" + ) + + raw_output_url = _upload_unit_to_s3( + session=session, + project_id=eval_run.project_id, + eval_run_id=eval_run.id, + filename=f"embeddings_{eval_run.id}.json", + results=embedding_results, + ) + + summed_usage = {"prompt_tokens": 0, "total_tokens": 0} + for r in embedding_results: + usage = r.get("usage") or {} + for k in summed_usage: + summed_usage[k] += int(usage.get(k, 0) or 0) + + batch_job = create_batch_job( + session=session, + batch_job_create=BatchJobCreate( + provider="openai", + job_type=JOB_TYPE_EMBEDDING_FAST, + config={ + "endpoint": "/v1/embeddings", + "run_mode": RunModeEnum.FAST.value, + "embedding_model": EMBEDDING_MODEL, + "usage": summed_usage, + }, + raw_output_url=raw_output_url, + total_items=len(embedding_results), + organization_id=eval_run.organization_id, + project_id=eval_run.project_id, + ), + ) + + eval_run = update_evaluation_run( + session=session, + eval_run=eval_run, + update=EvaluationRunUpdate(embedding_batch_job_id=batch_job.id), + ) + + return eval_run, embedding_results + + +def _stage3_score_and_trace( + *, + session: Session, + eval_run: EvaluationRun, + langfuse: Langfuse, + response_results: list[dict[str, Any]], + embedding_results: list[dict[str, Any]], + log_prefix: str, +) -> EvaluationRun: + """Stage 3 — compute cosine, create Langfuse traces, attach costs. + + No stage marker; each step is idempotent or near-idempotent: + - Cosine recomputation is deterministic (same vectors → same numbers). + - Langfuse dedupes traces on the dataset_item.observe key. + - attach_cost is idempotent on (eval_run_id, batch_job_id, usage_type). + """ + logger.info( + f"[_stage3_score_and_trace] {log_prefix} Computing cosine + creating traces" + ) + + # 1. Compute cosine similarity per pair, in the same shape as the batch + # path's `parse_embedding_results` + `calculate_average_similarity`. + item_id_to_pair = { + r["item_id"]: r for r in embedding_results if not r.get("failed") + } + + # 2. Create Langfuse traces and get item_id -> trace_id mapping. The model + # name comes from the stored config (same source as batch path) so + # Langfuse's per-generation cost calc stays consistent. + model = resolve_model_from_config(session=session, eval_run=eval_run) + trace_id_mapping = create_langfuse_dataset_run( + langfuse=langfuse, + dataset_name=eval_run.dataset_name, + run_name=eval_run.run_name, + results=response_results, + model=model, + ) + + # 3. Build the per-item score list keyed on Langfuse trace_id, then update + # traces with the cosine score. + per_item_scores: list[dict[str, Any]] = [] + similarities: list[float] = [] + for r in response_results: + item_id = r["item_id"] + pair = item_id_to_pair.get(item_id) + trace_id = trace_id_mapping.get(item_id) + if not pair or not trace_id: + continue + if ( + pair.get("output_embedding") is None + or pair.get("ground_truth_embedding") is None + ): + continue + score = calculate_cosine_similarity( + pair["output_embedding"], pair["ground_truth_embedding"] + ) + similarities.append(score) + per_item_scores.append({"trace_id": trace_id, "cosine_similarity": score}) + + if per_item_scores: + try: + update_traces_with_cosine_scores( + langfuse=langfuse, per_item_scores=per_item_scores + ) + except Exception as exc: + # Mirror the batch path: Langfuse score-update failures don't fail + # the run; they get logged and the score still lives in eval_run.score. + logger.warning( + f"[_stage3_score_and_trace] {log_prefix} " + f"Failed to update Langfuse traces with scores | error={exc}", + exc_info=True, + ) + + # 4. Aggregate similarity stats — same shape as the batch path so + # summary_scores rendering on GET /evaluations/{id} stays identical. + if similarities: + sim_array = np.array(similarities) + avg = float(np.mean(sim_array)) + std = float(np.std(sim_array)) + else: + avg = 0.0 + std = 0.0 + + score_payload = { + "summary_scores": [ + { + "name": "Cosine Similarity", + "avg": round(avg, 2), + "std": round(std, 2), + "total_pairs": len(similarities), + "data_type": "NUMERIC", + } + ] + } + + # 5. Attach costs (response stage + embedding stage). attach_cost is + # idempotent on its natural key — see "Known Limitations" in the SRD. + if response_results: + attach_cost( + session=session, + eval_run=eval_run, + log_prefix=log_prefix, + response_model=model, + response_results=response_results, + ) + + # Embedding cost expects the raw OpenAI batch shape; rebuild it from our + # in-memory embedding_results so attach_cost can price the stage uniformly. + if embedding_results: + embedding_raw = [ + { + "response": { + "body": { + "usage": r.get("usage") + or {"prompt_tokens": 0, "total_tokens": 0} + } + } + } + for r in embedding_results + if not r.get("failed") + ] + if embedding_raw: + attach_cost( + session=session, + eval_run=eval_run, + log_prefix=log_prefix, + embedding_model=EMBEDDING_MODEL, + embedding_raw_results=embedding_raw, + ) + + eval_run = update_evaluation_run( + session=session, + eval_run=eval_run, + update=EvaluationRunUpdate( + score=score_payload, + cost=eval_run.cost, + ), + ) + + return eval_run + + +def run_fast_evaluation( + *, + session: Session, + openai_client: OpenAI, + langfuse: Langfuse, + eval_run: EvaluationRun, + config: TextLLMParams, +) -> EvaluationRun: + """Run the full fast-eval pipeline for one evaluation_run. + + Called from the `run_evaluation_fast` Celery task. Each stage is skipped on + retry if its batch_job marker is already set, so re-invocation does not + re-call OpenAI for work that already succeeded. + + Raises: + Exception: If any stage fails terminally; the orchestrator marks the + run failed. + """ + log_prefix = ( + f"[org={eval_run.organization_id}]" + f"[project={eval_run.project_id}]" + f"[eval={eval_run.id}]" + ) + logger.info(f"[run_fast_evaluation] {log_prefix} Starting fast evaluation pipeline") + + # Mark as processing immediately so the GET endpoint reflects state. + if eval_run.status == "pending": + eval_run = update_evaluation_run( + session=session, + eval_run=eval_run, + update=EvaluationRunUpdate(status="processing"), + ) + + dataset_items = fetch_dataset_items( + langfuse=langfuse, dataset_name=eval_run.dataset_name + ) + if not dataset_items: + raise ValueError( + f"Dataset '{eval_run.dataset_name}' returned no items for fast eval" + ) + + # Stage 1 + eval_run, response_results = _stage1_responses( + session=session, + openai_client=openai_client, + eval_run=eval_run, + config=config, + dataset_items=dataset_items, + log_prefix=log_prefix, + ) + + # Stage 2 + eval_run, embedding_results = _stage2_embeddings( + session=session, + openai_client=openai_client, + eval_run=eval_run, + response_results=response_results, + log_prefix=log_prefix, + ) + + # Stage 3 + eval_run = _stage3_score_and_trace( + session=session, + eval_run=eval_run, + langfuse=langfuse, + response_results=response_results, + embedding_results=embedding_results, + log_prefix=log_prefix, + ) + + # Stage 4 + eval_run = update_evaluation_run( + session=session, + eval_run=eval_run, + update=EvaluationRunUpdate(status="completed"), + ) + + logger.info( + f"[run_fast_evaluation] {log_prefix} Fast evaluation completed | " + f"total_items={eval_run.total_items}" + ) + return eval_run diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index b8eec6cde..b84af7482 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -90,6 +90,7 @@ EvaluationRunCreate, EvaluationRunPublic, EvaluationRunUpdate, + RunModeEnum, ) from .feature_flag import ( FeatureFlag, diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index 8a95ff054..49a47871d 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -1,4 +1,5 @@ from datetime import datetime +from enum import Enum from typing import TYPE_CHECKING, Any, Optional from uuid import UUID @@ -16,6 +17,11 @@ from .project import Project +class RunModeEnum(str, Enum): + BATCH = "batch" + FAST = "fast" + + class DatasetItem(BaseModel): """Model for a single dataset item (Q&A pair).""" @@ -47,6 +53,10 @@ class DatasetUploadResponse(BaseModel): signed_url: str | None = Field( None, description="A signed URL for downloading the dataset" ) + eligible_for_fast: bool = Field( + False, + description="True if dataset has ≤10 unique rows and is eligible for run_mode=fast", + ) class EvaluationResult(BaseModel): @@ -192,6 +202,12 @@ class EvaluationRun(SQLModel, table=True): __table_args__ = ( Index("idx_eval_run_status_org", "status", "organization_id"), Index("idx_eval_run_status_project", "status", "project_id"), + UniqueConstraint( + "organization_id", + "project_id", + "run_name", + name="uq_evaluation_run_org_project_run_name", + ), ) id: int = SQLField( @@ -284,6 +300,12 @@ class EvaluationRun(SQLModel, table=True): "comment": "Evaluation status (pending, processing, completed, failed)" }, ) + run_mode: str = SQLField( + default="batch", + max_length=10, + nullable=False, + sa_column_kwargs={"comment": "Execution mode: batch or fast"}, + ) object_store_url: str | None = SQLField( default=None, description="Object store URL of processed evaluation results for future reference", @@ -421,6 +443,7 @@ class EvaluationRunPublic(SQLModel): batch_job_id: int | None embedding_batch_job_id: int | None status: str + run_mode: str object_store_url: str | None score_trace_url: str | None total_items: int diff --git a/backend/app/services/evaluations/__init__.py b/backend/app/services/evaluations/__init__.py index 92d88fe0b..3776e60c4 100644 --- a/backend/app/services/evaluations/__init__.py +++ b/backend/app/services/evaluations/__init__.py @@ -5,6 +5,11 @@ get_evaluation_with_scores, start_evaluation, ) +from app.services.evaluations.fast import ( + execute_fast_evaluation, + is_dataset_fast_eligible, + validate_and_start_fast_evaluation, +) from app.services.evaluations.validators import ( ALLOWED_EXTENSIONS, ALLOWED_MIME_TYPES, diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 9f78b9e37..5e4abd0dc 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -4,6 +4,7 @@ from uuid import UUID from fastapi import HTTPException +from sqlalchemy.exc import IntegrityError from sqlmodel import Session from app.crud.evaluations import ( @@ -129,16 +130,31 @@ def start_evaluation( ) # Step 3: Create EvaluationRun record with config references - eval_run = create_evaluation_run( - session=session, - run_name=experiment_name, - dataset_name=dataset.name, - dataset_id=dataset_id, - config_id=config_id, - config_version=config_version, - organization_id=organization_id, - project_id=project_id, - ) + try: + eval_run = create_evaluation_run( + session=session, + run_name=experiment_name, + dataset_name=dataset.name, + dataset_id=dataset_id, + config_id=config_id, + config_version=config_version, + organization_id=organization_id, + project_id=project_id, + ) + except IntegrityError: + session.rollback() + logger.warning( + f"[start_evaluation] Duplicate run_name | run_name={experiment_name} | " + f"org_id={organization_id} | project_id={project_id}" + ) + raise HTTPException( + status_code=409, + detail=( + f"run_name_already_exists: a run with name '{experiment_name}' " + "already exists for this organization and project. Pick a new " + "run_name or fetch the existing run via GET /evaluations." + ), + ) # Step 4: Start the batch evaluation try: diff --git a/backend/app/services/evaluations/fast.py b/backend/app/services/evaluations/fast.py new file mode 100644 index 000000000..d3e316b4b --- /dev/null +++ b/backend/app/services/evaluations/fast.py @@ -0,0 +1,289 @@ +"""Fast evaluation orchestration service. + +This is the only place that decides whether a /evaluations request enters the +fast-eval path. It also hosts the worker-side entry point invoked by the +`run_evaluation_fast` Celery task. + +See `Fast Evaluation SRD.md` for the full design. +""" + +import logging +from uuid import UUID + +from fastapi import HTTPException +from sqlalchemy.exc import IntegrityError +from sqlmodel import Session + +from app.celery.utils import start_fast_evaluation as enqueue_fast_evaluation +from app.core.config import settings +from app.core.db import engine +from app.crud.evaluations import ( + create_evaluation_run, + get_dataset_by_id, + resolve_evaluation_config, + run_fast_evaluation, +) +from app.crud.evaluations.core import update_evaluation_run +from app.models.evaluation import EvaluationRun, EvaluationRunUpdate, RunModeEnum +from app.models.llm.request import TextLLMParams +from app.services.llm.providers import LLMProvider +from app.utils import get_langfuse_client, get_openai_client + +logger = logging.getLogger(__name__) + + +# Error codes surfaced in HTTPException.detail so the UI can localize/branch. +ERR_CONFIG_TYPE_UNSUPPORTED = "config_type_unsupported" +ERR_DATASET_TOO_LARGE_FOR_FAST = "dataset_too_large_for_fast" +ERR_RUN_NAME_ALREADY_EXISTS = "run_name_already_exists" + + +def is_dataset_fast_eligible(*, original_items_count: int) -> bool: + """A dataset is eligible for fast mode when its unique-row count is within cap.""" + return original_items_count <= settings.EVAL_FAST_MAX_UNIQUE_ROWS + + +def validate_and_start_fast_evaluation( + *, + session: Session, + dataset_id: int, + run_name: str, + config_id: UUID, + config_version: int, + organization_id: int, + project_id: int, + trace_id: str = "N/A", +) -> EvaluationRun: + """Validate + create + dispatch a fast evaluation run. + + Validation (in order): + 1. Dataset exists and has a Langfuse id. + 2. Config resolves to a text-type OpenAI config. + 3. Dataset's original_items_count <= EVAL_FAST_MAX_UNIQUE_ROWS. + 4. (organization_id, project_id, run_name) is unique — enforced by the DB + constraint added in migration 063. We translate IntegrityError to 409. + + On success the function creates the EvaluationRun row with + `run_mode="fast"`, `status="processing"`, and enqueues the orchestrator + task. The caller (route) returns the row immediately. + """ + logger.info( + f"[validate_and_start_fast_evaluation] Starting fast eval | " + f"run_name={run_name} | dataset_id={dataset_id} | " + f"org_id={organization_id} | project_id={project_id}" + ) + + # 1. Dataset must exist + have a Langfuse id (same as batch path). + dataset = get_dataset_by_id( + session=session, + dataset_id=dataset_id, + organization_id=organization_id, + project_id=project_id, + ) + if not dataset: + raise HTTPException( + status_code=404, + detail=( + f"Dataset {dataset_id} not found or not accessible to this " + "organization/project" + ), + ) + if not dataset.langfuse_dataset_id: + raise HTTPException( + status_code=400, + detail=( + f"Dataset {dataset_id} has no Langfuse dataset id; cannot run " + "evaluation." + ), + ) + + # 2. Config must resolve and be a text OpenAI config. + config_blob, error = resolve_evaluation_config( + session=session, + config_id=config_id, + config_version=config_version, + project_id=project_id, + ) + if error or config_blob is None: + raise HTTPException( + status_code=400, + detail=f"Failed to resolve config: {error}", + ) + if config_blob.completion.provider != LLMProvider.OPENAI: + raise HTTPException( + status_code=422, + detail="Only 'openai' provider is supported for evaluation configs", + ) + if config_blob.completion.type != "text": + raise HTTPException( + status_code=422, + detail=ERR_CONFIG_TYPE_UNSUPPORTED, + ) + + # 3. Dataset must be small enough for fast eval. + original_items_count = (dataset.dataset_metadata or {}).get( + "original_items_count" + ) or 0 + if not is_dataset_fast_eligible(original_items_count=original_items_count): + raise HTTPException( + status_code=422, + detail=( + f"{ERR_DATASET_TOO_LARGE_FOR_FAST}: dataset has " + f"{original_items_count} unique rows; fast mode requires at most " + f"{settings.EVAL_FAST_MAX_UNIQUE_ROWS}." + ), + ) + + # 4. Create the run; rely on the DB unique constraint to catch double-clicks. + try: + eval_run = create_evaluation_run( + session=session, + run_name=run_name, + dataset_name=dataset.name, + dataset_id=dataset_id, + config_id=config_id, + config_version=config_version, + organization_id=organization_id, + project_id=project_id, + run_mode=RunModeEnum.FAST.value, + ) + except IntegrityError: + session.rollback() + logger.warning( + f"[validate_and_start_fast_evaluation] Duplicate run_name | " + f"run_name={run_name} | org_id={organization_id} | " + f"project_id={project_id}" + ) + raise HTTPException( + status_code=409, + detail=( + f"{ERR_RUN_NAME_ALREADY_EXISTS}: a run with name '{run_name}' " + "already exists for this organization and project. Pick a new " + "run_name or fetch the existing run via GET /evaluations." + ), + ) + + # Flip to processing before dispatching the task so the GET endpoint + # reflects the correct state immediately. + eval_run = update_evaluation_run( + session=session, + eval_run=eval_run, + update=EvaluationRunUpdate(status="processing"), + ) + + # Dispatch the orchestrator. If enqueue fails, mark the run as failed so it + # doesn't linger in `processing` forever. + try: + enqueue_fast_evaluation(eval_run_id=eval_run.id, trace_id=trace_id) + except Exception as exc: + logger.error( + f"[validate_and_start_fast_evaluation] Failed to enqueue task | " + f"eval_run_id={eval_run.id} | error={exc}", + exc_info=True, + ) + update_evaluation_run( + session=session, + eval_run=eval_run, + update=EvaluationRunUpdate( + status="failed", + error_message=f"Failed to enqueue fast eval task: {exc}", + ), + ) + raise HTTPException( + status_code=500, + detail="Failed to enqueue fast evaluation task", + ) + + return eval_run + + +def execute_fast_evaluation(*, eval_run_id: int) -> None: + """Worker-side entry point: run the full fast-eval pipeline. + + Called from the `run_evaluation_fast` Celery task. Opens its own DB + session so the task is self-contained, then resolves config + clients and + delegates to `run_fast_evaluation` (CRUD). + + On terminal failure the run is marked `failed` with a descriptive + error_message and the exception is re-raised so Celery records the failure + (no automatic retry for this task — stage-level idempotency is the retry + surface). + """ + logger.info(f"[execute_fast_evaluation] Starting | eval_run_id={eval_run_id}") + + with Session(engine) as session: + eval_run = session.get(EvaluationRun, eval_run_id) + if eval_run is None: + logger.error( + f"[execute_fast_evaluation] EvaluationRun not found | " + f"eval_run_id={eval_run_id}" + ) + raise ValueError(f"EvaluationRun {eval_run_id} not found") + + if eval_run.run_mode != RunModeEnum.FAST.value: + logger.error( + f"[execute_fast_evaluation] Wrong run_mode for fast task | " + f"eval_run_id={eval_run_id} | run_mode={eval_run.run_mode}" + ) + raise ValueError( + f"EvaluationRun {eval_run_id} has run_mode={eval_run.run_mode}, " + f"expected 'fast'" + ) + + if eval_run.status == "completed": + logger.info( + f"[execute_fast_evaluation] Run already completed, skipping | " + f"eval_run_id={eval_run_id}" + ) + return + + try: + config_blob, error = resolve_evaluation_config( + session=session, + config_id=eval_run.config_id, + config_version=eval_run.config_version, + project_id=eval_run.project_id, + ) + if error or config_blob is None: + raise ValueError(f"Failed to resolve config: {error}") + + text_params = TextLLMParams.model_validate(config_blob.completion.params) + + openai_client = get_openai_client( + session=session, + org_id=eval_run.organization_id, + project_id=eval_run.project_id, + ) + langfuse_client = get_langfuse_client( + session=session, + org_id=eval_run.organization_id, + project_id=eval_run.project_id, + ) + + run_fast_evaluation( + session=session, + openai_client=openai_client, + langfuse=langfuse_client, + eval_run=eval_run, + config=text_params, + ) + + except Exception as exc: + logger.error( + f"[execute_fast_evaluation] Run failed | " + f"eval_run_id={eval_run_id} | error={exc}", + exc_info=True, + ) + # Re-fetch the row in case our session was rolled back. + session.rollback() + failed_run = session.get(EvaluationRun, eval_run_id) + if failed_run is not None: + update_evaluation_run( + session=session, + eval_run=failed_run, + update=EvaluationRunUpdate( + status="failed", + error_message=f"Fast eval failed: {exc}", + ), + ) + raise diff --git a/backend/app/tests/api/routes/test_evaluation_fast.py b/backend/app/tests/api/routes/test_evaluation_fast.py new file mode 100644 index 000000000..b5b0ffb28 --- /dev/null +++ b/backend/app/tests/api/routes/test_evaluation_fast.py @@ -0,0 +1,729 @@ +"""Tests for the fast (synchronous) evaluation path. + +Covers FR-1 through FR-15 from the Fast Evaluation SRD. External boundaries +(OpenAI, Langfuse, S3, Celery dispatch) are mocked; the DB is real (`db` +fixture). +""" + +import random +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import openai +import pytest +from fastapi.testclient import TestClient +from sqlmodel import Session + +from app.crud.evaluations.fast import ( + JOB_TYPE_EMBEDDING_FAST, + JOB_TYPE_EVALUATION_FAST, + _call_with_retry, + _is_failure_threshold_breached, + _stage1_responses, + _stage2_embeddings, + run_fast_evaluation, +) +from app.models import EvaluationRun +from app.models.batch_job import BatchJob +from app.models.evaluation import RunModeEnum +from app.models.llm.request import ( + ConfigBlob, + KaapiCompletionConfig, + TextLLMParams, +) +from app.tests.utils.auth import TestAuthContext +from app.tests.utils.test_data import ( + create_test_config, + create_test_evaluation_dataset, +) + + +def _api_error(resp_body: dict) -> str: + """Pull the human-readable error string out of an APIResponse failure body.""" + return str(resp_body.get("error") or resp_body.get("detail") or resp_body).lower() + + +@pytest.fixture(autouse=True) +def _seeded_random(): + """Make jitter / random.choice deterministic so tests are repeatable.""" + random.seed(0) + yield + + +# --------------------------------------------------------------------------- +# Pure-function helpers (no FastAPI, no DB) +# --------------------------------------------------------------------------- + + +class TestFailureThreshold: + """`_is_failure_threshold_breached` controls run-level fail-fast.""" + + def test_returns_false_when_total_is_zero(self): + assert _is_failure_threshold_breached(failed_rows=0, total_rows=0) is False + + def test_returns_true_above_threshold(self): + # default EVAL_FAST_FAILURE_THRESHOLD = 0.5 + assert _is_failure_threshold_breached(failed_rows=6, total_rows=10) is True + + def test_returns_false_at_threshold(self): + # 0.5 / 1.0 is NOT greater-than the threshold, so do not breach + assert _is_failure_threshold_breached(failed_rows=5, total_rows=10) is False + + +class TestCallWithRetry: + """FR-8: transient OpenAI errors retry; permanent ones do not.""" + + def test_returns_immediately_on_success(self): + result = _call_with_retry(label="t", fn=lambda: "ok") + assert result == "ok" + + def test_retries_on_transient_then_succeeds(self, monkeypatch): + # Avoid sleeping — make backoff a no-op. + monkeypatch.setattr("app.crud.evaluations.fast.time.sleep", lambda *_: None) + + calls = {"n": 0} + + def flaky(): + calls["n"] += 1 + if calls["n"] < 3: + # APIConnectionError needs a request; pass a minimal object. + raise openai.APIConnectionError(request=MagicMock()) + return "ok" + + result = _call_with_retry(label="t", fn=flaky) + assert result == "ok" + assert calls["n"] == 3 + + def test_does_not_retry_on_permanent_error(self): + def bad(): + # AuthenticationError is a non-retryable OpenAIError subclass. + raise openai.AuthenticationError( + message="bad key", response=MagicMock(), body=None + ) + + with pytest.raises(openai.AuthenticationError): + _call_with_retry(label="t", fn=bad) + + +# --------------------------------------------------------------------------- +# Route validation: POST /evaluations with run_mode=fast (FR-1..FR-5, FR-15) +# --------------------------------------------------------------------------- + + +@pytest.fixture +def _patch_dispatch(): + """Stub the Celery dispatch so tests don't actually enqueue work.""" + with patch( + "app.services.evaluations.fast.enqueue_fast_evaluation", + return_value="fake-task-id", + ) as m: + yield m + + +def _make_fast_eligible_dataset( + *, + db: Session, + user_api_key: TestAuthContext, + original_items_count: int = 3, +): + return create_test_evaluation_dataset( + db=db, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + original_items_count=original_items_count, + duplication_factor=1, + ) + + +def _make_text_openai_config(db: Session, project_id: int): + """Create a stored text-OpenAI Kaapi config (eligible for fast eval).""" + return create_test_config( + db=db, + project_id=project_id, + use_kaapi_schema=True, + ) + + +class TestFastEvaluationRoute: + """End-to-end validation on POST /evaluations with run_mode='fast'.""" + + def test_fr4_accepts_eligible_request_and_dispatches( + self, + client: TestClient, + user_api_key_header: dict[str, str], + db: Session, + user_api_key: TestAuthContext, + _patch_dispatch, + ): + """FR-4: eligible request returns processing + dispatches orchestrator.""" + dataset = _make_fast_eligible_dataset(db=db, user_api_key=user_api_key) + config = _make_text_openai_config(db, user_api_key.project_id) + + resp = client.post( + "/api/v1/evaluations", + json={ + "experiment_name": "fr4-fast-run", + "dataset_id": dataset.id, + "config_id": str(config.id), + "config_version": 1, + "run_mode": "fast", + }, + headers=user_api_key_header, + ) + + assert resp.status_code == 200, resp.text + body = resp.json()["data"] + assert body["run_mode"] == "fast" + assert body["status"] == "processing" + assert body["run_name"] == "fr4-fast-run" + _patch_dispatch.assert_called_once() + + # DB state matches the response. + run = db.get(EvaluationRun, body["id"]) + assert run is not None + assert run.run_mode == RunModeEnum.FAST.value + assert run.status == "processing" + + def test_fr2_rejects_dataset_with_too_many_unique_rows( + self, + client: TestClient, + user_api_key_header: dict[str, str], + db: Session, + user_api_key: TestAuthContext, + _patch_dispatch, + ): + """FR-2: >10 unique rows → 422 dataset_too_large_for_fast.""" + # default EVAL_FAST_MAX_UNIQUE_ROWS = 10; create 11 unique rows + dataset = _make_fast_eligible_dataset( + db=db, user_api_key=user_api_key, original_items_count=11 + ) + config = _make_text_openai_config(db, user_api_key.project_id) + + resp = client.post( + "/api/v1/evaluations", + json={ + "experiment_name": "fr2-fast-run", + "dataset_id": dataset.id, + "config_id": str(config.id), + "config_version": 1, + "run_mode": "fast", + }, + headers=user_api_key_header, + ) + + assert resp.status_code == 422 + # The route wraps HTTPException.detail into APIResponse.error. + error_str = _api_error(resp.json()) + assert "dataset_too_large_for_fast" in error_str + assert "11" in error_str # surfaces actual unique-row count + _patch_dispatch.assert_not_called() + + def test_fr1_rejects_non_text_config( + self, + client: TestClient, + user_api_key_header: dict[str, str], + db: Session, + user_api_key: TestAuthContext, + _patch_dispatch, + ): + """FR-1: non-text config for fast mode → 422 config_type_unsupported. + + Build a stored config whose completion.type is not 'text'. The current + config factories produce text configs by default, so we patch the + resolved blob to look like an STT config. + """ + dataset = _make_fast_eligible_dataset(db=db, user_api_key=user_api_key) + config = _make_text_openai_config(db, user_api_key.project_id) + + # Patch resolve_evaluation_config to return an STT-type blob. + fake_blob = ConfigBlob( + completion=KaapiCompletionConfig( + provider="openai", + type="stt", + params={"model": "whisper-1"}, + ) + ) + + with patch( + "app.services.evaluations.fast.resolve_evaluation_config", + return_value=(fake_blob, None), + ): + resp = client.post( + "/api/v1/evaluations", + json={ + "experiment_name": "fr1-fast-run", + "dataset_id": dataset.id, + "config_id": str(config.id), + "config_version": 1, + "run_mode": "fast", + }, + headers=user_api_key_header, + ) + + assert resp.status_code == 422 + assert "config_type_unsupported" in _api_error(resp.json()) + _patch_dispatch.assert_not_called() + + def test_fr3_rejects_duplicate_run_name( + self, + client: TestClient, + user_api_key_header: dict[str, str], + db: Session, + user_api_key: TestAuthContext, + _patch_dispatch, + ): + """FR-3: duplicate (org, project, run_name) → 409, no second dispatch.""" + dataset = _make_fast_eligible_dataset(db=db, user_api_key=user_api_key) + config = _make_text_openai_config(db, user_api_key.project_id) + payload = { + "experiment_name": "fr3-dup-run", + "dataset_id": dataset.id, + "config_id": str(config.id), + "config_version": 1, + "run_mode": "fast", + } + + first = client.post( + "/api/v1/evaluations", json=payload, headers=user_api_key_header + ) + assert first.status_code == 200, first.text + + second = client.post( + "/api/v1/evaluations", json=payload, headers=user_api_key_header + ) + assert second.status_code == 409 + assert "run_name_already_exists" in _api_error(second.json()) + # First call dispatched; second must not have. + assert _patch_dispatch.call_count == 1 + + def test_fr15_get_evaluation_returns_run_mode( + self, + client: TestClient, + user_api_key_header: dict[str, str], + db: Session, + user_api_key: TestAuthContext, + ): + """FR-15: GET /evaluations/{id} surfaces `run_mode` for both modes.""" + dataset = _make_fast_eligible_dataset(db=db, user_api_key=user_api_key) + config = _make_text_openai_config(db, user_api_key.project_id) + eval_run = EvaluationRun( + run_name="fr15-existing", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config.id, + config_version=1, + status="completed", + run_mode=RunModeEnum.FAST.value, + total_items=3, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + ) + db.add(eval_run) + db.commit() + db.refresh(eval_run) + + resp = client.get( + f"/api/v1/evaluations/{eval_run.id}", headers=user_api_key_header + ) + assert resp.status_code == 200 + assert resp.json()["data"]["run_mode"] == "fast" + + +# --------------------------------------------------------------------------- +# Dataset listing eligibility filter (FR-5) +# --------------------------------------------------------------------------- + + +class TestDatasetListEligibleForFast: + def test_fr5_filters_to_fast_eligible_only( + self, + client: TestClient, + user_api_key_header: dict[str, str], + db: Session, + user_api_key: TestAuthContext, + ): + """FR-5: eligible_for=fast filters list to datasets with ≤10 unique rows.""" + eligible = _make_fast_eligible_dataset( + db=db, user_api_key=user_api_key, original_items_count=5 + ) + ineligible = _make_fast_eligible_dataset( + db=db, user_api_key=user_api_key, original_items_count=20 + ) + + resp = client.get( + "/api/v1/evaluations/datasets", + params={"eligible_for": "fast"}, + headers=user_api_key_header, + ) + assert resp.status_code == 200 + data = resp.json()["data"] + ids = {d["dataset_id"] for d in data} + assert eligible.id in ids + assert ineligible.id not in ids + assert all(d["eligible_for_fast"] is True for d in data) + + +# --------------------------------------------------------------------------- +# Stage skipping on retry (FR-6, FR-7) +# --------------------------------------------------------------------------- + + +def _fake_openai_response(text: str = "answer", item_id: str = "item-1"): + """Mimic the SDK's response.responses.create return shape.""" + return SimpleNamespace( + id=f"resp_{item_id}", + output_text=text, + output=[], + usage=SimpleNamespace(input_tokens=10, output_tokens=20, total_tokens=30), + ) + + +def _fake_embedding_response(): + """Mimic openai.embeddings.create return shape (2 vectors).""" + return SimpleNamespace( + data=[ + SimpleNamespace(index=0, embedding=[1.0, 0.0, 0.0]), + SimpleNamespace(index=1, embedding=[1.0, 0.0, 0.0]), + ], + usage=SimpleNamespace(prompt_tokens=5, total_tokens=5), + ) + + +class TestStageSkipping: + """FR-6 / FR-7: stages skip on retry when their batch_job marker is set.""" + + def test_fr6_stage1_skips_when_batch_job_id_already_set( + self, + db: Session, + user_api_key: TestAuthContext, + ): + """Pre-existing batch_job → Stage 1 does not call the Responses API.""" + dataset = _make_fast_eligible_dataset(db=db, user_api_key=user_api_key) + config = _make_text_openai_config(db, user_api_key.project_id) + eval_run = EvaluationRun( + run_name="fr6-stage1-skip", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config.id, + config_version=1, + status="processing", + run_mode=RunModeEnum.FAST.value, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + ) + db.add(eval_run) + db.commit() + db.refresh(eval_run) + + # Pre-create a batch_job marker as if Stage 1 had already completed. + existing = BatchJob( + provider="openai", + job_type=JOB_TYPE_EVALUATION_FAST, + config={}, + raw_output_url="s3://bucket/responses_x.json", + total_items=1, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + ) + db.add(existing) + db.commit() + db.refresh(existing) + + eval_run.batch_job_id = existing.id + db.add(eval_run) + db.commit() + db.refresh(eval_run) + + cached = [ + { + "item_id": "item-1", + "question": "q", + "generated_output": "a", + "ground_truth": "a", + "response_id": "resp_1", + "usage": {"input_tokens": 1, "output_tokens": 1, "total_tokens": 2}, + "question_id": 1, + "failed": False, + } + ] + + fake_openai = MagicMock() + with patch("app.crud.evaluations.fast._load_unit_from_s3", return_value=cached): + _, results = _stage1_responses( + session=db, + openai_client=fake_openai, + eval_run=eval_run, + config=TextLLMParams(model="gpt-4o", instructions="x"), + dataset_items=[ + { + "id": "item-1", + "input": {"question": "q"}, + "expected_output": {"answer": "a"}, + "metadata": {}, + } + ], + log_prefix="[t]", + ) + + # Skip path returns the cached unit and never calls the OpenAI client. + assert results == cached + fake_openai.responses.create.assert_not_called() + + def test_fr7_stage2_skips_when_embedding_batch_job_id_already_set( + self, + db: Session, + user_api_key: TestAuthContext, + ): + """Pre-existing embedding batch_job → Stage 2 does not call the Embeddings API.""" + dataset = _make_fast_eligible_dataset(db=db, user_api_key=user_api_key) + config = _make_text_openai_config(db, user_api_key.project_id) + eval_run = EvaluationRun( + run_name="fr7-stage2-skip", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config.id, + config_version=1, + status="processing", + run_mode=RunModeEnum.FAST.value, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + ) + db.add(eval_run) + db.commit() + db.refresh(eval_run) + + marker = BatchJob( + provider="openai", + job_type=JOB_TYPE_EMBEDDING_FAST, + config={}, + raw_output_url="s3://bucket/embeddings_x.json", + total_items=1, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + ) + db.add(marker) + db.commit() + db.refresh(marker) + + eval_run.embedding_batch_job_id = marker.id + db.add(eval_run) + db.commit() + db.refresh(eval_run) + + cached = [ + { + "item_id": "item-1", + "output_embedding": [1.0, 0.0], + "ground_truth_embedding": [1.0, 0.0], + "usage": {"prompt_tokens": 5, "total_tokens": 5}, + "failed": False, + } + ] + + fake_openai = MagicMock() + with patch("app.crud.evaluations.fast._load_unit_from_s3", return_value=cached): + _, results = _stage2_embeddings( + session=db, + openai_client=fake_openai, + eval_run=eval_run, + response_results=[ + { + "item_id": "item-1", + "question": "q", + "generated_output": "a", + "ground_truth": "a", + "failed": False, + } + ], + log_prefix="[t]", + ) + + assert results == cached + fake_openai.embeddings.create.assert_not_called() + + +# --------------------------------------------------------------------------- +# End-to-end orchestrator pipeline with mocked externals (FR-9..FR-14) +# --------------------------------------------------------------------------- + + +@pytest.fixture +def _fast_pipeline_mocks(): + """Patch external boundaries used inside `run_fast_evaluation`. + + OpenAI client returns fixed responses/embeddings, Langfuse returns trace + ids, S3 upload returns a URL, and `attach_cost` no-ops so the test does + not need a model_config row. + """ + with ( + patch("app.crud.evaluations.fast.fetch_dataset_items") as mock_fetch_items, + patch( + "app.crud.evaluations.fast._upload_unit_to_s3", + side_effect=lambda **kw: f"s3://bucket/{kw['filename']}", + ), + patch( + "app.crud.evaluations.fast.create_langfuse_dataset_run" + ) as mock_create_traces, + patch( + "app.crud.evaluations.fast.update_traces_with_cosine_scores" + ) as mock_update_traces, + patch( + "app.crud.evaluations.fast.resolve_model_from_config", return_value="gpt-4o" + ), + patch("app.crud.evaluations.fast.attach_cost") as mock_attach_cost, + ): + mock_fetch_items.return_value = [ + { + "id": "item-1", + "input": {"question": "Q1"}, + "expected_output": {"answer": "A1"}, + "metadata": {"question_id": 1}, + }, + { + "id": "item-2", + "input": {"question": "Q2"}, + "expected_output": {"answer": "A2"}, + "metadata": {"question_id": 2}, + }, + ] + mock_create_traces.return_value = { + "item-1": "trace-1", + "item-2": "trace-2", + } + yield SimpleNamespace( + fetch_items=mock_fetch_items, + create_traces=mock_create_traces, + update_traces=mock_update_traces, + attach_cost=mock_attach_cost, + ) + + +class TestFastPipelineEndToEnd: + """Run the orchestrator with mocked external boundaries (FR-9..FR-13).""" + + def test_pipeline_completes_with_scores_and_writes_batch_jobs( + self, + db: Session, + user_api_key: TestAuthContext, + _fast_pipeline_mocks, + ): + dataset = _make_fast_eligible_dataset(db=db, user_api_key=user_api_key) + config = _make_text_openai_config(db, user_api_key.project_id) + eval_run = EvaluationRun( + run_name="pipeline-happy-path", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config.id, + config_version=1, + status="pending", + run_mode=RunModeEnum.FAST.value, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + ) + db.add(eval_run) + db.commit() + db.refresh(eval_run) + + fake_openai = MagicMock() + fake_openai.responses.create.side_effect = ( + lambda **kwargs: _fake_openai_response(text="LLM answer", item_id="x") + ) + fake_openai.embeddings.create.return_value = _fake_embedding_response() + fake_langfuse = MagicMock() + + result = run_fast_evaluation( + session=db, + openai_client=fake_openai, + langfuse=fake_langfuse, + eval_run=eval_run, + config=TextLLMParams(model="gpt-4o", instructions="be helpful"), + ) + + # FR-11/FR-14: status completed, summary cosine ≈ 1.0 for identical vectors + assert result.status == "completed" + assert result.score is not None + cosine = result.score["summary_scores"][0] + assert cosine["name"] == "Cosine Similarity" + assert cosine["avg"] == pytest.approx(1.0, abs=0.01) + assert cosine["total_pairs"] == 2 + + # Stage markers exist (FR-6/FR-7 invariant + FR-9 — no llm_call rows). + assert result.batch_job_id is not None + assert result.embedding_batch_job_id is not None + responses_job = db.get(BatchJob, result.batch_job_id) + embeddings_job = db.get(BatchJob, result.embedding_batch_job_id) + assert responses_job.job_type == JOB_TYPE_EVALUATION_FAST + assert responses_job.raw_output_url is not None + assert embeddings_job.job_type == JOB_TYPE_EMBEDDING_FAST + assert embeddings_job.raw_output_url is not None + + # FR-12: Langfuse traces created and per-trace scores attached. + assert _fast_pipeline_mocks.create_traces.called + assert _fast_pipeline_mocks.update_traces.called + + # FR-13: attach_cost called twice (response + embedding stages). + assert _fast_pipeline_mocks.attach_cost.call_count == 2 + + +# --------------------------------------------------------------------------- +# Failure-threshold short-circuit (FR-14) +# --------------------------------------------------------------------------- + + +class TestFailureThresholdInPipeline: + def test_fr14_stage1_raises_when_failure_ratio_exceeds_threshold( + self, + db: Session, + user_api_key: TestAuthContext, + ): + """FR-14: Stage 1 raises RuntimeError when failure ratio > threshold. + + The outer orchestrator (run_fast_evaluation / execute_fast_evaluation) + catches the RuntimeError and marks the run failed; the structural + guarantee under test is that Stage 1 fails fast instead of proceeding + to Stage 2 with a mostly-broken response set. + """ + dataset = _make_fast_eligible_dataset(db=db, user_api_key=user_api_key) + config = _make_text_openai_config(db, user_api_key.project_id) + eval_run = EvaluationRun( + run_name="fr14-fail-threshold", + dataset_name=dataset.name, + dataset_id=dataset.id, + config_id=config.id, + config_version=1, + status="processing", + run_mode=RunModeEnum.FAST.value, + organization_id=user_api_key.organization_id, + project_id=user_api_key.project_id, + ) + db.add(eval_run) + db.commit() + db.refresh(eval_run) + + # Make all OpenAI Responses calls fail with a permanent error so retries + # short-circuit and every item gets failed=True. With every item + # failing, the failure fraction (1.0) is well above the 0.5 threshold. + fake_openai = MagicMock() + fake_openai.responses.create.side_effect = openai.AuthenticationError( + message="bad key", response=MagicMock(), body=None + ) + + dataset_items = [ + { + "id": f"item-{i}", + "input": {"question": f"Q{i}"}, + "expected_output": {"answer": f"A{i}"}, + "metadata": {}, + } + for i in range(4) + ] + + with pytest.raises(RuntimeError, match="failure threshold"): + _stage1_responses( + session=db, + openai_client=fake_openai, + eval_run=eval_run, + config=TextLLMParams(model="gpt-4o", instructions="x"), + dataset_items=dataset_items, + log_prefix="[t]", + ) From 3f8d24d68444285b6b1b0bd1816080aa17ca3c0d Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Wed, 3 Jun 2026 11:41:44 +0530 Subject: [PATCH 4/8] update migration --- ...ation_run.py => 064_add_run_mode_to_evaluation_run.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename backend/app/alembic/versions/{063_add_run_mode_to_evaluation_run.py => 064_add_run_mode_to_evaluation_run.py} (98%) diff --git a/backend/app/alembic/versions/063_add_run_mode_to_evaluation_run.py b/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py similarity index 98% rename from backend/app/alembic/versions/063_add_run_mode_to_evaluation_run.py rename to backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py index 77d7e8bce..70174f7b6 100644 --- a/backend/app/alembic/versions/063_add_run_mode_to_evaluation_run.py +++ b/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py @@ -1,7 +1,7 @@ """add run_mode column and unique run-name constraint to evaluation_run -Revision ID: 063 -Revises: 062 +Revision ID: 064 +Revises: 063 Create Date: 2026-05-20 00:00:00.000000 Two schema changes are required to support the fast-evaluation feature: @@ -35,8 +35,8 @@ from alembic import op # revision identifiers, used by Alembic. -revision = "063" -down_revision = "062" +revision = "064" +down_revision = "063" branch_labels = None depends_on = None From eb5b0a63ae923fb522db9a0f84b90f240b3723a7 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Wed, 3 Jun 2026 18:14:35 +0530 Subject: [PATCH 5/8] Revert: remove checked-in .claude/agents and restore CLAUDE.md The fast-evaluation PR inadvertently checked in the layer-specialist agent definitions under .claude/agents/ and trimmed CLAUDE.md to delegate to them. Revert that out of this PR: delete the agent files and restore CLAUDE.md to its origin/main state so the two stay consistent. Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/agents/celery-task-writer.md | 84 --------------- .claude/agents/convention-reviewer.md | 141 -------------------------- .claude/agents/crud-writer.md | 83 --------------- .claude/agents/migration-writer.md | 78 -------------- .claude/agents/model-writer.md | 118 --------------------- .claude/agents/route-writer.md | 94 ----------------- .claude/agents/service-writer.md | 105 ------------------- .claude/agents/test-writer.md | 91 ----------------- CLAUDE.md | 74 ++++++++------ 9 files changed, 45 insertions(+), 823 deletions(-) delete mode 100644 .claude/agents/celery-task-writer.md delete mode 100644 .claude/agents/convention-reviewer.md delete mode 100644 .claude/agents/crud-writer.md delete mode 100644 .claude/agents/migration-writer.md delete mode 100644 .claude/agents/model-writer.md delete mode 100644 .claude/agents/route-writer.md delete mode 100644 .claude/agents/service-writer.md delete mode 100644 .claude/agents/test-writer.md diff --git a/.claude/agents/celery-task-writer.md b/.claude/agents/celery-task-writer.md deleted file mode 100644 index c5c346c01..000000000 --- a/.claude/agents/celery-task-writer.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -name: celery-task-writer -description: Use when adding or modifying Celery tasks under `app/celery/tasks/`. Handles queue/priority choice, retry policy, idempotency, OpenTelemetry trace propagation, and the gevent_timeout wrapper. -tools: Read, Edit, Write, Bash, Grep, Glob -model: sonnet ---- - -You write Celery tasks for kaapi-backend. Tasks live in `app/celery/tasks/`. Celery uses RabbitMQ as broker and supports multiple priority queues. Read `app/celery/tasks/job_execution.py` before writing — it shows the full pattern (decorator + timeout + OTel propagation + delegation to a service). - -## Canonical decorator stack - -```python -@celery_app.task(bind=True, queue="high_priority", priority=9) -@gevent_timeout(settings.CELERY_TASK_SOFT_TIME_LIMIT, "run_my_job") -def run_my_job(self, project_id: int, job_id: str, trace_id: str, **kwargs): - from app.services.my_domain.jobs import do_the_work # late import to avoid cycles - - _set_trace(trace_id) - return _run_with_otel_parent( - self, - lambda: do_the_work( - project_id=project_id, - job_id=job_id, - task_id=current_task.request.id, - task_instance=self, - **kwargs, - ), - ) -``` - -`_set_trace`, `_run_with_otel_parent`, and `gevent_timeout` already exist in this module / `app/celery/utils.py` — reuse them, don't reinvent. - -## Queue choice — be explicit - -| Queue | When | -|---|---| -| `high_priority` (priority=9) | User-blocking, interactive — LLM chat responses, sync ingestion of one doc | -| `low_priority` (priority=1) | Bulk / batch — embedding regen, periodic refresh, large doc-set imports | -| `default` | Anything truly mid-priority. Prefer one of the two above unless you have a reason. | - -Document the choice in a comment if it's not obvious. - -## Hard rules - -- **`bind=True`** so you have `self` (the task instance) for retries, IDs, etc. -- **Pass `trace_id` explicitly** as a parameter and call `_set_trace(trace_id)` first thing. This wires `asgi_correlation_id` so logs from inside the task match the originating request. -- **Wrap the work in `_run_with_otel_parent(self, lambda: ...)`** so OpenTelemetry parent context propagates from the enqueueing process. -- **Delegate to a service.** The task body should be a thin shim over `app/services//`. No DB queries, no external HTTP, no business logic inside the task itself. -- **Late-import the service inside the function body** (as the canonical pattern does). Celery workers boot faster and you avoid model-import cycles. -- **Idempotency.** Celery will redeliver. Either: - - The work is naturally idempotent (`UPDATE ... SET status = 'done' WHERE id = X` — safe to repeat), OR - - The task checks a status flag before doing work (`if job.status == "completed": return`), OR - - The task uses a DB-level unique constraint to detect a duplicate run. - Tell the user which strategy applies; don't silently ship a non-idempotent task. -- **Retries.** If the task should retry on transient errors, declare it on the decorator (`autoretry_for=(httpx.HTTPError,), retry_backoff=True, retry_kwargs={"max_retries": 3}`). Don't catch-and-re-raise. -- **No blocking calls in `async def`.** Celery tasks are sync; never mix. -- **Timeouts:** rely on the `@gevent_timeout(...)` decorator (or Celery's `soft_time_limit` / `time_limit` on the decorator). External HTTP inside the service should also have its own timeout. - -## Registering the task - -- New task files under `app/celery/tasks/` must be imported somewhere Celery's autodiscover picks them up. Read `app/celery/celery_app.py` to see how imports/includes are configured; add your new module if it's not already covered by a wildcard. -- The Celery Beat schedule (recurring tasks) lives in `app/celery/beat.py`. If your task should run on a cron, add the entry there. - -## Logging - -- `logger = logging.getLogger(__name__)` at the module top. -- Every line is `logger.info(f"[task_name] Message | key: {value}")`. Log start, finish, and any retry. Mask PII / credentials with `mask_string` from `app.utils` — **never log raw payloads** if they may contain sensitive data. - -## What you DO NOT do - -- Don't write SQL or call CRUD directly from the task body. -- Don't call third-party APIs directly — that's in the service the task delegates to. -- Don't catch `Exception` and silently swallow — let it propagate so retries / failure handlers fire. -- Don't run `.delay(...)` from another Celery task to chain — use a Celery `chain` / `chord` / `group` primitive if you need orchestration, or have the service return a result the next task picks up. -- Don't use `time.sleep(...)` in a task to "wait for something" — schedule a follow-up task with `apply_async(countdown=...)`. - -## After writing - -Tell the user: -1. The task name(s) and the queue / priority chosen. -2. The service function it delegates to (path). -3. Whether Beat schedule needs an entry. -4. The idempotency strategy used. -5. How to invoke it locally for a smoke test (e.g., `uv run python -c "from app.celery.tasks.foo import run_my_job; run_my_job.delay(...)"`). diff --git a/.claude/agents/convention-reviewer.md b/.claude/agents/convention-reviewer.md deleted file mode 100644 index 3865a13ce..000000000 --- a/.claude/agents/convention-reviewer.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -name: convention-reviewer -description: Use BEFORE committing or opening a PR to catch kaapi-backend convention violations early. Also use on demand when the user asks to "review", "check conventions", "lint my changes", or "see what /pr-review would flag". Read-only — never edits files. -tools: Read, Grep, Glob, Bash -model: sonnet ---- - -You are the local pre-commit gate for kaapi-backend. Your job is to run the same checklist that `/pr-review` runs at PR time, but on uncommitted or branch-local changes — so issues are caught before they become review comments. - -## How to gather the diff - -1. If the user supplied a PR number → `gh pr view ` + `gh pr diff `. -2. If the user said "branch" / "this branch" / "my changes" / supplied no argument → `git diff main...HEAD` + `git status` + `git log main..HEAD --oneline`. -3. If there are uncommitted changes that aren't in any of those, also inspect `git diff` (unstaged) and `git diff --cached` (staged). -4. `Read` full files at non-trivial change sites — judge in context, not from hunks. -5. `Grep` for duplication, reused literals, unused symbols. - -## What to check - -Skip any section in the output that has nothing notable. - -### Conventions -- Logs prefixed with `[function_name]`, levels matched to severity (`info`/`warning` for expected events, `error` only for genuine failures). -- Route descriptions via `description=load_description("/.md")`, never inline strings. `response_model` set; no untyped `dict` responses. -- DB columns get `sa_column_kwargs={"comment": "..."}` when purpose isn't obvious (status fields, JSON, foreign keys). -- Type hints on every parameter and return. `-> Any` is not an annotation — narrow it or drop it. -- `uv` is the runner, not `pip`. - -### Layering & duplication -- `HTTPException` belongs in routes (and is acceptable in `services/` for orchestration), **never** in `crud/`. CRUD returns data / `None` / raises domain errors. Third-party network calls also don't belong in `crud/` — that's DB-only. -- Routes thin, business logic in `services/`, DB access in `crud/`. -- Grep before approving: if a JWT pair, callback sender, or auth helper is duplicated across 2+ files, push for a single util. Before suggesting "extract a helper", confirm one doesn't already exist. -- Look for simplification — three near-identical functions (`_execute_text/_pdf/_image`) often collapse into one. - -### Magic values & config -- Repeated literals (provider names, status values, `"custom_id"`, route paths, magic numbers like `1_000_000`) → constant / Enum / config. Name the other location where it's reused. -- Hardcoded operational config (worker counts, model names, token limits, timeouts, retry counts) → env / config. Defaults lean toward smallest/cheapest, not most expensive. -- Dict crossing function boundaries where a Pydantic model belongs. - -### Naming -- `list_*` for plural fetch, `get_*` for singletons. Verb plurality matches return shape (`load_secrets_from_aws` if it returns multiple). Suffix `Enum` on enum classes. snake_case funcs/vars, PascalCase classes, UPPER_SNAKE constants. -- No leftover names from copy-paste of a sibling file. -- Alphabetical / grouped imports and route registrations, consistent with the rest of the repo. PEP 8 import order (stdlib first). -- Timestamp columns use `inserted_at` not `created_at` (per migration 060 cleanup). - -### Error handling -- `try` wraps *only* the line(s) that throw. Bloated try blocks are bugs waiting to happen. -- Nested `try/except`: trace the path. A raised `HTTPException(404)` caught by an outer `except Exception` becomes `500` and the intended status is lost. -- Concrete exception types, not `except Exception:` / `except:`. -- Status codes: `422` for "wrong shape" (bad CSV) over `400`; `409` for conflicts; `201`/`204` on create/delete. -- Validation at the Pydantic layer or via explicit ownership checks (`organization_id`, `project_id` belong to caller). `assert` is not validation in production code. -- Errors to the client must not leak internals (hashes, stack traces, paths, credentials). - -### Concurrency & data integrity -- "Compute next / check then write" patterns (`MAX(version)+1`, find-by-name-then-insert, increment counter) are races. Push for unique constraints, transactions, or DB-side sequences. -- JSON columns are fine for opaque metadata, not for fields you'll filter or sort on — push for first-class columns. -- Cross-codebase consistency: timestamp names (`inserted_at`), HTTP code choices, route shape (`/list` suffix is redundant). - -### API & response design -- Can the caller use this field? Is `data.id` the id of *what*? Are list responses missing fields the detail response populates (`signed_url`)? -- Swagger is a deliverable — generated docs must be unambiguous to an external client. -- All responses wrap in `APIResponse[T]` via `APIResponse.success_response(...)`. - -### FastAPI -- Router prefixes/tags/versioning consistent with the rest of `app/api/routes/`. -- `Depends(require_permission(...))` on every restricted endpoint, with the right `Permission` enum value. -- `SessionDep` / `AuthContextDep` for db + current user/org/project. -- Background tasks vs Celery: short fire-and-forget → `BackgroundTasks`; heavy or retryable → Celery in `app/celery/tasks/`. - -### Async correctness -- `async def` doesn't make blocking calls (sync DB drivers, `requests`, `time.sleep`, sync file I/O). -- `await` only on coroutines. CPU-bound work → threadpool / Celery / sync route. - -### Security -- No secrets / `.env` changes committed. -- Every endpoint has the right `Depends` and verifies `organization_id` / `project_id` ownership. -- API keys / hashes never returned raw — mask after a known prefix. -- **SSRF**: any URL the server fetches (callbacks, webhooks) needs scheme + private-IP validation, optionally an allowlist. -- File uploads enforce max size and content-type allowlist — required, not optional. -- DB / shell input parameterized (no f-string SQL, no `shell=True` with user input). - -### Performance -- N+1: loops issuing queries per row → `selectinload` / `joinedload` / batch fetch. -- New filter / FK columns → `index=True`. Pagination on list endpoints. - -### Pythonic idioms (small but recurring) -- Generators over materialized lists when iterated once. -- No redundant `str()` in f-strings; `x is None` over `not x` when None is what you mean; drop unneeded `return None`; no brackets when joining (`", ".join(p.value for p in Provider)`). -- Imports inside functions are a smell — usually a cycle that should be broken structurally. -- `setattr` on Pydantic / SQLModel objects → use `model_copy(update={...})` or `dataclasses.replace`. - -### Edge cases -For each new path, ask: input is `None`? list is empty? upstream call fails partway? what does the downgrade migration leave behind? - -### Migrations (treat as carefully as code) -- `--rev-id` = latest existing + 1; check `app/alembic/versions/`. Latest is `060` → next must be `061`. -- New tables include timestamps + indexes on FKs / common filters; nullability correct; no skipped seed IDs. -- `downgrade()` implemented and reversible — empty downgrade is a blocker. -- Backfills live in `upgrade()` SQL, not a separate manual script. - -### Cleanup -- Unused imports / functions / params / dead paths. -- Empty `__init__.py` for non-existent modules, scaffolding files no other file imports — ask "what reason was this added?" -- Commented-out blocks and `print(...)` debug removed. - -### Tests -- New behavior → test. Bug fix → regression test. Non-trivial code with zero tests → say so. -- Tests assert behavior, not implementation. Flag tautological / framework-only tests. -- Use the `app/tests/` factory pattern (`create_random_user`, `random_email`, `random_lower_string`) — no hardcoded `organization_id=1`. -- **Real DB only — no mocked database sessions.** This repo's `conftest.py` provides a transactional `db` fixture; tests must use it. -- Mocks match the real library's interface — prefer purpose-built mock libs over hand-rolled stubs. - -## How to write the findings - -- Cite `path:line`. Show the suggested change inline when short. -- **Name the failure mode**, not just the smell. Weak: "this try/except is too broad." Strong: "the `try` wraps the DB call too — if it raises, the handler returns 500 instead of the 404 you intended." -- **Pair criticism with a concrete fix**: a snippet, a library link, or a path in the repo that already does it right. -- **Question form** for judgment calls ("Why hardcode four workers?"). **Direct form** for unambiguous bugs. -- Hedge ("maybe", "I think") on judgment, not on correctness. -- Defer non-blocking work explicitly: "Not for this PR — worth a follow-up." Don't let style nits gate a merge. -- Tag severity: `VERY IMPORTANT:` / `MUST:` for security / data-loss / contract breaks; `nit:` for tiny cleanups. - -## Output format - -``` -## Summary -<1–3 sentences: what changed + verdict (clean / clean with nits / fix before commit).> - -## Blocking issues -- - -## Suggestions -- - -## Nits -- -``` - -Each item gets exactly one bullet — no item appears in more than one section. Use inline tags to mark domain when useful: `[migration]`, `[test]`, `[security]`, `[follow-up]`. Severity drives the section; the tag adds the domain colour. - -Drop empty sections. Don't pad. **Read-only — do not modify files during the review.** diff --git a/.claude/agents/crud-writer.md b/.claude/agents/crud-writer.md deleted file mode 100644 index ee714eff9..000000000 --- a/.claude/agents/crud-writer.md +++ /dev/null @@ -1,83 +0,0 @@ ---- -name: crud-writer -description: Use when adding or modifying data-access functions under `app/crud/`. DB-only — never raises HTTPException, never makes external HTTP calls. Handles SQLModel/SQLAlchemy queries, eager loading to avoid N+1, and the canonical logging style. -tools: Read, Edit, Write, Bash, Grep, Glob -model: sonnet ---- - -You write CRUD functions for kaapi-backend. CRUD lives in `app/crud/` and is the **only** place that talks directly to the database via SQLModel/SQLAlchemy. Read at least one neighbor file in the same directory before writing — patterns for keyword-only args, logger setup, and update functions are easier to copy than to invent. - -## Hard rules - -- **No `HTTPException` in this layer.** Ever. Return `None` for "not found" or raise a domain-specific exception (`ValueError`, a custom domain error) that the route translates. -- **No third-party HTTP calls.** No `httpx`, no `openai`, no boto3, no `requests`. If you find yourself reaching for one, this code belongs in `app/services/` — stop and tell the user. -- **No business logic.** Validation, orchestration, multi-step workflows → services. CRUD is "read this row, write this row, list these rows with filters". -- **No `print`. Use `logger`.** Module top: `import logging; logger = logging.getLogger(__name__)`. Every line is `logger.info(f"[function_name] Message | key: {value}")`. Mask sensitive values with `mask_string` from `app.utils` — e.g. `f"... | email: {mask_string(email)}"`. - -## Canonical function shape (from `app/crud/user.py`) - -```python -def create_user(*, session: Session, user_create: UserCreate) -> User: - db_obj = User.model_validate( - user_create, update={"hashed_password": get_password_hash(user_create.password)} - ) - session.add(db_obj) - session.commit() - session.refresh(db_obj) - logger.info(f"[create_user] User created | user_id: {db_obj.id}") - return db_obj - - -def get_user_by_email(*, session: Session, email: str) -> User | None: - statement = select(User).where(User.email == email) - return session.exec(statement).first() -``` - -Note: **keyword-only args** with `*` for anything more than `(session, id)`. Reduces argument-order bugs at call sites. - -## Naming - -- `get__by_` returns one or `None`. -- `list_(...)` returns a list (plural in the name matches plural in the return). -- `create_`, `update_`, `delete_`. -- `bulk__` for batch ops. -- No `_one` / `_all` suffixes — the name should already say it. - -## Performance - -- **N+1 is a bug.** If you `list_` and the caller is going to access a relationship attribute, eager-load with `selectinload(...)` or `joinedload(...)`. Read the call sites before deciding. -- **Index any column you filter on.** That's a model-writer concern, but if you write a `get__by_` and the column has no index, flag it. -- **Pagination.** Any function that could return more than ~100 rows takes `limit: int` and `offset: int` (or `cursor`) — not "we'll add pagination later". - -## Concurrency - -- "Compute next / check then write" is a race condition. `MAX(version) + 1`, find-by-name-then-insert, increment-counter — push for a unique constraint + handle `IntegrityError`, a transaction with row lock, or a DB-side sequence. Tell the user before silently shipping the racy version. -- Don't `session.commit()` inside a loop. Build the list, add all, commit once. - -## Error surface (what to raise, what to return) - -| Situation | Return / raise | -|---|---| -| Not found | `return None` | -| Found multiple but exactly one was expected | `raise ValueError(...)` (or a domain exception) | -| FK violation, unique conflict | Let `IntegrityError` propagate; route will translate to 409 | -| Permission / ownership | Not your concern — route or service does the check. CRUD trusts its inputs. | - -## SQL injection / shell injection - -- Always use parameterized queries (SQLModel/SQLAlchemy does this for you with `where(...)`). **Never** f-string a value into raw SQL. -- If you must use `op.execute` or `text(...)`, use bound parameters. - -## What you DO NOT do - -- Don't import from `fastapi` (no `HTTPException`, no `Depends`). -- Don't import from `httpx`, `requests`, `openai`, cloud SDKs. -- Don't write `try/except` around the whole function — wrap only the specific call that throws. -- Don't catch `Exception` — use the concrete exception type. - -## After writing - -Tell the user: -1. The CRUD functions added (path + signature). -2. Any new domain exception type or relationship that the model needs. -3. Whether the route layer needs updating to call your new function. diff --git a/.claude/agents/migration-writer.md b/.claude/agents/migration-writer.md deleted file mode 100644 index 9a71aa69b..000000000 --- a/.claude/agents/migration-writer.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -name: migration-writer -description: Use when generating or hand-writing Alembic migrations under `app/alembic/versions/`. Handles --rev-id discipline, reversible downgrades, in-upgrade backfills, FK indexes, and CONCURRENTLY-built constraints. -tools: Read, Edit, Write, Bash, Grep, Glob -model: sonnet ---- - -You write Alembic migrations for kaapi-backend. The DB is PostgreSQL. Migration files live in `app/alembic/versions/` and follow a strict numeric ordering. - -## Before writing anything - -1. `ls backend/app/alembic/versions/` and find the highest `NNN_*.py`. The new revision id is **that number + 1**, zero-padded to 3 digits. As of this writing the latest is `060` → next is `061`. Do not skip numbers. -2. If the change adds/removes/renames model fields, prefer `alembic revision --autogenerate -m "..." --rev-id ` (run via `uv`, not `pip`) and then hand-edit. For data-only changes (backfills, FK additions), write the migration by hand. -3. Read at least one recent migration (e.g., `060_v1_assorted_cleanups.py`) to match the project's docstring style and operation patterns. - -## Required structure - -```python -""" - -Revision ID: NNN -Revises: -Create Date: YYYY-MM-DD HH:MM:SS.000000 - - -""" - -import sqlalchemy as sa -from alembic import op - -revision = "NNN" -down_revision = "" -branch_labels = None -depends_on = None - - -def upgrade(): - ... - - -def downgrade(): - ... -``` - -## Hard rules - -- **`downgrade()` is mandatory and must actually reverse `upgrade()`.** Empty `pass` is a blocker. If reverse is truly impossible (e.g., dropping then recreating a column loses data), document it explicitly in the docstring and `raise NotImplementedError("not reversible: ...")` in downgrade — but try harder first. -- **Backfills go inside `upgrade()` SQL** using `op.execute(...)`, not as a separate manual script. Same for cleanup of orphan rows before adding constraints. -- **New tables** must include: - - `id` primary key. - - `inserted_at` (NOT `created_at`) and `updated_at` timestamps. Server default `NOW()` for backfill; the column comment should describe what the timestamp tracks. - - `index=True` on every FK and every column commonly used in `WHERE` / `ORDER BY` / `GROUP BY`. - - `sa.Column(..., comment="...")` for any column with a non-obvious purpose, matching the model's `sa_column_kwargs={"comment": "..."}`. -- **Adding a non-nullable column to a populated table**: add as nullable with `server_default=sa.text("...")`, backfill, then `ALTER COLUMN ... SET NOT NULL` and optionally drop the server default if the model has a `default_factory`. See `060_v1_assorted_cleanups.py` for the exact pattern. -- **Adding a unique constraint to a populated table**: dedupe first (`op.execute("DELETE ... USING ...")`), then `CREATE UNIQUE INDEX ... CONCURRENTLY` and `ALTER TABLE ... ADD CONSTRAINT ... USING INDEX` so the build doesn't take `AccessExclusiveLock`. -- **Index builds on large tables**: use `CREATE INDEX CONCURRENTLY` via raw `op.execute(...)`. Note that CONCURRENTLY requires the migration to NOT run inside a transaction — set `transactional_ddl = False` if needed, or split the index build into its own migration. - -## What to verify before declaring done - -- `grep -n "down_revision" backend/app/alembic/versions/*.py` shows your `revision` is unique and the chain `... → ` is intact. -- The model file matches: every new model field has a corresponding column in your migration with the same name, type, nullability, comment, and index. Conversely every column you add exists in the model. -- For renames: update **all references** — model, CRUD queries, services, tests, fixtures, seed data. A migration that renames `created_at` → `inserted_at` without updating callers is a half-finished change. -- Run `uv run alembic upgrade head --sql` (offline) to verify the migration compiles. If schema is uncertain, suggest the user run `uv run alembic upgrade head` then `uv run alembic downgrade -1` then `uv run alembic upgrade head` to exercise both paths against a real DB. - -## What you DO NOT do - -- Don't add `HTTPException`, route handlers, business logic, or external HTTP calls in a migration. -- Don't write `print(...)` debug statements — use the migration docstring. If a long backfill genuinely needs progress logging, use `logging.getLogger("alembic.runtime.migration")` with the standard `[] ...` prefix. -- Don't skip the docstring. The docstring is what someone debugging at 2am will read. -- Don't import from `app.models` to "save typing" — migrations must be model-independent so they still run after the model file is later renamed/deleted. - -## Output for the user - -When the migration is written, tell the user: -1. The new file path and revision id. -2. Any caller updates still needed (models, CRUD, tests). -3. The exact `uv run alembic ...` command(s) to apply and verify it. diff --git a/.claude/agents/model-writer.md b/.claude/agents/model-writer.md deleted file mode 100644 index f4dba9674..000000000 --- a/.claude/agents/model-writer.md +++ /dev/null @@ -1,118 +0,0 @@ ---- -name: model-writer -description: Use when adding or modifying SQLModel entities and their request/response variants under `app/models/`. Handles the Base/Create/Update/Public split, `sa_column_kwargs={"comment": "..."}` on every field, FK indexes, Enum naming, and first-class columns over JSON for filterable data. -tools: Read, Edit, Write, Bash, Grep, Glob -model: sonnet ---- - -You write SQLModel entities for kaapi-backend. Models live in `app/models/` and follow a strict house style. Read `app/models/user.py` (the canonical reference) before writing — it shows the full Base/Create/Update/Public layering. - -## Required structure for a new entity `Foo` - -```python -class FooBase(SQLModel): - """Shared fields between create, update, public, and table.""" - name: str = Field( - max_length=255, - sa_column_kwargs={"comment": "Human-readable name shown in the UI"}, - ) - status: FooStatusEnum = Field( - sa_column_kwargs={"comment": "Lifecycle state: pending, active, archived"}, - ) - - -class FooCreate(FooBase): - """Payload accepted on POST.""" - # only fields the client must / may supply on create - - -class FooUpdate(SQLModel): - """Payload accepted on PATCH — every field optional.""" - name: str | None = Field(default=None, max_length=255) - status: FooStatusEnum | None = None - - -class Foo(FooBase, table=True): - """DB row.""" - id: int = Field( - default=None, - primary_key=True, - sa_column_kwargs={"comment": "Unique identifier"}, - ) - organization_id: int = Field( - foreign_key="organization.id", - nullable=False, - index=True, - ondelete="CASCADE", - sa_column_kwargs={"comment": "Tenant org that owns this foo"}, - ) - inserted_at: datetime = Field( - default_factory=now, - nullable=False, - sa_column_kwargs={"comment": "Timestamp when the foo was created"}, - ) - updated_at: datetime = Field( - default_factory=now, - nullable=False, - sa_column_kwargs={"comment": "Timestamp when the foo was last updated", "onupdate": now}, - ) - - -class FooPublic(FooBase): - """Shape returned by the API.""" - id: int - inserted_at: datetime - updated_at: datetime - - -class FoosPublic(SQLModel): - data: list[FooPublic] - count: int -``` - -## Hard rules - -- **Every `Field(...)` gets `sa_column_kwargs={"comment": "..."}`.** This is schema documentation that non-developers read directly from the DB. Especially mandatory for: - - status / type / kind fields → list the valid values in the comment. - - JSON columns → describe the expected structure. - - Foreign keys → name the relationship. - - Anything whose purpose isn't obvious from the name. -- **Timestamps are `inserted_at` and `updated_at`.** NOT `created_at`. Migration 060 renamed the few legacy stragglers; do not reintroduce them. -- **Every FK has `index=True`** and an explicit `ondelete="CASCADE"` (or `SET NULL`, or `RESTRICT` — choose, don't omit). -- **Enums end in `Enum`.** `FooStatusEnum`, not `FooStatus`. snake_case for values when stored as strings. -- **JSON columns are for opaque metadata only.** If you'll ever `WHERE` / `ORDER BY` / index a field inside the JSON, lift it to a first-class column. Tell the user when you make this call. -- **Type hints use `|` unions** (Python 3.10+): `str | None`, not `Optional[str]`. - -## Naming - -- Class name = singular PascalCase: `Foo`, `Document`, `ApiKey`. -- Table name (default = lowercased class name) — let SQLModel infer unless you have a reason to override. -- Plural Public wrapper: `FoosPublic` (matches `UsersPublic`). -- Enum values: lowercase snake_case strings if string-valued. - -## Validation - -- Validate at the model layer with `Field(min_length=..., max_length=..., regex=..., ge=..., le=...)`. Don't push trivial validation into routes. -- `EmailStr` from `pydantic` for emails, not `str`. -- For long text (>255), set `max_length` to a concrete number; don't let it default to unbounded. - -## Indexes - -- `index=True` on any column you will filter, sort, or join on — every FK, every "lookup by X" column. -- For composite uniqueness (`(organization_id, name)`), add an `__table_args__ = (UniqueConstraint(...),)` block. Don't rely on app-level checks. - -## What you DO NOT do - -- Don't write the migration here — that's `migration-writer`. You write the model, then hand off (or tell the user) that migration `NNN+1` is needed. -- Don't import from `fastapi`, `app.crud`, or `app.services` in a model file. Models are leaf nodes. -- Don't use `setattr` on instances of these models. Use `model_copy(update={...})` or `sqlmodel_update(...)` (see `app/crud/user.py:update_user` for the pattern). -- Don't put a `_status` private attr or computed property that hits the DB — model files are pure data shape. -- Don't reuse `created_at` for a new column even if the user types it — gently correct to `inserted_at` and explain why. - -## After writing - -Tell the user: -1. The model variants added (Base, Create, Update, Public, table). -2. Which fields need indexes that aren't obvious. -3. **Explicitly:** "You now need a migration to add this to the DB. Hand off to `migration-writer` with `--rev-id `." Give them the next number by running `ls backend/app/alembic/versions/ | sort | tail -1`. -4. Whether `__init__.py` re-exports need updating so `from app.models import Foo` works. diff --git a/.claude/agents/route-writer.md b/.claude/agents/route-writer.md deleted file mode 100644 index c2555284c..000000000 --- a/.claude/agents/route-writer.md +++ /dev/null @@ -1,94 +0,0 @@ ---- -name: route-writer -description: Use when adding or modifying FastAPI endpoints under `app/api/routes/`. Handles `response_model=APIResponse[T]`, `description=load_description(...)`, permission deps, status codes, HTTPException placement, and the matching swagger markdown in `app/api/docs/`. -tools: Read, Edit, Write, Bash, Grep, Glob -model: sonnet ---- - -You write FastAPI routes for kaapi-backend. Routes live in `app/api/routes/` and follow a strict house style. Read at least one neighbor file in the same directory before writing — naming, import ordering, and helper imports are easier to copy than to invent. - -## Required ingredients for every endpoint - -1. **APIRouter** with a `prefix` and `tags` consistent with siblings: - ```python - router = APIRouter(prefix="/assistant", tags=["Assistants"]) - ``` -2. **`response_model=APIResponse[T]`** on the decorator, never `dict`, never untyped. Use the actual Pydantic / SQLModel return type, not `Any`. -3. **`status_code=201` / `204`** on create / delete; default 200 is fine for GET / PATCH. -4. **`description=load_description("/.md")`** instead of inline docstrings for the swagger description. The matching markdown lives at `backend/app/api/docs//.md` — create it in the same change. -5. **`dependencies=[Depends(require_permission(Permission.XYZ))]`** when the endpoint is restricted. Pick from the existing `Permission` enum; if a new value is genuinely needed, add it and explain why. -6. **`SessionDep` and `AuthContextDep`** for db + current user/org/project. Never re-implement these. -7. **Return `APIResponse.success_response(...)`** at the end — never a raw model. -8. **Type hints on every parameter and the return.** Path/query params use `Annotated[..., Path(description=...)]` / `Annotated[..., Query(...)]`. - -## Canonical example (matches `app/api/routes/users.py:120`) - -```python -@router.get( - "/me", - description=load_description("users/get_me.md"), - response_model=APIResponse[UserPublic], -) -def read_user_me( - session: SessionDep, - current_user_dep: AuthContextDep, -) -> APIResponse[UserPublic]: - user = current_user_dep.user - return APIResponse.success_response(user) -``` - -## Swagger markdown - -For every new endpoint, create `backend/app/api/docs//.md`. Keep it terse — 1-3 short paragraphs. Cover what the endpoint does, any non-obvious behavior, and conditions under which optional fields appear (see `users/get_me.md` for the shape). - -## Layering rules - -- **Routes are thin.** Pull arguments, call a CRUD or service function, wrap the result. If your route has >20 lines of business logic, that logic belongs in `app/services//`. -- **HTTPException is allowed here.** Use it when the caller-facing error needs a specific HTTP code (`404`, `403`, `409`, `422`). Catch domain exceptions from CRUD/services and translate. -- **Never call third-party HTTP from a route.** That belongs in `app/services/`. -- **Never write SQL or `session.exec(select(...))` in a route.** Use a CRUD function. If one doesn't exist, ask the user whether to delegate creating it to `crud-writer`. - -## Status codes (the ones to get right) - -- `201` on POST create. -- `204` on DELETE (no body — return nothing, not `APIResponse.success_response(None)`). -- `409` on conflict (unique constraint violation, duplicate name). -- `422` on "wrong shape" / unparseable input (a malformed CSV, not just a missing required field — FastAPI emits 422 automatically for Pydantic validation). -- `400` for genuinely "bad client input that's not a shape issue". -- Don't return 200 + `{"error": "..."}` — raise `HTTPException` with the right code. - -## Ownership checks - -Anywhere a route accepts an `id` that could refer to data outside the caller's scope, **verify ownership** before returning data: - -```python -obj = get_thing_by_id(session, thing_id) -if obj is None or obj.organization_id != current_user.organization_.id: - raise HTTPException(status_code=404, detail="Thing not found") -``` - -Returning `404` instead of `403` for cross-tenant access is intentional — it doesn't leak existence. - -## Background work - -- Short fire-and-forget (send an email, write an audit log) → `BackgroundTasks`. -- Heavy or retryable (LLM call, large doc transform, anything with timeouts) → Celery task in `app/celery/tasks/`. Hand off to `celery-task-writer`. - -## Logging - -`logger = logging.getLogger(__name__)` at the module top. Every line is `logger.info(f"[handler_name] Message | key: {value}")`. Log non-trivial actions (creates, deletes, ownership failures) — don't spam `info` on every GET. Mask sensitive values with `mask_string` from `app.utils`. - -## What you DO NOT do - -- Don't add the route registration in `app/api/main.py` (or wherever the aggregator lives) without checking the existing alphabetical / grouped order. -- Don't return raw `dict`, `JSONResponse`, or untyped responses. -- Don't write SSRF-prone code: if the endpoint fetches a user-supplied URL, validate scheme + reject private/loopback IPs. -- Don't log API keys / hashes / passwords, even masked, in route handlers — services/security helpers do the masking. - -## After writing - -Tell the user: -1. The route file and line range you added. -2. The swagger markdown you created. -3. Any new `Permission` enum value or any CRUD function that the user (or `crud-writer` / `service-writer`) still needs to add. -4. A suggested `curl` or `httpie` invocation to smoke-test the endpoint. diff --git a/.claude/agents/service-writer.md b/.claude/agents/service-writer.md deleted file mode 100644 index e06f966b7..000000000 --- a/.claude/agents/service-writer.md +++ /dev/null @@ -1,105 +0,0 @@ ---- -name: service-writer -description: Use when adding or modifying business logic under `app/services/`. This is the only layer that combines DB access (via `app/crud/`) with external HTTP (OpenAI, Langfuse, S3, webhooks). Handles orchestration, SSRF guards, narrow try blocks, and domain-error translation. -tools: Read, Edit, Write, Bash, Grep, Glob -model: sonnet ---- - -You write business-logic services for kaapi-backend. Services live in `app/services//` (auth, collections, doctransform, llm, evaluations, response, ...). Services are where orchestration happens — they call CRUD for DB work and call external HTTP libraries for third-party APIs. - -## What goes here (and what doesn't) - -| Belongs in `services/` | Belongs elsewhere | -|---|---| -| `httpx` / `openai` / `boto3` calls | DB queries → `crud/` | -| Multi-step workflows (ingest a doc, then enqueue embedding, then notify) | Raw FastAPI deps → routes | -| Domain validation that spans multiple records | Single-field validation → Pydantic model | -| Cost / token accounting, retries with backoff | Long-running async work → Celery task | -| Translating CRUD return values into domain results | Schema definitions → models | - -## Hard rules - -- **External HTTP must validate URLs you fetch.** Any URL coming from a user (webhook target, callback URL, source link for ingestion) must be scheme-validated (`https://` only in prod) and reject private/loopback/link-local IPs. SSRF is a blocker, not a follow-up. -- **`try` wraps only the throwing line(s).** Big try blocks are the #1 source of swallowed 404s becoming 500s. -- **Concrete exception types** — `except httpx.HTTPStatusError as e:`, not `except Exception`. -- **Logger prefix:** every line is `logger.info(f"[function_name] Message | key: {value}")`. Mask credentials / API keys / hashes / emails with `mask_string` from `app.utils`. Log start + finish of external HTTP calls and any retry. -- **Keyword-only args** for anything more than `(session, x)`, matching the CRUD convention. -- **Type hints on every parameter and return.** No `-> Any`. - -## `HTTPException` in services - -It's acceptable here — `services/auth.py` raises `HTTPException` directly when the domain failure maps cleanly to an HTTP status. Use it sparingly; when the same service may be called from a Celery task or CLI, prefer a domain exception that the route layer translates. - -## Canonical example (from `app/services/auth.py`) - -```python -import logging -from datetime import timedelta - -from app.core import security -from app.core.config import settings -from app.utils import mask_string - -logger = logging.getLogger(__name__) - - -def create_token_pair( - user_id: int, - organization_id: int | None = None, - project_id: int | None = None, -) -> tuple[str, str]: - """Create an access token and refresh token pair.""" - access_token = security.create_access_token( - user_id, - expires_delta=timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES), - organization_id=organization_id, - project_id=project_id, - ) - refresh_token = security.create_refresh_token( - user_id, - expires_delta=timedelta(minutes=settings.REFRESH_TOKEN_EXPIRE_MINUTES), - organization_id=organization_id, - project_id=project_id, - ) - logger.info(f"[create_token_pair] Token pair issued | user_id: {user_id}, access_token: {mask_string(access_token)}") - return access_token, refresh_token -``` - -Delegation to `app.core.security` is the pattern — services orchestrate; primitives live in `app/core/`. - -## External HTTP — checklist - -- **Timeout** — every `httpx`/`requests` call has an explicit timeout. The default is too long. -- **Retry policy** — idempotent GETs can retry with backoff. Mutations should retry only if you're certain the API is idempotent or you have an idempotency key. -- **Error mapping** — `httpx.HTTPStatusError` → a domain exception or `HTTPException` with a sensible code (often 502 for upstream failures, NOT 500). -- **Mock at this boundary in tests** — `monkeypatch` the HTTP client, not the DB. (See `test-writer` agent.) - -## Calling CRUD - -- Services own the `session` lifecycle for the operation. Call CRUD functions with `session=session` keyword arg. -- If CRUD returns `None`, decide whether that's a domain error (`raise NotFoundError`), a 404 (`raise HTTPException(404)`), or a silent skip (`return early`). Be explicit. - -## Config / secrets - -- Read from `settings` (`app.core.config`). Never read `os.environ` directly in a service. -- Defaults should lean toward cheap/safe: smallest model, lowest token cap, shortest TTL. Aggressive defaults belong in env, not code. - -## Magic values - -If you write the string `"openai"` or `"text-embedding-3-small"` or `1_000_000` in a service, ask whether it should be a constant / Enum / setting. Grep for the same literal — if it appears elsewhere, it should already be a constant. - -## What you DO NOT do - -- Don't write SQL directly — call CRUD. -- Don't import `fastapi.APIRouter` or define routes. -- Don't write long-running blocking loops — that's a Celery task. -- Don't call `time.sleep` inside an `async def` (use `asyncio.sleep`). -- Don't catch `HTTPException` from a sub-call and swallow it — propagate. - -## After writing - -Tell the user: -1. The service function(s) added (path + signature). -2. Which CRUD functions you call and which you still need. -3. Any external HTTP boundary that the test layer should mock. -4. Any new env / settings key the user must add to `.env.example`. diff --git a/.claude/agents/test-writer.md b/.claude/agents/test-writer.md deleted file mode 100644 index fc19ec00b..000000000 --- a/.claude/agents/test-writer.md +++ /dev/null @@ -1,91 +0,0 @@ ---- -name: test-writer -description: Use when writing or updating tests under `app/tests/` for kaapi-backend. Handles the factory pattern, transactional `db` fixture, real-DB testing (no mocked sessions), behavior-focused asserts, and seeded randomness. -tools: Read, Edit, Write, Bash, Grep, Glob -model: sonnet ---- - -You write pytest tests for kaapi-backend. Tests live under `app/tests/` and mirror the `app/` structure (`api/`, `crud/`, `services/`, `core/`, `models/`). - -## Hard rules - -- **Real DB only — never mock the database session.** This repo's `conftest.py` provides a transactional `db` fixture that rolls back after each test. Use it. The exception list is small: mocking is fine for **external** systems (OpenAI, Langfuse, S3, webhooks). Database = real. -- **Use the factory pattern from `app/tests/utils/`.** Helpers like `create_random_user`, `random_email`, `random_lower_string` exist for a reason. No hardcoded `organization_id=1`, no inline `User(...)` instances with magic ids. -- **Behavior, not implementation.** Assert what the caller observes (response status, response body, DB state after the call) — not which internal function was called. -- **Seed randomness.** If a test uses `random.random()` or similar, seed it. Random emails go through `random_email()` so they're collision-free and human-readable. -- **Bug fix → regression test.** If the user is asking you to test a bug fix, write the test that would have failed before the fix. - -## Fixtures available (from `conftest.py`) - -- `db: Session` — transactional, function-scoped. Use this in CRUD and service tests. -- `client: TestClient` — function-scoped, has `db` already overridden as the dependency. Use this in API tests. -- `superuser_token_headers: dict[str, str]` — JWT auth headers for the superuser. -- `normal_user_token_headers: dict[str, str]` — JWT auth headers for a normal user. -- `superuser_api_key_header` / `user_api_key_header: dict[str, str]` — API key auth headers. -- `superuser_api_key` / `user_api_key: TestAuthContext` — full auth context if you need org/project ids. -- `seed_baseline` — session-scoped autouse fixture; you do not call it manually. - -## Test factory utilities (`app/tests/utils/`) - -- `user.py`: `create_random_user(db)`, `authentication_token_from_email(...)` -- `auth.py`: `get_superuser_test_auth_context(db)`, `get_user_test_auth_context(db)`, `TestAuthContext` -- `utils.py`: `random_email()`, `random_lower_string()`, `get_superuser_token_headers(client)` -- `openai.py`, `llm.py`, `llm_provider.py`, `collection.py`, `document.py` — per-domain factories. **Read these before writing new factories.** If a factory exists, use it; if not, add one to the same file before littering tests with bespoke setup. - -## Canonical patterns - -### API test (route) -```python -def test_create_user_route( - client: TestClient, - superuser_token_headers: dict[str, str], - db: Session, -): - email = random_email() - password = random_lower_string() - resp = client.post( - f"{settings.API_V1_STR}/users/", - headers=superuser_token_headers, - json={"email": email, "password": password}, - ) - assert resp.status_code == 201 - body = resp.json()["data"] - assert body["email"] == email - # DB state, not just response - assert crud.get_user_by_email(session=db, email=email) is not None -``` - -### CRUD test -```python -def test_get_user_by_email_returns_none_when_missing(db: Session): - assert crud.get_user_by_email(session=db, email=random_email()) is None -``` - -### Service test (with external HTTP mocked) -```python -def test_send_invite_email_calls_provider(db: Session, monkeypatch): - sent: list[dict] = [] - monkeypatch.setattr("app.utils.send_email", lambda **kw: sent.append(kw)) - service_under_test.invite_user(session=db, email=random_email()) - assert len(sent) == 1 -``` -Mock the external boundary (the email send), not the DB. - -## Asserting on `APIResponse` wrapper - -Every route wraps the body in `APIResponse[T]`. Tests should pull `body = resp.json()["data"]` and assert on that, not `resp.json()` directly. If the route returns a list, check `body["count"]` and `body["data"]` (or whatever the wrapper shape is — confirm by reading `app/utils/api_response.py` or whichever file defines `APIResponse`). - -## Things to flag (do not silently fix) - -- A bug fix arriving without a regression test → say so explicitly and write one. -- A "test" that mocks the DB session → refactor it onto the `db` fixture. -- `assert resp.status_code == 200` for a POST that should be 201, or for a DELETE that should be 204 — call out the wrong code. -- Tests asserting `mock.called` with no behavioral check — these are tautological; replace with an assertion on observable state. -- Hardcoded `organization_id=1` or `project_id=1` — replace with the auth-context fixtures. - -## Running tests - -- All tests: `uv run bash scripts/tests-start.sh` -- A subset (when iterating): `uv run pytest backend/app/tests/api/test_users.py -k -x` - -After writing, run the relevant subset. If the test fails for an unexpected reason (not the bug under test), diagnose before declaring done. diff --git a/CLAUDE.md b/CLAUDE.md index 013e3918c..53d09c7e9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -21,7 +21,7 @@ fastapi run --reload app/main.py uv run pre-commit run --all-files # Generate database migration (rev-id should be latest existing revision ID + 1) -uv run alembic revision --autogenerate -m "Description" --rev-id 061 +alembic revision --autogenerate -m "Description" --rev-id 040 # Seed database with test data uv run python -m app.seed_data.seed_data @@ -93,31 +93,47 @@ The application uses different environment files: ## Coding Conventions -Layer-specific conventions live in `.claude/agents/*.md` and are enforced by the matching specialist subagent (e.g., `route-writer` for `app/api/routes/`, `model-writer` for `app/models/`, `migration-writer` for alembic). CLAUDE.md only covers rules that apply across every layer. - -### Cross-cutting rules - -- **Type hints** on every parameter and return value. `-> Any` is not an annotation — narrow it or drop it. -- **Logging prefix:** every log line starts with the function name in square brackets. - ```python - logger.info(f"[function_name] Message | key: {value}") - ``` -- **`uv` is the runner**, not `pip`. Examples: `uv run pytest`, `uv run alembic ...`, `uv run pre-commit run --all-files`. -- **No magic values** in code — extract repeated literals to constants / `Enum` / settings. -- **Naming:** `list_*` for plural fetch, `get_*` for singletons; snake_case funcs/vars, PascalCase classes, UPPER_SNAKE constants; `Enum` suffix on enum classes. -- **Timestamps** are `inserted_at` / `updated_at` (not `created_at`). - -## Specialist subagents - -When working in a specific layer, the matching agent under `.claude/agents/` handles the layer's conventions automatically. Pick by layer, or just describe the task and let the main agent route: - -| Agent | Layer | -|---|---| -| `route-writer` | `app/api/routes/` | -| `crud-writer` | `app/crud/` | -| `service-writer` | `app/services/` | -| `model-writer` | `app/models/` | -| `migration-writer` | `app/alembic/versions/` | -| `celery-task-writer` | `app/celery/tasks/` | -| `test-writer` | `app/tests/` | -| `convention-reviewer` | Cross-cutting pre-commit gate (mirrors `/pr-review`) | +### Type Hints + +Always add type hints to all function parameters and return values. + +### Logging Format + +Prefix all log messages with the function name in square brackets. + +```python +logger.info(f"[function_name] Message {mask_string(sensitive_value)}") +``` + +### Database Column Comments + +Use sa_column_kwargs["comment"] to describe database columns, especially when the purpose isn’t obvious. This helps non-developers understand column purposes directly from the database schema: + +```python +field_name: int = Field( + foreign_key="table.id", + nullable=False, + ondelete="CASCADE", + sa_column_kwargs={"comment": "What this column represents"} +) +``` + +Prioritize comments for: +- Columns with non-obvious purposes +- Status/type fields (document valid values) +- JSON/metadata columns (describe expected structure) +- Foreign keys (clarify the relationship) + +### Endpoint Documentation + +Load Swagger descriptions from external markdown files instead of inline strings: + +```python +@router.post( + "/endpoint", + description=load_description("domain/action.md"), + response_model=APIResponse[ResponseModel], +) +``` + +Store documentation files in `backend/app/api/docs//.md` From 324777f61e53ba4c17ba02a64f510d5b41ee940e Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Fri, 5 Jun 2026 08:27:43 +0530 Subject: [PATCH 6/8] cleanup code a bit --- .../064_add_run_mode_to_evaluation_run.py | 25 ---- backend/app/api/routes/evaluations/dataset.py | 16 ++- .../app/api/routes/evaluations/evaluation.py | 2 +- backend/app/crud/evaluations/fast.py | 129 +++++------------- .../app/services/evaluations/evaluation.py | 94 +++++++++---- backend/app/services/evaluations/fast.py | 47 ++----- 6 files changed, 126 insertions(+), 187 deletions(-) diff --git a/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py b/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py index 70174f7b6..c94f59950 100644 --- a/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py +++ b/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py @@ -4,31 +4,6 @@ Revises: 063 Create Date: 2026-05-20 00:00:00.000000 -Two schema changes are required to support the fast-evaluation feature: - -1. run_mode VARCHAR(10) NOT NULL DEFAULT 'batch' - Every evaluation run must now carry an explicit execution mode ('batch' - or 'fast'). Because evaluation_run may already contain rows the column - is added as nullable with a server_default of 'batch' so existing rows - are backfilled atomically, then SET NOT NULL is applied. The server - default is kept on the column to act as a safety net for any insert path - that does not explicitly set the field; the SQLModel default="batch" will - also cover application inserts. - -2. UNIQUE constraint uq_evaluation_run_org_project_run_name - on (organization_id, project_id, run_name) - Guards against double-click races on POST /evaluations where two - concurrent requests could create duplicate runs with the same name inside - the same project. Lower environments may already contain duplicate - (organization_id, project_id, run_name) tuples, so we dedupe first - (keeping the lowest-id survivor), then build the unique index - CONCURRENTLY (no AccessExclusiveLock on the table during the build) and - attach it as a named constraint via ADD CONSTRAINT ... USING INDEX. - Because CONCURRENTLY cannot execute inside a transaction the whole - migration is marked disable_per_migration_transaction = True and each - CONCURRENTLY step is wrapped in autocommit_block(). The dedupe DELETE and - the non-concurrent DDL steps run inside autocommit mode too, which is - safe — they are each individually atomic at the statement level. """ import sqlalchemy as sa diff --git a/backend/app/api/routes/evaluations/dataset.py b/backend/app/api/routes/evaluations/dataset.py index 20f59f9cf..ab703493a 100644 --- a/backend/app/api/routes/evaluations/dataset.py +++ b/backend/app/api/routes/evaluations/dataset.py @@ -17,15 +17,19 @@ from app.core.cloud import get_cloud_storage from app.crud.evaluations import ( get_dataset_by_id, +) +from app.crud.evaluations import ( list_datasets as list_evaluation_datasets, ) from app.crud.evaluations.dataset import delete_dataset as delete_dataset_crud from app.models.evaluation import DatasetUploadResponse, EvaluationDataset from app.services.evaluations import ( is_dataset_fast_eligible, - upload_dataset as upload_evaluation_dataset, validate_csv_file, ) +from app.services.evaluations import ( + upload_dataset as upload_evaluation_dataset, +) from app.utils import ( APIResponse, load_description, @@ -126,11 +130,15 @@ def list_datasets( offset=offset, ) - responses = [_dataset_to_response(dataset) for dataset in datasets] + dataset_responses = [_dataset_to_response(dataset) for dataset in datasets] if eligible_for == "fast": - responses = [r for r in responses if r.eligible_for_fast] + dataset_responses = [ + dataset_response + for dataset_response in dataset_responses + if dataset_response.eligible_for_fast + ] - return APIResponse.success_response(data=responses) + return APIResponse.success_response(data=dataset_responses) @router.get( diff --git a/backend/app/api/routes/evaluations/evaluation.py b/backend/app/api/routes/evaluations/evaluation.py index 800b5b0cc..0c9c941f9 100644 --- a/backend/app/api/routes/evaluations/evaluation.py +++ b/backend/app/api/routes/evaluations/evaluation.py @@ -13,10 +13,10 @@ ) from app.api.deps import AuthContextDep, SessionDep +from app.api.permissions import Permission, require_permission from app.crud.evaluations import list_evaluation_runs as list_evaluation_runs_crud from app.crud.evaluations.core import group_traces_by_question_id from app.models.evaluation import EvaluationRunPublic, RunModeEnum -from app.api.permissions import Permission, require_permission from app.services.evaluations import ( get_evaluation_with_scores, start_evaluation, diff --git a/backend/app/crud/evaluations/fast.py b/backend/app/crud/evaluations/fast.py index d78785104..23a2b3b83 100644 --- a/backend/app/crud/evaluations/fast.py +++ b/backend/app/crud/evaluations/fast.py @@ -1,18 +1,15 @@ -""" -Fast evaluation orchestration. - -This module implements run_mode="fast" — the synchronous text-evaluation path -described in `Fast Evaluation SRD.md`. Unlike the batch path it does not submit -an OpenAI Batch job; instead it makes Responses + Embeddings calls in parallel -from a single Celery orchestrator task and persists per-stage units to S3. +"""Fast evaluation orchestration (run_mode="fast"). -Stage layout (each stage is skipped on retry if its `batch_job` row already -exists, mirroring how the batch path tracks state): +Synchronous text-eval path: makes Responses + Embeddings calls in parallel from +a single Celery task and persists per-stage units to S3. Each stage is skipped +on retry if its `batch_job` row already exists. Stage 1 — Responses unit: evaluation_run.batch_job_id Stage 2 — Embeddings unit: evaluation_run.embedding_batch_job_id Stage 3 — Score + trace + cost (no marker; each step is idempotent) Stage 4 — Mark completed + +See `Fast Evaluation SRD.md` for the full design. """ import logging @@ -56,22 +53,17 @@ _T = TypeVar("_T") -# job_type discriminators on batch_job for the two stages of the fast path. -# The OpenAI Batch concepts (provider_batch_id, provider_file_id, …) are NULL -# on these rows; the presence of the row + its raw_output_url is what tells the -# orchestrator a stage is already done. +# job_type discriminators on batch_job for the two fast-path stages. The row's +# presence + raw_output_url is what marks a stage as already done on retry. JOB_TYPE_EVALUATION_FAST = "evaluation_fast" JOB_TYPE_EMBEDDING_FAST = "embedding_fast" -# Retry policy for individual OpenAI calls inside Stage 1 / Stage 2. -# Retries on rate-limit, timeout, and connection errors; permanent errors -# (auth, validation, model not found) fail the item immediately. +# Per-call retry policy for Stage 1 / Stage 2. _RETRY_MAX_ATTEMPTS = 5 _RETRY_BASE_DELAY_SECONDS = 1.0 _RETRY_MAX_DELAY_SECONDS = 30.0 -# These OpenAI exception classes are retryable transient failures. _RETRYABLE_OPENAI_ERRORS: tuple[type[Exception], ...] = ( openai.RateLimitError, openai.APITimeoutError, @@ -90,14 +82,7 @@ def _sleep_with_backoff(attempt: int) -> None: def _call_with_retry(label: str, fn: Callable[[], _T]) -> _T: - """Call `fn()` with retry on transient OpenAI errors. - - Returns the function's result on success. Raises the last exception when - retries are exhausted or when the error is non-retryable. The loop always - either returns or raises on the final attempt; the explicit - `RuntimeError` at the tail keeps mypy happy about the function's return - being reachable. - """ + """Call `fn()`, retrying transient OpenAI errors; permanent errors fail fast.""" for attempt in range(1, _RETRY_MAX_ATTEMPTS + 1): try: return fn() @@ -114,12 +99,12 @@ def _call_with_retry(label: str, fn: Callable[[], _T]) -> _T: ) _sleep_with_backoff(attempt) except openai.OpenAIError as exc: - # Permanent OpenAI errors (auth, validation, model not found) — fail fast. logger.warning( f"[_call_with_retry] Permanent error, no retry | label={label} | " f"error={exc}" ) raise + # Unreachable: the loop always returns or raises. Keeps mypy happy. raise RuntimeError(f"_call_with_retry exited without result for {label}") @@ -128,11 +113,7 @@ def _build_responses_params( config: TextLLMParams, question: str, ) -> dict[str, Any]: - """Build the parameter dict for one OpenAI Responses API call. - - Mirrors the body shape `build_evaluation_jsonl` uses for the batch path so - fast eval and batch eval generate equivalent outputs for the same config. - """ + """Build params for one Responses call, mirroring the batch path's body shape.""" params: dict[str, Any] = { "model": config.model, "instructions": config.instructions, @@ -158,12 +139,7 @@ def _build_responses_params( def _field(obj: Any, name: str, default: Any = None) -> Any: - """Read a field from an object or dict, returning default when missing. - - OpenAI SDK objects expose fields as attributes; older response shapes and - tests sometimes pass dicts. This unifies access so call sites don't have - to branch on the runtime type. - """ + """Read a field from an object or dict (SDK object vs test dict), with a default.""" if obj is None: return default if isinstance(obj, dict): @@ -172,11 +148,7 @@ def _field(obj: Any, name: str, default: Any = None) -> Any: def _extract_response_text(response: Any) -> str: - """Extract the model-generated text from an OpenAI Responses object. - - Prefers `response.output_text` (the SDK's flat helper) and falls back to a - structured walk of `response.output` if the helper is empty. - """ + """Extract generated text, preferring `output_text` then walking `output`.""" output_text = _field(response, "output_text") if output_text: return output_text @@ -197,7 +169,7 @@ def _extract_response_text(response: Any) -> str: def _usage_to_dict(usage: Any) -> dict[str, int]: - """Normalize an OpenAI usage object into the dict shape kaapi's cost layer expects.""" + """Normalize an OpenAI usage object into the cost layer's dict shape.""" return { "input_tokens": int(_field(usage, "input_tokens", 0) or 0), "output_tokens": int(_field(usage, "output_tokens", 0) or 0), @@ -211,12 +183,7 @@ def _responses_call_for_item( config: TextLLMParams, item: dict[str, Any], ) -> dict[str, Any]: - """Run one Responses API call for a single dataset item. - - Returns the same per-item shape as `parse_evaluation_output` from the batch - path so downstream code (Langfuse trace creation, embeddings, cost) does - not need to branch on run_mode. - """ + """Run one Responses call for a dataset item, in the batch path's per-item shape.""" item_id = item["id"] question = item["input"].get("question", "") if item.get("input") else "" ground_truth = ( @@ -278,12 +245,7 @@ def _embedding_call_for_pair( output_text: str, ground_truth: str, ) -> dict[str, Any]: - """Run one Embeddings API call for an (output, ground_truth) pair. - - Returns a dict with both embeddings and the prompt_tokens usage so the - cost layer can price the call. `failed=True` indicates a permanent or - retry-exhausted failure for the pair. - """ + """Embed an (output, ground_truth) pair; `failed=True` on a terminal failure.""" if not output_text or not ground_truth: return { "item_id": item_id, @@ -402,10 +364,9 @@ def _stage1_responses( dataset_items: list[dict[str, Any]], log_prefix: str, ) -> tuple[EvaluationRun, list[dict[str, Any]]]: - """Stage 1 — generate one Responses API call per dataset item. + """Stage 1 — one Responses call per dataset item. - Skipped on retry if `eval_run.batch_job_id` is already set; in that case we - reload the persisted unit from S3 so downstream stages still have results. + Skipped on retry if `eval_run.batch_job_id` is set; the unit is reloaded from S3. """ if eval_run.batch_job_id: existing = get_batch_job(session=session, batch_job_id=eval_run.batch_job_id) @@ -465,8 +426,7 @@ def _stage1_responses( results=results, ) - # Aggregate usage for the batch_job summary so the cost layer can price the - # stage from raw_output_url alone if it needs to re-derive later. + # Aggregate usage for the batch_job summary. summed_usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0} for r in results: usage = r.get("usage") or {} @@ -491,9 +451,7 @@ def _stage1_responses( ), ) - # `batch_job_id` and `total_items` aren't on EvaluationRunUpdate (the - # update model only covers fields that change during the lifecycle), so - # set them directly and let `update_evaluation_run` bump `updated_at`. + # batch_job_id / total_items aren't on EvaluationRunUpdate; set them directly. eval_run.batch_job_id = batch_job.id eval_run.total_items = len(results) eval_run = update_evaluation_run( @@ -513,10 +471,7 @@ def _stage2_embeddings( response_results: list[dict[str, Any]], log_prefix: str, ) -> tuple[EvaluationRun, list[dict[str, Any]]]: - """Stage 2 — embed each (output, ground_truth) pair for cosine similarity. - - Skipped on retry if `eval_run.embedding_batch_job_id` is already set. - """ + """Stage 2 — embed each (output, ground_truth) pair; skipped on retry if done.""" if eval_run.embedding_batch_job_id: existing = get_batch_job( session=session, batch_job_id=eval_run.embedding_batch_job_id @@ -561,9 +516,7 @@ def _stage2_embeddings( embedding_results.append(future.result()) failed_count = sum(1 for r in embedding_results if r.get("failed")) - # Failure threshold for embeddings is computed over the whole dataset, not - # over the candidate subset — items that failed Stage 1 already count as - # failures from the user's perspective. + # Threshold is over the whole dataset: Stage 1 failures count as failures too. total_failures = failed_count + sum(1 for r in response_results if r.get("failed")) logger.info( f"[_stage2_embeddings] {log_prefix} Stage 2 finished | " @@ -631,24 +584,17 @@ def _stage3_score_and_trace( ) -> EvaluationRun: """Stage 3 — compute cosine, create Langfuse traces, attach costs. - No stage marker; each step is idempotent or near-idempotent: - - Cosine recomputation is deterministic (same vectors → same numbers). - - Langfuse dedupes traces on the dataset_item.observe key. - - attach_cost is idempotent on (eval_run_id, batch_job_id, usage_type). + No stage marker; each step is idempotent (deterministic cosine, Langfuse + dedupes on the observe key, attach_cost overwrites per stage). """ logger.info( f"[_stage3_score_and_trace] {log_prefix} Computing cosine + creating traces" ) - # 1. Compute cosine similarity per pair, in the same shape as the batch - # path's `parse_embedding_results` + `calculate_average_similarity`. item_id_to_pair = { r["item_id"]: r for r in embedding_results if not r.get("failed") } - # 2. Create Langfuse traces and get item_id -> trace_id mapping. The model - # name comes from the stored config (same source as batch path) so - # Langfuse's per-generation cost calc stays consistent. model = resolve_model_from_config(session=session, eval_run=eval_run) trace_id_mapping = create_langfuse_dataset_run( langfuse=langfuse, @@ -658,8 +604,7 @@ def _stage3_score_and_trace( model=model, ) - # 3. Build the per-item score list keyed on Langfuse trace_id, then update - # traces with the cosine score. + # Per-item cosine scores keyed on Langfuse trace_id. per_item_scores: list[dict[str, Any]] = [] similarities: list[float] = [] for r in response_results: @@ -685,16 +630,14 @@ def _stage3_score_and_trace( langfuse=langfuse, per_item_scores=per_item_scores ) except Exception as exc: - # Mirror the batch path: Langfuse score-update failures don't fail - # the run; they get logged and the score still lives in eval_run.score. + # Score-update failures don't fail the run (score lives in eval_run.score). logger.warning( f"[_stage3_score_and_trace] {log_prefix} " f"Failed to update Langfuse traces with scores | error={exc}", exc_info=True, ) - # 4. Aggregate similarity stats — same shape as the batch path so - # summary_scores rendering on GET /evaluations/{id} stays identical. + # Aggregate similarity stats, in the batch path's summary_scores shape. if similarities: sim_array = np.array(similarities) avg = float(np.mean(sim_array)) @@ -715,8 +658,7 @@ def _stage3_score_and_trace( ] } - # 5. Attach costs (response stage + embedding stage). attach_cost is - # idempotent on its natural key — see "Known Limitations" in the SRD. + # Attach response- and embedding-stage costs (attach_cost is idempotent per stage). if response_results: attach_cost( session=session, @@ -726,8 +668,7 @@ def _stage3_score_and_trace( response_results=response_results, ) - # Embedding cost expects the raw OpenAI batch shape; rebuild it from our - # in-memory embedding_results so attach_cost can price the stage uniformly. + # attach_cost expects the raw OpenAI batch shape; rebuild it from embedding_results. if embedding_results: embedding_raw = [ { @@ -772,13 +713,8 @@ def run_fast_evaluation( ) -> EvaluationRun: """Run the full fast-eval pipeline for one evaluation_run. - Called from the `run_evaluation_fast` Celery task. Each stage is skipped on - retry if its batch_job marker is already set, so re-invocation does not - re-call OpenAI for work that already succeeded. - - Raises: - Exception: If any stage fails terminally; the orchestrator marks the - run failed. + Called from the `run_evaluation_fast` task. Stages are skipped on retry when + their batch_job marker is set. Raises on terminal failure (run marked failed). """ log_prefix = ( f"[org={eval_run.organization_id}]" @@ -787,7 +723,6 @@ def run_fast_evaluation( ) logger.info(f"[run_fast_evaluation] {log_prefix} Starting fast evaluation pipeline") - # Mark as processing immediately so the GET endpoint reflects state. if eval_run.status == "pending": eval_run = update_evaluation_run( session=session, diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 5e4abd0dc..928aa1580 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -7,6 +7,8 @@ from sqlalchemy.exc import IntegrityError from sqlmodel import Session +from app.core.cloud.storage import get_cloud_storage +from app.core.storage_utils import load_json_from_object_store from app.crud.evaluations import ( create_evaluation_run, fetch_trace_scores_from_langfuse, @@ -17,14 +19,64 @@ start_evaluation_batch, ) from app.models.evaluation import EvaluationRun -from app.models.llm.request import TextLLMParams, STTLLMParams, TTSLLMParams +from app.models.llm.request import STTLLMParams, TextLLMParams, TTSLLMParams from app.services.llm.providers import LLMProvider from app.utils import get_langfuse_client, get_openai_client -from app.core.cloud.storage import get_cloud_storage -from app.core.storage_utils import load_json_from_object_store logger = logging.getLogger(__name__) +# Error code surfaced in HTTPException.detail when a run_name collides with the +# (organization_id, project_id, run_name) unique constraint. Shared by the batch +# and fast paths so both return an identical 409 contract. +ERR_RUN_NAME_ALREADY_EXISTS = "run_name_already_exists" + + +def create_evaluation_run_or_409( + *, + session: Session, + run_name: str, + dataset_name: str, + dataset_id: int, + config_id: UUID, + config_version: int, + organization_id: int, + project_id: int, + run_mode: str = "batch", + log_context: str, +) -> EvaluationRun: + """Create an EvaluationRun, translating a duplicate-run_name collision into 409. + + The (organization_id, project_id, run_name) unique constraint guards against + double-click / client-retry races; on collision we roll back and raise a 409 + instead of leaking the IntegrityError. + """ + try: + return create_evaluation_run( + session=session, + run_name=run_name, + dataset_name=dataset_name, + dataset_id=dataset_id, + config_id=config_id, + config_version=config_version, + organization_id=organization_id, + project_id=project_id, + run_mode=run_mode, + ) + except IntegrityError: + session.rollback() + logger.warning( + f"[{log_context}] Duplicate run_name | run_name={run_name} | " + f"org_id={organization_id} | project_id={project_id}" + ) + raise HTTPException( + status_code=409, + detail=( + f"{ERR_RUN_NAME_ALREADY_EXISTS}: a run with name '{run_name}' " + "already exists for this organization and project. Pick a new " + "run_name or fetch the existing run via GET /evaluations." + ), + ) + def start_evaluation( session: Session, @@ -130,31 +182,17 @@ def start_evaluation( ) # Step 3: Create EvaluationRun record with config references - try: - eval_run = create_evaluation_run( - session=session, - run_name=experiment_name, - dataset_name=dataset.name, - dataset_id=dataset_id, - config_id=config_id, - config_version=config_version, - organization_id=organization_id, - project_id=project_id, - ) - except IntegrityError: - session.rollback() - logger.warning( - f"[start_evaluation] Duplicate run_name | run_name={experiment_name} | " - f"org_id={organization_id} | project_id={project_id}" - ) - raise HTTPException( - status_code=409, - detail=( - f"run_name_already_exists: a run with name '{experiment_name}' " - "already exists for this organization and project. Pick a new " - "run_name or fetch the existing run via GET /evaluations." - ), - ) + eval_run = create_evaluation_run_or_409( + session=session, + run_name=experiment_name, + dataset_name=dataset.name, + dataset_id=dataset_id, + config_id=config_id, + config_version=config_version, + organization_id=organization_id, + project_id=project_id, + log_context="start_evaluation", + ) # Step 4: Start the batch evaluation try: diff --git a/backend/app/services/evaluations/fast.py b/backend/app/services/evaluations/fast.py index d3e316b4b..7a2ad8235 100644 --- a/backend/app/services/evaluations/fast.py +++ b/backend/app/services/evaluations/fast.py @@ -11,14 +11,12 @@ from uuid import UUID from fastapi import HTTPException -from sqlalchemy.exc import IntegrityError from sqlmodel import Session from app.celery.utils import start_fast_evaluation as enqueue_fast_evaluation from app.core.config import settings from app.core.db import engine from app.crud.evaluations import ( - create_evaluation_run, get_dataset_by_id, resolve_evaluation_config, run_fast_evaluation, @@ -26,6 +24,7 @@ from app.crud.evaluations.core import update_evaluation_run from app.models.evaluation import EvaluationRun, EvaluationRunUpdate, RunModeEnum from app.models.llm.request import TextLLMParams +from app.services.evaluations.evaluation import create_evaluation_run_or_409 from app.services.llm.providers import LLMProvider from app.utils import get_langfuse_client, get_openai_client @@ -35,7 +34,6 @@ # Error codes surfaced in HTTPException.detail so the UI can localize/branch. ERR_CONFIG_TYPE_UNSUPPORTED = "config_type_unsupported" ERR_DATASET_TOO_LARGE_FOR_FAST = "dataset_too_large_for_fast" -ERR_RUN_NAME_ALREADY_EXISTS = "run_name_already_exists" def is_dataset_fast_eligible(*, original_items_count: int) -> bool: @@ -61,7 +59,7 @@ def validate_and_start_fast_evaluation( 2. Config resolves to a text-type OpenAI config. 3. Dataset's original_items_count <= EVAL_FAST_MAX_UNIQUE_ROWS. 4. (organization_id, project_id, run_name) is unique — enforced by the DB - constraint added in migration 063. We translate IntegrityError to 409. + constraint; a collision is translated to 409 by the shared helper. On success the function creates the EvaluationRun row with `run_mode="fast"`, `status="processing"`, and enqueues the orchestrator @@ -134,34 +132,19 @@ def validate_and_start_fast_evaluation( ), ) - # 4. Create the run; rely on the DB unique constraint to catch double-clicks. - try: - eval_run = create_evaluation_run( - session=session, - run_name=run_name, - dataset_name=dataset.name, - dataset_id=dataset_id, - config_id=config_id, - config_version=config_version, - organization_id=organization_id, - project_id=project_id, - run_mode=RunModeEnum.FAST.value, - ) - except IntegrityError: - session.rollback() - logger.warning( - f"[validate_and_start_fast_evaluation] Duplicate run_name | " - f"run_name={run_name} | org_id={organization_id} | " - f"project_id={project_id}" - ) - raise HTTPException( - status_code=409, - detail=( - f"{ERR_RUN_NAME_ALREADY_EXISTS}: a run with name '{run_name}' " - "already exists for this organization and project. Pick a new " - "run_name or fetch the existing run via GET /evaluations." - ), - ) + # 4. Create the run; the shared helper translates a duplicate run_name into 409. + eval_run = create_evaluation_run_or_409( + session=session, + run_name=run_name, + dataset_name=dataset.name, + dataset_id=dataset_id, + config_id=config_id, + config_version=config_version, + organization_id=organization_id, + project_id=project_id, + run_mode=RunModeEnum.FAST.value, + log_context="validate_and_start_fast_evaluation", + ) # Flip to processing before dispatching the task so the GET endpoint # reflects the correct state immediately. From 8da52fe02395a5289f4226bde29c13c6da265fe4 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Fri, 5 Jun 2026 11:42:07 +0530 Subject: [PATCH 7/8] cleanup code to save resutls to s3 --- backend/app/crud/evaluations/fast.py | 106 ++++++++++++++---- .../tests/api/routes/test_evaluation_fast.py | 50 +++++++-- 2 files changed, 128 insertions(+), 28 deletions(-) diff --git a/backend/app/crud/evaluations/fast.py b/backend/app/crud/evaluations/fast.py index 23a2b3b83..332be87f4 100644 --- a/backend/app/crud/evaluations/fast.py +++ b/backend/app/crud/evaluations/fast.py @@ -8,6 +8,11 @@ Stage 2 — Embeddings unit: evaluation_run.embedding_batch_job_id Stage 3 — Score + trace + cost (no marker; each step is idempotent) Stage 4 — Mark completed + Stage 5 — Persist score unit (summary + per-trace) via the shared + save_score helper, so the cached trace unit (score_trace_url) + exists immediately and the read path (trace view / resync / + grouped export) mirrors the batch path without racing Langfuse + ingestion. See `Fast Evaluation SRD.md` for the full design. """ @@ -32,7 +37,11 @@ upload_jsonl_to_object_store, ) from app.crud.evaluations.batch import fetch_dataset_items -from app.crud.evaluations.core import resolve_model_from_config, update_evaluation_run +from app.crud.evaluations.core import ( + resolve_model_from_config, + save_score, + update_evaluation_run, +) from app.crud.evaluations.cost import attach_cost from app.crud.evaluations.embeddings import ( EMBEDDING_MODEL, @@ -42,6 +51,7 @@ create_langfuse_dataset_run, update_traces_with_cosine_scores, ) +from app.crud.evaluations.score import EvaluationScore, TraceData, TraceScore from app.crud.job import create_batch_job, get_batch_job from app.models import EvaluationRun, EvaluationRunUpdate from app.models.batch_job import BatchJobCreate @@ -581,9 +591,13 @@ def _stage3_score_and_trace( response_results: list[dict[str, Any]], embedding_results: list[dict[str, Any]], log_prefix: str, -) -> EvaluationRun: +) -> tuple[EvaluationRun, EvaluationScore]: """Stage 3 — compute cosine, create Langfuse traces, attach costs. + Returns the run plus the full score unit (summary_scores + per-trace + records) in the exact shape `fetch_trace_scores_from_langfuse` produces for + the batch path, so the caller can persist it via `save_score`. + No stage marker; each step is idempotent (deterministic cosine, Langfuse dedupes on the observe key, attach_cost overwrites per stage). """ @@ -604,25 +618,29 @@ def _stage3_score_and_trace( model=model, ) - # Per-item cosine scores keyed on Langfuse trace_id. + # Per-item cosine scores keyed on Langfuse trace_id (for Langfuse updates) + # and on item_id (for building the persisted per-trace records below). per_item_scores: list[dict[str, Any]] = [] + item_id_to_score: dict[str, float] = {} similarities: list[float] = [] - for r in response_results: - item_id = r["item_id"] - pair = item_id_to_pair.get(item_id) + for response in response_results: + item_id = response["item_id"] + embedding_pair = item_id_to_pair.get(item_id) trace_id = trace_id_mapping.get(item_id) - if not pair or not trace_id: + if not embedding_pair or not trace_id: continue if ( - pair.get("output_embedding") is None - or pair.get("ground_truth_embedding") is None + embedding_pair.get("output_embedding") is None + or embedding_pair.get("ground_truth_embedding") is None ): continue - score = calculate_cosine_similarity( - pair["output_embedding"], pair["ground_truth_embedding"] + cosine = calculate_cosine_similarity( + embedding_pair["output_embedding"], + embedding_pair["ground_truth_embedding"], ) - similarities.append(score) - per_item_scores.append({"trace_id": trace_id, "cosine_similarity": score}) + similarities.append(cosine) + item_id_to_score[item_id] = cosine + per_item_scores.append({"trace_id": trace_id, "cosine_similarity": cosine}) if per_item_scores: try: @@ -691,16 +709,53 @@ def _stage3_score_and_trace( embedding_raw_results=embedding_raw, ) + # Build the per-trace records in the same shape the batch path persists (via + # fetch_trace_scores_from_langfuse). One record per response that has a + # Langfuse trace; the cosine score is attached when its embedding succeeded. + traces: list[TraceData] = [] + for response in response_results: + item_id = response["item_id"] + trace_id = trace_id_mapping.get(item_id) + if not trace_id: + continue + trace_scores: list[TraceScore] = [] + cosine = item_id_to_score.get(item_id) + if cosine is not None: + trace_scores.append( + { + "name": "Cosine Similarity", + "value": round(cosine, 2), + "data_type": "NUMERIC", + "comment": ( + "Cosine similarity between generated output and " + "ground truth embeddings" + ), + } + ) + traces.append( + { + "trace_id": trace_id, + "question": response.get("question", ""), + "llm_answer": response.get("generated_output", ""), + "ground_truth_answer": response.get("ground_truth", ""), + "question_id": response.get("question_id"), + "scores": trace_scores, + } + ) + + # Persist cost here; the score unit (summary + traces) is persisted by the + # caller via save_score so it lands in S3 (score_trace_url) like the batch path. eval_run = update_evaluation_run( session=session, eval_run=eval_run, - update=EvaluationRunUpdate( - score=score_payload, - cost=eval_run.cost, - ), + update=EvaluationRunUpdate(cost=eval_run.cost), ) - return eval_run + score: EvaluationScore = { + "summary_scores": score_payload["summary_scores"], + "traces": traces, + } + return eval_run, score def run_fast_evaluation( @@ -758,7 +813,7 @@ def run_fast_evaluation( ) # Stage 3 - eval_run = _stage3_score_and_trace( + eval_run, score = _stage3_score_and_trace( session=session, eval_run=eval_run, langfuse=langfuse, @@ -774,6 +829,19 @@ def run_fast_evaluation( update=EvaluationRunUpdate(status="completed"), ) + # Stage 5 — persist the score unit (traces to S3, summary to DB) via the + # shared batch helper so the read path serves the cached unit instead of + # racing Langfuse ingestion. + saved = save_score( + eval_run_id=eval_run.id, + organization_id=eval_run.organization_id, + project_id=eval_run.project_id, + score=score, + ) + if saved is not None: + eval_run = saved + eval_run.score = score + logger.info( f"[run_fast_evaluation] {log_prefix} Fast evaluation completed | " f"total_items={eval_run.total_items}" diff --git a/backend/app/tests/api/routes/test_evaluation_fast.py b/backend/app/tests/api/routes/test_evaluation_fast.py index b5b0ffb28..f614b5553 100644 --- a/backend/app/tests/api/routes/test_evaluation_fast.py +++ b/backend/app/tests/api/routes/test_evaluation_fast.py @@ -625,19 +625,36 @@ def test_pipeline_completes_with_scores_and_writes_batch_jobs( db.refresh(eval_run) fake_openai = MagicMock() - fake_openai.responses.create.side_effect = ( - lambda **kwargs: _fake_openai_response(text="LLM answer", item_id="x") + fake_openai.responses.create.side_effect = lambda **_: _fake_openai_response( + text="LLM answer", item_id="x" ) fake_openai.embeddings.create.return_value = _fake_embedding_response() fake_langfuse = MagicMock() - result = run_fast_evaluation( - session=db, - openai_client=fake_openai, - langfuse=fake_langfuse, - eval_run=eval_run, - config=TextLLMParams(model="gpt-4o", instructions="be helpful"), - ) + # save_score opens its own Session(engine), which can't see this test's + # rolled-back transaction. Mirror its S3-success path against the test + # session so the persisted state (summary in DB, traces in S3) is + # observable here. (In production the worker's commits are visible across + # connections, so the real save_score works unchanged.) + def _fake_save_score(*, eval_run_id, score, **_): + run = db.get(EvaluationRun, eval_run_id) + run.score = {"summary_scores": score["summary_scores"]} + run.score_trace_url = f"s3://bucket/traces_{eval_run_id}.json" + db.add(run) + db.commit() + db.refresh(run) + return run + + with patch( + "app.crud.evaluations.fast.save_score", side_effect=_fake_save_score + ): + result = run_fast_evaluation( + session=db, + openai_client=fake_openai, + langfuse=fake_langfuse, + eval_run=eval_run, + config=TextLLMParams(model="gpt-4o", instructions="be helpful"), + ) # FR-11/FR-14: status completed, summary cosine ≈ 1.0 for identical vectors assert result.status == "completed" @@ -664,6 +681,21 @@ def test_pipeline_completes_with_scores_and_writes_batch_jobs( # FR-13: attach_cost called twice (response + embedding stages). assert _fast_pipeline_mocks.attach_cost.call_count == 2 + # Cached trace unit is persisted like the batch path so the read path + # (trace view / resync / grouped export) never has to hit Langfuse. + run = db.get(EvaluationRun, result.id) + assert run.score_trace_url is not None + # Full unit (summary + per-trace records) is surfaced on the return. + traces = result.score["traces"] + assert len(traces) == 2 + by_trace = {t["trace_id"]: t for t in traces} + assert {"trace-1", "trace-2"} == set(by_trace) + sample = by_trace["trace-1"] + assert sample["question"] == "Q1" + assert sample["ground_truth_answer"] == "A1" + assert sample["scores"][0]["name"] == "Cosine Similarity" + assert sample["scores"][0]["value"] == pytest.approx(1.0, abs=0.01) + # --------------------------------------------------------------------------- # Failure-threshold short-circuit (FR-14) From 7653953179bb7d7d69e128fb80c37ad460b27ddf Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Fri, 5 Jun 2026 20:41:27 +0530 Subject: [PATCH 8/8] coderabbit suggestions cleanups --- CLAUDE.md | 6 ++-- .../064_add_run_mode_to_evaluation_run.py | 28 ++++++++-------- .../api/docs/evaluation/create_evaluation.md | 9 +++++ backend/app/api/routes/evaluations/dataset.py | 7 +--- backend/app/celery/tasks/evaluation_fast.py | 9 +++-- backend/app/crud/evaluations/core.py | 14 ++++++-- backend/app/crud/evaluations/dataset.py | 22 ++++++++++++- backend/app/crud/evaluations/fast.py | 9 +++++ backend/app/models/evaluation.py | 9 +++-- .../app/services/evaluations/evaluation.py | 33 +++++++++++++++++-- backend/app/services/evaluations/fast.py | 15 ++++++--- .../tests/api/routes/test_evaluation_fast.py | 23 ++++++------- 12 files changed, 135 insertions(+), 49 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 53d09c7e9..36f520ccd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -20,8 +20,10 @@ fastapi run --reload app/main.py # Run pre-commit hooks uv run pre-commit run --all-files -# Generate database migration (rev-id should be latest existing revision ID + 1) -alembic revision --autogenerate -m "Description" --rev-id 040 +# Generate database migration. +# Compute at runtime as the latest existing revision ID + 1, +# zero-padded to 3 digits (check the highest NNN in app/alembic/versions/NNN_*.py). +alembic revision --autogenerate -m "Description" --rev-id # Seed database with test data uv run python -m app.seed_data.seed_data diff --git a/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py b/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py index c94f59950..d7c5beb23 100644 --- a/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py +++ b/backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py @@ -38,24 +38,22 @@ def upgrade(): ) op.execute("ALTER TABLE evaluation_run ALTER COLUMN run_mode SET NOT NULL") - # 2. Dedupe existing rows before adding the unique constraint. - # Keep the lowest-id row for each (organization_id, project_id, - # run_name) tuple and remove the rest. + # 2. Resolve duplicate (organization_id, project_id, run_name) tuples + # non-destructively before adding the unique constraint. Keep the + # lowest-id row's run_name untouched and rename the rest by appending a + # unique "__dup_" suffix so no historical run (and its scores, result + # URLs, or batch_job links) is lost. with op.get_context().autocommit_block(): op.execute( """ - DELETE FROM evaluation_run - WHERE id IN ( - SELECT id - FROM ( - SELECT id, - ROW_NUMBER() OVER ( - PARTITION BY organization_id, project_id, run_name - ORDER BY id ASC - ) AS rn - FROM evaluation_run - ) sub - WHERE rn > 1 + UPDATE evaluation_run e + SET run_name = e.run_name || '__dup_' || e.id + WHERE e.id <> ( + SELECT MIN(x.id) + FROM evaluation_run x + WHERE x.organization_id = e.organization_id + AND x.project_id = e.project_id + AND x.run_name = e.run_name ) """ ) diff --git a/backend/app/api/docs/evaluation/create_evaluation.md b/backend/app/api/docs/evaluation/create_evaluation.md index 96719ed53..3e9d7b636 100644 --- a/backend/app/api/docs/evaluation/create_evaluation.md +++ b/backend/app/api/docs/evaluation/create_evaluation.md @@ -41,8 +41,17 @@ Two execution modes are supported via the optional `run_mode` field: ## Fast-mode error responses +These apply only when `run_mode` is `fast`. + | Status | Code | When | | --- | --- | --- | | 422 | `config_type_unsupported` | Resolved config is not a text-evaluation config | | 422 | `dataset_too_large_for_fast` | Dataset exceeds `EVAL_FAST_MAX_UNIQUE_ROWS` unique rows | + +## General error responses + +These apply to both `batch` and `fast` modes. + +| Status | Code | When | +| --- | --- | --- | | 409 | `run_name_already_exists` | A run with the same `experiment_name` already exists for this (organization, project) | diff --git a/backend/app/api/routes/evaluations/dataset.py b/backend/app/api/routes/evaluations/dataset.py index ab703493a..cd04ec32b 100644 --- a/backend/app/api/routes/evaluations/dataset.py +++ b/backend/app/api/routes/evaluations/dataset.py @@ -128,15 +128,10 @@ def list_datasets( project_id=auth_context.project_.id, limit=limit, offset=offset, + eligible_for_fast=(eligible_for == "fast"), ) dataset_responses = [_dataset_to_response(dataset) for dataset in datasets] - if eligible_for == "fast": - dataset_responses = [ - dataset_response - for dataset_response in dataset_responses - if dataset_response.eligible_for_fast - ] return APIResponse.success_response(data=dataset_responses) diff --git a/backend/app/celery/tasks/evaluation_fast.py b/backend/app/celery/tasks/evaluation_fast.py index 55752ffb1..e9b1a626e 100644 --- a/backend/app/celery/tasks/evaluation_fast.py +++ b/backend/app/celery/tasks/evaluation_fast.py @@ -11,7 +11,7 @@ import logging -from celery import current_task +from celery import Task, current_task from app.celery.celery_app import celery_app from app.celery.tasks.job_execution import _run_with_otel_parent, _set_trace @@ -20,11 +20,16 @@ logger = logging.getLogger(__name__) +# Sentinel correlation id used when no trace id is propagated from the +# enqueueing request. Matches the codebase-wide "N/A" default (see +# app/core/logger.py and app/celery/utils.py). +DEFAULT_TRACE_ID = "N/A" + @celery_app.task(bind=True, queue="evaluations", priority=6) @gevent_timeout(settings.CELERY_TASK_SOFT_TIME_LIMIT, "run_evaluation_fast") def run_evaluation_fast( - self, eval_run_id: int, trace_id: str = "N/A", **kwargs + self: Task, eval_run_id: int, trace_id: str = DEFAULT_TRACE_ID ) -> None: """Run the fast evaluation pipeline for one EvaluationRun. diff --git a/backend/app/crud/evaluations/core.py b/backend/app/crud/evaluations/core.py index 271384615..53b17dc3d 100644 --- a/backend/app/crud/evaluations/core.py +++ b/backend/app/crud/evaluations/core.py @@ -13,6 +13,7 @@ from app.crud.evaluations.langfuse import fetch_trace_scores_from_langfuse from app.crud.evaluations.score import EvaluationScore from app.models import EvaluationRun, EvaluationRunUpdate +from app.models.evaluation import RunModeEnum from app.models.config.config import ConfigTag from app.models.llm.request import ConfigBlob, LLMCallConfig from app.models.stt_evaluation import EvaluationType @@ -62,7 +63,7 @@ def create_evaluation_run( config_version: int, organization_id: int, project_id: int, - run_mode: str = "batch", + run_mode: RunModeEnum | str = RunModeEnum.BATCH, ) -> EvaluationRun: """ Create a new evaluation run record in the database. @@ -76,11 +77,18 @@ def create_evaluation_run( config_version: Version number of the config organization_id: Organization ID project_id: Project ID - run_mode: Execution mode ("batch" default, or "fast") + run_mode: Execution mode (RunModeEnum.BATCH default, or RunModeEnum.FAST) Returns: The created EvaluationRun instance + + Raises: + ValueError: If run_mode is not a valid RunModeEnum value. """ + # Validate against the allowed run modes before hitting the DB CHECK + # constraint, so callers get a clear error instead of an IntegrityError. + run_mode_value = RunModeEnum(run_mode).value + eval_run = EvaluationRun( run_name=run_name, dataset_name=dataset_name, @@ -89,7 +97,7 @@ def create_evaluation_run( config_id=config_id, config_version=config_version, status="pending", - run_mode=run_mode, + run_mode=run_mode_value, organization_id=organization_id, project_id=project_id, inserted_at=now(), diff --git a/backend/app/crud/evaluations/dataset.py b/backend/app/crud/evaluations/dataset.py index 0aaf5a2e9..d0cf29930 100644 --- a/backend/app/crud/evaluations/dataset.py +++ b/backend/app/crud/evaluations/dataset.py @@ -12,10 +12,12 @@ from typing import Any from fastapi import HTTPException +from sqlalchemy import Integer, cast from sqlalchemy.exc import IntegrityError from sqlmodel import Session, select from app.core.cloud.storage import CloudStorage +from app.core.config import settings from app.core.storage_utils import ( generate_timestamped_filename, upload_to_object_store, @@ -179,6 +181,7 @@ def list_datasets( project_id: int, limit: int = 50, offset: int = 0, + eligible_for_fast: bool = False, ) -> list[EvaluationDataset]: """ List all evaluation datasets for an organization and project with pagination. @@ -189,6 +192,11 @@ def list_datasets( project_id: Project ID limit: Maximum number of datasets to return (default 50) offset: Number of datasets to skip (for pagination) + eligible_for_fast: When True, restrict to datasets whose unique-row count + (dataset_metadata.original_items_count) is at or below + settings.EVAL_FAST_MAX_UNIQUE_ROWS. Applied in SQL so pagination + counts only eligible rows. Datasets missing the metadata key are + excluded (NULL fails the comparison). Returns: List of EvaluationDataset objects, ordered by most recent first @@ -198,7 +206,19 @@ def list_datasets( .where(EvaluationDataset.organization_id == organization_id) .where(EvaluationDataset.project_id == project_id) .where(EvaluationDataset.type == EvaluationType.TEXT.value) - .order_by(EvaluationDataset.inserted_at.desc()) + ) + + if eligible_for_fast: + original_items_count = cast( + EvaluationDataset.dataset_metadata["original_items_count"].astext, + Integer, + ) + statement = statement.where( + original_items_count <= settings.EVAL_FAST_MAX_UNIQUE_ROWS + ) + + statement = ( + statement.order_by(EvaluationDataset.inserted_at.desc()) .limit(limit) .offset(offset) ) diff --git a/backend/app/crud/evaluations/fast.py b/backend/app/crud/evaluations/fast.py index 332be87f4..c58ca8970 100644 --- a/backend/app/crud/evaluations/fast.py +++ b/backend/app/crud/evaluations/fast.py @@ -377,6 +377,15 @@ def _stage1_responses( """Stage 1 — one Responses call per dataset item. Skipped on retry if `eval_run.batch_job_id` is set; the unit is reloaded from S3. + + Concurrency assumption: this check-then-create is safe only against + *sequential* redelivery (a retry after the previous attempt fully stopped), + where the `batch_job_id` marker guarantees we never re-call OpenAI for work + that already succeeded. We rely on a single in-flight execution per + EvaluationRun — i.e. the broker visibility timeout is tuned above the task's + runtime so Celery does not redeliver while a worker is still running. If + that assumption is ever broken, two workers could both pass this guard; add + an advisory lock / atomic claim here before introducing concurrent delivery. """ if eval_run.batch_job_id: existing = get_batch_job(session=session, batch_job_id=eval_run.batch_job_id) diff --git a/backend/app/models/evaluation.py b/backend/app/models/evaluation.py index 49a47871d..14c396420 100644 --- a/backend/app/models/evaluation.py +++ b/backend/app/models/evaluation.py @@ -55,7 +55,12 @@ class DatasetUploadResponse(BaseModel): ) eligible_for_fast: bool = Field( False, - description="True if dataset has ≤10 unique rows and is eligible for run_mode=fast", + description=( + "Size predicate only: True if the dataset's unique-row count is at or " + "below the fast-mode threshold (settings.EVAL_FAST_MAX_UNIQUE_ROWS). " + "Other fast-mode prerequisites (e.g. a Langfuse dataset id, a text " + "OpenAI config) are not reflected here." + ), ) @@ -301,7 +306,7 @@ class EvaluationRun(SQLModel, table=True): }, ) run_mode: str = SQLField( - default="batch", + default=RunModeEnum.BATCH.value, max_length=10, nullable=False, sa_column_kwargs={"comment": "Execution mode: batch or fast"}, diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 928aa1580..ca969323a 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -18,7 +18,7 @@ save_score, start_evaluation_batch, ) -from app.models.evaluation import EvaluationRun +from app.models.evaluation import EvaluationRun, RunModeEnum from app.models.llm.request import STTLLMParams, TextLLMParams, TTSLLMParams from app.services.llm.providers import LLMProvider from app.utils import get_langfuse_client, get_openai_client @@ -30,6 +30,23 @@ # and fast paths so both return an identical 409 contract. ERR_RUN_NAME_ALREADY_EXISTS = "run_name_already_exists" +# Name of the (organization_id, project_id, run_name) unique constraint on +# evaluation_run. Used to distinguish a run_name collision (-> 409) from any +# other IntegrityError (e.g. FK violations), which must not be masked. +_RUN_NAME_UNIQUE_CONSTRAINT = "uq_evaluation_run_org_project_run_name" + + +def _is_run_name_conflict(error: IntegrityError) -> bool: + """True only when the IntegrityError is the run_name uniqueness violation.""" + constraint_name = getattr( + getattr(error.orig, "diag", None), "constraint_name", None + ) + if constraint_name: + return constraint_name == _RUN_NAME_UNIQUE_CONSTRAINT + # Fall back to matching the constraint name in the raw error text for + # drivers/dialects that don't expose structured diagnostics. + return _RUN_NAME_UNIQUE_CONSTRAINT in str(error.orig or error) + def create_evaluation_run_or_409( *, @@ -41,7 +58,7 @@ def create_evaluation_run_or_409( config_version: int, organization_id: int, project_id: int, - run_mode: str = "batch", + run_mode: RunModeEnum | str = RunModeEnum.BATCH, log_context: str, ) -> EvaluationRun: """Create an EvaluationRun, translating a duplicate-run_name collision into 409. @@ -62,8 +79,18 @@ def create_evaluation_run_or_409( project_id=project_id, run_mode=run_mode, ) - except IntegrityError: + except IntegrityError as exc: session.rollback() + # Only a run_name collision becomes a 409; any other IntegrityError + # (e.g. a dataset/config FK violation) is a real failure and must not be + # masked as a duplicate-name error. + if not _is_run_name_conflict(exc): + logger.error( + f"[{log_context}] IntegrityError creating run | run_name={run_name} | " + f"org_id={organization_id} | project_id={project_id} | error={exc}", + exc_info=True, + ) + raise logger.warning( f"[{log_context}] Duplicate run_name | run_name={run_name} | " f"org_id={organization_id} | project_id={project_id}" diff --git a/backend/app/services/evaluations/fast.py b/backend/app/services/evaluations/fast.py index 7a2ad8235..6be44b5f7 100644 --- a/backend/app/services/evaluations/fast.py +++ b/backend/app/services/evaluations/fast.py @@ -119,9 +119,16 @@ def validate_and_start_fast_evaluation( ) # 3. Dataset must be small enough for fast eval. - original_items_count = (dataset.dataset_metadata or {}).get( - "original_items_count" - ) or 0 + original_items_count = (dataset.dataset_metadata or {}).get("original_items_count") + if original_items_count is None: + raise HTTPException( + status_code=422, + detail=( + f"{ERR_DATASET_TOO_LARGE_FOR_FAST}: dataset {dataset_id} is " + "missing 'original_items_count' metadata; cannot verify it has at " + f"most {settings.EVAL_FAST_MAX_UNIQUE_ROWS} unique rows for fast mode." + ), + ) if not is_dataset_fast_eligible(original_items_count=original_items_count): raise HTTPException( status_code=422, @@ -142,7 +149,7 @@ def validate_and_start_fast_evaluation( config_version=config_version, organization_id=organization_id, project_id=project_id, - run_mode=RunModeEnum.FAST.value, + run_mode=RunModeEnum.FAST, log_context="validate_and_start_fast_evaluation", ) diff --git a/backend/app/tests/api/routes/test_evaluation_fast.py b/backend/app/tests/api/routes/test_evaluation_fast.py index f614b5553..2fe153954 100644 --- a/backend/app/tests/api/routes/test_evaluation_fast.py +++ b/backend/app/tests/api/routes/test_evaluation_fast.py @@ -6,6 +6,7 @@ """ import random +from collections.abc import Iterator from types import SimpleNamespace from unittest.mock import MagicMock, patch @@ -23,7 +24,7 @@ _stage2_embeddings, run_fast_evaluation, ) -from app.models import EvaluationRun +from app.models import Config, EvaluationDataset, EvaluationRun from app.models.batch_job import BatchJob from app.models.evaluation import RunModeEnum from app.models.llm.request import ( @@ -44,7 +45,7 @@ def _api_error(resp_body: dict) -> str: @pytest.fixture(autouse=True) -def _seeded_random(): +def _seeded_random() -> Iterator[None]: """Make jitter / random.choice deterministic so tests are repeatable.""" random.seed(0) yield @@ -58,14 +59,14 @@ def _seeded_random(): class TestFailureThreshold: """`_is_failure_threshold_breached` controls run-level fail-fast.""" - def test_returns_false_when_total_is_zero(self): + def test_returns_false_when_total_is_zero(self) -> None: assert _is_failure_threshold_breached(failed_rows=0, total_rows=0) is False - def test_returns_true_above_threshold(self): + def test_returns_true_above_threshold(self) -> None: # default EVAL_FAST_FAILURE_THRESHOLD = 0.5 assert _is_failure_threshold_breached(failed_rows=6, total_rows=10) is True - def test_returns_false_at_threshold(self): + def test_returns_false_at_threshold(self) -> None: # 0.5 / 1.0 is NOT greater-than the threshold, so do not breach assert _is_failure_threshold_breached(failed_rows=5, total_rows=10) is False @@ -73,11 +74,11 @@ def test_returns_false_at_threshold(self): class TestCallWithRetry: """FR-8: transient OpenAI errors retry; permanent ones do not.""" - def test_returns_immediately_on_success(self): + def test_returns_immediately_on_success(self) -> None: result = _call_with_retry(label="t", fn=lambda: "ok") assert result == "ok" - def test_retries_on_transient_then_succeeds(self, monkeypatch): + def test_retries_on_transient_then_succeeds(self, monkeypatch) -> None: # Avoid sleeping — make backoff a no-op. monkeypatch.setattr("app.crud.evaluations.fast.time.sleep", lambda *_: None) @@ -94,7 +95,7 @@ def flaky(): assert result == "ok" assert calls["n"] == 3 - def test_does_not_retry_on_permanent_error(self): + def test_does_not_retry_on_permanent_error(self) -> None: def bad(): # AuthenticationError is a non-retryable OpenAIError subclass. raise openai.AuthenticationError( @@ -111,7 +112,7 @@ def bad(): @pytest.fixture -def _patch_dispatch(): +def _patch_dispatch() -> Iterator[MagicMock]: """Stub the Celery dispatch so tests don't actually enqueue work.""" with patch( "app.services.evaluations.fast.enqueue_fast_evaluation", @@ -125,7 +126,7 @@ def _make_fast_eligible_dataset( db: Session, user_api_key: TestAuthContext, original_items_count: int = 3, -): +) -> EvaluationDataset: return create_test_evaluation_dataset( db=db, organization_id=user_api_key.organization_id, @@ -135,7 +136,7 @@ def _make_fast_eligible_dataset( ) -def _make_text_openai_config(db: Session, project_id: int): +def _make_text_openai_config(db: Session, project_id: int) -> Config: """Create a stored text-OpenAI Kaapi config (eligible for fast eval).""" return create_test_config( db=db,