-
Notifications
You must be signed in to change notification settings - Fork 10
Enhancement/evaluation fast #870
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
AkhileshNegi
wants to merge
11
commits into
main
Choose a base branch
from
enhancement/evaluation-fast
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+2,335
−24
Open
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
bbfb6b0
moving from claude.ms to agets
AkhileshNegi cd4e0c6
add consistent logging
AkhileshNegi ccf281b
Merge branch 'main' of github.com:ProjectTech4DevAI/kaapi-backend int…
AkhileshNegi 74058f7
first stab at fast evaluation using agents
AkhileshNegi d13ab1c
Merge branch 'main' into enhancement/evaluation-fast
AkhileshNegi 3f8d24d
update migration
AkhileshNegi eb5b0a6
Revert: remove checked-in .claude/agents and restore CLAUDE.md
AkhileshNegi 324777f
cleanup code a bit
AkhileshNegi 8da52fe
cleanup code to save resutls to s3
AkhileshNegi f6c9567
Merge branch 'main' of github.com:ProjectTech4DevAI/kaapi-backend int…
AkhileshNegi 7653953
coderabbit suggestions cleanups
AkhileshNegi File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
85 changes: 85 additions & 0 deletions
85
backend/app/alembic/versions/064_add_run_mode_to_evaluation_run.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| """add run_mode column and unique run-name constraint to evaluation_run | ||
|
|
||
| Revision ID: 064 | ||
| Revises: 063 | ||
| Create Date: 2026-05-20 00:00:00.000000 | ||
|
|
||
| """ | ||
|
|
||
| import sqlalchemy as sa | ||
| from alembic import op | ||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "064" | ||
| down_revision = "063" | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
| disable_per_migration_transaction = True | ||
|
|
||
| _UNIQUE_INDEX = "uq_evaluation_run_org_project_run_name" | ||
| _UNIQUE_CONSTRAINT = "uq_evaluation_run_org_project_run_name" | ||
|
|
||
|
|
||
| def upgrade(): | ||
| # 1. Add run_mode as nullable first so existing rows are backfilled by the | ||
| # server default, then tighten to NOT NULL. The server default is left | ||
| # in place as a safety net. | ||
| with op.get_context().autocommit_block(): | ||
| op.add_column( | ||
| "evaluation_run", | ||
| sa.Column( | ||
| "run_mode", | ||
| sa.String(length=10), | ||
| nullable=True, | ||
| server_default=sa.text("'batch'"), | ||
| comment="Execution mode: batch or fast", | ||
| ), | ||
| ) | ||
| op.execute("ALTER TABLE evaluation_run ALTER COLUMN run_mode SET NOT NULL") | ||
|
|
||
| # 2. Resolve duplicate (organization_id, project_id, run_name) tuples | ||
| # non-destructively before adding the unique constraint. Keep the | ||
| # lowest-id row's run_name untouched and rename the rest by appending a | ||
| # unique "__dup_<id>" suffix so no historical run (and its scores, result | ||
| # URLs, or batch_job links) is lost. | ||
| with op.get_context().autocommit_block(): | ||
| op.execute( | ||
| """ | ||
| UPDATE evaluation_run e | ||
| SET run_name = e.run_name || '__dup_' || e.id | ||
| WHERE e.id <> ( | ||
| SELECT MIN(x.id) | ||
| FROM evaluation_run x | ||
| WHERE x.organization_id = e.organization_id | ||
| AND x.project_id = e.project_id | ||
| AND x.run_name = e.run_name | ||
| ) | ||
| """ | ||
| ) | ||
|
|
||
| # 3. Build the unique index CONCURRENTLY so the scan does not take an | ||
| # AccessExclusiveLock, then attach it as a named constraint via | ||
| # ADD CONSTRAINT ... USING INDEX (brief catalog-only lock). | ||
| with op.get_context().autocommit_block(): | ||
| op.execute( | ||
| f"CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS " | ||
| f'"{_UNIQUE_INDEX}" ' | ||
| f"ON evaluation_run (organization_id, project_id, run_name)" | ||
| ) | ||
| op.execute( | ||
| f"ALTER TABLE evaluation_run " | ||
| f'ADD CONSTRAINT "{_UNIQUE_CONSTRAINT}" ' | ||
| f'UNIQUE USING INDEX "{_UNIQUE_INDEX}"' | ||
| ) | ||
|
|
||
|
|
||
| def downgrade(): | ||
| # Reverse in opposite order to upgrade(). | ||
| op.execute( | ||
| f"ALTER TABLE evaluation_run " | ||
| f'DROP CONSTRAINT IF EXISTS "{_UNIQUE_CONSTRAINT}"' | ||
| ) | ||
| with op.get_context().autocommit_block(): | ||
| op.execute(f'DROP INDEX CONCURRENTLY IF EXISTS "{_UNIQUE_INDEX}"') | ||
| op.drop_column("evaluation_run", "run_mode") | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,10 @@ | ||
| List all datasets for the current organization and project. | ||
|
|
||
| Returns a paginated list of datasets ordered by most recent first. Each dataset includes metadata (ID, name, item counts, duplication factor), Langfuse integration details, and object store URL. | ||
| Returns a paginated list of datasets ordered by most recent first. Each dataset includes metadata (ID, name, item counts, duplication factor), Langfuse integration details, object store URL, and an `eligible_for_fast` flag that is `true` when the dataset's unique-row count is within `EVAL_FAST_MAX_UNIQUE_ROWS` (and so can be used with `run_mode="fast"` on `POST /evaluations`). | ||
|
|
||
| ## Query parameters | ||
|
|
||
| | Parameter | Description | | ||
| | --- | --- | | ||
| | `limit` / `offset` | Pagination (default 50 / 0; max limit 100) | | ||
| | `eligible_for` | If set to `fast`, the response is filtered to only datasets where `eligible_for_fast` is `true` | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| """ | ||
| Celery task for the synchronous (fast) text-evaluation pipeline. | ||
|
|
||
| This module hosts the single orchestrator task per fast evaluation run. The | ||
| heavy lifting lives in `app/services/evaluations/fast.py`; this task is a thin | ||
| shim that sets the correlation id, attaches the OTel parent context, and | ||
| delegates. | ||
|
|
||
| See `Fast Evaluation SRD.md` for the design (queue, retries, idempotency). | ||
| """ | ||
|
|
||
| import logging | ||
|
|
||
| from celery import Task, current_task | ||
|
|
||
| from app.celery.celery_app import celery_app | ||
| from app.celery.tasks.job_execution import _run_with_otel_parent, _set_trace | ||
| from app.celery.utils import gevent_timeout | ||
| from app.core.config import settings | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| # Sentinel correlation id used when no trace id is propagated from the | ||
| # enqueueing request. Matches the codebase-wide "N/A" default (see | ||
| # app/core/logger.py and app/celery/utils.py). | ||
| DEFAULT_TRACE_ID = "N/A" | ||
|
|
||
|
|
||
| @celery_app.task(bind=True, queue="evaluations", priority=6) | ||
| @gevent_timeout(settings.CELERY_TASK_SOFT_TIME_LIMIT, "run_evaluation_fast") | ||
| def run_evaluation_fast( | ||
| self: Task, eval_run_id: int, trace_id: str = DEFAULT_TRACE_ID | ||
| ) -> None: | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| """Run the fast evaluation pipeline for one EvaluationRun. | ||
|
|
||
| Idempotency: each stage is skipped on retry when its `batch_job` marker is | ||
| already set on the EvaluationRun, so Celery redelivery never re-calls | ||
| OpenAI for work that already succeeded. | ||
|
|
||
| Args: | ||
| eval_run_id: ID of the EvaluationRun (run_mode="fast"). | ||
| trace_id: Correlation id from the enqueueing request, propagated into | ||
| the worker for log correlation. | ||
| """ | ||
| from app.services.evaluations.fast import execute_fast_evaluation | ||
|
|
||
| _set_trace(trace_id) | ||
| logger.info( | ||
| f"[run_evaluation_fast] Starting fast evaluation task | " | ||
| f"eval_run_id={eval_run_id} | task_id={current_task.request.id}" | ||
| ) | ||
|
|
||
| return _run_with_otel_parent( | ||
| self, | ||
| lambda: execute_fast_evaluation(eval_run_id=eval_run_id), | ||
| ) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.