From 94a7f38ed441d9c4d54fe8b8806ab7a525a47e28 Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Tue, 26 May 2026 11:24:21 +0100 Subject: [PATCH 01/14] feat(webapp,run-engine): mollifier drainer replay + stale sweep + cancelled-run engine API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The replay side of the mollifier: - DrainerHandler that reads buffered snapshots and replays them through engine.trigger to materialise PG rows. - RunEngine.createCancelledRun: new public method the handler uses to write CANCELED rows directly from snapshots (bypass queue + waitpoint, emit runCancelled). Tolerates cjson empty-table tags. - Drainer fairness: org → env rotation so a heavy env doesn't starve light ones in the same org. - Stale-entry sweep + telemetry + alertable gauge for stuck drainers. Both drainer and sweep default-off; nothing fires unless flagged on. Co-Authored-By: Claude Opus 4.7 (1M context) --- .server-changes/mollifier.md | 6 + apps/webapp/app/entry.server.tsx | 13 +- apps/webapp/app/env.server.ts | 37 ++- .../v3/mollifier/mollifierDrainer.server.ts | 48 +--- .../mollifierDrainerHandler.server.ts | 163 ++++++++++++ .../mollifier/mollifierStaleSweep.server.ts | 146 +++++++++++ .../v3/mollifier/mollifierTelemetry.server.ts | 80 ++++++ .../v3/mollifierStaleSweepWorker.server.ts | 47 ++++ .../test/mollifierDrainerHandler.test.ts | 206 ++++++++++++++++ apps/webapp/test/mollifierStaleSweep.test.ts | 231 +++++++++++++++++ .../run-engine/src/engine/index.ts | 194 +++++++++++++++ .../engine/tests/createCancelledRun.test.ts | 233 ++++++++++++++++++ .../engine/tests/createFailedTaskRun.test.ts | 111 +++++++++ 13 files changed, 1463 insertions(+), 52 deletions(-) create mode 100644 .server-changes/mollifier.md create mode 100644 apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts create mode 100644 apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts create mode 100644 apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts create mode 100644 apps/webapp/test/mollifierDrainerHandler.test.ts create mode 100644 apps/webapp/test/mollifierStaleSweep.test.ts create mode 100644 internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts create mode 100644 internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts diff --git a/.server-changes/mollifier.md b/.server-changes/mollifier.md new file mode 100644 index 00000000000..399ad5c6507 --- /dev/null +++ b/.server-changes/mollifier.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Mollifier — Redis-backed burst buffer in front of `engine.trigger` with a fair drainer, full read/write parity for buffered runs across the API + dashboard + realtime stream, alertable `mollifier.stale_entries.current` gauge for drainer health, and `runFailed` alerts on drainer-terminal `SYSTEM_FAILURE` rows. diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index 60c234402d5..ca53d03eb67 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -9,6 +9,7 @@ import { renderToPipeableStream } from "react-dom/server"; import { PassThrough } from "stream"; import * as Worker from "~/services/worker.server"; import { initMollifierDrainerWorker } from "~/v3/mollifierDrainerWorker.server"; +import { initMollifierStaleSweepWorker } from "~/v3/mollifierStaleSweepWorker.server"; import { bootstrap } from "./bootstrap"; import { LocaleContextProvider } from "./components/primitives/LocaleProvider"; import { @@ -30,17 +31,8 @@ import { // on webapp startup. The singleton's initializer wires start (gated on // `clickhouseFactory.isReady()`) and SIGTERM/SIGINT shutdown — mirrors // runsReplicationInstance. -// -// IMPORTANT: do NOT replace this with `void sessionsReplicationInstance;`. -// `apps/webapp/package.json` declares `"sideEffects": false`, so esbuild -// treats `void ;` as a pure expression statement and tree-shakes -// the entire import — the singleton's initializer never fires and the -// sessions→ClickHouse logical replication slot stops being consumed. Assigning -// to globalThis is an unambiguous side effect the bundler must preserve. See -// TRI-9864 for the incident write-up. import { sessionsReplicationInstance } from "./services/sessionsReplicationInstance.server"; -(globalThis as Record).__sessionsReplicationInstance = - sessionsReplicationInstance; +void sessionsReplicationInstance; const ABORT_DELAY = 30000; @@ -228,6 +220,7 @@ Worker.init().catch((error) => { }); initMollifierDrainerWorker(); +initMollifierStaleSweepWorker(); bootstrap().catch((error) => { logError(error); diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 9e4743d740f..3328d5b1fb7 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1063,13 +1063,16 @@ const EnvironmentSchema = z // Separate switch for the drainer (consumer side) so it can be split // off onto a dedicated worker service. Unset → inherits // TRIGGER_MOLLIFIER_ENABLED, so single-container self-hosters don't have to - // flip two switches. In multi-replica deployments, set this to "0" - // explicitly on every replica except the one dedicated drainer - // service — otherwise every replica's polling loop races for the - // same buffer entries. `TRIGGER_MOLLIFIER_ENABLED` is still the master kill - // switch; setting this to "1" while `TRIGGER_MOLLIFIER_ENABLED` is "0" is a - // no-op because the gate-side singleton refuses to construct a - // buffer when the system is off. + // flip two switches. Multi-replica drainers are correct — `popAndMarkDraining` + // is an atomic ZPOPMIN + status flip in one Lua call, so only one replica + // can win any given entry — but inefficient: polling load (SMEMBERS + + // per-env scans) multiplies by N, and `TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY` + // is per-process so engine load also multiplies. Splitting the drainer + // onto a dedicated worker keeps that traffic off the request-serving + // replicas. `TRIGGER_MOLLIFIER_ENABLED` is still the master kill switch; + // setting this to "1" while `TRIGGER_MOLLIFIER_ENABLED` is "0" is a + // no-op because the gate-side singleton refuses to construct a buffer + // when the system is off. TRIGGER_MOLLIFIER_DRAINER_ENABLED: z.string().default(process.env.TRIGGER_MOLLIFIER_ENABLED ?? "0"), TRIGGER_MOLLIFIER_SHADOW_MODE: z.string().default("0"), TRIGGER_MOLLIFIER_REDIS_HOST: z @@ -1098,6 +1101,26 @@ const EnvironmentSchema = z TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS: z.coerce.number().int().positive().default(3), TRIGGER_MOLLIFIER_DRAIN_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().positive().default(30_000), TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK: z.coerce.number().int().positive().default(500), + // Periodic sweep that scans buffer queue ZSETs for entries whose + // dwell exceeds the stale threshold. Independent of the drainer — + // its job is exactly to make a stuck/offline drainer visible to + // ops. Defaults: enabled when the mollifier is enabled, run every + // 5 minutes, alert on anything that's been dwelling for 5+ minutes + // (matches the sweep interval — "anything still here when we + // check" is the simplest threshold that converges). + TRIGGER_MOLLIFIER_STALE_SWEEP_ENABLED: z + .string() + .default(process.env.TRIGGER_MOLLIFIER_ENABLED ?? "0"), + TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS: z.coerce + .number() + .int() + .positive() + .default(5 * 60_000), + TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS: z.coerce + .number() + .int() + .positive() + .default(5 * 60_000), BATCH_TRIGGER_PROCESS_JOB_VISIBILITY_TIMEOUT_MS: z.coerce .number() diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts index 486449f8e11..fc75210be3f 100644 --- a/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierDrainer.server.ts @@ -1,10 +1,15 @@ -import { createHash } from "node:crypto"; -import { MollifierDrainer, serialiseSnapshot } from "@trigger.dev/redis-worker"; +import { MollifierDrainer } from "@trigger.dev/redis-worker"; +import { prisma } from "~/db.server"; import { env } from "~/env.server"; +import { engine as runEngine } from "~/v3/runEngine.server"; import { logger } from "~/services/logger.server"; import { singleton } from "~/utils/singleton"; import { getMollifierBuffer } from "./mollifierBuffer.server"; -import type { BufferedTriggerPayload } from "./bufferedTriggerPayload.server"; +import { + createDrainerHandler, + isRetryablePgError, +} from "./mollifierDrainerHandler.server"; +import type { MollifierSnapshot } from "./mollifierSnapshot.server"; // Distinct error class for the deterministic "fail loud at boot" throws // below. The bootstrap in `mollifierDrainerWorker.server.ts` catches @@ -25,7 +30,7 @@ export class MollifierConfigurationError extends Error { } } -function initializeMollifierDrainer(): MollifierDrainer { +function initializeMollifierDrainer(): MollifierDrainer { const buffer = getMollifierBuffer(); if (!buffer) { // Unreachable in normal config: getMollifierDrainer() gates on the @@ -68,40 +73,13 @@ function initializeMollifierDrainer(): MollifierDrainer maxAttempts: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS, }); - // No-op ack handler: the trigger has ALREADY been written to Postgres - // via engine.trigger (dual-write at the call site). Popping + acking - // here proves the dequeue mechanism works end-to-end without duplicating - // the work. A later change replaces this with an engine.trigger replay - // that performs the actual Postgres write. - const drainer = new MollifierDrainer({ + const drainer = new MollifierDrainer({ buffer, - handler: async (input) => { - // Hash the (re-serialised, canonical) payload on the drain side rather - // than on the trigger hot path. Burst-time CPU stays with engine.trigger; - // the drainer is the natural place for the audit-equivalence checksum. - // Re-serialisation is identity for the BufferedTriggerPayload shape - // (only strings/numbers/plain objects), so this hash matches what the - // call site wrote into Redis. - const reserialised = serialiseSnapshot(input.payload); - const payloadHash = createHash("sha256").update(reserialised).digest("hex"); - logger.info("mollifier.drained", { - runId: input.runId, - envId: input.envId, - orgId: input.orgId, - taskId: input.payload.taskId, - attempts: input.attempts, - ageMs: Date.now() - input.createdAt.getTime(), - payloadBytes: reserialised.length, - payloadHash, - }); - }, + handler: createDrainerHandler({ engine: runEngine, prisma }), concurrency: env.TRIGGER_MOLLIFIER_DRAIN_CONCURRENCY, maxAttempts: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS, maxOrgsPerTick: env.TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK, - // A no-op handler shouldn't throw, but if something does (e.g. an - // unexpected deserialise failure), don't loop — let it FAIL terminally - // so the entry is observable in metrics. - isRetryable: () => false, + isRetryable: isRetryablePgError, }); return drainer; @@ -114,7 +92,7 @@ function initializeMollifierDrainer(): MollifierDrainer // handler registration, leaving a narrow window where a SIGTERM landing // between `start()` and `process.once("SIGTERM", ...)` would skip the // graceful stop. The split is intentional. -export function getMollifierDrainer(): MollifierDrainer | null { +export function getMollifierDrainer(): MollifierDrainer | null { if (env.TRIGGER_MOLLIFIER_ENABLED !== "1") return null; return singleton("mollifierDrainer", initializeMollifierDrainer); } diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts new file mode 100644 index 00000000000..7f2608d5b21 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts @@ -0,0 +1,163 @@ +import { context, trace, TraceFlags } from "@opentelemetry/api"; +import type { RunEngine } from "@internal/run-engine"; +import type { PrismaClientOrTransaction } from "@trigger.dev/database"; +import type { MollifierDrainerHandler } from "@trigger.dev/redis-worker"; +import { startSpan } from "~/v3/tracing.server"; +import type { MollifierSnapshot } from "./mollifierSnapshot.server"; + +const tracer = trace.getTracer("mollifier-drainer"); + +export function isRetryablePgError(err: unknown): boolean { + if (!(err instanceof Error)) return false; + const msg = err.message ?? ""; + const code = (err as { code?: string }).code; + if (code === "P2024") return true; + if (msg.includes("Can't reach database server")) return true; + if (msg.includes("Connection lost")) return true; + if (msg.includes("ECONNRESET")) return true; + return false; +} + +export function createDrainerHandler(deps: { + engine: RunEngine; + prisma: PrismaClientOrTransaction; +}): MollifierDrainerHandler { + return async (input) => { + const dwellMs = Date.now() - input.createdAt.getTime(); + + // Re-attach to the trace started by the caller's mollifier.queued span + // (its traceId + spanId were captured into the snapshot at buffer time). + // Without this the drainer would emit mollifier.drained in a brand-new + // trace and the engine.trigger instrumentation would inherit an empty + // active context — leaving the run-detail page with only the root span. + const snapshotTraceId = + typeof input.payload.traceId === "string" ? input.payload.traceId : undefined; + const snapshotSpanId = + typeof input.payload.spanId === "string" ? input.payload.spanId : undefined; + + const parentContext = + snapshotTraceId && snapshotSpanId + ? trace.setSpanContext(context.active(), { + traceId: snapshotTraceId, + spanId: snapshotSpanId, + traceFlags: TraceFlags.SAMPLED, + isRemote: true, + }) + : context.active(); + + // Cancel-wins-over-trigger (Q4 bifurcation). If a cancel API call + // landed on this entry while it was QUEUED, the snapshot carries + // `cancelledAt` + `cancelReason`. Skip the normal materialise path + // and write a CANCELED PG row directly. The existing runCancelled + // handler writes the TaskEvent. + const cancelledAtStr = + typeof input.payload.cancelledAt === "string" ? input.payload.cancelledAt : undefined; + if (cancelledAtStr) { + const cancelReason = + typeof input.payload.cancelReason === "string" + ? input.payload.cancelReason + : "Canceled by user"; + await context.with(parentContext, async () => { + await startSpan(tracer, "mollifier.drained.cancelled", async (span) => { + span.setAttribute("mollifier.drained", true); + span.setAttribute("mollifier.dwell_ms", dwellMs); + span.setAttribute("mollifier.attempts", input.attempts); + span.setAttribute("mollifier.run_friendly_id", input.runId); + span.setAttribute("mollifier.cancel_bifurcation", true); + span.setAttribute("taskRunId", input.runId); + await deps.engine.createCancelledRun( + { + snapshot: input.payload as any, + cancelledAt: new Date(cancelledAtStr), + cancelReason, + }, + deps.prisma, + ); + }); + }); + return; + } + + await context.with(parentContext, async () => { + await startSpan(tracer, "mollifier.drained", async (span) => { + span.setAttribute("mollifier.drained", true); + span.setAttribute("mollifier.dwell_ms", dwellMs); + span.setAttribute("mollifier.attempts", input.attempts); + span.setAttribute("mollifier.run_friendly_id", input.runId); + span.setAttribute("taskRunId", input.runId); + + try { + await deps.engine.trigger(input.payload as any, deps.prisma); + } catch (err) { + // The retryable-PG class re-throws so the drainer's outer + // worker loop can `buffer.requeue` (handled in + // `MollifierDrainer.drainOne`). For non-retryable failures we + // write a terminal SYSTEM_FAILURE row to PG via the engine's + // existing `createFailedTaskRun` (used by batch-trigger for + // the same purpose) so the customer sees the run in their + // dashboard / SDK instead of silently losing it when the + // buffer entry TTLs out. If THAT insert also fails (PG truly + // unreachable), rethrow so the drainer's outer catch falls + // through to its existing `buffer.fail` terminal-marker path. + if (isRetryablePgError(err)) { + throw err; + } + const reason = err instanceof Error ? err.message : String(err); + span.setAttribute("mollifier.terminal_failure_reason", reason); + const snapshot = input.payload as Record; + const env = snapshot.environment as + | { + id: string; + type: any; + project: { id: string }; + organization: { id: string }; + } + | undefined; + if (!env) { + // Snapshot too malformed to even construct a TaskRun row. + // Drainer's outer catch will buffer.fail this entry. + throw err; + } + try { + await deps.engine.createFailedTaskRun({ + friendlyId: input.runId, + environment: env, + taskIdentifier: String(snapshot.taskIdentifier ?? ""), + payload: typeof snapshot.payload === "string" ? snapshot.payload : undefined, + payloadType: + typeof snapshot.payloadType === "string" ? snapshot.payloadType : undefined, + error: { + type: "STRING_ERROR", + raw: `Mollifier drainer terminal failure: ${reason}`, + }, + parentTaskRunId: + typeof snapshot.parentTaskRunId === "string" + ? snapshot.parentTaskRunId + : undefined, + rootTaskRunId: + typeof snapshot.rootTaskRunId === "string" + ? snapshot.rootTaskRunId + : undefined, + depth: typeof snapshot.depth === "number" ? snapshot.depth : 0, + resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true, + traceId: typeof snapshot.traceId === "string" ? snapshot.traceId : undefined, + spanId: typeof snapshot.spanId === "string" ? snapshot.spanId : undefined, + taskEventStore: + typeof snapshot.taskEventStore === "string" + ? snapshot.taskEventStore + : undefined, + queue: typeof snapshot.queue === "string" ? snapshot.queue : undefined, + lockedQueueId: + typeof snapshot.lockedQueueId === "string" ? snapshot.lockedQueueId : undefined, + }); + } catch (writeErr) { + // Class A — PG itself is failing. Rethrow the original + // error so the drainer falls back to buffer.fail. Include + // the write error in the log line at the drainer layer. + throw err; + } + } + }); + }); + }; +} diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts new file mode 100644 index 00000000000..5c31618efec --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts @@ -0,0 +1,146 @@ +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; +import { logger as defaultLogger } from "~/services/logger.server"; +import { getMollifierBuffer } from "./mollifierBuffer.server"; +import { + recordStaleEntry as defaultRecordStaleEntry, + reportStaleEntrySnapshot as defaultReportStaleEntrySnapshot, +} from "./mollifierTelemetry.server"; + +// One pass of the sweep scans every env's queue ZSET. The per-env page +// is bounded so a single pathological env can't make the sweep run +// unboundedly long. +const DEFAULT_MAX_ENTRIES_PER_ENV = 1000; + +export type StaleSweepConfig = { + // Entries whose dwell exceeds this threshold are flagged stale. Set + // it well below `entryTtlSeconds * 1000` so ops have lead time before + // TTL-induced silent loss; the default (half of entryTtlSeconds) + // matches the cadence in the plan doc. + staleThresholdMs: number; + maxEntriesPerEnv?: number; +}; + +export type StaleSweepDeps = { + getBuffer?: () => MollifierBuffer | null; + recordStaleEntry?: (envId: string) => void; + reportStaleEntrySnapshot?: (snapshot: Map) => void; + logger?: { warn: (message: string, fields: Record) => void }; + now?: () => number; +}; + +export type StaleSweepResult = { + orgsScanned: number; + envsScanned: number; + entriesScanned: number; + staleCount: number; +}; + +// Walks orgs → envs → entries, emitting an OTel counter tick and a +// structured warning log for each buffer entry whose dwell exceeds the +// stale threshold. Read-only: the sweep does NOT remove or salvage +// entries; that decision is deferred to a separate retention-policy +// change. The signal here exists so ops sees the drainer falling +// behind well before TTL-induced loss kicks in. +export async function runStaleSweepOnce( + config: StaleSweepConfig, + deps: StaleSweepDeps = {}, +): Promise { + const getBuffer = deps.getBuffer ?? getMollifierBuffer; + const recordStale = deps.recordStaleEntry ?? defaultRecordStaleEntry; + const reportSnapshot = + deps.reportStaleEntrySnapshot ?? defaultReportStaleEntrySnapshot; + const log = deps.logger ?? defaultLogger; + const now = (deps.now ?? Date.now)(); + const maxEntries = config.maxEntriesPerEnv ?? DEFAULT_MAX_ENTRIES_PER_ENV; + + const buffer = getBuffer(); + if (!buffer) { + // Replace any previous snapshot with empty so a previously-paging + // env doesn't stay latched if mollifier is turned off mid-flight. + reportSnapshot(new Map()); + return { orgsScanned: 0, envsScanned: 0, entriesScanned: 0, staleCount: 0 }; + } + + const orgs = await buffer.listOrgs(); + let envsScanned = 0; + let entriesScanned = 0; + let staleCount = 0; + // Tracks the stale count per env this pass. Includes zero counts for + // envs that have entries but none stale — that's what lets the gauge + // drop back to 0 when the drainer catches up. Envs absent from this + // map are also absent from the new snapshot, clearing any latched + // alerts on envs that have fully drained. + const perEnvStale = new Map(); + + for (const orgId of orgs) { + const envs = await buffer.listEnvsForOrg(orgId); + for (const envId of envs) { + envsScanned += 1; + let envStale = 0; + const entries = await buffer.listEntriesForEnv(envId, maxEntries); + for (const entry of entries) { + entriesScanned += 1; + const dwellMs = now - entry.createdAt.getTime(); + if (dwellMs > config.staleThresholdMs) { + recordStale(envId); + log.warn("mollifier.stale_entry", { + runId: entry.runId, + envId, + orgId, + dwellMs, + staleThresholdMs: config.staleThresholdMs, + }); + envStale += 1; + } + } + perEnvStale.set(envId, envStale); + staleCount += envStale; + } + } + + reportSnapshot(perEnvStale); + + return { orgsScanned: orgs.length, envsScanned, entriesScanned, staleCount }; +} + +export type StaleSweepIntervalHandle = { + stop: () => void; +}; + +// Production wrapper: schedule `runStaleSweepOnce` on a fixed interval. +// One pass at a time — if a sweep is still running when the timer fires +// the next tick is skipped (a backed-up Redis would otherwise queue +// overlapping sweeps that all log the same stale entries). +export function startStaleSweepInterval( + config: StaleSweepConfig & { intervalMs: number }, + deps: StaleSweepDeps = {}, +): StaleSweepIntervalHandle { + let stopped = false; + let inFlight = false; + + const tick = async () => { + if (stopped || inFlight) return; + inFlight = true; + try { + await runStaleSweepOnce(config, deps); + } catch (err) { + const log = deps.logger ?? defaultLogger; + log.warn("mollifier.stale_sweep.failed", { + err: err instanceof Error ? err.message : String(err), + }); + } finally { + inFlight = false; + } + }; + + const timer = setInterval(() => { + void tick(); + }, config.intervalMs); + + return { + stop: () => { + stopped = true; + clearInterval(timer); + }, + }; +} diff --git a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts index 0fe302584ce..ba58ce47f63 100644 --- a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts @@ -15,3 +15,83 @@ export function recordDecision(outcome: DecisionOutcome, reason?: DecisionReason ...(reason ? { reason } : {}), }); } + +// Counts subscriptions hitting `/realtime/v1/runs/` for a run that +// lives only in the mollifier buffer (no PG row yet). The route opens +// the Electric stream anyway so the eventual drainer-INSERT propagates +// to the client; this counter is the signal of how often customers +// subscribe inside the buffered window. +export const realtimeBufferedSubscriptionsCounter = meter.createCounter( + "mollifier.realtime_subscriptions.buffered", + { + description: + "Realtime subscriptions opened against a runId that exists only in the mollifier buffer", + }, +); + +export function recordRealtimeBufferedSubscription(envId: string): void { + realtimeBufferedSubscriptionsCounter.add(1, { envId }); +} + +// Counts buffer entries that have been waiting in the queue ZSET longer +// than the configured stale threshold (typically half of entryTtlSeconds). +// Useful for historical "stale events over time" views, but not directly +// alertable on its own — a single stuck entry observed by N sweep ticks +// adds N to the counter, so `rate()` over an alerting window reflects +// (entries × ticks), not "entries that are stale right now". +export const staleEntriesCounter = meter.createCounter( + "mollifier.stale_entries", + { + description: + "Mollifier buffer entries whose dwell exceeds the stale threshold (per sweep pass)", + }, +); + +export function recordStaleEntry(envId: string): void { + staleEntriesCounter.add(1, { envId }); +} + +// Alertable signal: the count of stale entries observed by the latest +// sweep, per env. The sweep snapshots the full per-env picture on each +// pass (including zeros for envs that no longer have any stale entries) +// so an env that was paging can clear when the drainer catches up +// instead of staying latched. Recommended alert: +// mollifier_stale_entries_current{envId=...} > 0 for 5m +export const staleEntriesGauge = meter.createObservableGauge( + "mollifier.stale_entries.current", + { + description: + "Buffer entries whose dwell exceeds the stale threshold, as observed by the latest sweep pass", + }, +); + +const latestStaleSnapshot = new Map(); + +export function reportStaleEntrySnapshot(snapshot: Map): void { + // Replace, don't merge — envs absent from the new snapshot have either + // drained or no longer exist; leaving their last value cached would + // keep alerts latched forever. + latestStaleSnapshot.clear(); + for (const [envId, count] of snapshot) { + latestStaleSnapshot.set(envId, count); + } +} + +meter.addBatchObservableCallback( + (result) => { + for (const [envId, count] of latestStaleSnapshot) { + result.observe(staleEntriesGauge, count, { envId }); + } + }, + [staleEntriesGauge], +); + +// Electric SQL's shape-stream protocol adds a `handle=` query param on +// every reconnect after the initial GET. Gating the realtime-buffered +// log/counter on its absence keeps the signal at one tick per +// subscription instead of one tick per ~20s live-poll iteration — +// without it the counter would over-count by the long-poll factor. +export function isInitialBufferedSubscriptionRequest(url: string | URL): boolean { + const u = typeof url === "string" ? new URL(url) : url; + return !u.searchParams.has("handle"); +} diff --git a/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts new file mode 100644 index 00000000000..5325018baf1 --- /dev/null +++ b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts @@ -0,0 +1,47 @@ +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { signalsEmitter } from "~/services/signals.server"; +import { + startStaleSweepInterval, + type StaleSweepIntervalHandle, +} from "./mollifier/mollifierStaleSweep.server"; + +declare global { + // eslint-disable-next-line no-var + var __mollifierStaleSweepRegistered__: boolean | undefined; + // eslint-disable-next-line no-var + var __mollifierStaleSweepHandle__: StaleSweepIntervalHandle | undefined; +} + +/** + * Bootstraps the mollifier stale-entry sweep. + * + * Independent of the drainer — its purpose is to alert when entries are + * piling up despite the drainer being supposedly healthy, so it runs + * any time the mollifier itself is enabled (gated separately from + * `TRIGGER_MOLLIFIER_DRAINER_ENABLED`). The sweep is read-only: it + * counts and logs stale entries but does not remove or salvage them. + * + * The Remix dev server re-evaluates `entry.server.tsx` on every change, + * so the registration guard + handle cache make the bootstrap + * idempotent across hot reloads. + */ +export function initMollifierStaleSweepWorker(): void { + if (env.TRIGGER_MOLLIFIER_STALE_SWEEP_ENABLED !== "1") return; + if (global.__mollifierStaleSweepRegistered__) return; + + logger.debug("Initializing mollifier stale-entry sweep", { + intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS, + staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS, + }); + + const handle = startStaleSweepInterval({ + intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS, + staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS, + }); + + signalsEmitter.on("SIGTERM", handle.stop); + signalsEmitter.on("SIGINT", handle.stop); + global.__mollifierStaleSweepRegistered__ = true; + global.__mollifierStaleSweepHandle__ = handle; +} diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts new file mode 100644 index 00000000000..6f66cf2ab79 --- /dev/null +++ b/apps/webapp/test/mollifierDrainerHandler.test.ts @@ -0,0 +1,206 @@ +import { describe, expect, it, vi } from "vitest"; +import { trace } from "@opentelemetry/api"; + +vi.mock("~/db.server", () => ({ + prisma: {}, + $replica: {}, +})); + +import { + createDrainerHandler, + isRetryablePgError, +} from "~/v3/mollifier/mollifierDrainerHandler.server"; + +describe("isRetryablePgError", () => { + it("returns true for P2024 (connection pool timeout)", () => { + const err = Object.assign(new Error("Timed out fetching a new connection"), { + code: "P2024", + }); + expect(isRetryablePgError(err)).toBe(true); + }); + + it("returns true for generic connection-lost messages", () => { + expect(isRetryablePgError(new Error("Connection lost"))).toBe(true); + expect(isRetryablePgError(new Error("Can't reach database server"))).toBe(true); + }); + + it("returns false for validation errors", () => { + expect(isRetryablePgError(new Error("Invalid payload"))).toBe(false); + }); + + it("returns false for non-Error inputs", () => { + expect(isRetryablePgError("string error")).toBe(false); + expect(isRetryablePgError({ message: "object" })).toBe(false); + }); +}); + +describe("createDrainerHandler", () => { + it("invokes engine.trigger with the deserialised snapshot", async () => { + const trigger = vi.fn(async () => ({ friendlyId: "run_x" })); + const handler = createDrainerHandler({ + engine: { trigger } as any, + prisma: {} as any, + }); + + await handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", payload: "{}" }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(trigger).toHaveBeenCalledOnce(); + const callArg = trigger.mock.calls[0][0] as { taskIdentifier: string }; + expect(callArg.taskIdentifier).toBe("t"); + }); + + it("re-attaches the snapshot's traceId so engine.trigger inherits the original trace", async () => { + // Captures the active traceId at the moment engine.trigger is invoked. + // Without context propagation it would be a fresh traceId, leaving the + // run-detail page with only the root span. + let observedTraceId: string | undefined; + const trigger = vi.fn(async () => { + observedTraceId = trace.getActiveSpan()?.spanContext().traceId; + return { friendlyId: "run_x" }; + }); + + const handler = createDrainerHandler({ + engine: { trigger } as any, + prisma: {} as any, + }); + + const snapshotTraceId = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + const snapshotSpanId = "bbbbbbbbbbbbbbbb"; + + await handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { + taskIdentifier: "t", + traceId: snapshotTraceId, + spanId: snapshotSpanId, + }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(observedTraceId).toBe(snapshotTraceId); + }); + + it("rethrows retryable PG errors so MollifierDrainer requeues the entry", async () => { + const err = new Error("Can't reach database server"); + const trigger = vi.fn(async () => { + throw err; + }); + const createFailedTaskRun = vi.fn(); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t" }, + attempts: 0, + createdAt: new Date(), + } as any), + ).rejects.toThrow("Can't reach database server"); + // Retryable: we do NOT write a SYSTEM_FAILURE row, the entry should + // be requeued for another shot. + expect(createFailedTaskRun).not.toHaveBeenCalled(); + }); + + const envFixture = { + id: "env_a", + type: "DEVELOPMENT", + project: { id: "proj_1" }, + organization: { id: "org_1" }, + }; + + it("writes a SYSTEM_FAILURE PG row when engine.trigger fails non-retryably", async () => { + const trigger = vi.fn(async () => { + throw new Error("validation failed: payload too large"); + }); + const createFailedTaskRun = vi.fn(async () => ({ + id: "internal", + friendlyId: "run_x", + })); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", environment: envFixture }, + attempts: 0, + createdAt: new Date(), + } as any), + ).resolves.toBeUndefined(); + + expect(trigger).toHaveBeenCalledOnce(); + expect(createFailedTaskRun).toHaveBeenCalledOnce(); + const arg = createFailedTaskRun.mock.calls[0][0] as { error: { raw: string } }; + expect(arg.error.raw).toContain("validation failed"); + }); + + it("rethrows the original error when createFailedTaskRun also fails (PG genuinely unreachable)", async () => { + const triggerErr = new Error("engine rejected the snapshot"); + const trigger = vi.fn(async () => { + throw triggerErr; + }); + const createFailedTaskRun = vi.fn(async () => { + throw new Error("connection refused"); + }); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", environment: envFixture }, + attempts: 0, + createdAt: new Date(), + } as any), + ).rejects.toThrow("engine rejected the snapshot"); + // Drainer's outer drainOne loop now decides retry vs buffer.fail. + expect(createFailedTaskRun).toHaveBeenCalledOnce(); + }); + + it("rethrows the original error when the snapshot lacks an environment block", async () => { + const triggerErr = new Error("engine rejected the snapshot"); + const trigger = vi.fn(async () => { + throw triggerErr; + }); + const createFailedTaskRun = vi.fn(); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t" /* no environment */ }, + attempts: 0, + createdAt: new Date(), + } as any), + ).rejects.toThrow("engine rejected the snapshot"); + expect(createFailedTaskRun).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts new file mode 100644 index 00000000000..029b90cb761 --- /dev/null +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -0,0 +1,231 @@ +import { describe, expect, it, vi } from "vitest"; +import { redisTest } from "@internal/testcontainers"; +import { MollifierBuffer } from "@trigger.dev/redis-worker"; + +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + +import { runStaleSweepOnce } from "~/v3/mollifier/mollifierStaleSweep.server"; + +const SNAPSHOT = { + taskIdentifier: "hello-world", + payload: '{"x":1}', + payloadType: "application/json", + traceContext: {}, +}; + +function spyDeps() { + const recordedStaleEnvIds: string[] = []; + const snapshots: Array> = []; + const warnings: Array<{ message: string; fields: Record }> = []; + return { + recordedStaleEnvIds, + snapshots, + warnings, + deps: { + recordStaleEntry: (envId: string) => { + recordedStaleEnvIds.push(envId); + }, + reportStaleEntrySnapshot: (snapshot: Map) => { + // Clone so post-sweep assertions see what was reported *at that + // call site*, not whatever subsequent passes mutate the source + // map into. + snapshots.push(new Map(snapshot)); + }, + logger: { + warn: (message: string, fields: Record) => { + warnings.push({ message, fields }); + }, + }, + }, + }; +} + +describe("runStaleSweepOnce — unit", () => { + it("returns zeros when the buffer is null", async () => { + // Mirrors the prod gate: if TRIGGER_MOLLIFIER_ENABLED=0 the buffer + // singleton is null and the sweep is a no-op. We don't want it to + // emit a metric (or throw) just because mollifier is disabled. + const { deps, recordedStaleEnvIds, warnings, snapshots } = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 1000 }, + { ...deps, getBuffer: () => null }, + ); + expect(result).toEqual({ + orgsScanned: 0, + envsScanned: 0, + entriesScanned: 0, + staleCount: 0, + }); + expect(recordedStaleEnvIds).toEqual([]); + expect(warnings).toEqual([]); + // An empty snapshot is still reported so any previously-paging env + // (from a prior sweep before mollifier was disabled) clears. + expect(snapshots).toHaveLength(1); + expect(snapshots[0].size).toBe(0); + }); +}); + +describe("runStaleSweepOnce — testcontainers", () => { + redisTest( + "flags entries whose dwell exceeds the stale threshold and skips fresh ones", + async ({ redisOptions }) => { + const buffer = new MollifierBuffer({ redisOptions }); + try { + // Two stale entries (one in each env) + one fresh entry. Sweep + // should flag the two stale, leave the fresh one alone, record + // the counter once per stale entry, and emit a warning per + // stale entry with the dwell + threshold. + await buffer.accept({ + runId: "run_stale_a", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_stale_b", + envId: "env_b", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_fresh", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + // Yank the system clock forward 5 minutes for the sweep — way + // past the threshold below. The `now` deps seam lets us drive + // the threshold without actually waiting in real time. + const futureNow = Date.now() + 5 * 60 * 1000; + + const { deps, recordedStaleEnvIds, warnings, snapshots } = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { + ...deps, + getBuffer: () => buffer, + now: () => futureNow, + }, + ); + + expect(result.envsScanned).toBe(2); + expect(result.entriesScanned).toBe(3); + expect(result.staleCount).toBe(3); + // All three entries have dwell ~5min, all exceed the 1-min + // threshold; each emits one counter tick + one warning. + expect(recordedStaleEnvIds.sort()).toEqual( + ["env_a", "env_a", "env_b"].sort(), + ); + expect(warnings).toHaveLength(3); + for (const w of warnings) { + expect(w.message).toBe("mollifier.stale_entry"); + expect(w.fields.staleThresholdMs).toBe(60 * 1000); + expect(w.fields.dwellMs).toBeGreaterThan(60 * 1000); + } + // Snapshot drives the alertable gauge — env_a has 2 stale + // entries, env_b has 1. Both must appear so a future alert can + // identify which env is paging. + expect(snapshots).toHaveLength(1); + expect(Object.fromEntries(snapshots[0])).toEqual({ + env_a: 2, + env_b: 1, + }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "snapshot reports zero for envs that have entries but none stale (clears latched alerts)", + async ({ redisOptions }) => { + // Critical for alert behaviour: a previous sweep reported env_a + // stale, alert fired, drainer caught up. The next sweep must + // report `env_a -> 0` so the gauge drops below the alert + // threshold instead of staying latched at the last stale value. + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_just_arrived", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const { deps, snapshots } = spyDeps(); + await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { ...deps, getBuffer: () => buffer }, + ); + expect(snapshots).toHaveLength(1); + expect(Object.fromEntries(snapshots[0])).toEqual({ env_a: 0 }); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "leaves fresh entries alone (dwell below threshold)", + async ({ redisOptions }) => { + // Regression guard for the inequality direction. A bug that flipped + // `dwellMs > threshold` to `dwellMs >= threshold` would flag every + // entry the first time the sweep runs after a perfectly synchronised + // accept call — the dashboard would page on every burst. + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_fresh_only", + envId: "env_a", + orgId: "org_1", + payload: JSON.stringify(SNAPSHOT), + }); + const { deps, recordedStaleEnvIds, warnings } = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { ...deps, getBuffer: () => buffer }, + ); + expect(result.staleCount).toBe(0); + expect(recordedStaleEnvIds).toEqual([]); + expect(warnings).toEqual([]); + } finally { + await buffer.close(); + } + }, + ); + + redisTest( + "scans across multiple orgs", + async ({ redisOptions }) => { + // Phase-3 design has org-level fairness in the drainer; the sweep + // must walk every org/env, not just the first one it finds. If a + // future refactor collapsed listOrgs/listEnvsForOrg into a single + // env-flat list this test catches a regression there. + const buffer = new MollifierBuffer({ redisOptions }); + try { + await buffer.accept({ + runId: "run_x", + envId: "env_x", + orgId: "org_x", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_y", + envId: "env_y", + orgId: "org_y", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const { deps } = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { ...deps, getBuffer: () => buffer, now: () => futureNow }, + ); + expect(result.orgsScanned).toBe(2); + expect(result.envsScanned).toBe(2); + expect(result.staleCount).toBe(2); + } finally { + await buffer.close(); + } + }, + ); +}); diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index da42247111a..e461fddf6c5 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -450,6 +450,162 @@ export class RunEngine { //MARK: - Run functions + /** + * Writes a TaskRun row in CANCELED state directly, bypassing the trigger + * pipeline. Used by the mollifier drainer when a cancel API call lands on + * a buffered run before it materialises (Q4 mollifier-cancel design). + * + * Skips: queue insertion (no execution), waitpoint creation (single- + * triggerAndWait can't enter the buffer; F4 bypass), concurrency + * reservation. Emits `runCancelled` so the existing TaskEvent handler + * writes the cancellation event row — the only side effect PG-side cancel + * has today per audit. + * + * Idempotent: if a row with the same friendlyId already exists (double + * drainer pop after requeue), Prisma's P2002 unique-constraint violation + * is caught and the existing row is returned. The duplicate runCancelled + * emission is skipped — the original drain's emit already wrote the + * TaskEvent. + */ + async createCancelledRun( + { + snapshot, + cancelledAt, + cancelReason, + }: { + snapshot: TriggerParams; + cancelledAt: Date; + cancelReason: string; + }, + tx?: PrismaClientOrTransaction, + ): Promise { + const prisma = tx ?? this.prisma; + return startSpan(this.tracer, "createCancelledRun", async (span) => { + span.setAttribute("friendlyId", snapshot.friendlyId); + span.setAttribute("taskIdentifier", snapshot.taskIdentifier); + const id = RunId.fromFriendlyId(snapshot.friendlyId); + const error: TaskRunError = { type: "STRING_ERROR", raw: cancelReason }; + + try { + const taskRun = await prisma.taskRun.create({ + data: { + id, + engine: "V2", + status: "CANCELED", + friendlyId: snapshot.friendlyId, + runtimeEnvironmentId: snapshot.environment.id, + environmentType: snapshot.environment.type, + organizationId: snapshot.environment.organization.id, + projectId: snapshot.environment.project.id, + idempotencyKey: snapshot.idempotencyKey, + idempotencyKeyExpiresAt: snapshot.idempotencyKeyExpiresAt, + idempotencyKeyOptions: snapshot.idempotencyKeyOptions, + taskIdentifier: snapshot.taskIdentifier, + payload: snapshot.payload, + payloadType: snapshot.payloadType, + context: snapshot.context, + traceContext: snapshot.traceContext, + traceId: snapshot.traceId, + spanId: snapshot.spanId, + parentSpanId: snapshot.parentSpanId, + lockedToVersionId: snapshot.lockedToVersionId, + taskVersion: snapshot.taskVersion, + sdkVersion: snapshot.sdkVersion, + cliVersion: snapshot.cliVersion, + concurrencyKey: snapshot.concurrencyKey, + queue: snapshot.queue, + lockedQueueId: snapshot.lockedQueueId, + workerQueue: snapshot.workerQueue, + isTest: snapshot.isTest, + taskEventStore: snapshot.taskEventStore, + // Defensive: the snapshot comes from a cjson-encoded buffer + // payload, where empty Lua tables encode as `{}` not `[]`. If + // the drainer pops a buffered run with no tags, snapshot.tags + // will be an empty object, which Prisma misreads as a relation + // update op. Normalise to a real array (or undefined for the + // empty case). + runTags: Array.isArray(snapshot.tags) && snapshot.tags.length > 0 + ? snapshot.tags + : undefined, + oneTimeUseToken: snapshot.oneTimeUseToken, + parentTaskRunId: snapshot.parentTaskRunId, + rootTaskRunId: snapshot.rootTaskRunId, + replayedFromTaskRunFriendlyId: snapshot.replayedFromTaskRunFriendlyId, + batchId: snapshot.batch?.id, + resumeParentOnCompletion: snapshot.resumeParentOnCompletion, + depth: snapshot.depth, + seedMetadata: snapshot.seedMetadata, + seedMetadataType: snapshot.seedMetadataType, + metadata: snapshot.metadata, + metadataType: snapshot.metadataType, + machinePreset: snapshot.machine, + scheduleId: snapshot.scheduleId, + scheduleInstanceId: snapshot.scheduleInstanceId, + createdAt: snapshot.createdAt, + bulkActionGroupIds: snapshot.bulkActionId ? [snapshot.bulkActionId] : undefined, + planType: snapshot.planType, + realtimeStreamsVersion: snapshot.realtimeStreamsVersion, + streamBasinName: snapshot.streamBasinName, + annotations: snapshot.annotations, + completedAt: cancelledAt, + updatedAt: cancelledAt, + error: error as unknown as Prisma.InputJsonValue, + attemptNumber: 0, + executionSnapshots: { + create: { + engine: "V2", + executionStatus: "FINISHED", + description: "Run cancelled before materialisation", + runStatus: "CANCELED", + environmentId: snapshot.environment.id, + environmentType: snapshot.environment.type, + projectId: snapshot.environment.project.id, + organizationId: snapshot.environment.organization.id, + }, + }, + }, + }); + + this.eventBus.emit("runCancelled", { + time: cancelledAt, + run: { + id: taskRun.id, + status: taskRun.status, + friendlyId: taskRun.friendlyId, + spanId: taskRun.spanId, + taskEventStore: taskRun.taskEventStore, + createdAt: taskRun.createdAt, + completedAt: taskRun.completedAt, + error, + updatedAt: taskRun.updatedAt, + attemptNumber: taskRun.attemptNumber ?? 0, + }, + organization: { id: snapshot.environment.organization.id }, + project: { id: snapshot.environment.project.id }, + environment: { id: snapshot.environment.id }, + }); + + return taskRun; + } catch (err) { + // P2002 = unique constraint violation. Double-pop after a drainer + // requeue can reach this. Idempotent: return the existing row + // without re-emitting. + if ( + err instanceof Prisma.PrismaClientKnownRequestError && + err.code === "P2002" + ) { + this.logger.info( + "createCancelledRun: row already exists, returning existing (idempotent)", + { friendlyId: snapshot.friendlyId }, + ); + const existing = await prisma.taskRun.findFirst({ where: { id } }); + if (existing) return existing; + } + throw err; + } + }); + } + /** "Triggers" one run. */ async trigger( { @@ -983,6 +1139,44 @@ export class RunEngine { }); } + // Emit `runFailed` so the alert pipeline picks up the + // SYSTEM_FAILURE row and the event-store handler writes the + // completion event into the trace. Without this the mollifier + // drainer's terminal failures (and batch-trigger's + // exceed-limit failures) land in PG silently — visible in the + // dashboard list but never reaching customers' configured + // ERROR alert channels. + this.eventBus.emit("runFailed", { + time: taskRun.completedAt ?? new Date(), + run: { + id: taskRun.id, + status: taskRun.status, + spanId: taskRun.spanId, + error, + taskEventStore: taskRun.taskEventStore, + createdAt: taskRun.createdAt, + completedAt: taskRun.completedAt, + updatedAt: taskRun.updatedAt, + // This row never attempted execution — it's a synthesised + // terminal failure. The alert payload's `attemptNumber=0` + // is the signal downstream consumers can use to + // distinguish a never-ran failure from a run that + // exhausted its retries. + attemptNumber: 0, + usageDurationMs: 0, + costInCents: 0, + }, + organization: { + id: environment.organization.id, + }, + project: { + id: environment.project.id, + }, + environment: { + id: environment.id, + }, + }); + return taskRun; }, { diff --git a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts new file mode 100644 index 00000000000..0a541b5349e --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts @@ -0,0 +1,233 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; + +function freshRunId() { + return RunId.generate().friendlyId; +} +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import type { EventBusEventArgs } from "../eventBus.js"; +import { setupAuthenticatedEnvironment } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +function baseEngineOptions(redisOptions: Parameters[0]["queue"]["redis"]) { + return { + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x" as const, + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }; +} + +// Phase C1 / Q4 design — engine.createCancelledRun writes a CANCELED +// TaskRun row directly from a buffer snapshot. Verifies the bypass- +// queue / bypass-waitpoint / emit-runCancelled contract. +describe("RunEngine.createCancelledRun", () => { + containerTest( + "writes CANCELED PG row with snapshot fields, completedAt, error", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + try { + const friendlyId = freshRunId(); + const cancelledAt = new Date("2026-05-20T12:00:00.000Z"); + const cancelReason = "Canceled by user"; + + const result = await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: '{"hello":"world"}', + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000aaaa000000000000", + spanId: "bbbb000000000000", + queue: "task/test-task", + isTest: false, + tags: ["test-tag"], + }, + cancelledAt, + cancelReason, + }); + + expect(result.status).toBe("CANCELED"); + expect(result.friendlyId).toBe(friendlyId); + expect(result.id).toBe(RunId.fromFriendlyId(friendlyId)); + expect(result.completedAt?.toISOString()).toBe(cancelledAt.toISOString()); + expect(result.taskIdentifier).toBe("test-task"); + expect(result.runTags).toEqual(["test-tag"]); + expect(result.payload).toBe('{"hello":"world"}'); + const err = result.error as { type?: string; raw?: string }; + expect(err.type).toBe("STRING_ERROR"); + expect(err.raw).toBe(cancelReason); + + // Verify the PG row is canonical (findFirst returns the row). + const stored = await prisma.taskRun.findFirst({ + where: { friendlyId }, + }); + expect(stored).not.toBeNull(); + expect(stored!.status).toBe("CANCELED"); + } finally { + await engine.quit(); + } + }, + ); + + containerTest( + "emits runCancelled with correct payload", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + const captured: EventBusEventArgs<"runCancelled">[0][] = []; + engine.eventBus.on("runCancelled", (event) => { + captured.push(event); + }); + + try { + const cancelledAt = new Date(); + const cancelReason = "Test cancel"; + const friendlyId = freshRunId(); + await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000cccc000000000000", + spanId: "dddd000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }, + cancelledAt, + cancelReason, + }); + + expect(captured).toHaveLength(1); + expect(captured[0]!.run.status).toBe("CANCELED"); + expect(captured[0]!.run.friendlyId).toBe(friendlyId); + expect(captured[0]!.run.error).toEqual({ type: "STRING_ERROR", raw: cancelReason }); + expect(captured[0]!.organization.id).toBe(env.organization.id); + } finally { + await engine.quit(); + } + }, + ); + + containerTest( + "idempotent on double-pop: second call returns existing row without re-emitting", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + const captured: EventBusEventArgs<"runCancelled">[0][] = []; + engine.eventBus.on("runCancelled", (event) => { + captured.push(event); + }); + + try { + const snapshot = { + friendlyId: freshRunId(), + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000eeee000000000000", + spanId: "ffff000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }; + const cancelledAt = new Date(); + const cancelReason = "Test idempotent"; + + const first = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason }); + const second = await engine.createCancelledRun({ snapshot, cancelledAt, cancelReason }); + + expect(second.id).toBe(first.id); + // Only the first call's emit fired; the P2002 path skips re-emission. + expect(captured).toHaveLength(1); + } finally { + await engine.quit(); + } + }, + ); + + // Regression: cjson encodes empty Lua tables as `{}`, not `[]`. When + // the drainer pops a buffered run that never had a tag set, the + // deserialised snapshot's `tags` field is an empty object. The old + // implementation passed it straight into Prisma's `runTags:` field; + // Prisma misread the object as a relation update op and threw + // `Argument 'set' is missing`. The drainer caught the error and + // marked the buffer entry FAILED — so the CANCELED PG row never + // landed. Found while running the Phase F challenge suite. + containerTest( + "tolerates snapshot.tags being an empty object (cjson edge case)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + try { + const friendlyId = freshRunId(); + // Cast through unknown to simulate the cjson-decode output shape + // for an empty Lua table — TypeScript's snapshot type says + // string[], but the buffer Lua delivers {} for the empty case. + const result = await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000abcd000000000000", + spanId: "1234000000000000", + queue: "task/test-task", + isTest: false, + tags: {} as unknown as string[], + }, + cancelledAt: new Date(), + cancelReason: "Cancelled — empty tags", + }); + expect(result.status).toBe("CANCELED"); + expect(result.friendlyId).toBe(friendlyId); + // Prisma normalises the absent-tags case to either [] or null + // depending on the column default; assert it's an empty array. + expect(result.runTags).toEqual([]); + } finally { + await engine.quit(); + } + }, + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts new file mode 100644 index 00000000000..0619eeffc2f --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts @@ -0,0 +1,111 @@ +import { containerTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; +import { expect } from "vitest"; +import { RunEngine } from "../index.js"; +import { EventBusEventArgs } from "../eventBus.js"; +import { setupAuthenticatedEnvironment } from "./setup.js"; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunEngine.createFailedTaskRun", () => { + containerTest("emits runFailed so the alert pipeline wakes up", async ({ prisma, redisOptions }) => { + // The mollifier drainer (and batch-trigger over-limit path) call + // createFailedTaskRun to write a terminal SYSTEM_FAILURE PG row + // for runs that never actually executed. Without an explicit + // runFailed emit, the row lands silently — the + // runEngineHandlers' `runFailed` listener (which enqueues + // PerformTaskRunAlertsService) never fires, so customers' + // configured TASK_RUN alert channels miss the failure entirely. + // + // Regression intent: if the emit is removed or moved out of + // createFailedTaskRun's success path, this test fails. The + // shape assertions pin the fields the alert delivery service + // reads from the event payload (run.id, run.status, error, + // attemptNumber=0 as the never-ran-marker). + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const failedEvents: EventBusEventArgs<"runFailed">[0][] = []; + engine.eventBus.on("runFailed", (event) => { + failedEvents.push(event); + }); + + const friendlyId = generateFriendlyId("run"); + const taskIdentifier = "drainer-terminal-test"; + + const failed = await engine.createFailedTaskRun({ + friendlyId, + environment: { + id: authenticatedEnvironment.id, + type: authenticatedEnvironment.type, + project: { id: authenticatedEnvironment.project.id }, + organization: { id: authenticatedEnvironment.organization.id }, + }, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + error: { + type: "STRING_ERROR", + raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic", + }, + traceId: "0123456789abcdef0123456789abcdef", + spanId: "fedcba9876543210", + }); + + expect(failed.status).toBe("SYSTEM_FAILURE"); + + expect(failedEvents).toHaveLength(1); + const event = failedEvents[0]; + expect(event.run.id).toBe(failed.id); + expect(event.run.status).toBe("SYSTEM_FAILURE"); + expect(event.run.spanId).toBe("fedcba9876543210"); + // attemptNumber=0 is the marker that the run never executed — + // it's a synthesised terminal failure, not an exhausted-retries + // failure. Downstream consumers can use this to distinguish. + expect(event.run.attemptNumber).toBe(0); + expect(event.run.usageDurationMs).toBe(0); + expect(event.run.costInCents).toBe(0); + expect(event.run.error).toEqual({ + type: "STRING_ERROR", + raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic", + }); + expect(event.organization.id).toBe(authenticatedEnvironment.organization.id); + expect(event.project.id).toBe(authenticatedEnvironment.project.id); + expect(event.environment.id).toBe(authenticatedEnvironment.id); + } finally { + await engine.quit(); + } + }); +}); From e3a2e11bc922a76f2d072dac5be559bdad631399 Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Tue, 26 May 2026 15:05:53 +0100 Subject: [PATCH 02/14] fix(webapp,run-engine): replay-layer code-review follow-ups - `isRetryablePgError`: also accept `errorCode === "P1001"` so `PrismaClientInitializationError` (which surfaces P1001 on a different field than `PrismaClientKnownRequestError`) retries. - Drop `envId` from OTel metric labels on `mollifier.realtime_subscriptions.buffered`, `mollifier.stale_entries`, and the `mollifier.stale_entries.current` gauge. `envId` is a banned high-cardinality attribute; the structured warn log alongside each counter tick still carries envId for forensic drill-down. - Stale-sweep test name + comments now match the assertion shape (all three entries stale, not "two stale + one fresh"). - `RunEngine.createCancelledRun` P2002 path now requires the existing row's status to be CANCELED; a non-canceled conflict throws rather than silently reporting success, so the caller can route to `engine.cancelRun()` or skip. - Regression test pins the new conflict guard. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../mollifierDrainerHandler.server.ts | 7 ++ .../mollifier/mollifierStaleSweep.server.ts | 7 +- .../v3/mollifier/mollifierTelemetry.server.ts | 54 +++++++------ apps/webapp/test/mollifierStaleSweep.test.ts | 80 ++++++++++--------- .../run-engine/src/engine/index.ts | 16 +++- .../engine/tests/createCancelledRun.test.ts | 61 ++++++++++++++ 6 files changed, 160 insertions(+), 65 deletions(-) diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts index 7f2608d5b21..741f1afed07 100644 --- a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts @@ -10,8 +10,15 @@ const tracer = trace.getTracer("mollifier-drainer"); export function isRetryablePgError(err: unknown): boolean { if (!(err instanceof Error)) return false; const msg = err.message ?? ""; + // Prisma surfaces P1001 ("Can't reach database server") via two + // different error classes — `PrismaClientKnownRequestError` exposes + // it as `err.code`, `PrismaClientInitializationError` exposes it as + // `err.errorCode`. Check both so reconnection-time errors retry + // regardless of which class fires. const code = (err as { code?: string }).code; + const errorCode = (err as { errorCode?: string }).errorCode; if (code === "P2024") return true; + if (code === "P1001" || errorCode === "P1001") return true; if (msg.includes("Can't reach database server")) return true; if (msg.includes("Connection lost")) return true; if (msg.includes("ECONNRESET")) return true; diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts index 5c31618efec..c2f94495cc9 100644 --- a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts @@ -22,7 +22,10 @@ export type StaleSweepConfig = { export type StaleSweepDeps = { getBuffer?: () => MollifierBuffer | null; - recordStaleEntry?: (envId: string) => void; + // No `envId` arg — `envId` is a high-cardinality metric attribute and + // is intentionally not emitted as a metric label. The structured warn + // log below carries envId for forensic drill-down. + recordStaleEntry?: () => void; reportStaleEntrySnapshot?: (snapshot: Map) => void; logger?: { warn: (message: string, fields: Record) => void }; now?: () => number; @@ -82,7 +85,7 @@ export async function runStaleSweepOnce( entriesScanned += 1; const dwellMs = now - entry.createdAt.getTime(); if (dwellMs > config.staleThresholdMs) { - recordStale(envId); + recordStale(); log.warn("mollifier.stale_entry", { runId: entry.runId, envId, diff --git a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts index ba58ce47f63..f9c7ca72f1f 100644 --- a/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierTelemetry.server.ts @@ -29,16 +29,21 @@ export const realtimeBufferedSubscriptionsCounter = meter.createCounter( }, ); -export function recordRealtimeBufferedSubscription(envId: string): void { - realtimeBufferedSubscriptionsCounter.add(1, { envId }); +// No `envId` attribute — `envId` is a banned high-cardinality metric +// label per the repo's OTel rules. The structured warn log emitted +// alongside the counter tick (in `mollifierStaleSweep.server.ts`) +// carries the envId / orgId / runId for forensic drill-down; the +// metric stays an aggregate. +export function recordRealtimeBufferedSubscription(): void { + realtimeBufferedSubscriptionsCounter.add(1); } // Counts buffer entries that have been waiting in the queue ZSET longer -// than the configured stale threshold (typically half of entryTtlSeconds). -// Useful for historical "stale events over time" views, but not directly -// alertable on its own — a single stuck entry observed by N sweep ticks -// adds N to the counter, so `rate()` over an alerting window reflects -// (entries × ticks), not "entries that are stale right now". +// than the configured stale threshold. Useful for historical "stale +// events over time" views, but not directly alertable on its own — a +// single stuck entry observed by N sweep ticks adds N to the counter, +// so `rate()` over an alerting window reflects (entries × ticks), not +// "entries that are stale right now". export const staleEntriesCounter = meter.createCounter( "mollifier.stale_entries", { @@ -47,16 +52,16 @@ export const staleEntriesCounter = meter.createCounter( }, ); -export function recordStaleEntry(envId: string): void { - staleEntriesCounter.add(1, { envId }); +// No `envId` attribute — see comment above. +export function recordStaleEntry(): void { + staleEntriesCounter.add(1); } -// Alertable signal: the count of stale entries observed by the latest -// sweep, per env. The sweep snapshots the full per-env picture on each -// pass (including zeros for envs that no longer have any stale entries) -// so an env that was paging can clear when the drainer catches up -// instead of staying latched. Recommended alert: -// mollifier_stale_entries_current{envId=...} > 0 for 5m +// Alertable signal: the total count of stale entries observed by the +// latest sweep. The sweep snapshots the full picture on each pass so +// the gauge drops back to 0 when the drainer catches up instead of +// staying latched. Recommended alert: +// mollifier_stale_entries_current > 0 for 5m export const staleEntriesGauge = meter.createObservableGauge( "mollifier.stale_entries.current", { @@ -65,23 +70,22 @@ export const staleEntriesGauge = meter.createObservableGauge( }, ); -const latestStaleSnapshot = new Map(); +let latestStaleTotal = 0; export function reportStaleEntrySnapshot(snapshot: Map): void { - // Replace, don't merge — envs absent from the new snapshot have either - // drained or no longer exist; leaving their last value cached would - // keep alerts latched forever. - latestStaleSnapshot.clear(); - for (const [envId, count] of snapshot) { - latestStaleSnapshot.set(envId, count); + // Sum across envs. Per-env breakdown is intentionally NOT emitted as + // a metric label (high-cardinality); the structured warn log lines + // from the sweep carry per-env detail for ops to drill down. + let total = 0; + for (const count of snapshot.values()) { + total += count; } + latestStaleTotal = total; } meter.addBatchObservableCallback( (result) => { - for (const [envId, count] of latestStaleSnapshot) { - result.observe(staleEntriesGauge, count, { envId }); - } + result.observe(staleEntriesGauge, latestStaleTotal); }, [staleEntriesGauge], ); diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts index 029b90cb761..6b57931407f 100644 --- a/apps/webapp/test/mollifierStaleSweep.test.ts +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -14,16 +14,21 @@ const SNAPSHOT = { }; function spyDeps() { - const recordedStaleEnvIds: string[] = []; + // Counter ticks — metric carries no `envId` label (high-cardinality) + // so the spy is a simple call count. Per-env detail lives on the + // structured warn log and the snapshot map. + let staleEntryCount = 0; const snapshots: Array> = []; const warnings: Array<{ message: string; fields: Record }> = []; return { - recordedStaleEnvIds, + get staleEntryCount() { + return staleEntryCount; + }, snapshots, warnings, deps: { - recordStaleEntry: (envId: string) => { - recordedStaleEnvIds.push(envId); + recordStaleEntry: () => { + staleEntryCount += 1; }, reportStaleEntrySnapshot: (snapshot: Map) => { // Clone so post-sweep assertions see what was reported *at that @@ -45,10 +50,10 @@ describe("runStaleSweepOnce — unit", () => { // Mirrors the prod gate: if TRIGGER_MOLLIFIER_ENABLED=0 the buffer // singleton is null and the sweep is a no-op. We don't want it to // emit a metric (or throw) just because mollifier is disabled. - const { deps, recordedStaleEnvIds, warnings, snapshots } = spyDeps(); + const spies = spyDeps(); const result = await runStaleSweepOnce( { staleThresholdMs: 1000 }, - { ...deps, getBuffer: () => null }, + { ...spies.deps, getBuffer: () => null }, ); expect(result).toEqual({ orgsScanned: 0, @@ -56,8 +61,9 @@ describe("runStaleSweepOnce — unit", () => { entriesScanned: 0, staleCount: 0, }); - expect(recordedStaleEnvIds).toEqual([]); - expect(warnings).toEqual([]); + expect(spies.staleEntryCount).toBe(0); + expect(spies.warnings).toEqual([]); + const snapshots = spies.snapshots; // An empty snapshot is still reported so any previously-paging env // (from a prior sweep before mollifier was disabled) clears. expect(snapshots).toHaveLength(1); @@ -67,14 +73,15 @@ describe("runStaleSweepOnce — unit", () => { describe("runStaleSweepOnce — testcontainers", () => { redisTest( - "flags entries whose dwell exceeds the stale threshold and skips fresh ones", + "flags every entry whose dwell exceeds the stale threshold", async ({ redisOptions }) => { const buffer = new MollifierBuffer({ redisOptions }); try { - // Two stale entries (one in each env) + one fresh entry. Sweep - // should flag the two stale, leave the fresh one alone, record - // the counter once per stale entry, and emit a warning per - // stale entry with the dwell + threshold. + // Three entries across two envs in the same org. The sweep below + // runs against a `now` advanced by 5 minutes, so all three have + // dwell ~5min and ALL THREE are stale against a 1-minute + // threshold — there is no "fresh" entry in this scenario. The + // assertions below pin the all-three-stale shape. await buffer.accept({ runId: "run_stale_a", envId: "env_a", @@ -88,7 +95,7 @@ describe("runStaleSweepOnce — testcontainers", () => { payload: JSON.stringify(SNAPSHOT), }); await buffer.accept({ - runId: "run_fresh", + runId: "run_stale_c", envId: "env_a", orgId: "org_1", payload: JSON.stringify(SNAPSHOT), @@ -98,11 +105,11 @@ describe("runStaleSweepOnce — testcontainers", () => { // the threshold without actually waiting in real time. const futureNow = Date.now() + 5 * 60 * 1000; - const { deps, recordedStaleEnvIds, warnings, snapshots } = spyDeps(); + const spies = spyDeps(); const result = await runStaleSweepOnce( { staleThresholdMs: 60 * 1000 }, { - ...deps, + ...spies.deps, getBuffer: () => buffer, now: () => futureNow, }, @@ -111,22 +118,21 @@ describe("runStaleSweepOnce — testcontainers", () => { expect(result.envsScanned).toBe(2); expect(result.entriesScanned).toBe(3); expect(result.staleCount).toBe(3); - // All three entries have dwell ~5min, all exceed the 1-min - // threshold; each emits one counter tick + one warning. - expect(recordedStaleEnvIds.sort()).toEqual( - ["env_a", "env_a", "env_b"].sort(), - ); - expect(warnings).toHaveLength(3); - for (const w of warnings) { + // All three entries exceed the threshold; each emits one + // counter tick + one warning. + expect(spies.staleEntryCount).toBe(3); + expect(spies.warnings).toHaveLength(3); + for (const w of spies.warnings) { expect(w.message).toBe("mollifier.stale_entry"); expect(w.fields.staleThresholdMs).toBe(60 * 1000); expect(w.fields.dwellMs).toBeGreaterThan(60 * 1000); } // Snapshot drives the alertable gauge — env_a has 2 stale - // entries, env_b has 1. Both must appear so a future alert can - // identify which env is paging. - expect(snapshots).toHaveLength(1); - expect(Object.fromEntries(snapshots[0])).toEqual({ + // entries, env_b has 1. Per-env detail is still passed to + // `reportStaleEntrySnapshot` for forensic value even though the + // gauge itself aggregates the total. + expect(spies.snapshots).toHaveLength(1); + expect(Object.fromEntries(spies.snapshots[0])).toEqual({ env_a: 2, env_b: 1, }); @@ -151,13 +157,13 @@ describe("runStaleSweepOnce — testcontainers", () => { orgId: "org_1", payload: JSON.stringify(SNAPSHOT), }); - const { deps, snapshots } = spyDeps(); + const spies = spyDeps(); await runStaleSweepOnce( { staleThresholdMs: 60 * 1000 }, - { ...deps, getBuffer: () => buffer }, + { ...spies.deps, getBuffer: () => buffer }, ); - expect(snapshots).toHaveLength(1); - expect(Object.fromEntries(snapshots[0])).toEqual({ env_a: 0 }); + expect(spies.snapshots).toHaveLength(1); + expect(Object.fromEntries(spies.snapshots[0])).toEqual({ env_a: 0 }); } finally { await buffer.close(); } @@ -179,14 +185,14 @@ describe("runStaleSweepOnce — testcontainers", () => { orgId: "org_1", payload: JSON.stringify(SNAPSHOT), }); - const { deps, recordedStaleEnvIds, warnings } = spyDeps(); + const spies = spyDeps(); const result = await runStaleSweepOnce( { staleThresholdMs: 60 * 1000 }, - { ...deps, getBuffer: () => buffer }, + { ...spies.deps, getBuffer: () => buffer }, ); expect(result.staleCount).toBe(0); - expect(recordedStaleEnvIds).toEqual([]); - expect(warnings).toEqual([]); + expect(spies.staleEntryCount).toBe(0); + expect(spies.warnings).toEqual([]); } finally { await buffer.close(); } @@ -215,10 +221,10 @@ describe("runStaleSweepOnce — testcontainers", () => { payload: JSON.stringify(SNAPSHOT), }); const futureNow = Date.now() + 5 * 60 * 1000; - const { deps } = spyDeps(); + const spies = spyDeps(); const result = await runStaleSweepOnce( { staleThresholdMs: 60 * 1000 }, - { ...deps, getBuffer: () => buffer, now: () => futureNow }, + { ...spies.deps, getBuffer: () => buffer, now: () => futureNow }, ); expect(result.orgsScanned).toBe(2); expect(result.envsScanned).toBe(2); diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index e461fddf6c5..244d2b014ed 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -599,7 +599,21 @@ export class RunEngine { { friendlyId: snapshot.friendlyId }, ); const existing = await prisma.taskRun.findFirst({ where: { id } }); - if (existing) return existing; + if (existing) { + // Only treat the conflict as idempotent when the existing + // row is ALREADY canceled. If a non-canceled row landed + // first (e.g. the drainer's normal `engine.trigger` replay + // path raced ahead of the cancel) we surface a conflict + // rather than silently reporting "cancelled" — the run is + // genuinely live and the caller must decide between + // engine.cancelRun() and skipping. + if (existing.status === "CANCELED") { + return existing; + } + throw new Error( + `createCancelledRun conflict: existing run ${snapshot.friendlyId} has status ${existing.status}`, + ); + } } throw err; } diff --git a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts index 0a541b5349e..71223ce0773 100644 --- a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts +++ b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts @@ -230,4 +230,65 @@ describe("RunEngine.createCancelledRun", () => { } }, ); + + // Regression: the P2002-on-id idempotency path used to return ANY + // existing row, which would silently report success even if a live + // (non-CANCELED) row landed first. The guard now requires the + // existing row's status to be CANCELED; anything else surfaces a + // conflict so the caller can route to engine.cancelRun() or skip. + containerTest( + "P2002 conflict with non-CANCELED existing row throws (does not silently succeed)", + async ({ prisma, redisOptions }) => { + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + try { + const friendlyId = freshRunId(); + const id = RunId.fromFriendlyId(friendlyId); + + // Plant a live (non-CANCELED) row with the same id so the + // cancelled-run INSERT hits P2002 and the guard finds a row + // that ISN'T CANCELED. + await prisma.taskRun.create({ + data: { + id, + friendlyId, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + status: "PENDING", + runtimeEnvironmentId: env.id, + projectId: env.project.id, + organizationId: env.organizationId, + queue: "task/test-task", + traceId: "0000000000000000aaaa000000000000", + spanId: "bbbb000000000000", + engine: "V2", + }, + }); + + await expect( + engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000aaaa000000000000", + spanId: "bbbb000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }, + cancelledAt: new Date(), + cancelReason: "Should not silently overwrite a live row", + }), + ).rejects.toThrow(/createCancelledRun conflict.*PENDING/); + } finally { + await engine.quit(); + } + }, + ); }); From 0c776a95f99e9a64af4a60a878fd2463b7170e8e Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Tue, 26 May 2026 15:44:06 +0100 Subject: [PATCH 03/14] test(webapp): isolate mollifierDrainerWorker test from runEngine singleton MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Importing the production drainer wiring transitively loads \`~/v3/runEngine.server\`, whose top-level \`singleton(...)\` eagerly constructs a RunEngine. The constructor spins up Prisma + Redis workers that try to connect to localhost — in CI (no PG, no Redis) that produces an unhandled \`PrismaClientInitializationError\` which fails the run even though every assertion passes. Mock the runEngine and prisma modules so the unit test exercises only the bootstrap's error classification, not a live engine. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/webapp/test/mollifierDrainerWorker.test.ts | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/apps/webapp/test/mollifierDrainerWorker.test.ts b/apps/webapp/test/mollifierDrainerWorker.test.ts index e5f38229d8f..0d4e931fd83 100644 --- a/apps/webapp/test/mollifierDrainerWorker.test.ts +++ b/apps/webapp/test/mollifierDrainerWorker.test.ts @@ -1,4 +1,17 @@ -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; + +// Importing `~/v3/mollifier/mollifierDrainer.server` (below) transitively +// loads `~/v3/runEngine.server`, whose top-level `singleton(...)` call +// eagerly constructs a RunEngine. That spins up Prisma + Redis workers +// that try to connect to localhost — which in CI (no PG, no Redis) +// produces an unhandled `PrismaClientInitializationError` that fails +// the test run even though the assertions all pass. Mocking the +// runEngine module short-circuits the singleton so no worker starts. +vi.mock("~/v3/runEngine.server", () => ({ engine: {} })); +// Same problem: prisma.server.ts's top-level singleton tries to open a +// PG client. The test never makes a query; an empty stub is enough. +vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); + import { MollifierConfigurationError } from "~/v3/mollifier/mollifierDrainer.server"; import { initMollifierDrainerWorker } from "~/v3/mollifierDrainerWorker.server"; From 79f4090e97df6a58e268856af78cddca58b92cd4 Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Tue, 26 May 2026 16:12:22 +0100 Subject: [PATCH 04/14] test(webapp): bump mollifierStaleSweep redisTest timeouts to 20s Container startup + the sweep loop can exceed Vitest's 5s default on CI runners (passes in ~1.7-2s locally). Matches the explicit \`{ timeout: 20_000 }\` other mollifier redisTests carry across the project. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/webapp/test/mollifierStaleSweep.test.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts index 6b57931407f..cc452aff9f6 100644 --- a/apps/webapp/test/mollifierStaleSweep.test.ts +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -74,6 +74,7 @@ describe("runStaleSweepOnce — unit", () => { describe("runStaleSweepOnce — testcontainers", () => { redisTest( "flags every entry whose dwell exceeds the stale threshold", + { timeout: 20_000 }, async ({ redisOptions }) => { const buffer = new MollifierBuffer({ redisOptions }); try { @@ -144,6 +145,7 @@ describe("runStaleSweepOnce — testcontainers", () => { redisTest( "snapshot reports zero for envs that have entries but none stale (clears latched alerts)", + { timeout: 20_000 }, async ({ redisOptions }) => { // Critical for alert behaviour: a previous sweep reported env_a // stale, alert fired, drainer caught up. The next sweep must @@ -172,6 +174,7 @@ describe("runStaleSweepOnce — testcontainers", () => { redisTest( "leaves fresh entries alone (dwell below threshold)", + { timeout: 20_000 }, async ({ redisOptions }) => { // Regression guard for the inequality direction. A bug that flipped // `dwellMs > threshold` to `dwellMs >= threshold` would flag every @@ -201,6 +204,7 @@ describe("runStaleSweepOnce — testcontainers", () => { redisTest( "scans across multiple orgs", + { timeout: 20_000 }, async ({ redisOptions }) => { // Phase-3 design has org-level fairness in the drainer; the sweep // must walk every org/env, not just the first one it finds. If a From d7ab9fc78e7ea074da27a140a15335aeab4981db Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Tue, 26 May 2026 17:32:17 +0100 Subject: [PATCH 05/14] fix(webapp,run-engine): replay-layer Devin follow-ups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs flagged by Devin on PR #3754: 1. entry.server.tsx reverted to \`void sessionsReplicationInstance;\`, which esbuild tree-shakes under \`"sideEffects": false\`. Restored the globalThis assignment + warning comment from #3738 (incident TRI-9864). Without this the sessions→ClickHouse logical replication slot stops being consumed at boot. 2. createFailedTaskRun unconditionally emitted \`runFailed\`, which the \`completeFailedRunEvent\` listener uses to write a span completion into ClickHouse. But TriggerFailedTaskService.call() already wraps createFailedTaskRun inside \`repository.traceEvent({ incomplete: false, isError: true })\` which writes its own completion row for the same (traceId, spanId). Two completions racing on the same span row is a real observability bug. Added an \`emitRunFailedEvent: boolean = true\` opt-out. The TriggerFailedTaskService.call() path now passes \`false\` and enqueues \`PerformTaskRunAlertsService\` directly after the trace event closes so the alerts side of \`runFailed\` is preserved. \`callWithoutTraceEvents\` and the mollifier drainer's terminal- failure path keep the default emit (they have no outer trace event managing the span). Regression test pins the opt-out: \`emitRunFailedEvent: false\` writes the PG row but does NOT fire the bus event. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/webapp/app/entry.server.tsx | 11 +++- .../services/triggerFailedTask.server.ts | 27 ++++++++ .../run-engine/src/engine/index.ts | 27 ++++++++ .../engine/tests/createFailedTaskRun.test.ts | 65 +++++++++++++++++++ 4 files changed, 129 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index ca53d03eb67..9996eb7b30a 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -31,8 +31,17 @@ import { // on webapp startup. The singleton's initializer wires start (gated on // `clickhouseFactory.isReady()`) and SIGTERM/SIGINT shutdown — mirrors // runsReplicationInstance. +// +// IMPORTANT: do NOT replace this with `void sessionsReplicationInstance;`. +// `apps/webapp/package.json` declares `"sideEffects": false`, so esbuild +// treats `void ;` as a pure expression statement and tree-shakes +// the entire import — the singleton's initializer never fires and the +// sessions→ClickHouse logical replication slot stops being consumed. Assigning +// to globalThis is an unambiguous side effect the bundler must preserve. See +// TRI-9864 for the incident write-up. import { sessionsReplicationInstance } from "./services/sessionsReplicationInstance.server"; -void sessionsReplicationInstance; +(globalThis as Record).__sessionsReplicationInstance = + sessionsReplicationInstance; const ABORT_DELAY = 30000; diff --git a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts index 5f985b684c1..a26f6c3e74d 100644 --- a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts @@ -6,6 +6,7 @@ import type { PrismaClientOrTransaction } from "@trigger.dev/database"; import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { getEventRepository } from "~/v3/eventRepository/index.server"; +import { PerformTaskRunAlertsService } from "~/v3/services/alerts/performTaskRunAlerts.server"; import { DefaultQueueManager } from "../concerns/queues.server"; import type { TriggerTaskRequest } from "../types"; @@ -176,6 +177,14 @@ export class TriggerFailedTaskService { event.setAttribute("runId", failedRunFriendlyId); event.failWithError(taskRunError); + // `emitRunFailedEvent: false` because this call site owns the + // trace-event lifecycle via the outer `traceEvent({ + // incomplete: false, isError: true })`. Letting the engine + // emit `runFailed` here would race the + // `completeFailedRunEvent` listener against the outer trace + // event's own completion write for the same (traceId, spanId). + // We re-trigger the alerts side directly after the trace + // event closes, below. return await this.engine.createFailedTaskRun({ friendlyId: failedRunFriendlyId, environment: { @@ -200,12 +209,30 @@ export class TriggerFailedTaskService { spanId: event.spanId, traceContext: traceContext as Record, taskEventStore: store, + emitRunFailedEvent: false, ...(queueName !== undefined && { queue: queueName }), ...(lockedQueueId !== undefined && { lockedQueueId }), }); } ); + // Alerts side of `runFailed` — the engine emit was suppressed + // above so the trace-event completion isn't double-written; we + // still need the alert pipeline to fire so customers' ERROR + // channels see the failure. Best-effort: a failed enqueue logs + // but doesn't block returning the friendlyId, mirroring the + // engine handler's behaviour at runEngineHandlers.server.ts:81. + try { + await PerformTaskRunAlertsService.enqueue(failedRun.id); + } catch (alertsError) { + logger.warn("TriggerFailedTaskService: alert enqueue failed", { + taskId: request.taskId, + friendlyId: failedRun.friendlyId, + error: + alertsError instanceof Error ? alertsError.message : String(alertsError), + }); + } + return failedRun.friendlyId; } catch (createError) { const createErrorMsg = diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index 244d2b014ed..ee8cf06cf3c 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -1051,6 +1051,7 @@ export class RunEngine { taskEventStore, queue: queueOverride, lockedQueueId: lockedQueueIdOverride, + emitRunFailedEvent = true, }: { friendlyId: string; environment: { @@ -1078,6 +1079,19 @@ export class RunEngine { queue?: string; /** Resolved TaskQueue.id when the task is locked to a specific queue. */ lockedQueueId?: string; + /** + * Whether to emit the `runFailed` engine-bus event. Defaults to true. + * + * Set to `false` when the caller is ALREADY managing the trace-event + * lifecycle for this run via `repository.traceEvent({ incomplete: false, + * isError: true, ... })`. In that path the outer trace event handles + * span completion itself; emitting `runFailed` from here causes the + * `runFailed` → `completeFailedRunEvent` handler to write a second + * completion row for the same (traceId, spanId), racing with the + * outer trace event's own write. The alert side of `runFailed` is + * preserved by emitting from the caller after `traceEvent` returns. + */ + emitRunFailedEvent?: boolean; }): Promise { return startSpan( this.tracer, @@ -1160,6 +1174,19 @@ export class RunEngine { // exceed-limit failures) land in PG silently — visible in the // dashboard list but never reaching customers' configured // ERROR alert channels. + // + // Gated by `emitRunFailedEvent` so call sites that already wrap + // this inside `repository.traceEvent({ incomplete: false, + // isError: true })` can opt out — the outer trace event writes + // the completion row itself, and a second write via + // `completeFailedRunEvent` would race against it. Callers that + // disable the emit are responsible for triggering the alerts + // side themselves (e.g. by calling + // `PerformTaskRunAlertsService.enqueue` directly after the + // trace event closes). + if (!emitRunFailedEvent) { + return taskRun; + } this.eventBus.emit("runFailed", { time: taskRun.completedAt ?? new Date(), run: { diff --git a/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts index 0619eeffc2f..84d33baa87d 100644 --- a/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts +++ b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts @@ -108,4 +108,69 @@ describe("RunEngine.createFailedTaskRun", () => { await engine.quit(); } }); + + // The TriggerFailedTaskService.call() path wraps createFailedTaskRun + // inside `repository.traceEvent({ incomplete: false, isError: true })` + // which already writes the completion row for the (traceId, spanId). + // Emitting `runFailed` from here would cause the + // `completeFailedRunEvent` handler to race a second write against + // the same span — the `emitRunFailedEvent: false` opt-out is what + // suppresses the emit. The PG row + alert side stay correct because + // the caller enqueues `PerformTaskRunAlertsService.enqueue(run.id)` + // directly after the trace event closes. + containerTest( + "emitRunFailedEvent: false suppresses the bus emit but still creates the PG row", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, + queue: { redis: redisOptions, masterQueueConsumersDisabled: true, processWorkerQueueDebounceMs: 50 }, + runLock: { redis: redisOptions }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { name: "small-1x" as const, cpu: 0.5, memory: 0.5, centsPerMs: 0.0001 }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const failedEvents: EventBusEventArgs<"runFailed">[0][] = []; + engine.eventBus.on("runFailed", (event) => { + failedEvents.push(event); + }); + + const friendlyId = generateFriendlyId("run"); + const failed = await engine.createFailedTaskRun({ + friendlyId, + environment: { + id: authenticatedEnvironment.id, + type: authenticatedEnvironment.type, + project: { id: authenticatedEnvironment.project.id }, + organization: { id: authenticatedEnvironment.organization.id }, + }, + taskIdentifier: "outer-trace-event-test", + payload: "{}", + payloadType: "application/json", + error: { type: "STRING_ERROR", raw: "outer trace event manages span" }, + traceId: "0123456789abcdef0123456789abcdef", + spanId: "fedcba9876543210", + emitRunFailedEvent: false, + }); + + // PG row landed (caller still gets a usable TaskRun). + expect(failed.status).toBe("SYSTEM_FAILURE"); + expect(failed.friendlyId).toBe(friendlyId); + + // Bus emit was suppressed. + expect(failedEvents).toHaveLength(0); + } finally { + await engine.quit(); + } + }, + ); }); From 5ba96ca56c8b3edb2c2fa2cb386288ca1e31404e Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Wed, 27 May 2026 12:59:04 +0100 Subject: [PATCH 06/14] docs(server-changes): scope mollifier drainer-replay note to this PR Rename the catch-all mollifier.md and trim it to the drainer replay handler, stale sweep, telemetry gauge, and run-engine cancelled/failed APIs; later read/mutation/dashboard work is documented in its own PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .server-changes/mollifier-drainer-replay.md | 6 ++++++ .server-changes/mollifier.md | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 .server-changes/mollifier-drainer-replay.md delete mode 100644 .server-changes/mollifier.md diff --git a/.server-changes/mollifier-drainer-replay.md b/.server-changes/mollifier-drainer-replay.md new file mode 100644 index 00000000000..a8ff2d4bb00 --- /dev/null +++ b/.server-changes/mollifier-drainer-replay.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Mollifier drainer replay: replay buffered entries into `engine.trigger`, stale-entry sweep, a drainer-health gauge, and run-engine cancelled/failed run APIs. diff --git a/.server-changes/mollifier.md b/.server-changes/mollifier.md deleted file mode 100644 index 399ad5c6507..00000000000 --- a/.server-changes/mollifier.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -area: webapp -type: feature ---- - -Mollifier — Redis-backed burst buffer in front of `engine.trigger` with a fair drainer, full read/write parity for buffered runs across the API + dashboard + realtime stream, alertable `mollifier.stale_entries.current` gauge for drainer health, and `runFailed` alerts on drainer-terminal `SYSTEM_FAILURE` rows. From 47184953a15a27edb32bd17bcb44465c9d56934c Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Wed, 27 May 2026 16:14:53 +0100 Subject: [PATCH 07/14] fix(webapp): honour cancel-wins conflict + requeue on transient PG outage in mollifier drainer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mollifier drainer's cancel bifurcation called engine.createCancelledRun without handling its documented conflict contract: when the normal trigger replay path races ahead and materialises a live (non-CANCELED) row, the engine throws a conflict so the caller can "decide between engine.cancelRun() and skipping". The handler did neither — the conflict propagated, isRetryablePgError returned false, and the drainer buffer.fail()'d the entry, silently losing the cancellation while the run kept executing. Now route conflicts to engine.cancelRun() so the cancel actually wins. Separately, when engine.trigger fails non-retryably and the SYSTEM_FAILURE fallback write then fails because PG is transiently unreachable, rethrowing the original non-retryable error made the drainer buffer.fail() the entry — losing the run with no PG row ever landing, and dropping the write error entirely. Rethrow the retryable write error instead so the drainer requeues; the failure row lands once PG recovers. Non-retryable write failures still rethrow the original error as before. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../mollifierDrainerHandler.server.ts | 64 +++++++++++++--- .../test/mollifierDrainerHandler.test.ts | 76 +++++++++++++++++++ 2 files changed, 129 insertions(+), 11 deletions(-) diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts index 741f1afed07..d08f0e30e80 100644 --- a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts @@ -1,6 +1,7 @@ import { context, trace, TraceFlags } from "@opentelemetry/api"; import type { RunEngine } from "@internal/run-engine"; import type { PrismaClientOrTransaction } from "@trigger.dev/database"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; import type { MollifierDrainerHandler } from "@trigger.dev/redis-worker"; import { startSpan } from "~/v3/tracing.server"; import type { MollifierSnapshot } from "./mollifierSnapshot.server"; @@ -72,14 +73,41 @@ export function createDrainerHandler(deps: { span.setAttribute("mollifier.run_friendly_id", input.runId); span.setAttribute("mollifier.cancel_bifurcation", true); span.setAttribute("taskRunId", input.runId); - await deps.engine.createCancelledRun( - { - snapshot: input.payload as any, - cancelledAt: new Date(cancelledAtStr), - cancelReason, - }, - deps.prisma, - ); + try { + await deps.engine.createCancelledRun( + { + snapshot: input.payload as any, + cancelledAt: new Date(cancelledAtStr), + cancelReason, + }, + deps.prisma, + ); + } catch (err) { + // createCancelledRun throws a conflict when the normal trigger + // replay path won the race and already materialised a live + // (non-CANCELED) row for this friendlyId. Its contract leaves + // the resolution to us: honour the cancel by actually + // cancelling the now-live run. Letting the conflict propagate + // would instead reach the drainer's terminal-failure path + // (isRetryablePgError() is false for it), buffer.fail() the + // entry, and silently lose the cancellation while the run + // keeps executing. + const isConflict = + err instanceof Error && err.message.startsWith("createCancelledRun conflict"); + if (!isConflict) { + throw err; + } + span.setAttribute("mollifier.cancel_conflict", true); + const friendlyId = + typeof input.payload.friendlyId === "string" + ? input.payload.friendlyId + : input.runId; + await deps.engine.cancelRun({ + runId: RunId.fromFriendlyId(friendlyId), + completedAt: new Date(cancelledAtStr), + reason: cancelReason, + }); + } }); }); return; @@ -158,9 +186,23 @@ export function createDrainerHandler(deps: { typeof snapshot.lockedQueueId === "string" ? snapshot.lockedQueueId : undefined, }); } catch (writeErr) { - // Class A — PG itself is failing. Rethrow the original - // error so the drainer falls back to buffer.fail. Include - // the write error in the log line at the drainer layer. + // The terminal SYSTEM_FAILURE write itself failed. If it + // failed because PG is transiently unreachable, rethrow the + // *write* error so the drainer requeues — buffer.fail()ing on + // the original non-retryable error would lose the run with no + // PG row ever landing. Once PG recovers the requeued entry + // writes its failure row and the customer sees it. + if (isRetryablePgError(writeErr)) { + span.setAttribute("mollifier.terminal_write_retryable", true); + throw writeErr; + } + // PG reachable but the write was rejected for another reason + // (genuinely bad snapshot). Rethrow the original trigger error + // so the drainer falls back to buffer.fail. + span.setAttribute( + "mollifier.terminal_write_error", + writeErr instanceof Error ? writeErr.message : String(writeErr) + ); throw err; } } diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts index 6f66cf2ab79..e1272203560 100644 --- a/apps/webapp/test/mollifierDrainerHandler.test.ts +++ b/apps/webapp/test/mollifierDrainerHandler.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it, vi } from "vitest"; import { trace } from "@opentelemetry/api"; +import { RunId } from "@trigger.dev/core/v3/isomorphic"; vi.mock("~/db.server", () => ({ prisma: {}, @@ -180,6 +181,81 @@ describe("createDrainerHandler", () => { expect(createFailedTaskRun).toHaveBeenCalledOnce(); }); + it("honours the cancel when a buffered cancel races a materialised non-CANCELED row", async () => { + // Cancel-wins-over-trigger (Q4 bifurcation). If the normal trigger + // replay path materialised a live PENDING row before the cancel + // bifurcation drained, engine.createCancelledRun throws a conflict — + // its documented contract is that "the caller must decide between + // engine.cancelRun() and skipping". The drainer handler must honour + // the cancel intent by actually cancelling the now-live run; otherwise + // the conflict propagates, isRetryablePgError() returns false, and the + // drainer buffer.fail()s the entry — silently losing the cancellation + // while the run keeps executing. + const friendlyId = RunId.generate().friendlyId; + const createCancelledRun = vi.fn(async () => { + throw new Error( + `createCancelledRun conflict: existing run ${friendlyId} has status PENDING` + ); + }); + const cancelRun = vi.fn(async () => ({ alreadyFinished: false })); + const handler = createDrainerHandler({ + engine: { createCancelledRun, cancelRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: friendlyId, + envId: "env_a", + orgId: "org_1", + payload: { + friendlyId, + taskIdentifier: "t", + environment: envFixture, + cancelledAt: new Date().toISOString(), + cancelReason: "Canceled by user", + }, + attempts: 0, + createdAt: new Date(), + } as any) + ).resolves.toBeUndefined(); + + // The live run is actually cancelled, by its internal id. + expect(cancelRun).toHaveBeenCalledOnce(); + expect(cancelRun.mock.calls[0][0].runId).toBe(RunId.fromFriendlyId(friendlyId)); + }); + + it("requeues on a transient PG outage during the SYSTEM_FAILURE fallback write", async () => { + // engine.trigger failed non-retryably, so we try to write a terminal + // SYSTEM_FAILURE row. If THAT write fails because PG is transiently + // unreachable, rethrowing the *original* non-retryable error makes the + // drainer buffer.fail() the entry — losing the run with no PG row ever + // landing. Rethrow the retryable write error instead so the drainer + // requeues; once PG recovers the failure row lands and the customer + // sees it. + const trigger = vi.fn(async () => { + throw new Error("validation failed: payload too large"); + }); + const createFailedTaskRun = vi.fn(async () => { + throw new Error("Can't reach database server"); + }); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_x", + envId: "env_a", + orgId: "org_1", + payload: { taskIdentifier: "t", environment: envFixture }, + attempts: 0, + createdAt: new Date(), + } as any) + ).rejects.toThrow("Can't reach database server"); + }); + it("rethrows the original error when the snapshot lacks an environment block", async () => { const triggerErr = new Error("engine rejected the snapshot"); const trigger = vi.fn(async () => { From 855398dc39fcde1758756ac5c8de24e1b1a9460e Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Wed, 27 May 2026 17:41:09 +0100 Subject: [PATCH 08/14] docs(mollifier): strip internal planning labels from comments Remove plan-tracking shorthand (Q# bifurcation, Phase C1/Q4) from replay-layer mollifier comments and test names; reword to plain English. Comment/test-name only; no behaviour change. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts | 2 +- apps/webapp/test/mollifierDrainerHandler.test.ts | 2 +- .../run-engine/src/engine/tests/createCancelledRun.test.ts | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts index d08f0e30e80..8ee89b895a2 100644 --- a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts @@ -53,7 +53,7 @@ export function createDrainerHandler(deps: { }) : context.active(); - // Cancel-wins-over-trigger (Q4 bifurcation). If a cancel API call + // Cancel-wins-over-trigger. If a cancel API call // landed on this entry while it was QUEUED, the snapshot carries // `cancelledAt` + `cancelReason`. Skip the normal materialise path // and write a CANCELED PG row directly. The existing runCancelled diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts index e1272203560..1a20fdbb69f 100644 --- a/apps/webapp/test/mollifierDrainerHandler.test.ts +++ b/apps/webapp/test/mollifierDrainerHandler.test.ts @@ -182,7 +182,7 @@ describe("createDrainerHandler", () => { }); it("honours the cancel when a buffered cancel races a materialised non-CANCELED row", async () => { - // Cancel-wins-over-trigger (Q4 bifurcation). If the normal trigger + // Cancel-wins-over-trigger. If the normal trigger // replay path materialised a live PENDING row before the cancel // bifurcation drained, engine.createCancelledRun throws a conflict — // its documented contract is that "the caller must decide between diff --git a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts index 71223ce0773..1f47d0b2a1b 100644 --- a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts +++ b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts @@ -44,7 +44,7 @@ function baseEngineOptions(redisOptions: Parameters[0]["queue" }; } -// Phase C1 / Q4 design — engine.createCancelledRun writes a CANCELED +// engine.createCancelledRun writes a CANCELED // TaskRun row directly from a buffer snapshot. Verifies the bypass- // queue / bypass-waitpoint / emit-runCancelled contract. describe("RunEngine.createCancelledRun", () => { From 5a1cbf605717274f345bc3fbd6ec37dfb57181da Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 28 May 2026 10:04:38 +0100 Subject: [PATCH 09/14] docs(mollifier): fix stale ZSET comments and strip residual plan-doc labels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mollifier queue moved from ZSET to LIST in an earlier refactor, but two comments still described it as a ZSET (`mollifierStaleSweep.server.ts:9` and `env.server.ts:1104` — both narrating the periodic stale sweep). Update to LIST. Also clean four residual internal plan-doc labels left over from prior cleanup passes: - `createCancelledRun` docstring (`engine/index.ts`) referenced "Q4 mollifier- cancel design" and "F4 bypass" — both dead nomenclature now that the gate's C1/C3/F4 labels have been rewritten. Restate the waitpoint-skip rationale in plain English: the mollifier gate refuses to buffer triggerAndWait children, so a cancelled buffered run never has a waiting parent to unblock. - `createCancelledRun.test.ts` empty-tags regression dropped "Found while running the Phase F challenge suite." — the comment describes the bug itself, which is self-contained. - `mollifierStaleSweep.test.ts` "scans across multiple orgs" rephrased away from "Phase-3 design has org-level fairness"; the prose now states the invariant directly. Comment/docstring-only; no behaviour change. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/webapp/app/env.server.ts | 2 +- .../app/v3/mollifier/mollifierStaleSweep.server.ts | 2 +- apps/webapp/test/mollifierStaleSweep.test.ts | 9 +++++---- internal-packages/run-engine/src/engine/index.ts | 13 +++++++------ .../src/engine/tests/createCancelledRun.test.ts | 2 +- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 3328d5b1fb7..a7e9cdef256 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1101,7 +1101,7 @@ const EnvironmentSchema = z TRIGGER_MOLLIFIER_DRAIN_MAX_ATTEMPTS: z.coerce.number().int().positive().default(3), TRIGGER_MOLLIFIER_DRAIN_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().positive().default(30_000), TRIGGER_MOLLIFIER_DRAIN_MAX_ORGS_PER_TICK: z.coerce.number().int().positive().default(500), - // Periodic sweep that scans buffer queue ZSETs for entries whose + // Periodic sweep that scans buffer queue LISTs for entries whose // dwell exceeds the stale threshold. Independent of the drainer — // its job is exactly to make a stuck/offline drainer visible to // ops. Defaults: enabled when the mollifier is enabled, run every diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts index c2f94495cc9..4ce219f3305 100644 --- a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts @@ -6,7 +6,7 @@ import { reportStaleEntrySnapshot as defaultReportStaleEntrySnapshot, } from "./mollifierTelemetry.server"; -// One pass of the sweep scans every env's queue ZSET. The per-env page +// One pass of the sweep scans every env's queue LIST. The per-env page // is bounded so a single pathological env can't make the sweep run // unboundedly long. const DEFAULT_MAX_ENTRIES_PER_ENV = 1000; diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts index cc452aff9f6..460fcb37e5c 100644 --- a/apps/webapp/test/mollifierStaleSweep.test.ts +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -206,10 +206,11 @@ describe("runStaleSweepOnce — testcontainers", () => { "scans across multiple orgs", { timeout: 20_000 }, async ({ redisOptions }) => { - // Phase-3 design has org-level fairness in the drainer; the sweep - // must walk every org/env, not just the first one it finds. If a - // future refactor collapsed listOrgs/listEnvsForOrg into a single - // env-flat list this test catches a regression there. + // The drainer pops with org-level fairness, so the sweep must + // walk every org/env to surface stale entries across all of them + // — not just stop at the first env it finds. If a future refactor + // collapsed listOrgs/listEnvsForOrg into a single env-flat list, + // this test catches a regression there. const buffer = new MollifierBuffer({ redisOptions }); try { await buffer.accept({ diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index ee8cf06cf3c..cab2925872f 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -453,13 +453,14 @@ export class RunEngine { /** * Writes a TaskRun row in CANCELED state directly, bypassing the trigger * pipeline. Used by the mollifier drainer when a cancel API call lands on - * a buffered run before it materialises (Q4 mollifier-cancel design). + * a buffered run before it materialises. * - * Skips: queue insertion (no execution), waitpoint creation (single- - * triggerAndWait can't enter the buffer; F4 bypass), concurrency - * reservation. Emits `runCancelled` so the existing TaskEvent handler - * writes the cancellation event row — the only side effect PG-side cancel - * has today per audit. + * Skips: queue insertion (no execution), waitpoint creation (the + * mollifier gate refuses to buffer triggerAndWait children, so a + * cancelled buffered run never has a waiting parent to unblock), + * concurrency reservation. Emits `runCancelled` so the existing + * TaskEvent handler writes the cancellation event row — the only side + * effect PG-side cancel has today per audit. * * Idempotent: if a row with the same friendlyId already exists (double * drainer pop after requeue), Prisma's P2002 unique-constraint violation diff --git a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts index 1f47d0b2a1b..f825cdcf8cf 100644 --- a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts +++ b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts @@ -191,7 +191,7 @@ describe("RunEngine.createCancelledRun", () => { // Prisma misread the object as a relation update op and threw // `Argument 'set' is missing`. The drainer caught the error and // marked the buffer entry FAILED — so the CANCELED PG row never - // landed. Found while running the Phase F challenge suite. + // landed. containerTest( "tolerates snapshot.tags being an empty object (cjson edge case)", async ({ prisma, redisOptions }) => { From 603e1eb3129fc210871d251a7945556d85d46974 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 28 May 2026 10:40:17 +0100 Subject: [PATCH 10/14] perf(webapp): shard the mollifier stale sweep to bound per-tick work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous sweep was unbounded along two dimensions: every tick walked every org and every env (via buffer.listOrgs + listEnvsForOrg). At the sweep's default per-env entry cap of 1000, an incident-scale fan-out gave O(orgs * envs * 1000) Redis round-trips per tick — running far longer than the 5-minute interval and triggering the inFlight guard to drop every subsequent tick until the slow pass finished. Shard the work via a durable cursor: - New file `mollifierStaleSweepState.server.ts` owns three Redis keys (`mollifier:stale_sweep:{cursor,org_list,counts}`), all under the mollifier namespace but separated from the buffer's own state. The state class has its own Redis client; the buffer's existing `MollifierBuffer` API surface is untouched. - On `cursor === 0` the org list is rebuilt by snapshotting `buffer.listOrgs()` into the frozen LIST — the cycle's frozen view. - Each tick consumes up to `maxOrgsPerPass` orgs (default 100), processes them, and advances the cursor. When the cursor reaches the end of the LIST it wraps to 0; the next tick rebuilds and starts the next cycle. - The per-env counts HASH is the source of truth for the gauge snapshot. Visiting an env with zero stale entries HDEL's its hash field — gauge clears immediately on revisit. Envs not revisited this tick keep their last-known value (durability across ticks AND across webapp restarts), accepting a worst-case lag of one full cursor cycle before a no-longer-stale env clears. Snapshot contract change: only envs with non-zero stale counts appear in the reported `Map`. The telemetry layer (`mollifierTelemetry.server.ts` `reportStaleEntrySnapshot`) sums values, so absence is equivalent to zero for the gauge — the alert behaviour is unchanged. Tests: - New: "shards work across ticks: cursor advances by maxOrgsPerPass and wraps after a full cycle" — drives a 5-org fixture with cap=2, asserts the cursor's three-tick progression and wrap. - New: "clears an env from the durable snapshot on revisit when it has entries but none currently stale" — same entry flips stale→not-stale by varying the sweep's `now`, asserts HDEL on revisit. - Existing tests updated to inject `state`; one assertion shape rewritten ("snapshot omits envs that have entries but none stale") to match the new HDEL semantics. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../mollifier/mollifierStaleSweep.server.ts | 115 +++++++-- .../mollifierStaleSweepState.server.ts | 124 +++++++++ .../v3/mollifierStaleSweepWorker.server.ts | 36 ++- apps/webapp/test/mollifierStaleSweep.test.ts | 237 +++++++++++++++--- 4 files changed, 446 insertions(+), 66 deletions(-) create mode 100644 apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts index 4ce219f3305..570ad1a72e8 100644 --- a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts @@ -1,15 +1,22 @@ import type { MollifierBuffer } from "@trigger.dev/redis-worker"; import { logger as defaultLogger } from "~/services/logger.server"; import { getMollifierBuffer } from "./mollifierBuffer.server"; +import { MollifierStaleSweepState, type StaleSweepStateStore } from "./mollifierStaleSweepState.server"; import { recordStaleEntry as defaultRecordStaleEntry, reportStaleEntrySnapshot as defaultReportStaleEntrySnapshot, } from "./mollifierTelemetry.server"; -// One pass of the sweep scans every env's queue LIST. The per-env page -// is bounded so a single pathological env can't make the sweep run -// unboundedly long. +// One pass of the sweep scans a bounded slice of orgs from the buffer's +// queue LIST, identified by a durable cursor in Redis. Per-env entry +// scan is also bounded so a single pathological env can't extend the +// pass. const DEFAULT_MAX_ENTRIES_PER_ENV = 1000; +// Max orgs visited per tick. Together with `maxEntriesPerEnv` this +// caps Redis traffic per pass. One "cycle" (visiting every org once) +// takes `ceil(N_orgs / cap)` ticks, after which the cursor wraps and a +// fresh org list is taken. +const DEFAULT_MAX_ORGS_PER_PASS = 100; export type StaleSweepConfig = { // Entries whose dwell exceeds this threshold are flagged stale. Set @@ -18,10 +25,21 @@ export type StaleSweepConfig = { // matches the cadence in the plan doc. staleThresholdMs: number; maxEntriesPerEnv?: number; + // Hard cap on orgs visited per tick. Bounds the per-pass Redis traffic + // and wall-time. Default 100 — at typical fleet sizes one or two + // ticks cover everyone; under incident-scale fan-out a full cycle + // takes a handful of ticks (~minutes) which is still well below the + // staleness signal latency that ops cares about. + maxOrgsPerPass?: number; }; export type StaleSweepDeps = { getBuffer?: () => MollifierBuffer | null; + // Durable cursor + per-env counts hash. Required: the sweep is + // useless without persistent state across ticks. The webapp wires up + // a real `MollifierStaleSweepState`; tests pass one constructed + // against the test container. + state: StaleSweepStateStore; // No `envId` arg — `envId` is a high-cardinality metric attribute and // is intentionally not emitted as a metric label. The structured warn // log below carries envId for forensic drill-down. @@ -38,15 +56,32 @@ export type StaleSweepResult = { staleCount: number; }; -// Walks orgs → envs → entries, emitting an OTel counter tick and a -// structured warning log for each buffer entry whose dwell exceeds the -// stale threshold. Read-only: the sweep does NOT remove or salvage -// entries; that decision is deferred to a separate retention-policy -// change. The signal here exists so ops sees the drainer falling +// Walks a bounded slice of `orgs → envs → entries`, emitting an OTel +// counter tick and a structured warning log for each buffer entry whose +// dwell exceeds the stale threshold. Read-only on the buffer's own +// state; writes only to the sweep's three dedicated keys +// (`mollifier:stale_sweep:*`). The sweep does NOT remove or salvage +// buffer entries; that decision is deferred to a separate retention- +// policy change. The signal here exists so ops sees the drainer falling // behind well before TTL-induced loss kicks in. +// +// Sharding contract: +// - Cursor starts at 0. On cursor=0 the org list is refreshed by +// snapshotting `buffer.listOrgs()` into the durable LIST — that is +// the cycle's frozen view of orgs to visit. +// - Each tick consumes up to `maxOrgsPerPass` orgs from the LIST, +// advances the cursor, and persists. +// - When the cursor reaches the end of the LIST it wraps to 0; the next +// tick rebuilds the org list, capturing any orgs that joined the +// buffer mid-cycle. +// - The per-env counts HASH carries over across ticks: an env visited +// on tick N and not revisited until tick N+M keeps its last-known +// stale count in the gauge for that window. This is the price of +// sharding — accepted because the alternative (re-scan everything +// every tick) does not bound work. export async function runStaleSweepOnce( config: StaleSweepConfig, - deps: StaleSweepDeps = {}, + deps: StaleSweepDeps, ): Promise { const getBuffer = deps.getBuffer ?? getMollifierBuffer; const recordStale = deps.recordStaleEntry ?? defaultRecordStaleEntry; @@ -55,27 +90,40 @@ export async function runStaleSweepOnce( const log = deps.logger ?? defaultLogger; const now = (deps.now ?? Date.now)(); const maxEntries = config.maxEntriesPerEnv ?? DEFAULT_MAX_ENTRIES_PER_ENV; + const maxOrgsPerPass = config.maxOrgsPerPass ?? DEFAULT_MAX_ORGS_PER_PASS; const buffer = getBuffer(); if (!buffer) { // Replace any previous snapshot with empty so a previously-paging // env doesn't stay latched if mollifier is turned off mid-flight. + // Also clear the durable state so a re-enable starts from a clean + // slate instead of resuming on a stale cursor. + await deps.state.clearAll(); reportSnapshot(new Map()); return { orgsScanned: 0, envsScanned: 0, entriesScanned: 0, staleCount: 0 }; } - const orgs = await buffer.listOrgs(); + let cursor = await deps.state.readCursor(); + if (cursor === 0) { + // Fresh cycle — capture the current set of orgs into the frozen + // LIST. Any orgs that join after this snapshot wait until the next + // cycle to be visited. Acceptable for an observational sweep; the + // staleness signal would only fire on entries that have been + // dwelling for `staleThresholdMs` anyway, so they're not new. + const orgs = await buffer.listOrgs(); + await deps.state.rebuildOrgList(orgs); + } + + const { orgs: slice, total } = await deps.state.readOrgListSlice( + cursor, + maxOrgsPerPass, + ); + let envsScanned = 0; let entriesScanned = 0; let staleCount = 0; - // Tracks the stale count per env this pass. Includes zero counts for - // envs that have entries but none stale — that's what lets the gauge - // drop back to 0 when the drainer catches up. Envs absent from this - // map are also absent from the new snapshot, clearing any latched - // alerts on envs that have fully drained. - const perEnvStale = new Map(); - - for (const orgId of orgs) { + + for (const orgId of slice) { const envs = await buffer.listEnvsForOrg(orgId); for (const envId of envs) { envsScanned += 1; @@ -96,18 +144,31 @@ export async function runStaleSweepOnce( envStale += 1; } } - perEnvStale.set(envId, envStale); + // Persist the per-env count to the durable hash. HSET when stale + // > 0, HDEL when it dropped back to zero — the hash is the source + // of truth for the gauge snapshot below. + await deps.state.setEnvStaleCount(envId, envStale); staleCount += envStale; } } - reportSnapshot(perEnvStale); + // Advance the cursor. If the slice consumed the end of the LIST, wrap + // to 0 so the next tick rebuilds the org list and starts a new cycle. + const advanced = cursor + slice.length; + const newCursor = advanced >= total ? 0 : advanced; + await deps.state.writeCursor(newCursor); + + // Emit the snapshot from the durable hash, which carries values for + // envs visited in earlier ticks too. This is what makes the gauge + // stable across ticks (and across webapp restarts). + const snapshot = await deps.state.readAllEnvStaleCounts(); + reportSnapshot(snapshot); - return { orgsScanned: orgs.length, envsScanned, entriesScanned, staleCount }; + return { orgsScanned: slice.length, envsScanned, entriesScanned, staleCount }; } export type StaleSweepIntervalHandle = { - stop: () => void; + stop: () => Promise; }; // Production wrapper: schedule `runStaleSweepOnce` on a fixed interval. @@ -116,7 +177,7 @@ export type StaleSweepIntervalHandle = { // overlapping sweeps that all log the same stale entries). export function startStaleSweepInterval( config: StaleSweepConfig & { intervalMs: number }, - deps: StaleSweepDeps = {}, + deps: StaleSweepDeps, ): StaleSweepIntervalHandle { let stopped = false; let inFlight = false; @@ -141,9 +202,15 @@ export function startStaleSweepInterval( }, config.intervalMs); return { - stop: () => { + stop: async () => { stopped = true; clearInterval(timer); + // Close the durable-state Redis client if the deps own a real + // `MollifierStaleSweepState`. Tests may inject a fake without a + // `close()`; guard accordingly. + if (deps.state instanceof MollifierStaleSweepState) { + await deps.state.close(); + } }, }; } diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts new file mode 100644 index 00000000000..fb98e8da923 --- /dev/null +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts @@ -0,0 +1,124 @@ +import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; +import { Logger } from "@trigger.dev/core/logger"; + +// Durable per-tick state for the sharded stale sweep. Three Redis keys, +// all in the `mollifier:` namespace alongside the buffer's own state: +// +// mollifier:stale_sweep:cursor STRING next position in org_list (0 = fresh cycle) +// mollifier:stale_sweep:org_list LIST org IDs frozen at the start of the cycle +// mollifier:stale_sweep:counts HASH envId -> last-known stale count +// +// The state survives webapp restarts: a restarted process picks up the +// cursor where the previous one left off and re-emits the last-known +// gauge values immediately, rather than blinking to zero until the next +// cycle visits each env. +// +// Storage is owned by this class rather than added to MollifierBuffer +// because the keys are sweep-internal — the buffer abstracts the +// drainer/queue state, this abstracts sweep state. They share a +// namespace prefix but no API surface. + +export interface StaleSweepStateStore { + readCursor(): Promise; + writeCursor(value: number): Promise; + /** Replaces the cycle's frozen org_list. Called at cursor=0. */ + rebuildOrgList(orgs: string[]): Promise; + /** Returns up to `count` org IDs starting at `start`, plus the LIST's total length. */ + readOrgListSlice(start: number, count: number): Promise<{ orgs: string[]; total: number }>; + /** HSET when count > 0, HDEL when count === 0 (so the snapshot reflects current truth). */ + setEnvStaleCount(envId: string, count: number): Promise; + readAllEnvStaleCounts(): Promise>; + clearAll(): Promise; + close(): Promise; +} + +const CURSOR_KEY = "mollifier:stale_sweep:cursor"; +const ORG_LIST_KEY = "mollifier:stale_sweep:org_list"; +const COUNTS_KEY = "mollifier:stale_sweep:counts"; + +export class MollifierStaleSweepState implements StaleSweepStateStore { + private readonly redis: Redis; + private readonly logger: Logger; + + constructor(options: { redisOptions: RedisOptions; logger?: Logger }) { + this.logger = options.logger ?? new Logger("MollifierStaleSweepState", "debug"); + this.redis = createRedisClient( + { ...options.redisOptions, maxRetriesPerRequest: 20 }, + { + onError: (error) => { + this.logger.error("MollifierStaleSweepState redis client error:", { error }); + }, + }, + ); + } + + async readCursor(): Promise { + const raw = await this.redis.get(CURSOR_KEY); + if (raw === null) return 0; + const n = Number.parseInt(raw, 10); + return Number.isFinite(n) && n >= 0 ? n : 0; + } + + async writeCursor(value: number): Promise { + await this.redis.set(CURSOR_KEY, String(value)); + } + + async rebuildOrgList(orgs: string[]): Promise { + // DEL + RPUSH in a pipeline — close enough to atomic for an + // observational sweep (the inFlight guard at startStaleSweepInterval + // serialises sweep passes; nothing else writes these keys). + const pipeline = this.redis.pipeline(); + pipeline.del(ORG_LIST_KEY); + if (orgs.length > 0) { + pipeline.rpush(ORG_LIST_KEY, ...orgs); + } + await pipeline.exec(); + } + + async readOrgListSlice( + start: number, + count: number, + ): Promise<{ orgs: string[]; total: number }> { + const pipeline = this.redis.pipeline(); + pipeline.lrange(ORG_LIST_KEY, start, start + count - 1); + pipeline.llen(ORG_LIST_KEY); + const results = await pipeline.exec(); + if (!results) return { orgs: [], total: 0 }; + const [lrangeErr, lrangeRes] = results[0] as [Error | null, string[] | null]; + const [llenErr, llenRes] = results[1] as [Error | null, number | null]; + if (lrangeErr || llenErr) { + this.logger.error("MollifierStaleSweepState.readOrgListSlice failed", { + lrangeErr: lrangeErr?.message, + llenErr: llenErr?.message, + }); + return { orgs: [], total: 0 }; + } + return { orgs: lrangeRes ?? [], total: llenRes ?? 0 }; + } + + async setEnvStaleCount(envId: string, count: number): Promise { + if (count > 0) { + await this.redis.hset(COUNTS_KEY, envId, String(count)); + } else { + await this.redis.hdel(COUNTS_KEY, envId); + } + } + + async readAllEnvStaleCounts(): Promise> { + const raw = await this.redis.hgetall(COUNTS_KEY); + const out = new Map(); + for (const [envId, value] of Object.entries(raw)) { + const n = Number.parseInt(value, 10); + if (Number.isFinite(n)) out.set(envId, n); + } + return out; + } + + async clearAll(): Promise { + await this.redis.del(CURSOR_KEY, ORG_LIST_KEY, COUNTS_KEY); + } + + async close(): Promise { + await this.redis.quit(); + } +} diff --git a/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts index 5325018baf1..de05ab24671 100644 --- a/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts +++ b/apps/webapp/app/v3/mollifierStaleSweepWorker.server.ts @@ -5,6 +5,7 @@ import { startStaleSweepInterval, type StaleSweepIntervalHandle, } from "./mollifier/mollifierStaleSweep.server"; +import { MollifierStaleSweepState } from "./mollifier/mollifierStaleSweepState.server"; declare global { // eslint-disable-next-line no-var @@ -35,13 +36,38 @@ export function initMollifierStaleSweepWorker(): void { staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS, }); - const handle = startStaleSweepInterval({ - intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS, - staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS, + // Construct the sweep's durable-state Redis client using the same + // mollifier-Redis credentials as the buffer. Keeping this client + // separate from the buffer's own client keeps state ownership clean: + // the buffer abstracts queue/entry state, this abstracts sweep state. + const state = new MollifierStaleSweepState({ + redisOptions: { + keyPrefix: "", + host: env.TRIGGER_MOLLIFIER_REDIS_HOST, + port: env.TRIGGER_MOLLIFIER_REDIS_PORT, + username: env.TRIGGER_MOLLIFIER_REDIS_USERNAME, + password: env.TRIGGER_MOLLIFIER_REDIS_PASSWORD, + enableAutoPipelining: true, + ...(env.TRIGGER_MOLLIFIER_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }, }); - signalsEmitter.on("SIGTERM", handle.stop); - signalsEmitter.on("SIGINT", handle.stop); + const handle = startStaleSweepInterval( + { + intervalMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_INTERVAL_MS, + staleThresholdMs: env.TRIGGER_MOLLIFIER_STALE_SWEEP_THRESHOLD_MS, + }, + { state }, + ); + + // `handle.stop` is now async (it closes the Redis client). The signals + // emitter swallows promise rejections from listeners, so wrap it in a + // void-returning shim to be explicit about discarding the promise. + const onShutdown = (): void => { + void handle.stop(); + }; + signalsEmitter.on("SIGTERM", onShutdown); + signalsEmitter.on("SIGINT", onShutdown); global.__mollifierStaleSweepRegistered__ = true; global.__mollifierStaleSweepHandle__ = handle; } diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts index 460fcb37e5c..57a55cf68e5 100644 --- a/apps/webapp/test/mollifierStaleSweep.test.ts +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -5,6 +5,7 @@ import { MollifierBuffer } from "@trigger.dev/redis-worker"; vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); import { runStaleSweepOnce } from "~/v3/mollifier/mollifierStaleSweep.server"; +import { MollifierStaleSweepState } from "~/v3/mollifier/mollifierStaleSweepState.server"; const SNAPSHOT = { taskIdentifier: "hello-world", @@ -13,6 +14,39 @@ const SNAPSHOT = { traceContext: {}, }; +// In-memory fake state for unit tests that don't have a Redis container. +// The testcontainer tests use a real MollifierStaleSweepState against +// the test Redis instead. +function makeFakeState() { + let cursor = 0; + let orgList: string[] = []; + const counts = new Map(); + return { + readCursor: async () => cursor, + writeCursor: async (v: number) => { + cursor = v; + }, + rebuildOrgList: async (orgs: string[]) => { + orgList = [...orgs]; + }, + readOrgListSlice: async (start: number, count: number) => ({ + orgs: orgList.slice(start, start + count), + total: orgList.length, + }), + setEnvStaleCount: async (envId: string, count: number) => { + if (count > 0) counts.set(envId, count); + else counts.delete(envId); + }, + readAllEnvStaleCounts: async () => new Map(counts), + clearAll: async () => { + cursor = 0; + orgList = []; + counts.clear(); + }, + close: async () => {}, + }; +} + function spyDeps() { // Counter ticks — metric carries no `envId` label (high-cardinality) // so the spy is a simple call count. Per-env detail lives on the @@ -53,7 +87,7 @@ describe("runStaleSweepOnce — unit", () => { const spies = spyDeps(); const result = await runStaleSweepOnce( { staleThresholdMs: 1000 }, - { ...spies.deps, getBuffer: () => null }, + { ...spies.deps, getBuffer: () => null, state: makeFakeState() }, ); expect(result).toEqual({ orgsScanned: 0, @@ -107,36 +141,42 @@ describe("runStaleSweepOnce — testcontainers", () => { const futureNow = Date.now() + 5 * 60 * 1000; const spies = spyDeps(); - const result = await runStaleSweepOnce( - { staleThresholdMs: 60 * 1000 }, - { - ...spies.deps, - getBuffer: () => buffer, - now: () => futureNow, - }, - ); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000 }, + { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }, + ); - expect(result.envsScanned).toBe(2); - expect(result.entriesScanned).toBe(3); - expect(result.staleCount).toBe(3); - // All three entries exceed the threshold; each emits one - // counter tick + one warning. - expect(spies.staleEntryCount).toBe(3); - expect(spies.warnings).toHaveLength(3); - for (const w of spies.warnings) { - expect(w.message).toBe("mollifier.stale_entry"); - expect(w.fields.staleThresholdMs).toBe(60 * 1000); - expect(w.fields.dwellMs).toBeGreaterThan(60 * 1000); + expect(result.envsScanned).toBe(2); + expect(result.entriesScanned).toBe(3); + expect(result.staleCount).toBe(3); + // All three entries exceed the threshold; each emits one + // counter tick + one warning. + expect(spies.staleEntryCount).toBe(3); + expect(spies.warnings).toHaveLength(3); + for (const w of spies.warnings) { + expect(w.message).toBe("mollifier.stale_entry"); + expect(w.fields.staleThresholdMs).toBe(60 * 1000); + expect(w.fields.dwellMs).toBeGreaterThan(60 * 1000); + } + // Snapshot drives the alertable gauge — env_a has 2 stale + // entries, env_b has 1. Per-env detail is still passed to + // `reportStaleEntrySnapshot` for forensic value even though the + // gauge itself aggregates the total. + expect(spies.snapshots).toHaveLength(1); + expect(Object.fromEntries(spies.snapshots[0])).toEqual({ + env_a: 2, + env_b: 1, + }); + } finally { + await state.close(); } - // Snapshot drives the alertable gauge — env_a has 2 stale - // entries, env_b has 1. Per-env detail is still passed to - // `reportStaleEntrySnapshot` for forensic value even though the - // gauge itself aggregates the total. - expect(spies.snapshots).toHaveLength(1); - expect(Object.fromEntries(spies.snapshots[0])).toEqual({ - env_a: 2, - env_b: 1, - }); } finally { await buffer.close(); } @@ -144,14 +184,20 @@ describe("runStaleSweepOnce — testcontainers", () => { ); redisTest( - "snapshot reports zero for envs that have entries but none stale (clears latched alerts)", + "snapshot omits envs that have entries but none stale (durable hash HDEL's zeros)", { timeout: 20_000 }, async ({ redisOptions }) => { - // Critical for alert behaviour: a previous sweep reported env_a + // Critical for alert behaviour: a previous sweep flagged env_a // stale, alert fired, drainer caught up. The next sweep must - // report `env_a -> 0` so the gauge drops below the alert - // threshold instead of staying latched at the last stale value. + // remove env_a from the durable counts hash so the gauge drops + // below the alert threshold instead of staying latched at the + // last stale value. With the sharded design the snapshot is + // sourced from the HASH directly — visiting an env with zero + // stale entries HDEL's it, so it's simply absent from the + // snapshot (telemetry sums values, so absence is equivalent to + // zero for the gauge). const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); try { await buffer.accept({ runId: "run_just_arrived", @@ -162,11 +208,13 @@ describe("runStaleSweepOnce — testcontainers", () => { const spies = spyDeps(); await runStaleSweepOnce( { staleThresholdMs: 60 * 1000 }, - { ...spies.deps, getBuffer: () => buffer }, + { ...spies.deps, getBuffer: () => buffer, state }, ); expect(spies.snapshots).toHaveLength(1); - expect(Object.fromEntries(spies.snapshots[0])).toEqual({ env_a: 0 }); + // env_a has entries but none stale → not in the snapshot. + expect(spies.snapshots[0].has("env_a")).toBe(false); } finally { + await state.close(); await buffer.close(); } }, @@ -181,6 +229,7 @@ describe("runStaleSweepOnce — testcontainers", () => { // entry the first time the sweep runs after a perfectly synchronised // accept call — the dashboard would page on every burst. const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); try { await buffer.accept({ runId: "run_fresh_only", @@ -191,12 +240,124 @@ describe("runStaleSweepOnce — testcontainers", () => { const spies = spyDeps(); const result = await runStaleSweepOnce( { staleThresholdMs: 60 * 1000 }, - { ...spies.deps, getBuffer: () => buffer }, + { ...spies.deps, getBuffer: () => buffer, state }, ); expect(result.staleCount).toBe(0); expect(spies.staleEntryCount).toBe(0); expect(spies.warnings).toEqual([]); } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "shards work across ticks: cursor advances by maxOrgsPerPass and wraps after a full cycle", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // Without sharding the sweep walks every org/env every tick — at + // any meaningful backlog that runs longer than the tick interval + // and the next tick gets dropped by the inFlight guard. Sharding + // splits the work: each tick visits at most `maxOrgsPerPass` orgs, + // advances a durable cursor, and resumes from there next tick. + // Over `ceil(N / cap)` ticks the cycle covers every org. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + for (let i = 0; i < 5; i++) { + await buffer.accept({ + runId: `run_shard_${i}`, + envId: `env_shard_${i}`, + orgId: `org_shard_${i}`, + payload: JSON.stringify(SNAPSHOT), + }); + } + const futureNow = Date.now() + 5 * 60 * 1000; + const spies = spyDeps(); + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 2 }; + const baseDeps = { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }; + + // Tick 1: cursor starts at 0, scans 2 orgs. + const r1 = await runStaleSweepOnce(cfg, baseDeps); + expect(r1.orgsScanned).toBe(2); + expect(spies.snapshots[0].size).toBe(2); + + // Tick 2: cursor was 2, scans 2 more orgs. + const r2 = await runStaleSweepOnce(cfg, baseDeps); + expect(r2.orgsScanned).toBe(2); + // Snapshot is the durable HASH — accumulates across ticks. + expect(spies.snapshots[1].size).toBe(4); + + // Tick 3: cursor was 4, scans the last 1 org and wraps to 0. + const r3 = await runStaleSweepOnce(cfg, baseDeps); + expect(r3.orgsScanned).toBe(1); + expect(spies.snapshots[2].size).toBe(5); + + // Tick 4: cycle complete, cursor is back at 0 — starts over. + const r4 = await runStaleSweepOnce(cfg, baseDeps); + expect(r4.orgsScanned).toBe(2); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "clears an env from the durable snapshot on revisit when it has entries but none currently stale", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // Stale state in the durable hash must be HDEL'd, not just left + // stale, when a previously-flagged env no longer has any entries + // whose dwell exceeds the threshold (drainer caught up, alert + // condition cleared). The same `entry` flips from stale to + // not-stale between two sweep ticks by varying the sweep's `now` + // — tick 1 uses a future clock so the entry is flagged stale; + // tick 2 uses real time so the same entry has near-zero dwell and + // is no longer stale. The env stays in the active set throughout + // (queue still has an entry), so the cursor revisits it and the + // hash field is cleared. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_drain", + envId: "env_drain", + orgId: "org_drain", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const spies = spyDeps(); + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }; + + // Tick 1 with future clock: entry's dwell is 5min vs 1min + // threshold → flagged stale. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }); + expect(spies.snapshots[0].get("env_drain")).toBe(1); + + // Tick 2 with real time: same entry, but its dwell is now ~ms + // vs the same 1min threshold → not stale. The env is revisited + // (cursor wrapped to 0 after tick 1, only 1 org in the list), + // setEnvStaleCount called with 0 → HDEL. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + }); + expect(spies.snapshots[1].has("env_drain")).toBe(false); + } finally { + await state.close(); await buffer.close(); } }, @@ -212,6 +373,7 @@ describe("runStaleSweepOnce — testcontainers", () => { // collapsed listOrgs/listEnvsForOrg into a single env-flat list, // this test catches a regression there. const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); try { await buffer.accept({ runId: "run_x", @@ -229,12 +391,13 @@ describe("runStaleSweepOnce — testcontainers", () => { const spies = spyDeps(); const result = await runStaleSweepOnce( { staleThresholdMs: 60 * 1000 }, - { ...spies.deps, getBuffer: () => buffer, now: () => futureNow }, + { ...spies.deps, getBuffer: () => buffer, state, now: () => futureNow }, ); expect(result.orgsScanned).toBe(2); expect(result.envsScanned).toBe(2); expect(result.staleCount).toBe(2); } finally { + await state.close(); await buffer.close(); } }, From 0005f1cfa19fb49afb7d95290939b81db76ae1a9 Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 28 May 2026 10:47:24 +0100 Subject: [PATCH 11/14] test(webapp): fill mollifier stale-sweep coverage gaps Five gaps in the sharded sweep's test coverage, added 9 new tests covering each: - State durability across process restart: state1 populates the cursor + counts hash, closes its Redis client (simulated webapp restart), state2 constructs against the same Redis and picks up exactly where state1 left off. This is the headline benefit of storing sweep state in Redis instead of process memory; without this test it could silently regress. - Cycle wrap rebuilds the org list: a third org joins between cycles and is visible only in the next cycle's snapshot. Pins the rebuildOrgList-on-cursor=0 contract. - Empty buffer (no orgs) advances cleanly with zero work, empty snapshot, cursor stays at 0 instead of tripping the wrap math. - Buffer-null branch's clearAll: previously asserted only "snapshot is empty"; now also asserts the durable state was actually wiped (cursor=0, counts hash empty) so a re-enable doesn't resume on a stale cursor. - MollifierStaleSweepState direct unit tests (5 tests): readCursor default + corrupt-value tolerance, writeCursor round-trip, rebuildOrgList replace-not-append semantics, setEnvStaleCount HSET vs HDEL, clearAll DELs all three keys. Suite total: 7 existing + 9 new = 16 tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/webapp/test/mollifierStaleSweep.test.ts | 314 +++++++++++++++++++ 1 file changed, 314 insertions(+) diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts index 57a55cf68e5..7af5c346d12 100644 --- a/apps/webapp/test/mollifierStaleSweep.test.ts +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -402,4 +402,318 @@ describe("runStaleSweepOnce — testcontainers", () => { } }, ); + + redisTest( + "state survives process restart: a second state instance picks up the cursor and counts", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // This is the headline reason the sweep state is durable in Redis + // instead of process-local — a webapp restart mid-cycle must not + // re-emit the gauge as fresh-zero for previously-flagged envs nor + // restart the cursor walk from scratch. Simulated here by closing + // state1 (its Redis client quits cleanly) and constructing state2 + // against the same Redis. The cursor + counts that state1 wrote + // are visible to state2 on its first tick. + const buffer = new MollifierBuffer({ redisOptions }); + const state1 = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_a", + envId: "env_a", + orgId: "org_a", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_b", + envId: "env_b", + orgId: "org_b", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 1 }; + const spies1 = spyDeps(); + + // Tick 1 with state1: visits 1 of 2 orgs. + await runStaleSweepOnce(cfg, { + ...spies1.deps, + getBuffer: () => buffer, + state: state1, + now: () => futureNow, + }); + expect(spies1.snapshots[0].size).toBe(1); + } finally { + // Simulate webapp restart: state1's Redis client closes cleanly. + await state1.close(); + } + + // New process boots, constructs a fresh state pointing at the + // same Redis. The cycle's frozen org_list, the cursor, and the + // counts hash are all preserved — state2 picks up at the second + // org of the cycle. + const state2 = new MollifierStaleSweepState({ redisOptions }); + try { + const futureNow = Date.now() + 5 * 60 * 1000; + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 1 }; + const spies2 = spyDeps(); + + await runStaleSweepOnce(cfg, { + ...spies2.deps, + getBuffer: () => buffer, + state: state2, + now: () => futureNow, + }); + // Snapshot now has BOTH envs: the one tick 1 flagged (still in + // the counts hash from state1) plus the one tick 2 just flagged. + // A non-durable design would show only the second. + expect(spies2.snapshots[0].size).toBe(2); + } finally { + await state2.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "cycle wrap rebuilds the org list, so orgs that joined mid-cycle get visited on the next cycle", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // The docstring promises "orgs joining mid-cycle wait until the + // next cycle to be visited." The mechanism is rebuildOrgList at + // cursor=0: a fresh snapshot of buffer.listOrgs() replaces the + // previous frozen LIST. Verified here by adding a third org + // between cycles and asserting it shows up only in the next + // cycle's snapshot. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_init_a", + envId: "env_init_a", + orgId: "org_init_a", + payload: JSON.stringify(SNAPSHOT), + }); + await buffer.accept({ + runId: "run_init_b", + envId: "env_init_b", + orgId: "org_init_b", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const spies = spyDeps(); + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }; + const baseDeps = { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }; + + // Tick 1: cycle 1. Visits both initial orgs; cursor wraps to 0. + await runStaleSweepOnce(cfg, baseDeps); + expect(spies.snapshots[0].size).toBe(2); + + // Mid-flight: a third org joins the buffer. It must NOT have + // been part of cycle 1's frozen LIST. + await buffer.accept({ + runId: "run_mid", + envId: "env_mid", + orgId: "org_mid", + payload: JSON.stringify(SNAPSHOT), + }); + + // Tick 2: cycle 2 begins (cursor was 0 after tick 1's wrap). + // rebuildOrgList captures all 3 orgs; this tick visits all 3. + const r2 = await runStaleSweepOnce(cfg, baseDeps); + expect(r2.orgsScanned).toBe(3); + expect(spies.snapshots[1].size).toBe(3); + expect(spies.snapshots[1].has("env_mid")).toBe(true); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "empty buffer (no orgs) advances cleanly with zero work and an empty snapshot", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // `mollifier:orgs` is empty (no entries ever accepted, or every + // entry has been drained). The sweep must handle the boundary: + // rebuildOrgList with [], readOrgListSlice returns total=0, + // the org loop is skipped, and the cursor stays at 0 instead of + // tripping the wrap math. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + const spies = spyDeps(); + const result = await runStaleSweepOnce( + { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }, + { ...spies.deps, getBuffer: () => buffer, state }, + ); + expect(result).toEqual({ + orgsScanned: 0, + envsScanned: 0, + entriesScanned: 0, + staleCount: 0, + }); + expect(spies.snapshots).toHaveLength(1); + expect(spies.snapshots[0].size).toBe(0); + // Cursor stayed at 0 — nothing to advance through. + expect(await state.readCursor()).toBe(0); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + + redisTest( + "buffer-null branch wipes the durable state so a re-enable starts fresh", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // The unit test above asserts the snapshot is empty when the + // buffer is null, but doesn't verify the durable state was + // actually cleared. Without clearAll the next re-enable would + // resume on a stale cursor + carry over a stale counts hash. + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_seed", + envId: "env_seed", + orgId: "org_seed", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }; + const spies = spyDeps(); + + // Tick 1: populate state. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }); + expect(spies.snapshots[0].size).toBe(1); + expect((await state.readAllEnvStaleCounts()).size).toBe(1); + + // Tick 2: mollifier flips OFF — getBuffer returns null. The + // sweep must clear the durable state. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => null, + state, + }); + expect(spies.snapshots[1].size).toBe(0); + expect((await state.readAllEnvStaleCounts()).size).toBe(0); + expect(await state.readCursor()).toBe(0); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); +}); + +describe("MollifierStaleSweepState — direct unit tests", () => { + redisTest("readCursor returns 0 when the key is absent", { timeout: 20_000 }, async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + expect(await state.readCursor()).toBe(0); + } finally { + await state.close(); + } + }); + + redisTest( + "writeCursor + readCursor round-trip; readCursor parses a non-numeric value as 0", + { timeout: 20_000 }, + async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await state.writeCursor(42); + expect(await state.readCursor()).toBe(42); + + // Defensive: a corrupted/garbage value must not throw or + // propagate NaN into the sweep's cursor arithmetic. + await state["redis"].set("mollifier:stale_sweep:cursor", "not-a-number"); + expect(await state.readCursor()).toBe(0); + } finally { + await state.close(); + } + }, + ); + + redisTest( + "rebuildOrgList replaces the previous list (DEL + RPUSH, in order)", + { timeout: 20_000 }, + async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await state.rebuildOrgList(["org_a", "org_b", "org_c"]); + let slice = await state.readOrgListSlice(0, 10); + expect(slice.total).toBe(3); + expect(slice.orgs).toEqual(["org_a", "org_b", "org_c"]); + + // Replacement, not append. + await state.rebuildOrgList(["org_x"]); + slice = await state.readOrgListSlice(0, 10); + expect(slice.total).toBe(1); + expect(slice.orgs).toEqual(["org_x"]); + + // Empty rebuild leaves the list empty (DEL fires, no RPUSH). + await state.rebuildOrgList([]); + slice = await state.readOrgListSlice(0, 10); + expect(slice.total).toBe(0); + expect(slice.orgs).toEqual([]); + } finally { + await state.close(); + } + }, + ); + + redisTest( + "setEnvStaleCount HSETs when count > 0 and HDELs when count === 0", + { timeout: 20_000 }, + async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await state.setEnvStaleCount("env_a", 3); + await state.setEnvStaleCount("env_b", 1); + let counts = await state.readAllEnvStaleCounts(); + expect(Object.fromEntries(counts)).toEqual({ env_a: 3, env_b: 1 }); + + // Zero clears the field (HDEL), not stores 0. + await state.setEnvStaleCount("env_a", 0); + counts = await state.readAllEnvStaleCounts(); + expect(Object.fromEntries(counts)).toEqual({ env_b: 1 }); + expect(counts.has("env_a")).toBe(false); + } finally { + await state.close(); + } + }, + ); + + redisTest( + "clearAll DELs cursor, org_list, and counts in one call", + { timeout: 20_000 }, + async ({ redisOptions }) => { + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await state.writeCursor(7); + await state.rebuildOrgList(["org_a", "org_b"]); + await state.setEnvStaleCount("env_a", 5); + + await state.clearAll(); + + expect(await state.readCursor()).toBe(0); + expect((await state.readOrgListSlice(0, 10)).total).toBe(0); + expect((await state.readAllEnvStaleCounts()).size).toBe(0); + } finally { + await state.close(); + } + }, + ); }); From 5cd2f1636a1430686e0efdfd4dd53b19b170a2a3 Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Thu, 28 May 2026 12:06:40 +0100 Subject: [PATCH 12/14] fix(webapp): GC drained-env orphans in stale sweep + propagate batch on terminal failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two Devin review findings on PR #3754, both real and unresolved: 1. Sharded stale sweep's counts hash never cleared for fully-drained envs — gauge stayed permanently elevated, false-alerting the recommended `> 0 for 5m` rule. Root cause: when an env's last buffered entry is popped, the buffer's atomic Lua removes the env from `mollifier:org-envs:${orgId}` (and removes the org from `mollifier:orgs` if it has no other envs). The sweep's inner loop walks `buffer.listEnvsForOrg(orgId)`, so the env disappears from the iteration entirely — `setEnvStaleCount(envId, 0)` (which HDELs the field) is never called, and the counts hash retains the env's last-known stale count forever. Fix (Devin's Approach 2): cycle-bounded reconciliation. Add a Redis SET `mollifier:stale_sweep:visited` that the sweep SADDs into for every env it touches. When the cursor wraps (cycle complete), `reconcileVisited()` does `HKEYS counts → SMEMBERS visited → HDEL the difference → DEL visited`. Pipelined; orphans clear within at most one full cursor cycle of the env going quiet, which matches the sharding contract's existing one-cycle freshness window. Test: "evicts fully-drained envs from the counts hash at cycle wrap" — accepts an entry, sweep flags it stale, pops the entry (env vanishes from listEnvsForOrg), runs another sweep that triggers wrap, asserts the env is HDEL'd from both the snapshot and the underlying counts hash. 2. Drainer handler's terminal SYSTEM_FAILURE write dropped the snapshot's `batch` field. If the buffered run was part of a batch, the failure row wasn't associated with the batch and the batch parent's completion tracking could hang indefinitely waiting on a child that landed but isn't visible to the batch. Fix: extract `snapshot.batch` with structural type guards and pass it through to `createFailedTaskRun`. Same defensive pattern as the other snapshot fields in this code path (the snapshot is typed `Record` because it came from cjson-decoded buffer payload). Test: "propagates the batch association into createFailedTaskRun" — asserts the call site receives `{ id, index }` from the snapshot. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../mollifierDrainerHandler.server.ts | 16 ++++ .../mollifier/mollifierStaleSweep.server.ts | 19 ++++- .../mollifierStaleSweepState.server.ts | 56 ++++++++++++- .../test/mollifierDrainerHandler.test.ts | 41 ++++++++++ apps/webapp/test/mollifierStaleSweep.test.ts | 82 +++++++++++++++++++ 5 files changed, 211 insertions(+), 3 deletions(-) diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts index 8ee89b895a2..59d21180cce 100644 --- a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts @@ -153,6 +153,21 @@ export function createDrainerHandler(deps: { // Drainer's outer catch will buffer.fail this entry. throw err; } + // Extract batch association from the snapshot if present. + // Without this, a SYSTEM_FAILURE row for a buffered batch + // child won't be linked to its batch, and the batch parent's + // completion tracking can hang indefinitely waiting on a + // child that landed but isn't visible to the batch. + const rawBatch = snapshot.batch; + const batch = + rawBatch && + typeof rawBatch === "object" && + "id" in rawBatch && + typeof (rawBatch as { id: unknown }).id === "string" && + "index" in rawBatch && + typeof (rawBatch as { index: unknown }).index === "number" + ? (rawBatch as { id: string; index: number }) + : undefined; try { await deps.engine.createFailedTaskRun({ friendlyId: input.runId, @@ -175,6 +190,7 @@ export function createDrainerHandler(deps: { : undefined, depth: typeof snapshot.depth === "number" ? snapshot.depth : 0, resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true, + batch, traceId: typeof snapshot.traceId === "string" ? snapshot.traceId : undefined, spanId: typeof snapshot.spanId === "string" ? snapshot.spanId : undefined, taskEventStore: diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts index 570ad1a72e8..e77b0916b5d 100644 --- a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts @@ -148,6 +148,13 @@ export async function runStaleSweepOnce( // > 0, HDEL when it dropped back to zero — the hash is the source // of truth for the gauge snapshot below. await deps.state.setEnvStaleCount(envId, envStale); + // Track that this env was visited during the current cycle. The + // reconcile step at cycle wrap uses this to HDEL counts hash + // entries for envs that fully drained mid-cycle (they disappear + // from listEnvsForOrg, so the inner loop above never reaches them + // and never HDELs their hash field — without reconcile the gauge + // would stay elevated forever). + await deps.state.markEnvVisited(envId); staleCount += envStale; } } @@ -155,9 +162,19 @@ export async function runStaleSweepOnce( // Advance the cursor. If the slice consumed the end of the LIST, wrap // to 0 so the next tick rebuilds the org list and starts a new cycle. const advanced = cursor + slice.length; - const newCursor = advanced >= total ? 0 : advanced; + const wrapped = advanced >= total; + const newCursor = wrapped ? 0 : advanced; await deps.state.writeCursor(newCursor); + if (wrapped) { + // Cycle ended. HDEL any env still in the counts hash that didn't + // appear in any tick of the just-completed cycle — these are envs + // that fully drained from the buffer mid-cycle and would otherwise + // hold their stale gauge value forever. Also DELs the visited set + // so the next cycle starts clean. + await deps.state.reconcileVisited(); + } + // Emit the snapshot from the durable hash, which carries values for // envs visited in earlier ticks too. This is what makes the gauge // stable across ticks (and across webapp restarts). diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts index fb98e8da923..da44d65808f 100644 --- a/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweepState.server.ts @@ -1,18 +1,27 @@ import { createRedisClient, type Redis, type RedisOptions } from "@internal/redis"; import { Logger } from "@trigger.dev/core/logger"; -// Durable per-tick state for the sharded stale sweep. Three Redis keys, +// Durable per-tick state for the sharded stale sweep. Four Redis keys, // all in the `mollifier:` namespace alongside the buffer's own state: // // mollifier:stale_sweep:cursor STRING next position in org_list (0 = fresh cycle) // mollifier:stale_sweep:org_list LIST org IDs frozen at the start of the cycle // mollifier:stale_sweep:counts HASH envId -> last-known stale count +// mollifier:stale_sweep:visited SET envIds visited during the current cycle // // The state survives webapp restarts: a restarted process picks up the // cursor where the previous one left off and re-emits the last-known // gauge values immediately, rather than blinking to zero until the next // cycle visits each env. // +// The `visited` set exists to GC the `counts` hash at cycle wrap: an env +// that drains completely between sweep ticks disappears from +// `buffer.listEnvsForOrg`, so the sweep's inner loop never revisits it +// and never HDELs its counts entry. Without the visited-set GC the +// counts hash retains the env's last-known stale count forever and the +// gauge stays permanently elevated. At cursor wrap we diff the hash +// against the cycle's visited set and HDEL the difference. +// // Storage is owned by this class rather than added to MollifierBuffer // because the keys are sweep-internal — the buffer abstracts the // drainer/queue state, this abstracts sweep state. They share a @@ -28,6 +37,14 @@ export interface StaleSweepStateStore { /** HSET when count > 0, HDEL when count === 0 (so the snapshot reflects current truth). */ setEnvStaleCount(envId: string, count: number): Promise; readAllEnvStaleCounts(): Promise>; + /** SADD `envId` to the current cycle's visited set. Called once per env scanned per tick. */ + markEnvVisited(envId: string): Promise; + /** + * HDEL every env in the counts hash that is NOT in the visited set, then + * DEL the visited set. Called when the cursor wraps (cycle ends) so + * envs that fully drained mid-cycle get cleaned out of the gauge. + */ + reconcileVisited(): Promise; clearAll(): Promise; close(): Promise; } @@ -35,6 +52,7 @@ export interface StaleSweepStateStore { const CURSOR_KEY = "mollifier:stale_sweep:cursor"; const ORG_LIST_KEY = "mollifier:stale_sweep:org_list"; const COUNTS_KEY = "mollifier:stale_sweep:counts"; +const VISITED_KEY = "mollifier:stale_sweep:visited"; export class MollifierStaleSweepState implements StaleSweepStateStore { private readonly redis: Redis; @@ -114,8 +132,42 @@ export class MollifierStaleSweepState implements StaleSweepStateStore { return out; } + async markEnvVisited(envId: string): Promise { + await this.redis.sadd(VISITED_KEY, envId); + } + + async reconcileVisited(): Promise { + // HKEYS + SMEMBERS in a pipeline, then HDEL the difference locally. + // For typical fleet sizes (counts and visited both bounded by the + // count of buffered envs) this is well within a single RTT plus one + // small HDEL. + const pipeline = this.redis.pipeline(); + pipeline.hkeys(COUNTS_KEY); + pipeline.smembers(VISITED_KEY); + const results = await pipeline.exec(); + if (!results) return; + const [hkeysErr, hkeysRes] = results[0] as [Error | null, string[] | null]; + const [smembersErr, smembersRes] = results[1] as [Error | null, string[] | null]; + if (hkeysErr || smembersErr) { + this.logger.error("MollifierStaleSweepState.reconcileVisited failed", { + hkeysErr: hkeysErr?.message, + smembersErr: smembersErr?.message, + }); + return; + } + const hashEnvs = hkeysRes ?? []; + const visited = new Set(smembersRes ?? []); + const orphans = hashEnvs.filter((envId) => !visited.has(envId)); + const cleanup = this.redis.pipeline(); + if (orphans.length > 0) { + cleanup.hdel(COUNTS_KEY, ...orphans); + } + cleanup.del(VISITED_KEY); + await cleanup.exec(); + } + async clearAll(): Promise { - await this.redis.del(CURSOR_KEY, ORG_LIST_KEY, COUNTS_KEY); + await this.redis.del(CURSOR_KEY, ORG_LIST_KEY, COUNTS_KEY, VISITED_KEY); } async close(): Promise { diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts index 1a20fdbb69f..59c11eb14f8 100644 --- a/apps/webapp/test/mollifierDrainerHandler.test.ts +++ b/apps/webapp/test/mollifierDrainerHandler.test.ts @@ -154,6 +154,47 @@ describe("createDrainerHandler", () => { expect(arg.error.raw).toContain("validation failed"); }); + it("propagates the batch association into createFailedTaskRun (so batch parents don't hang on missing children)", async () => { + // Devin's ANALYSIS report on PR #3754: the terminal-failure path + // extracts most snapshot fields (parentTaskRunId, rootTaskRunId, + // depth, etc.) but dropped `batch`. If the original trigger was + // part of a batch, the SYSTEM_FAILURE row isn't associated with + // the batch, so the batch parent's completion-tracking can hang + // indefinitely waiting on a child that landed but isn't linked. + const trigger = vi.fn(async () => { + throw new Error("validation failed: payload too large"); + }); + const createFailedTaskRun = vi.fn(async () => ({ + id: "internal", + friendlyId: "run_x", + })); + const handler = createDrainerHandler({ + engine: { trigger, createFailedTaskRun } as any, + prisma: {} as any, + }); + + await expect( + handler({ + runId: "run_batched", + envId: "env_a", + orgId: "org_1", + payload: { + taskIdentifier: "t", + environment: envFixture, + batch: { id: "batch_xyz", index: 7 }, + }, + attempts: 0, + createdAt: new Date(), + } as any), + ).resolves.toBeUndefined(); + + expect(createFailedTaskRun).toHaveBeenCalledOnce(); + const arg = createFailedTaskRun.mock.calls[0][0] as { + batch?: { id: string; index: number }; + }; + expect(arg.batch).toEqual({ id: "batch_xyz", index: 7 }); + }); + it("rethrows the original error when createFailedTaskRun also fails (PG genuinely unreachable)", async () => { const triggerErr = new Error("engine rejected the snapshot"); const trigger = vi.fn(async () => { diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts index 7af5c346d12..dddaa9d247c 100644 --- a/apps/webapp/test/mollifierStaleSweep.test.ts +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -21,6 +21,7 @@ function makeFakeState() { let cursor = 0; let orgList: string[] = []; const counts = new Map(); + let visited = new Set(); return { readCursor: async () => cursor, writeCursor: async (v: number) => { @@ -38,10 +39,20 @@ function makeFakeState() { else counts.delete(envId); }, readAllEnvStaleCounts: async () => new Map(counts), + markEnvVisited: async (envId: string) => { + visited.add(envId); + }, + reconcileVisited: async () => { + for (const envId of [...counts.keys()]) { + if (!visited.has(envId)) counts.delete(envId); + } + visited = new Set(); + }, clearAll: async () => { cursor = 0; orgList = []; counts.clear(); + visited = new Set(); }, close: async () => {}, }; @@ -363,6 +374,77 @@ describe("runStaleSweepOnce — testcontainers", () => { }, ); + redisTest( + "evicts fully-drained envs from the counts hash at cycle wrap (no permanent alert)", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // Devin's BUG report on PR #3754: an env that drains completely + // between sweep ticks disappears from `mollifier:org-envs:${orgId}` + // entirely, so the inner loop at runStaleSweepOnce never visits it + // and `setEnvStaleCount(envId, 0)` (which HDELs the field) is + // never called. The counts hash retains the env's last-known + // stale count forever, the gauge stays elevated, and the + // recommended alert `> 0 for 5m` fires indefinitely. + // + // Fix: at cycle wrap (cursor returned to 0) HDEL any env in the + // counts hash that wasn't visited during the just-completed cycle. + // Verified here by: + // 1. Flagging env_will_drain stale, confirming it's in the hash + // 2. Draining its only entry — now invisible to listEnvsForOrg + // 3. Running a sweep tick that triggers cycle wrap + // 4. Asserting the env is no longer in the snapshot + const buffer = new MollifierBuffer({ redisOptions }); + const state = new MollifierStaleSweepState({ redisOptions }); + try { + await buffer.accept({ + runId: "run_will_drain", + envId: "env_will_drain", + orgId: "org_will_drain", + payload: JSON.stringify(SNAPSHOT), + }); + const futureNow = Date.now() + 5 * 60 * 1000; + const cfg = { staleThresholdMs: 60 * 1000, maxOrgsPerPass: 10 }; + const spies = spyDeps(); + + // Tick 1: env_will_drain is flagged stale → enters counts hash. + // Cursor wraps to 0 (only 1 org in the list). + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }); + expect(spies.snapshots[0].get("env_will_drain")).toBe(1); + + // Drain the only entry. mollifier:queue:env_will_drain is now + // empty, and the buffer's atomic Lua removes env_will_drain + // from `mollifier:org-envs:org_will_drain` (and removes the org + // from `mollifier:orgs` since it has no other envs). The env is + // now invisible to listEnvsForOrg. + const popped = await buffer.pop("env_will_drain"); + expect(popped?.runId).toBe("run_will_drain"); + + // Tick 2: cursor was 0 after tick 1's wrap, so this rebuilds + // the org list (now empty) and immediately wraps again. The + // wrap-handler must HDEL env_will_drain from the counts hash + // because it wasn't in the visited set for this cycle. + await runStaleSweepOnce(cfg, { + ...spies.deps, + getBuffer: () => buffer, + state, + now: () => futureNow, + }); + expect(spies.snapshots[1].has("env_will_drain")).toBe(false); + // And the durable hash is genuinely empty, not just absent from + // this snapshot. + expect((await state.readAllEnvStaleCounts()).size).toBe(0); + } finally { + await state.close(); + await buffer.close(); + } + }, + ); + redisTest( "scans across multiple orgs", { timeout: 20_000 }, From 4a1bcfe19def128ded3d33255bd2f43597016646 Mon Sep 17 00:00:00 2001 From: Dan Sutton Date: Thu, 28 May 2026 14:17:07 +0100 Subject: [PATCH 13/14] fix(webapp): stale-sweep stop() awaits in-flight tick before closing Redis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devin review on PR #3754: `stop()` previously called `deps.state.close()` immediately after `clearInterval`, but `tick()` only checks `stopped` at entry. A tick that was already past that guard would keep making `state.*` calls against an ioredis client that `stop()` had already `quit()`ed — those calls would throw, the tick's own try/catch would swallow them as `mollifier.stale_sweep.failed` warnings, and every graceful shutdown would emit spurious noise. Track the current tick promise as `currentTick`. `stop()` awaits it (if present) before invoking `state.close()`, so the tick's last state call lands BEFORE the Redis client is quit. The tick's own try/catch handles the (unexpected) case where it rejects; the await in `stop()` is solely for ordering. Also drop the `instanceof MollifierStaleSweepState` guard around `state.close()` — `close()` is part of the `StaleSweepStateStore` contract, so unconditional invocation is correct. Test fake states implement `close()` as a no-op. Test: `stop() waits for an in-flight tick to finish before closing the state` — gates a fake state's `readCursor` on a Deferred, kicks off the interval, waits for the tick to start, then races `stop()` against the gate. Asserts the stop promise stays unresolved while the tick is mid-flight and that the tick's final state operation lands before `close()`. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../mollifier/mollifierStaleSweep.server.ts | 53 +++++--- apps/webapp/test/mollifierStaleSweep.test.ts | 121 +++++++++++++++++- 2 files changed, 158 insertions(+), 16 deletions(-) diff --git a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts index e77b0916b5d..d135824032c 100644 --- a/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierStaleSweep.server.ts @@ -198,20 +198,33 @@ export function startStaleSweepInterval( ): StaleSweepIntervalHandle { let stopped = false; let inFlight = false; + // Tracks the current tick so `stop()` can await it before closing the + // state's Redis client. Without this, a tick that's already past the + // `stopped` guard at entry would continue making `state.*` calls + // against an ioredis client that `stop()` has already `quit()`ed, + // raising errors that the tick's own try/catch then logs as + // `mollifier.stale_sweep.failed` warnings — spurious noise on every + // graceful shutdown. + let currentTick: Promise | null = null; const tick = async () => { if (stopped || inFlight) return; inFlight = true; - try { - await runStaleSweepOnce(config, deps); - } catch (err) { - const log = deps.logger ?? defaultLogger; - log.warn("mollifier.stale_sweep.failed", { - err: err instanceof Error ? err.message : String(err), - }); - } finally { - inFlight = false; - } + const run = (async () => { + try { + await runStaleSweepOnce(config, deps); + } catch (err) { + const log = deps.logger ?? defaultLogger; + log.warn("mollifier.stale_sweep.failed", { + err: err instanceof Error ? err.message : String(err), + }); + } finally { + inFlight = false; + currentTick = null; + } + })(); + currentTick = run; + await run; }; const timer = setInterval(() => { @@ -222,12 +235,22 @@ export function startStaleSweepInterval( stop: async () => { stopped = true; clearInterval(timer); - // Close the durable-state Redis client if the deps own a real - // `MollifierStaleSweepState`. Tests may inject a fake without a - // `close()`; guard accordingly. - if (deps.state instanceof MollifierStaleSweepState) { - await deps.state.close(); + // Drain any tick that started before `stopped` flipped. Its + // `state.*` calls must land before we close the Redis client. + if (currentTick) { + try { + await currentTick; + } catch { + // tick has its own catch — this await is just to ensure + // ordering, not to surface errors that have already been + // logged inside the tick. + } } + // Close the state's underlying resource. The `close()` method is + // part of the `StaleSweepStateStore` contract — production's + // `MollifierStaleSweepState` shuts down its ioredis client; fake + // test states implement a no-op. + await deps.state.close(); }, }; } diff --git a/apps/webapp/test/mollifierStaleSweep.test.ts b/apps/webapp/test/mollifierStaleSweep.test.ts index dddaa9d247c..05a15b042a1 100644 --- a/apps/webapp/test/mollifierStaleSweep.test.ts +++ b/apps/webapp/test/mollifierStaleSweep.test.ts @@ -4,7 +4,10 @@ import { MollifierBuffer } from "@trigger.dev/redis-worker"; vi.mock("~/db.server", () => ({ prisma: {}, $replica: {} })); -import { runStaleSweepOnce } from "~/v3/mollifier/mollifierStaleSweep.server"; +import { + runStaleSweepOnce, + startStaleSweepInterval, +} from "~/v3/mollifier/mollifierStaleSweep.server"; import { MollifierStaleSweepState } from "~/v3/mollifier/mollifierStaleSweepState.server"; const SNAPSHOT = { @@ -799,3 +802,119 @@ describe("MollifierStaleSweepState — direct unit tests", () => { }, ); }); + +describe("startStaleSweepInterval — lifecycle", () => { + it("stop() waits for an in-flight tick to finish before closing the state", async () => { + // Devin's BUG report on PR #3754: `stop()` previously called + // `deps.state.close()` immediately after `clearInterval`, but the + // `tick` function only checks `stopped` at entry. A tick that was + // already past that check would keep making `state.*` Redis calls + // against a now-closed ioredis client, throw, get caught by tick's + // own try/catch, and log a `mollifier.stale_sweep.failed` warning + // for every graceful shutdown. + // + // The fix tracks the current tick promise so `stop()` can await it + // before closing. This test pins that order by gating one of the + // tick's state calls on a Deferred — until we resolve it, the tick + // can't progress, and `stop()` must hang in the meantime. + let resolveGate: () => void = () => {}; + const gate = new Promise((r) => { + resolveGate = r; + }); + + const callOrder: string[] = []; + let closeCalled = false; + const state = { + readCursor: async () => { + callOrder.push("readCursor:start"); + await gate; + callOrder.push("readCursor:end"); + return 0; + }, + writeCursor: async () => { + callOrder.push("writeCursor"); + }, + rebuildOrgList: async () => { + callOrder.push("rebuildOrgList"); + }, + readOrgListSlice: async () => { + callOrder.push("readOrgListSlice"); + // Return zero orgs so the org loop is a no-op — we only care + // about ordering of state calls vs close, not the work. + return { orgs: [] as string[], total: 0 }; + }, + setEnvStaleCount: async () => { + callOrder.push("setEnvStaleCount"); + }, + readAllEnvStaleCounts: async () => { + callOrder.push("readAllEnvStaleCounts"); + return new Map(); + }, + markEnvVisited: async () => { + callOrder.push("markEnvVisited"); + }, + reconcileVisited: async () => { + callOrder.push("reconcileVisited"); + }, + clearAll: async () => { + callOrder.push("clearAll"); + }, + close: async () => { + callOrder.push("close"); + closeCalled = true; + }, + }; + + const fakeBuffer = { + listOrgs: async () => [], + listEnvsForOrg: async () => [], + listEntriesForEnv: async () => [], + } as any; + + const handle = startStaleSweepInterval( + { + intervalMs: 20, + staleThresholdMs: 60_000, + maxOrgsPerPass: 10, + }, + { + state, + getBuffer: () => fakeBuffer, + recordStaleEntry: () => {}, + reportStaleEntrySnapshot: () => {}, + logger: { warn: () => {} }, + now: () => Date.now(), + }, + ); + + // Wait for the interval to fire one tick. The tick will start, call + // readCursor, and then block on `gate`. + await new Promise((r) => setTimeout(r, 80)); + expect(callOrder).toContain("readCursor:start"); + expect(closeCalled).toBe(false); + + // Call stop() concurrently — its promise MUST NOT resolve while the + // tick is still mid-flight. + let stopResolved = false; + const stopPromise = handle.stop().then(() => { + stopResolved = true; + }); + await new Promise((r) => setTimeout(r, 50)); + expect(stopResolved).toBe(false); + expect(closeCalled).toBe(false); + + // Release the gate. The tick can now finish, and only then should + // stop() resolve and close the state. + resolveGate(); + await stopPromise; + expect(stopResolved).toBe(true); + expect(closeCalled).toBe(true); + + // The tick's readCursor:end MUST appear before the close — otherwise + // we closed the Redis client out from under an in-flight tick. + expect(callOrder.indexOf("readCursor:end")).toBeGreaterThan(-1); + expect(callOrder.indexOf("close")).toBeGreaterThan( + callOrder.indexOf("readCursor:end"), + ); + }); +}); From d45191811774d7cd813f5dc973d89f215a60925d Mon Sep 17 00:00:00 2001 From: Daniel Sutton Date: Thu, 28 May 2026 15:04:00 +0100 Subject: [PATCH 14/14] fix(webapp,run-engine): three follow-ups on Devin review (PR #3754) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three unresolved Devin threads, all addressed in this commit. Committed locally only — not pushed. 1. `callWithoutTraceEvents` was inheriting the new `emitRunFailedEvent` default of `true`, so its `createFailedTaskRun` call would fire the `runFailed` bus emit and the listener would write a ClickHouse completion event row with empty `traceId`/`spanId` — orphan row, directly contradicting the method's "without trace events" contract. Pass `emitRunFailedEvent: false` and enqueue `PerformTaskRunAlertsService` directly, mirroring the `call()` pattern so customers' ERROR channels still see the failure. 2. The cjson empty-tags defense lived only on `createCancelledRun`, not on `engine.trigger`. When the mollifier buffer's mutate-side Lua re-serialises a payload (e.g. `append_tags` on a buffered run that never had tags), an empty Lua table encodes as `{}` and decodes back to a JS object — and the previous `tags.length === 0` check passes that object straight to Prisma's `String[]` column. Mirror the same `Array.isArray && tags.length > 0 ? tags : undefined` guard `createCancelledRun` already uses. The defense is symmetric with the existing tested case for createCancelledRun, so the same contract holds for the trigger replay path. 3. `runCancelled` handler's `cancelRunEvent` lookup fails for buffered-only runs (no primary trace event exists, since the mollifier gate skipped `repository.traceEvent` for the not-yet-materialised run). The handler's `tryCatch` swallowed the error, but the systematic `[runCancelled] Failed to cancel run event` log fired on every cancelled buffered run. Add `emitRunCancelledEvent: boolean = true` to `createCancelledRun` (symmetric with the existing `emitRunFailedEvent` flag on `createFailedTaskRun`); drainer handler passes `false`. CANCELED PG row still writes; only the trace-event mirror is skipped. Tests: - `RunEngine.createCancelledRun > emitRunCancelledEvent: false suppresses the bus emit but still writes the CANCELED PG row` — pins the new flag's semantics. - `createDrainerHandler > calls createCancelledRun with emitRunCancelledEvent: false (suppresses orphan trace-event log noise)` — pins the call site's contract. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../services/triggerFailedTask.server.ts | 26 ++++++- .../mollifierDrainerHandler.server.ts | 13 ++-- .../test/mollifierDrainerHandler.test.ts | 39 ++++++++++ .../run-engine/src/engine/index.ts | 77 +++++++++++++------ .../engine/tests/createCancelledRun.test.ts | 51 ++++++++++++ 5 files changed, 177 insertions(+), 29 deletions(-) diff --git a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts index a26f6c3e74d..a8a7cbf0f3b 100644 --- a/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerFailedTask.server.ts @@ -291,7 +291,7 @@ export class TriggerFailedTaskService { } } - await this.engine.createFailedTaskRun({ + const failedRun = await this.engine.createFailedTaskRun({ friendlyId: failedRunFriendlyId, environment: { id: opts.environmentId, @@ -313,8 +313,32 @@ export class TriggerFailedTaskService { depth, resumeParentOnCompletion: opts.resumeParentOnCompletion, batch: opts.batch, + // Suppress the engine's `runFailed` bus emit — the listener + // (`runEngineHandlers.server.ts` `runFailed`) calls + // `completeFailedRunEvent`, which writes a ClickHouse trace event + // row keyed on (traceId, spanId). This caller has no trace + // context (the method name is literally `callWithoutTraceEvents`) + // so the emit would write a row with empty traceId/spanId — + // orphan event in the store. We still want alert coverage, + // though, so enqueue directly below. + emitRunFailedEvent: false, }); + // Alerts side of `runFailed` — the engine emit was suppressed + // above so we don't create an orphan trace event; enqueue the + // alert directly so customers' ERROR channels still see the + // failure. Best-effort, mirroring the `call()` path. + try { + await PerformTaskRunAlertsService.enqueue(failedRun.id); + } catch (alertsError) { + logger.warn("TriggerFailedTaskService.callWithoutTraceEvents: alert enqueue failed", { + taskId: opts.taskId, + friendlyId: failedRun.friendlyId, + error: + alertsError instanceof Error ? alertsError.message : String(alertsError), + }); + } + return failedRunFriendlyId; } catch (createError) { logger.error("TriggerFailedTaskService: failed to create pre-failed TaskRun (no trace)", { diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts index 59d21180cce..ad9f9be6d2b 100644 --- a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts +++ b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts @@ -53,11 +53,13 @@ export function createDrainerHandler(deps: { }) : context.active(); - // Cancel-wins-over-trigger. If a cancel API call - // landed on this entry while it was QUEUED, the snapshot carries - // `cancelledAt` + `cancelReason`. Skip the normal materialise path - // and write a CANCELED PG row directly. The existing runCancelled - // handler writes the TaskEvent. + // Cancel-wins-over-trigger. If a cancel API call landed on this + // entry while it was QUEUED, the snapshot carries `cancelledAt` + + // `cancelReason`. Skip the normal materialise path and write a + // CANCELED PG row directly. The `runCancelled` bus emit is + // suppressed here because a buffered-only run never had a primary + // trace event written for it — the runCancelled handler's + // `cancelRunEvent` lookup would fail and log noise per cancel. const cancelledAtStr = typeof input.payload.cancelledAt === "string" ? input.payload.cancelledAt : undefined; if (cancelledAtStr) { @@ -79,6 +81,7 @@ export function createDrainerHandler(deps: { snapshot: input.payload as any, cancelledAt: new Date(cancelledAtStr), cancelReason, + emitRunCancelledEvent: false, }, deps.prisma, ); diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts index 59c11eb14f8..795582c90ca 100644 --- a/apps/webapp/test/mollifierDrainerHandler.test.ts +++ b/apps/webapp/test/mollifierDrainerHandler.test.ts @@ -222,6 +222,45 @@ describe("createDrainerHandler", () => { expect(createFailedTaskRun).toHaveBeenCalledOnce(); }); + it("calls createCancelledRun with emitRunCancelledEvent: false (suppresses orphan trace-event log noise)", async () => { + // Buffered-only runs never had a primary trace event written for + // them — the mollifier gate skipped `repository.traceEvent` since + // the run hadn't materialised in PG yet. The `runCancelled` handler + // would log `[runCancelled] Failed to cancel run event` for every + // cancelled buffered run if we let the emit fire. Suppress it. + const friendlyId = RunId.generate().friendlyId; + const createCancelledRun = vi.fn(async () => ({ + id: "internal", + friendlyId, + status: "CANCELED", + })); + const handler = createDrainerHandler({ + engine: { createCancelledRun } as any, + prisma: {} as any, + }); + + await handler({ + runId: friendlyId, + envId: "env_a", + orgId: "org_1", + payload: { + friendlyId, + taskIdentifier: "t", + environment: envFixture, + cancelledAt: new Date().toISOString(), + cancelReason: "Canceled by user", + }, + attempts: 0, + createdAt: new Date(), + } as any); + + expect(createCancelledRun).toHaveBeenCalledOnce(); + const arg = createCancelledRun.mock.calls[0][0] as { + emitRunCancelledEvent?: boolean; + }; + expect(arg.emitRunCancelledEvent).toBe(false); + }); + it("honours the cancel when a buffered cancel races a materialised non-CANCELED row", async () => { // Cancel-wins-over-trigger. If the normal trigger // replay path materialised a live PENDING row before the cancel diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index cab2925872f..835ff90cc48 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -458,25 +458,45 @@ export class RunEngine { * Skips: queue insertion (no execution), waitpoint creation (the * mollifier gate refuses to buffer triggerAndWait children, so a * cancelled buffered run never has a waiting parent to unblock), - * concurrency reservation. Emits `runCancelled` so the existing - * TaskEvent handler writes the cancellation event row — the only side - * effect PG-side cancel has today per audit. + * concurrency reservation. Emits `runCancelled` by default — callers + * working on buffered-only runs (no primary trace event exists) can + * opt out via `emitRunCancelledEvent: false` to avoid the systematic + * "Failed to cancel run event" noise the handler would log when its + * `cancelRunEvent` call can't find a span. * * Idempotent: if a row with the same friendlyId already exists (double * drainer pop after requeue), Prisma's P2002 unique-constraint violation * is caught and the existing row is returned. The duplicate runCancelled * emission is skipped — the original drain's emit already wrote the - * TaskEvent. + * TaskEvent (when applicable). */ async createCancelledRun( { snapshot, cancelledAt, cancelReason, + emitRunCancelledEvent = true, }: { snapshot: TriggerParams; cancelledAt: Date; cancelReason: string; + /** + * Whether to emit the `runCancelled` engine-bus event. Defaults to + * true. + * + * Set to `false` for buffered-only runs that never had a primary + * trace event written (the mollifier gate never called + * `repository.traceEvent` for them). The `runCancelled` handler in + * `runEngineHandlers.server.ts` calls `cancelRunEvent`, which + * looks up the run's primary span in the event store — for + * buffered-only runs that span doesn't exist, so the lookup fails, + * the handler's `tryCatch` swallows it, and a "[runCancelled] + * Failed to cancel run event" error is logged for every cancelled + * buffered run. Suppressing the emit avoids that systematic noise. + * The CANCELED PG row is still written; only the trace-event + * mirror is skipped. + */ + emitRunCancelledEvent?: boolean; }, tx?: PrismaClientOrTransaction, ): Promise { @@ -567,24 +587,26 @@ export class RunEngine { }, }); - this.eventBus.emit("runCancelled", { - time: cancelledAt, - run: { - id: taskRun.id, - status: taskRun.status, - friendlyId: taskRun.friendlyId, - spanId: taskRun.spanId, - taskEventStore: taskRun.taskEventStore, - createdAt: taskRun.createdAt, - completedAt: taskRun.completedAt, - error, - updatedAt: taskRun.updatedAt, - attemptNumber: taskRun.attemptNumber ?? 0, - }, - organization: { id: snapshot.environment.organization.id }, - project: { id: snapshot.environment.project.id }, - environment: { id: snapshot.environment.id }, - }); + if (emitRunCancelledEvent) { + this.eventBus.emit("runCancelled", { + time: cancelledAt, + run: { + id: taskRun.id, + status: taskRun.status, + friendlyId: taskRun.friendlyId, + spanId: taskRun.spanId, + taskEventStore: taskRun.taskEventStore, + createdAt: taskRun.createdAt, + completedAt: taskRun.completedAt, + error, + updatedAt: taskRun.updatedAt, + attemptNumber: taskRun.attemptNumber ?? 0, + }, + organization: { id: snapshot.environment.organization.id }, + project: { id: snapshot.environment.project.id }, + environment: { id: snapshot.environment.id }, + }); + } return taskRun; } catch (err) { @@ -819,7 +841,16 @@ export class RunEngine { priorityMs, queueTimestamp: queueTimestamp ?? delayUntil ?? new Date(), ttl: resolvedTtl, - runTags: tags.length === 0 ? undefined : tags, + // Defensive: when the mollifier drainer replays a buffered + // snapshot whose payload was rewritten by a buffer-side Lua + // mutate (e.g. append_tags clears an empty list), cjson + // encodes an empty Lua table as `{}` rather than `[]`. JS + // parses that back as an empty object, and `{}.length` is + // undefined — the original `tags.length === 0` check would + // pass `{}` straight to Prisma's `String[]` column. Mirror + // the same Array.isArray guard that `createCancelledRun` + // uses for symmetry with the trigger replay path. + runTags: Array.isArray(tags) && tags.length > 0 ? tags : undefined, oneTimeUseToken, parentTaskRunId, rootTaskRunId, diff --git a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts index f825cdcf8cf..68662074ea2 100644 --- a/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts +++ b/internal-packages/run-engine/src/engine/tests/createCancelledRun.test.ts @@ -144,6 +144,57 @@ describe("RunEngine.createCancelledRun", () => { }, ); + containerTest( + "emitRunCancelledEvent: false suppresses the bus emit but still writes the CANCELED PG row", + async ({ prisma, redisOptions }) => { + // The mollifier drainer passes `emitRunCancelledEvent: false` for + // buffered-only runs because the runCancelled handler's + // `cancelRunEvent` lookup fails for them (no primary trace event + // span exists — the mollifier gate never called + // `repository.traceEvent` for this run). Without the gate, every + // cancelled buffered run produces a `[runCancelled] Failed to + // cancel run event` error log. This pins the gate's contract: PG + // row still lands, bus emit suppressed. + const env = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const engine = new RunEngine({ prisma, ...baseEngineOptions(redisOptions) }); + const captured: EventBusEventArgs<"runCancelled">[0][] = []; + engine.eventBus.on("runCancelled", (event) => { + captured.push(event); + }); + + try { + const friendlyId = freshRunId(); + const result = await engine.createCancelledRun({ + snapshot: { + friendlyId, + environment: env, + taskIdentifier: "test-task", + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "0000000000000000eeee000000000000", + spanId: "ffff000000000000", + queue: "task/test-task", + isTest: false, + tags: [], + }, + cancelledAt: new Date(), + cancelReason: "Test cancel (silent emit)", + emitRunCancelledEvent: false, + }); + + // PG row still lands. + expect(result.status).toBe("CANCELED"); + expect(result.friendlyId).toBe(friendlyId); + // Bus emit suppressed. + expect(captured).toHaveLength(0); + } finally { + await engine.quit(); + } + }, + ); + containerTest( "idempotent on double-pop: second call returns existing row without re-emitting", async ({ prisma, redisOptions }) => {