From 51b234a7d9c620e8bec4f67a36b344130ef64456 Mon Sep 17 00:00:00 2001 From: Nathan Heskew Date: Thu, 4 Jun 2026 11:30:45 -0700 Subject: [PATCH 1/6] fix: idle-based startup readiness and port-safe teardown recycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses two harness races (HarperFast/integration-testing#8) that cause intermittent failures under sharded/concurrent CI. Race 1 — startup deadline: runHarperCommand used a single wall-clock deadline, so a slow-but-healthy boot was indistinguishable from a hang. Replace it with an idle timeout that resets on every chunk of Harper output (limit = time-since-last -progress), plus a generous CI-aware absolute ceiling as a backstop. New startupMaxMs option and HARPER_INTEGRATION_TEST_STARTUP_MAX_MS env var. Race 2 — teardown loopback recycle: killHarper escalated to SIGKILL after only 200ms and resolved without confirming exit; teardownHarper then immediately recycled the loopback address while Harper's fixed ports could still be held by lingering workers, causing EADDRINUSE/ECONNREFUSED for the next suite. Give a longer, configurable SIGTERM grace (wait for real exit after SIGKILL), and verify the fixed ports are actually free before recycling the address. Adds the repo's first unit tests (node:test) and an npm test script; tests are excluded from the published build. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 3 +- README.md | 23 +++- package.json | 3 +- src/harperLifecycle.test.ts | 181 +++++++++++++++++++++++++++++++ src/harperLifecycle.ts | 211 ++++++++++++++++++++++++++++++------ src/portUtils.test.ts | 72 ++++++++++++ src/portUtils.ts | 51 +++++++++ tsconfig.build.json | 1 + 8 files changed, 504 insertions(+), 41 deletions(-) create mode 100644 src/harperLifecycle.test.ts create mode 100644 src/portUtils.test.ts create mode 100644 src/portUtils.ts diff --git a/AGENTS.md b/AGENTS.md index af4d15d..e08433d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -21,4 +21,5 @@ Review the `README.md` and `CONTRIBUTING.md` for all relevant repository informa ## Testing Tips - Use `npm link` in this directory and `npm link @harperfast/integration-testing` in other project directories to test out changes locally - Use `npm run check` to type-check the project without generating a build output -- There are currently no tests for this project +- Use `npm test` to run the unit tests (`node --test` over `src/**/*.test.ts`). Tests are excluded from the build (`tsconfig.build.json`) so they are not published. + - To keep them runnable without a real Harper, internal-only helpers are exported from their modules (e.g. `runHarperCommand` in `harperLifecycle.ts`) but intentionally NOT re-exported from `index.ts`, so they stay out of the public API. diff --git a/README.md b/README.md index df592aa..d15763f 100644 --- a/README.md +++ b/README.md @@ -68,16 +68,20 @@ The Harper binary is resolved in the following order: ```ts interface StartHarperOptions { - startupTimeoutMs?: number; // Default: 30000 or HARPER_INTEGRATION_TEST_STARTUP_TIMEOUT_MS + startupTimeoutMs?: number; // Idle timeout: max gap between startup output chunks. Default: 60000 or HARPER_INTEGRATION_TEST_STARTUP_TIMEOUT_MS + startupMaxMs?: number; // Absolute startup ceiling regardless of output. Default: 120000 (300000 under CI) or HARPER_INTEGRATION_TEST_STARTUP_MAX_MS config?: object; // Harper config overrides (passed via HARPER_SET_CONFIG) env?: object; // Additional environment variables for the Harper process harperBinPath?: string; // Explicit path to dist/bin/harper.js } ``` +Startup readiness is detected by Harper printing `successfully started`. Rather than a single wall-clock deadline (which makes a slow-but-healthy boot indistinguishable from a hang), the watchdog uses an **idle timeout** that resets on every chunk of output — so the limit is time-since-last-progress — plus a generous absolute ceiling as a backstop. This is why a slow CI boot that keeps logging no longer trips the timeout. + **Environment Variables:** -- `HARPER_INTEGRATION_TEST_STARTUP_TIMEOUT_MS` - Default startup timeout +- `HARPER_INTEGRATION_TEST_STARTUP_TIMEOUT_MS` - Idle startup timeout: max time between chunks of startup output before Harper is treated as hung (resets on output). Default `60000`. +- `HARPER_INTEGRATION_TEST_STARTUP_MAX_MS` - Absolute ceiling on total startup time, regardless of ongoing output. Default `120000` (`300000` under CI). - `HARPER_INTEGRATION_TEST_INSTALL_PARENT_DIR` - Parent directory for temp Harper install dirs (default: OS tmpdir) - `HARPER_INTEGRATION_TEST_INSTALL_SCRIPT` - Path to Harper CLI script @@ -85,13 +89,22 @@ interface StartHarperOptions { Like `startHarper()`, but copies a component directory into the Harper install before starting, so it's available on first boot without a deploy. -### `killHarper(ctx)` +### `killHarper(ctx, options?)` + +Sends SIGTERM to the Harper process and waits for it to exit, giving it a grace period to shut down cleanly (flush RocksDB, release ports, reap worker children) before escalating to SIGKILL. After SIGKILL it waits briefly for the process to actually exit. Does not release the loopback address or clean up the install directory. Useful for restart scenarios where the test will call `startHarper` again. -Sends SIGTERM to the Harper process and waits for it to exit. Does not release the loopback address or clean up the install directory. Useful for restart scenarios where the test will call `startHarper` again. +`options.graceMs` overrides the SIGTERM→SIGKILL grace period (default `5000`, or `HARPER_INTEGRATION_TEST_TEARDOWN_GRACE_MS`). ### `teardownHarper(ctx)` -Kills Harper, releases the loopback address back to the pool, and removes the install directory. Call in a teardown/`after()` hook. +Kills Harper, waits for its fixed ports to be released on the loopback address, releases the address back to the pool, and removes the install directory. Call in a teardown/`after()` hook. + +The pool only verifies that an address is *bindable* (to an ephemeral port), not that Harper's fixed ports (Operations API, HTTP/S, MQTT/S) are free — and Harper's worker children/sockets can briefly outlive the parent. So before recycling the address, teardown waits until those ports are actually free, preventing `EADDRINUSE`/`ECONNREFUSED` for the next suite that grabs the address. If the ports are still held at the deadline, the address is recycled anyway. + +**Environment Variables:** + +- `HARPER_INTEGRATION_TEST_TEARDOWN_GRACE_MS` - Grace period after SIGTERM before escalating to SIGKILL. Default `5000`. +- `HARPER_INTEGRATION_TEST_PORT_RELEASE_TIMEOUT_MS` - Max time to wait for Harper's ports to be released before recycling the loopback address. Default `5000`. ### `sendOperation(context, operation)` diff --git a/package.json b/package.json index fafc6fe..ab16190 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,8 @@ ], "scripts": { "check": "tsc", - "build": "tsc -p tsconfig.build.json" + "build": "tsc -p tsconfig.build.json", + "test": "node --test \"src/**/*.test.ts\"" }, "engines": { "node": ">=20" diff --git a/src/harperLifecycle.test.ts b/src/harperLifecycle.test.ts new file mode 100644 index 0000000..530dd13 --- /dev/null +++ b/src/harperLifecycle.test.ts @@ -0,0 +1,181 @@ +import { test, before, after } from 'node:test'; +import { ok, strictEqual, match, rejects } from 'node:assert'; +import { spawn, type ChildProcess } from 'node:child_process'; +import { once } from 'node:events'; +import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { killHarper, runHarperCommand, HarperStartupError, type StartedHarperTestContext } from './harperLifecycle.ts'; + +// Standalone scripts used as a fake "Harper binary" (passed via harperBinPath) to drive +// runHarperCommand's startup watchdog through specific timing scenarios without a real Harper. +const FIXTURE_SOURCES: Record = { + // Reports ready, then stays alive like a server. + 'ready.cjs': "process.stdout.write('booting\\n');\nsetTimeout(() => process.stdout.write('successfully started\\n'), 50);\nsetInterval(() => {}, 1000);\n", + // Emits one line, then goes silent forever (hung during startup). + 'idle-hang.cjs': "process.stdout.write('booting\\n');\nsetInterval(() => {}, 1000);\n", + // Emits output continuously but never reports ready. + 'chatty.cjs': "setInterval(() => process.stdout.write('tick\\n'), 100);\n", + // Emits progress every 100ms (each gap well under the idle window) and only reports ready + // after ~800ms — longer than the idle window, so a single total-deadline would have failed. + 'idle-reset.cjs': "let n = 0;\nconst t = setInterval(() => {\n n++;\n process.stdout.write('progress ' + n + '\\n');\n if (n >= 8) { clearInterval(t); process.stdout.write('successfully started\\n'); }\n}, 100);\nsetInterval(() => {}, 1000);\n", + // Exits non-zero. + 'exit-nonzero.cjs': "process.stderr.write('boom\\n');\nprocess.exit(1);\n", +}; + +let fixtureDir: string; +const fixtures: Record = {}; + +before(() => { + fixtureDir = mkdtempSync(join(tmpdir(), 'harper-it-fixtures-')); + for (const [name, src] of Object.entries(FIXTURE_SOURCES)) { + const path = join(fixtureDir, name); + writeFileSync(path, src); + fixtures[name] = path; + } +}); + +after(() => { + rmSync(fixtureDir, { recursive: true, force: true }); +}); + +function fakeCtx(process?: ChildProcess): StartedHarperTestContext { + return { harper: { process } } as unknown as StartedHarperTestContext; +} + +/** Resolves once `needle` has appeared on the child's stdout. */ +function waitForOutput(child: ChildProcess, needle: string): Promise { + return new Promise((resolve) => { + let buffer = ''; + child.stdout?.on('data', (chunk: Buffer) => { + buffer += chunk.toString(); + if (buffer.includes(needle)) resolve(); + }); + }); +} + +// --- Startup watchdog (Race 1) --- + +test('runHarperCommand resolves when the completion message appears', async () => { + const result = await runHarperCommand({ + args: [], + env: {}, + completionMessage: 'successfully started', + harperBinPath: fixtures['ready.cjs'], + timeoutMs: 2000, + maxMs: 10000, + }); + try { + match(result.stdout, /successfully started/); + } finally { + result.process.kill('SIGKILL'); + } +}); + +test('runHarperCommand rejects when output is silent past the idle timeout', async () => { + await rejects( + runHarperCommand({ + args: [], + env: {}, + completionMessage: 'successfully started', + harperBinPath: fixtures['idle-hang.cjs'], + timeoutMs: 300, + maxMs: 10000, + }), + (err: Error) => { + ok(err instanceof HarperStartupError); + match(err.message, /no startup output for 300ms/); + return true; + } + ); +}); + +test('runHarperCommand enforces the absolute max even while output keeps flowing', async () => { + await rejects( + runHarperCommand({ + args: [], + env: {}, + completionMessage: 'successfully started', + harperBinPath: fixtures['chatty.cjs'], + timeoutMs: 2000, + maxMs: 500, + }), + (err: Error) => { + ok(err instanceof HarperStartupError); + match(err.message, /maximum startup time of 500ms/); + return true; + } + ); +}); + +test('runHarperCommand keeps a slow-but-progressing boot alive past the idle window', async () => { + const result = await runHarperCommand({ + args: [], + env: {}, + completionMessage: 'successfully started', + harperBinPath: fixtures['idle-reset.cjs'], + timeoutMs: 400, + maxMs: 10000, + }); + try { + match(result.stdout, /successfully started/); + } finally { + result.process.kill('SIGKILL'); + } +}); + +test('runHarperCommand rejects when the process exits non-zero', async () => { + await rejects( + runHarperCommand({ + args: [], + env: {}, + completionMessage: 'successfully started', + harperBinPath: fixtures['exit-nonzero.cjs'], + timeoutMs: 5000, + maxMs: 10000, + }), + (err: Error) => { + ok(err instanceof HarperStartupError); + match(err.message, /failed with exit code\/signal 1/); + return true; + } + ); +}); + +// --- Teardown kill (Race 2) --- + +test('killHarper terminates a process that exits on SIGTERM, before the grace deadline', async () => { + const child = spawn(process.execPath, ['-e', 'setInterval(() => {}, 1000)']); + await once(child, 'spawn'); + const start = Date.now(); + await killHarper(fakeCtx(child), { graceMs: 2000 }); + strictEqual(child.signalCode, 'SIGTERM'); + ok(Date.now() - start < 1500, 'should exit well before the grace deadline'); +}); + +test('killHarper escalates to SIGKILL when SIGTERM is ignored', async () => { + // Announce 'ready' only after the SIGTERM handler is installed, so the parent doesn't race + // the child's startup and send SIGTERM before the handler exists. + const child = spawn(process.execPath, [ + '-e', + "process.on('SIGTERM', () => {}); process.stdout.write('ready\\n'); setInterval(() => {}, 1000)", + ]); + await waitForOutput(child, 'ready'); + const start = Date.now(); + await killHarper(fakeCtx(child), { graceMs: 200 }); + strictEqual(child.signalCode, 'SIGKILL'); + ok(Date.now() - start >= 150, 'should wait the grace period before escalating to SIGKILL'); +}); + +test('killHarper returns immediately when there is no process', async () => { + await killHarper(fakeCtx(undefined)); + await killHarper({} as unknown as StartedHarperTestContext); +}); + +test('killHarper returns immediately for an already-exited process', async () => { + const child = spawn(process.execPath, ['-e', 'process.exit(0)']); + await once(child, 'exit'); + const start = Date.now(); + await killHarper(fakeCtx(child), { graceMs: 5000 }); + ok(Date.now() - start < 200, 'should not wait the grace period for an already-dead process'); +}); diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts index 8adc788..d1cddb8 100644 --- a/src/harperLifecycle.ts +++ b/src/harperLifecycle.ts @@ -5,6 +5,7 @@ import { tmpdir } from 'node:os'; import { mkdtemp, mkdir, rm, cp } from 'node:fs/promises'; import { type SuiteContext, type TestContext } from 'node:test'; import { getNextAvailableLoopbackAddress, releaseLoopbackAddress } from './loopbackAddressPool.ts'; +import { waitForPortsFree } from './portUtils.ts'; import { ok, equal } from 'node:assert'; import { createRequire } from 'node:module'; @@ -47,8 +48,52 @@ const MQTTS_PORT = 8883; export const OPERATIONS_API_PORT = 9925; export const DEFAULT_ADMIN_USERNAME = 'admin'; export const DEFAULT_ADMIN_PASSWORD = 'Abc1234!'; + +/** Every fixed port a Harper test instance may bind, all on its assigned loopback address. */ +const ALL_HARPER_PORTS = [OPERATIONS_API_PORT, HTTP_PORT, HTTPS_PORT, MQTT_PORT, MQTTS_PORT]; + +/** Whether we appear to be running in CI (most CI providers set CI=true). */ +const IS_CI = !!process.env.CI; + +/** + * Maximum time to wait between chunks of Harper startup output before treating the process + * as hung. The startup watchdog resets this window on every chunk of output, so the limit is + * time-since-last-progress rather than total boot time: a slow-but-healthy boot that keeps + * logging never trips it — only true silence does. + * + * Override with `HARPER_INTEGRATION_TEST_STARTUP_TIMEOUT_MS`. Default 60s. + */ export const DEFAULT_STARTUP_TIMEOUT_MS = parseInt(process.env.HARPER_INTEGRATION_TEST_STARTUP_TIMEOUT_MS || '', 10) || 60000; +/** + * Absolute ceiling on total startup time, regardless of ongoing output — a generous backstop + * so a process that chatters forever without ever reporting ready still fails. Higher under CI, + * where shared/contended runners boot more slowly. + * + * Override with `HARPER_INTEGRATION_TEST_STARTUP_MAX_MS`. Default 120s (300s under CI). + */ +export const DEFAULT_STARTUP_MAX_MS = parseInt(process.env.HARPER_INTEGRATION_TEST_STARTUP_MAX_MS || '', 10) || (IS_CI ? 300000 : 120000); + +/** + * Grace period after SIGTERM before escalating to SIGKILL during teardown, giving Harper time + * to shut down cleanly (flush RocksDB, release ports, reap worker children). + * + * Override with `HARPER_INTEGRATION_TEST_TEARDOWN_GRACE_MS`. Default 5s. + */ +export const DEFAULT_TEARDOWN_GRACE_MS = parseInt(process.env.HARPER_INTEGRATION_TEST_TEARDOWN_GRACE_MS || '', 10) || 5000; + +/** + * Maximum time teardown waits for Harper's fixed ports to be released on the loopback address + * before recycling it back to the pool. If the ports are still in use at the deadline, the + * address is recycled anyway (no worse than not waiting). + * + * Override with `HARPER_INTEGRATION_TEST_PORT_RELEASE_TIMEOUT_MS`. Default 5s. + */ +export const DEFAULT_PORT_RELEASE_TIMEOUT_MS = parseInt(process.env.HARPER_INTEGRATION_TEST_PORT_RELEASE_TIMEOUT_MS || '', 10) || 5000; + +/** Short backstop wait for the process 'exit' event after sending SIGKILL during teardown. */ +const SIGKILL_EXIT_WAIT_MS = 1000; + /** * The runtime to use for running Harper during tests. * Set via the HARPER_RUNTIME environment variable ('node' or 'bun'). @@ -68,10 +113,16 @@ export const LOG_DIR_MARKER_PREFIX = '[Harper] Logs for this instance will be st */ export interface StartHarperOptions { /** - * Timeout in milliseconds to wait for Harper to start. - * @default 30000 + * Maximum time (ms) to wait between chunks of startup output before treating Harper as hung. + * Resets on every chunk of output, so it bounds silence, not total boot time. + * Falls back to {@link DEFAULT_STARTUP_TIMEOUT_MS} (60s). */ startupTimeoutMs?: number; + /** + * Absolute ceiling (ms) on total startup time, regardless of ongoing output. + * Falls back to {@link DEFAULT_STARTUP_MAX_MS} (120s locally, 300s under CI). + */ + startupMaxMs?: number; /** * Additional configuration options to pass to the Harper CLI. */ @@ -239,8 +290,10 @@ interface RunHarperCommandOptions { /** When set, stdout and stderr are written to files in this directory */ logDir?: string; harperBinPath?: string; - /** Timeout in milliseconds to wait for the process to complete or emit the completionMessage. Falls back to DEFAULT_STARTUP_TIMEOUT_MS. */ + /** Idle timeout (ms): max time between output chunks before treating the process as hung. Resets on output. Falls back to DEFAULT_STARTUP_TIMEOUT_MS. */ timeoutMs?: number; + /** Absolute timeout (ms): ceiling on total time regardless of output. Falls back to DEFAULT_STARTUP_MAX_MS. */ + maxMs?: number; } interface RunHarperCommandResult { @@ -258,14 +311,17 @@ interface RunHarperCommandResult { * (`stdout.log` and `stderr.log`) in that directory. * * @throws {HarperStartupError} If the command times out or exits with a non-zero status code + * + * Exported for unit testing; not part of the public API (not re-exported from `index.ts`). */ -function runHarperCommand({ +export function runHarperCommand({ args, env, completionMessage, logDir, harperBinPath, timeoutMs, + maxMs, }: RunHarperCommandOptions): Promise { const harperScript = getHarperScript(harperBinPath); const runtime = HARPER_RUNTIME; @@ -284,47 +340,93 @@ function runHarperCommand({ stderrStream = createWriteStream(join(logDir, 'stderr.log')); } - const effectiveTimeout = timeoutMs ?? DEFAULT_STARTUP_TIMEOUT_MS; + const idleTimeoutMs = timeoutMs ?? DEFAULT_STARTUP_TIMEOUT_MS; + const maxTimeoutMs = maxMs ?? DEFAULT_STARTUP_MAX_MS; return new Promise((resolve, reject) => { let stdout = ''; let stderr = ''; - let timer = setTimeout(() => { - reject(new HarperStartupError( - `Harper process timed out after ${effectiveTimeout}ms`, - stdout, - stderr - )); + let settled = false; + let idleTimer: NodeJS.Timeout; + let maxTimer: NodeJS.Timeout; + + const clearTimers = () => { + clearTimeout(idleTimer); + clearTimeout(maxTimer); + }; + + const failStartup = (message: string) => { + if (settled) return; + settled = true; + clearTimers(); + reject(new HarperStartupError(message, stdout, stderr)); proc.kill(); - }, effectiveTimeout); + }; + + const succeed = () => { + if (settled) return; + settled = true; + clearTimers(); + resolve({ process: proc, stdout, stderr }); + }; + + // Reset on every chunk of output so the limit is time-since-last-progress, not total boot + // time: a slow-but-healthy boot that keeps logging never trips it, only true silence does. + const resetIdleTimer = () => { + if (settled) return; + clearTimeout(idleTimer); + idleTimer = setTimeout( + () => failStartup(`Harper produced no startup output for ${idleTimeoutMs}ms before reporting ready (likely hung)`), + idleTimeoutMs + ); + }; + + // Absolute backstop, regardless of ongoing output. + maxTimer = setTimeout( + () => failStartup(`Harper did not report ready within the maximum startup time of ${maxTimeoutMs}ms`), + maxTimeoutMs + ); + resetIdleTimer(); proc.stdout?.on('data', (data: Buffer) => { const dataString = stripAnsi(data.toString()); stdoutStream?.write(dataString); + // Once ready, keep streaming logs to disk but stop the watchdog and capture: the + // returned startupOutput is a snapshot taken at readiness, and the server may run + // (and log) for the rest of the suite. + if (settled) return; + resetIdleTimer(); + stdout += dataString; if (completionMessage && dataString.includes(completionMessage)) { - clearTimeout(timer); - resolve({ process: proc, stdout, stderr }); + succeed(); } - stdout += dataString; }); proc.stderr?.on('data', (data: Buffer) => { const dataString = stripAnsi(data.toString()); stderrStream?.write(dataString); + if (settled) return; + resetIdleTimer(); stderr += dataString; }); proc.on('error', (error) => { + if (settled) return; + settled = true; + clearTimers(); reject(error); }); proc.on('exit', (statusCode, signal) => { - clearTimeout(timer); - if (statusCode === 0) { - resolve({ process: proc, stdout, stderr }); - } else { - const errorMessage = `Harper process failed with exit code/signal ${statusCode ?? signal}`; - stderrStream?.write(errorMessage); - reject(new HarperStartupError(errorMessage, stdout, stderr)); + clearTimers(); + if (!settled) { + settled = true; + if (statusCode === 0) { + resolve({ process: proc, stdout, stderr }); + } else { + const errorMessage = `Harper process failed with exit code/signal ${statusCode ?? signal}`; + stderrStream?.write(errorMessage); + reject(new HarperStartupError(errorMessage, stdout, stderr)); + } } stdoutStream?.end(); stderrStream?.end(); @@ -446,6 +548,7 @@ export async function startHarper(ctx: HarperTestContext, options?: StartHarperO logDir, harperBinPath: options?.harperBinPath, timeoutMs: options?.startupTimeoutMs, + maxMs: options?.startupMaxMs, }); ctx.harper = { @@ -466,26 +569,52 @@ export async function startHarper(ctx: HarperTestContext, options?: StartHarperO } /** - * Kill harper process (can be used for teardown, or killing it before a restart) + * Kill harper process (can be used for teardown, or killing it before a restart). + * + * Sends SIGTERM first and gives Harper a grace period to shut down cleanly (flush RocksDB, + * release ports, reap worker children) before escalating to SIGKILL. After SIGKILL it waits + * briefly for the process to actually exit, so callers can rely on it being gone. + * * @param ctx + * @param options.graceMs Time to wait after SIGTERM before sending SIGKILL. Defaults to + * {@link DEFAULT_TEARDOWN_GRACE_MS}. */ -export async function killHarper(ctx: StartedHarperTestContext): Promise { - if (!ctx.harper?.process) return; +export async function killHarper(ctx: StartedHarperTestContext, options?: { graceMs?: number }): Promise { + const proc = ctx.harper?.process; + if (!proc) return; + // Already exited — nothing to do. + if (proc.exitCode !== null || proc.signalCode !== null) return; + + const graceMs = options?.graceMs ?? DEFAULT_TEARDOWN_GRACE_MS; + await new Promise((resolve) => { - let timer: NodeJS.Timeout; - ctx.harper.process.on('exit', () => { + let done = false; + let sigkillTimer: NodeJS.Timeout; + let backstopTimer: NodeJS.Timeout; + + const finish = () => { + if (done) return; + done = true; + clearTimeout(sigkillTimer); + clearTimeout(backstopTimer); resolve(); - clearTimeout(timer); - }); - ctx.harper.process.kill(); - timer = setTimeout(() => { + }; + + proc.once('exit', finish); + + // Ask Harper to shut down cleanly first. + proc.kill(); + + // If it hasn't exited within the grace period, force-kill and wait briefly for the + // 'exit' event before resolving (with a backstop in case it never fires). + sigkillTimer = setTimeout(() => { try { - ctx.harper.process.kill('SIGKILL'); + proc.kill('SIGKILL'); } catch { // possible that the process terminated but the exit event hasn't fired yet } - resolve(); - }, 200); + backstopTimer = setTimeout(finish, SIGKILL_EXIT_WAIT_MS); + }, graceMs); }); } @@ -513,6 +642,20 @@ export async function teardownHarper(ctx: StartedHarperTestContext): Promise { + return new Promise((resolve, reject) => { + const server = createServer(); + server.once('error', reject); + server.listen(port, host, () => resolve(server)); + }); +} + +function close(server: Server): Promise { + return new Promise((resolve) => server.close(() => resolve())); +} + +/** Asks the OS for a free port by binding to port 0 and reading the assigned port. */ +async function getEphemeralPort(host: string): Promise { + const server = await listen(host, 0); + const address = server.address(); + const port = typeof address === 'object' && address ? address.port : 0; + await close(server); + return port; +} + +test('isPortFree returns true for an unbound port', async () => { + const port = await getEphemeralPort(HOST); + strictEqual(await isPortFree(HOST, port), true); +}); + +test('isPortFree returns false for a bound port', async () => { + const port = await getEphemeralPort(HOST); + const server = await listen(HOST, port); + try { + strictEqual(await isPortFree(HOST, port), false); + } finally { + await close(server); + } +}); + +test('waitForPortsFree resolves true immediately when all ports are free', async () => { + const a = await getEphemeralPort(HOST); + const b = await getEphemeralPort(HOST); + const start = Date.now(); + const freed = await waitForPortsFree(HOST, [a, b], 1000, 50); + ok(freed, 'expected ports to be reported free'); + ok(Date.now() - start < 500, 'should not have polled/waited'); +}); + +test('waitForPortsFree waits until a held port is released, then resolves true', async () => { + const port = await getEphemeralPort(HOST); + const server = await listen(HOST, port); + // Release the port shortly after we start waiting. + setTimeout(() => void close(server), 200); + const freed = await waitForPortsFree(HOST, [port], 2000, 50); + ok(freed, 'expected the port to become free before the timeout'); +}); + +test('waitForPortsFree resolves false when a port stays held past the timeout', async () => { + const port = await getEphemeralPort(HOST); + const server = await listen(HOST, port); + try { + const freed = await waitForPortsFree(HOST, [port], 300, 50); + strictEqual(freed, false); + } finally { + await close(server); + } +}); diff --git a/src/portUtils.ts b/src/portUtils.ts new file mode 100644 index 0000000..54ccd98 --- /dev/null +++ b/src/portUtils.ts @@ -0,0 +1,51 @@ +import { createServer } from 'node:net'; +import { setTimeout as sleep } from 'node:timers/promises'; + +/** + * Checks whether a TCP port can be bound on a given host (i.e. the port is free). + * + * Attempts to bind a throwaway server to `host:port`; resolves `true` if the bind + * succeeds (the server is closed immediately) and `false` if it fails (e.g. the port + * is still held by another process or socket). + * + * @param host The host/address to bind on (e.g. "127.0.0.2") + * @param port The port to test + * @returns A promise resolving to `true` if the port is free, `false` otherwise + */ +export function isPortFree(host: string, port: number): Promise { + return new Promise((resolve) => { + const server = createServer(); + server.once('error', () => resolve(false)); + server.listen(port, host, () => { + server.close(() => resolve(true)); + }); + }); +} + +/** + * Polls until every `host:port` in `ports` is free, or `timeoutMs` elapses. + * + * Unlike binding to an ephemeral port (which only proves the *address* is usable), this + * verifies the specific fixed ports a Harper instance binds are actually released, which + * is what the next test suite needs before it can reuse the address. + * + * @param host The host/address the ports are bound on (e.g. "127.0.0.2") + * @param ports The fixed ports to wait on + * @param timeoutMs Maximum time to wait for all ports to become free + * @param pollIntervalMs Delay between polls (default 100ms) + * @returns `true` if all ports became free within the timeout, `false` if it gave up + */ +export async function waitForPortsFree( + host: string, + ports: number[], + timeoutMs: number, + pollIntervalMs = 100 +): Promise { + const deadline = Date.now() + timeoutMs; + for (;;) { + const results = await Promise.all(ports.map((port) => isPortFree(host, port))); + if (results.every(Boolean)) return true; + if (Date.now() >= deadline) return false; + await sleep(pollIntervalMs); + } +} diff --git a/tsconfig.build.json b/tsconfig.build.json index 12dc414..802eb97 100644 --- a/tsconfig.build.json +++ b/tsconfig.build.json @@ -1,5 +1,6 @@ { "extends": "./tsconfig.json", + "exclude": ["src/**/*.test.ts"], "compilerOptions": { "noEmit": false, "sourceMap": true, From 2bb471b38310773a40e992b37d21fefea067cd9e Mon Sep 17 00:00:00 2001 From: Nathan Heskew Date: Thu, 4 Jun 2026 14:49:08 -0700 Subject: [PATCH 2/6] fix: address cross-model review findings Gemini: - Match the startup completion marker against accumulated stdout, not a single chunk, so a marker split across stream chunks is still detected. - startHarper now waits for the fixed ports to be free before spawning, closing the killHarper()->startHarper() restart race (no-op for fresh pool addresses). - killHarper unregisters its 'exit' listener in finish() so it can't leak if the SIGKILL backstop fires. - Close the log streams in runHarperCommand's spawn-error handler ('exit' never fires for a failed spawn), avoiding leaked file descriptors. - Document the startupTimeoutMs semantic change (absolute deadline -> idle window). Codex: - Raise devEngines runtime to >=22.18.0 so `npm test` (which runs .ts directly via native type stripping) works across the declared contributor Node range. - Update CONTRIBUTING.md / AGENTS.md for the new `npm test` command and Node req. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 2 +- CONTRIBUTING.md | 5 ++++- README.md | 2 ++ package.json | 2 +- src/harperLifecycle.ts | 18 +++++++++++++++++- 5 files changed, 25 insertions(+), 4 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index e08433d..5331df9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -3,7 +3,7 @@ Review the `README.md` and `CONTRIBUTING.md` for all relevant repository information. ## Development Tips -- Ensure you're on at least Node.js v22 or greater when contributing +- Ensure you're on at least Node.js v22.18 or greater when contributing (native TypeScript stripping for `npm test` requires it; see `devEngines`) - Use `npm install` to install dependencies - Use `npm run build` to build the project - Do not run `npm version` or `npm publish`; these commands are for humans only. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3c78fda..5a3b260 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,9 +22,12 @@ There are two `tsconfig` files: ```sh npm run check # Type-check only (no output) npm run build # Compile src/ → dist/ +npm test # Run the unit tests (node:test over src/**/*.test.ts) ``` -There are no automated tests in this package yet. Validation is type-checking plus manual testing via dependent projects. +Unit tests live alongside the source as `src/*.test.ts` and run on the built-in Node.js test runner via `npm test`. They execute the `.ts` files directly using Node's native type stripping, which requires **Node 22.18+** (reflected in `devEngines`). Tests are excluded from the published build (`tsconfig.build.json`), so they are not shipped. Beyond unit tests, validation also includes type-checking and manual testing via dependent projects. + +Internal-only helpers may be exported from their modules for testing (e.g. `runHarperCommand` in `harperLifecycle.ts`) but are deliberately **not** re-exported from `src/index.ts`, keeping them out of the public API. ## Releases diff --git a/README.md b/README.md index d15763f..cd3f614 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,8 @@ interface StartHarperOptions { Startup readiness is detected by Harper printing `successfully started`. Rather than a single wall-clock deadline (which makes a slow-but-healthy boot indistinguishable from a hang), the watchdog uses an **idle timeout** that resets on every chunk of output — so the limit is time-since-last-progress — plus a generous absolute ceiling as a backstop. This is why a slow CI boot that keeps logging no longer trips the timeout. +> **Behavior change:** `startupTimeoutMs` (and `HARPER_INTEGRATION_TEST_STARTUP_TIMEOUT_MS`) previously meant an absolute startup deadline. It now means the **idle / no-output window**. If you relied on it as a hard ceiling to fail slow boots quickly, set `startupMaxMs` (or `HARPER_INTEGRATION_TEST_STARTUP_MAX_MS`) instead. + **Environment Variables:** - `HARPER_INTEGRATION_TEST_STARTUP_TIMEOUT_MS` - Idle startup timeout: max time between chunks of startup output before Harper is treated as hung (resets on output). Default `60000`. diff --git a/package.json b/package.json index ab16190..f8156be 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,7 @@ "devEngines": { "runtime": { "name": "node", - "version": ">=22", + "version": ">=22.18.0", "onFail": "error" }, "packageManager": { diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts index d1cddb8..a66ce4e 100644 --- a/src/harperLifecycle.ts +++ b/src/harperLifecycle.ts @@ -397,7 +397,9 @@ export function runHarperCommand({ if (settled) return; resetIdleTimer(); stdout += dataString; - if (completionMessage && dataString.includes(completionMessage)) { + // Match against the accumulated output, not just this chunk, so a marker split across + // two stream chunks is still detected. + if (completionMessage && stdout.includes(completionMessage)) { succeed(); } }); @@ -414,6 +416,9 @@ export function runHarperCommand({ if (settled) return; settled = true; clearTimers(); + // 'exit' won't fire for a failed spawn, so close the log streams here to avoid leaking FDs. + stdoutStream?.end(); + stderrStream?.end(); reject(error); }); proc.on('exit', (statusCode, signal) => { @@ -541,6 +546,16 @@ export async function startHarper(ctx: HarperTestContext, options?: StartHarperO ...options?.env, }; + // Defend against a prior instance's ports still being held on this address before we spawn — + // e.g. a quick killHarper()→startHarper() restart (which bypasses teardown's port-free wait), + // or a crashed earlier suite. The pool only guarantees the address is bindable to an ephemeral + // port, not that Harper's fixed ports are free, so without this the new instance can hit + // EADDRINUSE. For a freshly-allocated address the ports are already free and this returns at once. + const portsFree = await waitForPortsFree(loopbackAddress, ALL_HARPER_PORTS, DEFAULT_PORT_RELEASE_TIMEOUT_MS); + if (!portsFree) { + console.warn(`Harper ports on ${loopbackAddress} still in use after ${DEFAULT_PORT_RELEASE_TIMEOUT_MS}ms; starting anyway`); + } + const result = await runHarperCommand({ args, env: harperEnv, @@ -595,6 +610,7 @@ export async function killHarper(ctx: StartedHarperTestContext, options?: { grac const finish = () => { if (done) return; done = true; + proc.off('exit', finish); clearTimeout(sigkillTimer); clearTimeout(backstopTimer); resolve(); From 382eac8ddda394e7a0d7fb5426516d52d2b0d774 Mon Sep 17 00:00:00 2001 From: Nathan Heskew Date: Thu, 4 Jun 2026 15:24:19 -0700 Subject: [PATCH 3/6] refactor: move tests from src/ to test/ to match Harper conventions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test files don't belong in src/ (the compiled tree) and no Harper repo lays them out that way — the peer oauth package uses test/, core uses unitTests/. Move both *.test.ts into test/, point `npm test` at test/**/*.test.ts, have tsconfig.json type-check both src/ and test/ while tsconfig.build.json narrows the emit to src/ (so tests are never published). Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 2 +- CONTRIBUTING.md | 4 ++-- package.json | 2 +- {src => test}/harperLifecycle.test.ts | 2 +- {src => test}/portUtils.test.ts | 2 +- tsconfig.build.json | 2 +- tsconfig.json | 3 ++- 7 files changed, 9 insertions(+), 8 deletions(-) rename {src => test}/harperLifecycle.test.ts (99%) rename {src => test}/portUtils.test.ts (97%) diff --git a/AGENTS.md b/AGENTS.md index 5331df9..d6cc0a2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -21,5 +21,5 @@ Review the `README.md` and `CONTRIBUTING.md` for all relevant repository informa ## Testing Tips - Use `npm link` in this directory and `npm link @harperfast/integration-testing` in other project directories to test out changes locally - Use `npm run check` to type-check the project without generating a build output -- Use `npm test` to run the unit tests (`node --test` over `src/**/*.test.ts`). Tests are excluded from the build (`tsconfig.build.json`) so they are not published. +- Use `npm test` to run the tests (`node --test` over `test/**/*.test.ts`). Tests live in `test/`, separate from `src/`, so the build (which emits only `src/`) never includes them. - To keep them runnable without a real Harper, internal-only helpers are exported from their modules (e.g. `runHarperCommand` in `harperLifecycle.ts`) but intentionally NOT re-exported from `index.ts`, so they stay out of the public API. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5a3b260..9d69b22 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,10 +22,10 @@ There are two `tsconfig` files: ```sh npm run check # Type-check only (no output) npm run build # Compile src/ → dist/ -npm test # Run the unit tests (node:test over src/**/*.test.ts) +npm test # Run the tests (node:test over test/**/*.test.ts) ``` -Unit tests live alongside the source as `src/*.test.ts` and run on the built-in Node.js test runner via `npm test`. They execute the `.ts` files directly using Node's native type stripping, which requires **Node 22.18+** (reflected in `devEngines`). Tests are excluded from the published build (`tsconfig.build.json`), so they are not shipped. Beyond unit tests, validation also includes type-checking and manual testing via dependent projects. +Tests live in `test/` (separate from `src/`) and run on the built-in Node.js test runner via `npm test`. They execute the `.ts` files directly using Node's native type stripping, which requires **Node 22.18+** (reflected in `devEngines`). Because they live outside `src/`, the published build (which emits only `src/` → `dist/`) never includes them. `tsconfig.json` type-checks both `src/` and `test/`; `tsconfig.build.json` narrows the build to `src/`. Beyond these tests, validation also includes type-checking and manual testing via dependent projects. Internal-only helpers may be exported from their modules for testing (e.g. `runHarperCommand` in `harperLifecycle.ts`) but are deliberately **not** re-exported from `src/index.ts`, keeping them out of the public API. diff --git a/package.json b/package.json index f8156be..6eb8577 100644 --- a/package.json +++ b/package.json @@ -23,7 +23,7 @@ "scripts": { "check": "tsc", "build": "tsc -p tsconfig.build.json", - "test": "node --test \"src/**/*.test.ts\"" + "test": "node --test \"test/**/*.test.ts\"" }, "engines": { "node": ">=20" diff --git a/src/harperLifecycle.test.ts b/test/harperLifecycle.test.ts similarity index 99% rename from src/harperLifecycle.test.ts rename to test/harperLifecycle.test.ts index 530dd13..f91a5ad 100644 --- a/src/harperLifecycle.test.ts +++ b/test/harperLifecycle.test.ts @@ -5,7 +5,7 @@ import { once } from 'node:events'; import { mkdtempSync, writeFileSync, rmSync } from 'node:fs'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; -import { killHarper, runHarperCommand, HarperStartupError, type StartedHarperTestContext } from './harperLifecycle.ts'; +import { killHarper, runHarperCommand, HarperStartupError, type StartedHarperTestContext } from '../src/harperLifecycle.ts'; // Standalone scripts used as a fake "Harper binary" (passed via harperBinPath) to drive // runHarperCommand's startup watchdog through specific timing scenarios without a real Harper. diff --git a/src/portUtils.test.ts b/test/portUtils.test.ts similarity index 97% rename from src/portUtils.test.ts rename to test/portUtils.test.ts index a043733..4fe9117 100644 --- a/src/portUtils.test.ts +++ b/test/portUtils.test.ts @@ -1,7 +1,7 @@ import { test } from 'node:test'; import { ok, strictEqual } from 'node:assert'; import { createServer, type Server } from 'node:net'; -import { isPortFree, waitForPortsFree } from './portUtils.ts'; +import { isPortFree, waitForPortsFree } from '../src/portUtils.ts'; const HOST = '127.0.0.1'; diff --git a/tsconfig.build.json b/tsconfig.build.json index 802eb97..4fc3b1d 100644 --- a/tsconfig.build.json +++ b/tsconfig.build.json @@ -1,6 +1,6 @@ { "extends": "./tsconfig.json", - "exclude": ["src/**/*.test.ts"], + "include": ["src/**/*"], "compilerOptions": { "noEmit": false, "sourceMap": true, diff --git a/tsconfig.json b/tsconfig.json index 57d1652..fd25f38 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,6 +1,7 @@ { "include": [ - "src/**/*" + "src/**/*", + "test/**/*" ], "compilerOptions": { // Allow JS files for mixed codebase From 6ecc58ed3b824557aa021fbed500bdfded7d33b3 Mon Sep 17 00:00:00 2001 From: Nathan Heskew Date: Fri, 5 Jun 2026 10:55:20 -0700 Subject: [PATCH 4/6] rework Race 2 per review: kill the process tree (root cause), demote port-poll to assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Ethan's review: the loopback pool already serializes address ownership, so the collision was the *temporal* recycle race — the address was unlocked before the previous tenant's ports were actually free. The real fix is in killHarper, not a port poll. - Spawn Harper detached (POSIX) so it leads its own process group; killHarper now signals the whole tree (negative PID on POSIX, `taskkill /T` on Windows), SIGTERM → grace → SIGKILL, then waits for exit. A dead tree releases its listening sockets, so ports are free once killHarper returns. - Drop the proactive startHarper pre-spawn port wait (redundant given a complete kill). - Keep waitForPortsFree only as a warn-only post-kill assertion in teardownHarper: it should pass instantly; a warning means a child process escaped the tree kill. - Add a test proving the group kill reaps a child process, not just the parent. Note: Harper's listeners run in worker_threads (in-process), so they die with the parent regardless; the tree kill is belt-and-suspenders for component-spawned child processes. Windows taskkill path is untested locally. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/harperLifecycle.ts | 98 +++++++++++++++++++++++++----------- test/harperLifecycle.test.ts | 61 ++++++++++++++++++++-- 2 files changed, 124 insertions(+), 35 deletions(-) diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts index a66ce4e..608c998 100644 --- a/src/harperLifecycle.ts +++ b/src/harperLifecycle.ts @@ -83,9 +83,11 @@ export const DEFAULT_STARTUP_MAX_MS = parseInt(process.env.HARPER_INTEGRATION_TE export const DEFAULT_TEARDOWN_GRACE_MS = parseInt(process.env.HARPER_INTEGRATION_TEST_TEARDOWN_GRACE_MS || '', 10) || 5000; /** - * Maximum time teardown waits for Harper's fixed ports to be released on the loopback address - * before recycling it back to the pool. If the ports are still in use at the deadline, the - * address is recycled anyway (no worse than not waiting). + * Time teardown's safety assertion waits for Harper's fixed ports to be free before recycling the + * loopback address. Killing Harper's process tree (and waiting for exit) should free them + * immediately, so this normally returns on the first check; it only matters if a child process + * escaped the kill. If the ports are still in use at the deadline, the address is recycled anyway + * (no worse than not waiting) and a warning is logged. * * Override with `HARPER_INTEGRATION_TEST_PORT_RELEASE_TIMEOUT_MS`. Default 5s. */ @@ -331,6 +333,11 @@ export function runHarperCommand({ : ['--trace-warnings', '--force-node-api-uncaught-exceptions-policy=true', harperScript, ...args]; const proc = spawn(runtime, runtimeArgs, { env: { ...process.env, ...env }, + // On POSIX, run Harper as its own process-group leader so teardown can signal the whole + // group (parent + any worker children), not just the direct child. Windows has no process + // groups; killHarper uses `taskkill /T` there instead. stdio stays piped (not detached), + // and we never unref, so output capture and lifetime management are unchanged. + detached: process.platform !== 'win32', }); let stdoutStream: WriteStream | undefined; @@ -546,16 +553,6 @@ export async function startHarper(ctx: HarperTestContext, options?: StartHarperO ...options?.env, }; - // Defend against a prior instance's ports still being held on this address before we spawn — - // e.g. a quick killHarper()→startHarper() restart (which bypasses teardown's port-free wait), - // or a crashed earlier suite. The pool only guarantees the address is bindable to an ephemeral - // port, not that Harper's fixed ports are free, so without this the new instance can hit - // EADDRINUSE. For a freshly-allocated address the ports are already free and this returns at once. - const portsFree = await waitForPortsFree(loopbackAddress, ALL_HARPER_PORTS, DEFAULT_PORT_RELEASE_TIMEOUT_MS); - if (!portsFree) { - console.warn(`Harper ports on ${loopbackAddress} still in use after ${DEFAULT_PORT_RELEASE_TIMEOUT_MS}ms; starting anyway`); - } - const result = await runHarperCommand({ args, env: harperEnv, @@ -583,12 +580,56 @@ export async function startHarper(ctx: HarperTestContext, options?: StartHarperO return ctx as StartedHarperTestContext; } +/** + * Signals Harper's entire process tree, not just the direct child. + * + * Harper runs its listeners in worker threads (in-process), but components can also spawn + * child processes, so we target the whole tree to be safe: + * - POSIX: Harper is spawned as its own process-group leader (`detached`), so a negative PID + * signals the group (parent + descendants). + * - Windows: there are no process groups, so we shell out to `taskkill /T` (tree). `/F` (force) + * is required to actually terminate console processes, so it is used for the SIGKILL step. + * + * Best-effort: errors (e.g. the process already exited) are swallowed; teardown's port assertion + * and the wait-for-exit are the safety nets. + */ +function signalHarperTree(proc: ChildProcess, signal: 'SIGTERM' | 'SIGKILL'): void { + const pid = proc.pid; + if (pid === undefined) return; + if (process.platform === 'win32') { + const args = ['/pid', String(pid), '/T']; + if (signal === 'SIGKILL') args.push('/F'); + try { + spawn('taskkill', args, { stdio: 'ignore' }).on('error', () => {}); + } catch { + try { + proc.kill(signal); + } catch { + // process already gone + } + } + return; + } + try { + // Negative PID => signal the whole process group (proc is the group leader via `detached`). + process.kill(-pid, signal); + } catch { + // Group already gone, or (defensively) not a group leader — fall back to the direct child. + try { + proc.kill(signal); + } catch { + // process already gone + } + } +} + /** * Kill harper process (can be used for teardown, or killing it before a restart). * - * Sends SIGTERM first and gives Harper a grace period to shut down cleanly (flush RocksDB, - * release ports, reap worker children) before escalating to SIGKILL. After SIGKILL it waits - * briefly for the process to actually exit, so callers can rely on it being gone. + * Sends SIGTERM to Harper's whole process tree first and gives it a grace period to shut down + * cleanly (flush RocksDB, release ports, reap worker children) before escalating to SIGKILL. + * After SIGKILL it waits briefly for the process to actually exit, so callers can rely on it + * being gone — and, since a dead process releases its listening sockets, on its ports being free. * * @param ctx * @param options.graceMs Time to wait after SIGTERM before sending SIGKILL. Defaults to @@ -618,17 +659,13 @@ export async function killHarper(ctx: StartedHarperTestContext, options?: { grac proc.once('exit', finish); - // Ask Harper to shut down cleanly first. - proc.kill(); + // Ask the whole Harper tree to shut down cleanly first. + signalHarperTree(proc, 'SIGTERM'); - // If it hasn't exited within the grace period, force-kill and wait briefly for the + // If it hasn't exited within the grace period, force-kill the tree and wait briefly for the // 'exit' event before resolving (with a backstop in case it never fires). sigkillTimer = setTimeout(() => { - try { - proc.kill('SIGKILL'); - } catch { - // possible that the process terminated but the exit event hasn't fired yet - } + signalHarperTree(proc, 'SIGKILL'); backstopTimer = setTimeout(finish, SIGKILL_EXIT_WAIT_MS); }, graceMs); }); @@ -658,16 +695,17 @@ export async function teardownHarper(ctx: StartedHarperTestContext): Promise { }); } +/** Resolves with the regex match once it appears on the child's stdout. */ +function waitForMatch(child: ChildProcess, regex: RegExp): Promise { + return new Promise((resolve) => { + let buffer = ''; + child.stdout?.on('data', (chunk: Buffer) => { + buffer += chunk.toString(); + const matched = buffer.match(regex); + if (matched) resolve(matched); + }); + }); +} + +/** Polls (signal 0) until `pid` no longer exists, or the timeout elapses. */ +async function waitProcessGone(pid: number, timeoutMs: number): Promise { + const deadline = Date.now() + timeoutMs; + for (;;) { + try { + process.kill(pid, 0); + } catch { + return true; // ESRCH — process is gone + } + if (Date.now() >= deadline) return false; + await sleep(50); + } +} + // --- Startup watchdog (Race 1) --- test('runHarperCommand resolves when the completion message appears', async () => { @@ -144,8 +171,11 @@ test('runHarperCommand rejects when the process exits non-zero', async () => { // --- Teardown kill (Race 2) --- +// Children are spawned `detached` to mirror how Harper is spawned: as a process-group leader, +// so killHarper's group signal (negative PID on POSIX) targets the whole tree. + test('killHarper terminates a process that exits on SIGTERM, before the grace deadline', async () => { - const child = spawn(process.execPath, ['-e', 'setInterval(() => {}, 1000)']); + const child = spawn(process.execPath, ['-e', 'setInterval(() => {}, 1000)'], { detached: true }); await once(child, 'spawn'); const start = Date.now(); await killHarper(fakeCtx(child), { graceMs: 2000 }); @@ -156,10 +186,11 @@ test('killHarper terminates a process that exits on SIGTERM, before the grace de test('killHarper escalates to SIGKILL when SIGTERM is ignored', async () => { // Announce 'ready' only after the SIGTERM handler is installed, so the parent doesn't race // the child's startup and send SIGTERM before the handler exists. - const child = spawn(process.execPath, [ - '-e', - "process.on('SIGTERM', () => {}); process.stdout.write('ready\\n'); setInterval(() => {}, 1000)", - ]); + const child = spawn( + process.execPath, + ['-e', "process.on('SIGTERM', () => {}); process.stdout.write('ready\\n'); setInterval(() => {}, 1000)"], + { detached: true } + ); await waitForOutput(child, 'ready'); const start = Date.now(); await killHarper(fakeCtx(child), { graceMs: 200 }); @@ -167,6 +198,26 @@ test('killHarper escalates to SIGKILL when SIGTERM is ignored', async () => { ok(Date.now() - start >= 150, 'should wait the grace period before escalating to SIGKILL'); }); +test('killHarper kills the whole process tree, not just the direct child', { skip: process.platform === 'win32' }, async () => { + // A detached parent that spawns its own (non-detached) child, so the child shares the parent's + // process group. A group-targeted kill must reap the child too — the core of the fix. + const parent = spawn( + process.execPath, + [ + '-e', + "const{spawn}=require('node:child_process');" + + "const c=spawn(process.execPath,['-e','setInterval(()=>{},1000)'],{stdio:'ignore'});" + + "process.stdout.write('child:'+c.pid+'\\n');" + + 'setInterval(()=>{},1000)', + ], + { detached: true } + ); + const childPid = parseInt((await waitForMatch(parent, /child:(\d+)/))[1], 10); + await killHarper(fakeCtx(parent), { graceMs: 200 }); + ok(parent.exitCode !== null || parent.signalCode !== null, 'parent should be dead'); + ok(await waitProcessGone(childPid, 2000), `grandchild ${childPid} should have been killed with the group`); +}); + test('killHarper returns immediately when there is no process', async () => { await killHarper(fakeCtx(undefined)); await killHarper({} as unknown as StartedHarperTestContext); From 140731332b7ec2803db663b522af2e6fbcb226ec Mon Sep 17 00:00:00 2001 From: Nathan Heskew Date: Fri, 5 Jun 2026 11:20:47 -0700 Subject: [PATCH 5/6] reap Harper trees on runner exit/interrupt (detached orphan mitigation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spawning Harper detached means it survives signals sent to the runner's process group (Ctrl+C, CI group-kill), so an interrupted runner could orphan it holding the fixed ports. Track live Harper processes and SIGKILL their groups on the runner's 'exit'/SIGINT/SIGTERM (re-raising the signal so the runner still dies). This addresses the orphan window introduced by `detached`. The deeper pool issue (removeDeadProcessesFromPool keys on the runner PID and only sweeps when full) is pre-existing and unaffected by this change — tracked separately. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 8 ++++---- src/harperLifecycle.ts | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index cd3f614..0644b7d 100644 --- a/README.md +++ b/README.md @@ -93,20 +93,20 @@ Like `startHarper()`, but copies a component directory into the Harper install b ### `killHarper(ctx, options?)` -Sends SIGTERM to the Harper process and waits for it to exit, giving it a grace period to shut down cleanly (flush RocksDB, release ports, reap worker children) before escalating to SIGKILL. After SIGKILL it waits briefly for the process to actually exit. Does not release the loopback address or clean up the install directory. Useful for restart scenarios where the test will call `startHarper` again. +Terminates Harper's whole process tree and waits for it to exit. It sends SIGTERM first, giving Harper a grace period to shut down cleanly (flush RocksDB, release ports, reap workers) before escalating to SIGKILL, then waits briefly for the actual exit. Because Harper is spawned as its own process group (`detached` on POSIX), the signal targets the group — parent and any child processes — rather than only the direct child; on Windows it uses `taskkill /T`. A dead process releases its listening sockets, so once `killHarper` returns the fixed ports are free. Does not release the loopback address or clean up the install directory. Useful for restart scenarios where the test will call `startHarper` again. `options.graceMs` overrides the SIGTERM→SIGKILL grace period (default `5000`, or `HARPER_INTEGRATION_TEST_TEARDOWN_GRACE_MS`). ### `teardownHarper(ctx)` -Kills Harper, waits for its fixed ports to be released on the loopback address, releases the address back to the pool, and removes the install directory. Call in a teardown/`after()` hook. +Kills Harper's process tree, releases the loopback address back to the pool, and removes the install directory. Call in a teardown/`after()` hook. -The pool only verifies that an address is *bindable* (to an ephemeral port), not that Harper's fixed ports (Operations API, HTTP/S, MQTT/S) are free — and Harper's worker children/sockets can briefly outlive the parent. So before recycling the address, teardown waits until those ports are actually free, preventing `EADDRINUSE`/`ECONNREFUSED` for the next suite that grabs the address. If the ports are still held at the deadline, the address is recycled anyway. +Since `killHarper` waits for the process tree to exit, its fixed ports (Operations API, HTTP/S, MQTT/S) are already released by the time the address is recycled. As a safety assertion, teardown still verifies those ports are free before recycling — the pool only guarantees the *address* is bindable, not that these specific ports are free — and logs a warning if any are somehow still held (a sign a Harper child process escaped the kill). The address is recycled regardless. **Environment Variables:** - `HARPER_INTEGRATION_TEST_TEARDOWN_GRACE_MS` - Grace period after SIGTERM before escalating to SIGKILL. Default `5000`. -- `HARPER_INTEGRATION_TEST_PORT_RELEASE_TIMEOUT_MS` - Max time to wait for Harper's ports to be released before recycling the loopback address. Default `5000`. +- `HARPER_INTEGRATION_TEST_PORT_RELEASE_TIMEOUT_MS` - Max time teardown's safety assertion waits for Harper's ports to be free before recycling the loopback address (normally instant, since the process tree is already dead). Default `5000`. ### `sendOperation(context, operation)` diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts index 608c998..1a2a5d9 100644 --- a/src/harperLifecycle.ts +++ b/src/harperLifecycle.ts @@ -340,6 +340,10 @@ export function runHarperCommand({ detached: process.platform !== 'win32', }); + // Reap this process's tree if the runner exits/is interrupted before teardown (it's detached, + // so it would otherwise survive signals sent to the runner's group). + trackHarperProcess(proc); + let stdoutStream: WriteStream | undefined; let stderrStream: WriteStream | undefined; if (logDir) { @@ -623,6 +627,38 @@ function signalHarperTree(proc: ChildProcess, signal: 'SIGTERM' | 'SIGKILL'): vo } } +/** + * Tracks live Harper processes so that if the test runner exits or is interrupted before teardown + * runs (Ctrl+C, or a CI job killing the runner), their process trees are reaped instead of left + * orphaned holding the fixed ports. This matters specifically because Harper is spawned `detached` + * (its own process group), so it would otherwise survive signals delivered to the runner's group. + * + * Best-effort: on POSIX the group SIGKILL is delivered synchronously (works even from the 'exit' + * handler); on Windows the `taskkill` shell-out may not complete before the runner exits. + */ +const liveHarperProcesses = new Set(); +let runnerCleanupRegistered = false; + +function trackHarperProcess(proc: ChildProcess): void { + liveHarperProcesses.add(proc); + proc.once('exit', () => liveHarperProcesses.delete(proc)); + + if (runnerCleanupRegistered) return; + runnerCleanupRegistered = true; + + const reapAll = () => { + for (const child of liveHarperProcesses) signalHarperTree(child, 'SIGKILL'); + }; + process.once('exit', reapAll); + // SIGINT/SIGTERM don't fire 'exit'; reap, then re-raise so the runner still terminates normally. + for (const signal of ['SIGINT', 'SIGTERM'] as const) { + process.once(signal, () => { + reapAll(); + process.kill(process.pid, signal); + }); + } +} + /** * Kill harper process (can be used for teardown, or killing it before a restart). * From 2db4dfe515b8c230bba6fd449294615659574460 Mon Sep 17 00:00:00 2001 From: Nathan Heskew Date: Mon, 8 Jun 2026 16:17:57 -0700 Subject: [PATCH 6/6] =?UTF-8?q?refactor(tests):=20address=20review=20?= =?UTF-8?q?=E2=80=94=20bind-to-0=20port=20tests,=20retry=20the=20free-port?= =?UTF-8?q?=20race,=20deflake=20timing=20asserts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - runHarperCommand startup-failure path now signalHarperTree(SIGKILL) instead of proc.kill(), so the detached process tree is reaped rather than just the direct child - portUtils.test: bound-port tests bind to port 0 and assert the live port (no re-listen race); the two free-port tests retry via withFreePorts() to absorb the irreducible TOCTOU - deflake timing asserts: drop redundant <1500 SIGTERM bound (signalCode already proves pre-grace death), loosen already-dead <200 -> <1000, gate waitForPortsFree <500 behind !IS_CI; keep the >=150 lower bound - waitForPortsFree: for(;;) -> while(true) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/harperLifecycle.ts | 4 +- src/portUtils.ts | 2 +- test/harperLifecycle.test.ts | 9 +++-- test/portUtils.test.ts | 76 ++++++++++++++++++++++++++---------- 4 files changed, 65 insertions(+), 26 deletions(-) diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts index 1a2a5d9..e937cf7 100644 --- a/src/harperLifecycle.ts +++ b/src/harperLifecycle.ts @@ -371,7 +371,9 @@ export function runHarperCommand({ settled = true; clearTimers(); reject(new HarperStartupError(message, stdout, stderr)); - proc.kill(); + // Harper is spawned detached (its own process group), so `proc.kill()` would only hit the + // direct child and orphan any worker children still holding ports. Reap the whole tree. + signalHarperTree(proc, 'SIGKILL'); }; const succeed = () => { diff --git a/src/portUtils.ts b/src/portUtils.ts index 54ccd98..bb1ca06 100644 --- a/src/portUtils.ts +++ b/src/portUtils.ts @@ -42,7 +42,7 @@ export async function waitForPortsFree( pollIntervalMs = 100 ): Promise { const deadline = Date.now() + timeoutMs; - for (;;) { + while (true) { const results = await Promise.all(ports.map((port) => isPortFree(host, port))); if (results.every(Boolean)) return true; if (Date.now() >= deadline) return false; diff --git a/test/harperLifecycle.test.ts b/test/harperLifecycle.test.ts index 8f16985..b214b0e 100644 --- a/test/harperLifecycle.test.ts +++ b/test/harperLifecycle.test.ts @@ -177,10 +177,11 @@ test('runHarperCommand rejects when the process exits non-zero', async () => { test('killHarper terminates a process that exits on SIGTERM, before the grace deadline', async () => { const child = spawn(process.execPath, ['-e', 'setInterval(() => {}, 1000)'], { detached: true }); await once(child, 'spawn'); - const start = Date.now(); await killHarper(fakeCtx(child), { graceMs: 2000 }); + // A SIGTERM signalCode proves it died from the SIGTERM, before killHarper escalated to SIGKILL at + // the grace deadline — so this also covers "terminated before the grace deadline" without a + // flaky wall-clock upper bound. strictEqual(child.signalCode, 'SIGTERM'); - ok(Date.now() - start < 1500, 'should exit well before the grace deadline'); }); test('killHarper escalates to SIGKILL when SIGTERM is ignored', async () => { @@ -228,5 +229,7 @@ test('killHarper returns immediately for an already-exited process', async () => await once(child, 'exit'); const start = Date.now(); await killHarper(fakeCtx(child), { graceMs: 5000 }); - ok(Date.now() - start < 200, 'should not wait the grace period for an already-dead process'); + // Generous ceiling, far under the 5s grace: proves it took the already-exited fast path rather + // than waiting out the grace, while leaving plenty of headroom for a contended-CI stall. + ok(Date.now() - start < 1000, 'should not wait the grace period for an already-dead process'); }); diff --git a/test/portUtils.test.ts b/test/portUtils.test.ts index 4fe9117..a8ee671 100644 --- a/test/portUtils.test.ts +++ b/test/portUtils.test.ts @@ -1,9 +1,10 @@ import { test } from 'node:test'; import { ok, strictEqual } from 'node:assert'; -import { createServer, type Server } from 'node:net'; +import { createServer, type AddressInfo, type Server } from 'node:net'; import { isPortFree, waitForPortsFree } from '../src/portUtils.ts'; const HOST = '127.0.0.1'; +const IS_CI = !!process.env.CI; /** Binds a server to host:port and resolves once it is listening. */ function listen(host: string, port: number): Promise { @@ -18,23 +19,55 @@ function close(server: Server): Promise { return new Promise((resolve) => server.close(() => resolve())); } -/** Asks the OS for a free port by binding to port 0 and reading the assigned port. */ -async function getEphemeralPort(host: string): Promise { +/** Binds to port 0 and returns the still-listening server plus its OS-assigned port. */ +async function listenEphemeral(host: string): Promise<{ server: Server; port: number }> { const server = await listen(host, 0); - const address = server.address(); - const port = typeof address === 'object' && address ? address.port : 0; - await close(server); - return port; + const { port } = server.address() as AddressInfo; + return { server, port }; +} + +/** + * Acquires `count` distinct, (very probably) free ports by binding that many ephemeral servers at + * once — distinct because two live servers can't share a port — then releasing them all. + * + * This is the one unavoidable TOCTOU in these tests: nothing can hold a port open *as free*, so a + * test that needs a free port has to accept that something else could grab it between release and + * the assertion. {@link withFreePorts} re-runs the body to absorb that rare race. + */ +async function getFreePorts(host: string, count: number): Promise { + const servers = await Promise.all(Array.from({ length: count }, () => listenEphemeral(host))); + const ports = servers.map((s) => s.port); + await Promise.all(servers.map((s) => close(s.server))); + return ports; +} + +/** Runs `body` with freshly-acquired free ports, retrying with new ports if the TOCTOU race trips. */ +async function withFreePorts( + host: string, + count: number, + body: (ports: number[]) => Promise, + attempts = 5 +): Promise { + let lastErr: unknown; + for (let i = 0; i < attempts; i++) { + try { + await body(await getFreePorts(host, count)); + return; + } catch (err) { + lastErr = err; + } + } + throw lastErr; } test('isPortFree returns true for an unbound port', async () => { - const port = await getEphemeralPort(HOST); - strictEqual(await isPortFree(HOST, port), true); + await withFreePorts(HOST, 1, async ([port]) => { + strictEqual(await isPortFree(HOST, port), true); + }); }); test('isPortFree returns false for a bound port', async () => { - const port = await getEphemeralPort(HOST); - const server = await listen(HOST, port); + const { server, port } = await listenEphemeral(HOST); try { strictEqual(await isPortFree(HOST, port), false); } finally { @@ -43,17 +76,19 @@ test('isPortFree returns false for a bound port', async () => { }); test('waitForPortsFree resolves true immediately when all ports are free', async () => { - const a = await getEphemeralPort(HOST); - const b = await getEphemeralPort(HOST); - const start = Date.now(); - const freed = await waitForPortsFree(HOST, [a, b], 1000, 50); - ok(freed, 'expected ports to be reported free'); - ok(Date.now() - start < 500, 'should not have polled/waited'); + await withFreePorts(HOST, 2, async (ports) => { + const start = Date.now(); + const freed = await waitForPortsFree(HOST, ports, 1000, 50); + ok(freed, 'expected ports to be reported free'); + // `freed` is the correctness assertion. The upper-bound timing check is itself a flake source + // on contended CI (a GC/scheduler stall can blow the ceiling even when the code is correct), + // so it only runs locally as a no-busy-wait regression guard. + if (!IS_CI) ok(Date.now() - start < 500, 'should not have polled/waited'); + }); }); test('waitForPortsFree waits until a held port is released, then resolves true', async () => { - const port = await getEphemeralPort(HOST); - const server = await listen(HOST, port); + const { server, port } = await listenEphemeral(HOST); // Release the port shortly after we start waiting. setTimeout(() => void close(server), 200); const freed = await waitForPortsFree(HOST, [port], 2000, 50); @@ -61,8 +96,7 @@ test('waitForPortsFree waits until a held port is released, then resolves true', }); test('waitForPortsFree resolves false when a port stays held past the timeout', async () => { - const port = await getEphemeralPort(HOST); - const server = await listen(HOST, port); + const { server, port } = await listenEphemeral(HOST); try { const freed = await waitForPortsFree(HOST, [port], 300, 50); strictEqual(freed, false);