From 09d21a4b7cedb1584a04360778a49560e3cd54a8 Mon Sep 17 00:00:00 2001 From: Kris Zyp Date: Mon, 15 Jun 2026 07:12:38 -0600 Subject: [PATCH 1/2] fix(loopback): prevent SO_REUSEPORT co-binds via an operations-port conflict canary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Under concurrency, the loopback pool can hand the same 127.0.0.x to two suites that are alive at the same time (e.g. a slot freed while its Harper node lingered). On Linux, Harper binds its HTTP/replication ports with SO_REUSEPORT, so the freshly-assigned node silently co-binds an address another node still owns — the kernel then splits connections between the two nodes and corrupts both suites (manifests as replication sockets that never connect, "no encryption keys", missing databases/tables, etc.). macOS binds those ports without SO_REUSEPORT, which is why this only reproduces on Linux. Two defenses: - Conflict canary in getNextAvailableLoopbackAddress: after claiming a slot, probe the operations-API port (9925) with a plain, non-SO_REUSEPORT bind. The operations API is the one port Harper binds exclusively (main-thread only), so EADDRINUSE here means a node still holds the address. On conflict we park the slot under our PID (so no other process retries the poisoned address) and claim the next one. Override the probe port via HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT. - teardownHarper no longer recycles an address whose ports are still held by a child that escaped the tree-kill; it parks the slot (reclaimed when the per-file process exits). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/harperLifecycle.ts | 22 ++++++++---- src/loopbackAddressPool.ts | 71 +++++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 8 deletions(-) diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts index e937cf7..ea0d88b 100644 --- a/src/harperLifecycle.ts +++ b/src/harperLifecycle.ts @@ -739,16 +739,24 @@ export async function teardownHarper(ctx: StartedHarperTestContext): Promise { }); } +/** + * Conflict canary: detects whether a Harper node is already bound to `loopbackAddress` + * by attempting an exclusive (non-SO_REUSEPORT) bind on the operations-API port. Node's + * `net` server does not set SO_REUSEPORT, so if a stale/overlapping Harper node still + * owns the address's operations port, this bind fails with EADDRINUSE. + * + * Returns `true` if the address appears in use (EADDRINUSE), `false` if it is free. + * Re-throws any other bind error (e.g. EADDRNOTAVAIL) — that's a real configuration + * problem the caller should surface, not a transient conflict to skip. + * + * @param loopbackAddress The loopback IP address to probe (e.g., "127.0.0.2") + */ +function isLoopbackAddressInUse(loopbackAddress: string): Promise { + return new Promise((resolve, reject) => { + const server = createServer(); + server.once('error', (error: NodeJS.ErrnoException) => { + if (error.code === 'EADDRINUSE') { + resolve(true); + return; + } + const enhancedError = error as LoopbackAddressError; + enhancedError.loopbackAddress = loopbackAddress; + reject(enhancedError); + }); + server.listen(CONFLICT_PROBE_PORT, loopbackAddress, () => { + server.close(() => { + resolve(false); + }); + }); + }); +} + /** * This method attempts to validate all loopback addresses in the pool by trying to * bind to each one. It returns an object containing arrays of successfully bound @@ -322,11 +368,34 @@ export async function getNextAvailableLoopbackAddress(): Promise { const loopbackAddress = `127.0.0.${assignedIndex + HARPER_LOOPBACK_POOL_START}`; try { await validateLoopbackAddress(loopbackAddress); - return loopbackAddress; } catch (error) { // Validation failed - throw a proper error instead of breaking throw new LoopbackAddressValidationError(loopbackAddress, error as Error); } + + // Conflict canary: a stale or still-running Harper node from an overlapping + // suite may already hold this address (e.g. the pool slot was freed while its + // node lingered). SO_REUSEPORT on Harper's shared HTTP/replication ports would + // otherwise let our node silently co-bind it, splitting connections between two + // nodes and corrupting both suites. Detect that here via the exclusive + // operations port and skip the poisoned address. + let inUse: boolean; + try { + inUse = await isLoopbackAddressInUse(loopbackAddress); + } catch (error) { + throw new LoopbackAddressValidationError(loopbackAddress, error as Error); + } + if (!inUse) { + return loopbackAddress; + } + + // Leave the slot parked under our PID so no other process retries this poisoned + // address, then loop to claim the next available one. Parked slots are reclaimed + // when this (short-lived, per-file) process exits and its PID goes dead. + console.warn( + `[loopback-pool] ${loopbackAddress} is still in use by another Harper node (operations port ${CONFLICT_PROBE_PORT} bound); skipping to avoid an SO_REUSEPORT co-bind.` + ); + continue; } // No available addresses; wait and retry From ec2a7cee8a4040c389f9e481c8c305e6baa70669 Mon Sep 17 00:00:00 2001 From: Kris Zyp Date: Mon, 15 Jun 2026 07:24:26 -0600 Subject: [PATCH 2/2] =?UTF-8?q?fix(loopback):=20address=20review=20?= =?UTF-8?q?=E2=80=94=20avoid=20park-under-PID=20deadlock=20+=20edge=20case?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From PR review (gemini-code-assist): - Pool-exhaustion deadlock: parking poisoned slots under our own live PID could fill the pool with our PID, leaving nothing reclaimable and spinning forever. Instead release poisoned slots back to the pool and skip them via a per-attempt local Set (cleared before each wait so they can be re-probed once the lingering node exits). Removed the now-unused findAvailableIndex helper. - teardownHarper: nest the release under `if (ctx.harper.hostname)` so an early startup failure (no hostname) can't call releaseLoopbackAddress(undefined) and mask the real error with a TypeError. - CONFLICT_PROBE_PORT: fall back to 9925 on a NaN/out-of-range env override instead of crashing on server.listen(NaN). - isLoopbackAddressInUse: remove the 'error' listener once listen() succeeds so a post-bind socket error during close() can't flip the verdict. Validated locally: skip-in-use returns the next address; full-pool exhaustion recovers without deadlock once an address frees; an invalid probe-port env falls back cleanly. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/harperLifecycle.ts | 30 ++++++++---------- src/loopbackAddressPool.ts | 65 ++++++++++++++++++++++---------------- 2 files changed, 51 insertions(+), 44 deletions(-) diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts index ea0d88b..e88c911 100644 --- a/src/harperLifecycle.ts +++ b/src/harperLifecycle.ts @@ -739,23 +739,21 @@ export async function teardownHarper(ctx: StartedHarperTestContext): Promise { + const parsed = parseInt(process.env.HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT || '', 10); + // Fall back to the default if unset/non-numeric/out-of-range, so an invalid override + // can't turn into a `server.listen(NaN)` crash during allocation. + return Number.isNaN(parsed) || parsed < 0 || parsed > 65535 ? 9925 : parsed; +})(); // Type definitions type LoopbackPool = (number | null)[]; @@ -172,22 +175,6 @@ async function writePoolFile(pool: LoopbackPool): Promise { await writeFile(HARPER_LOOPBACK_POOL_PATH, JSON.stringify(pool)); } -/** - * Finds the first available (null) index in the pool. This implements a simple - * first-available allocation strategy. - * - * @param pool The loopback pool array - * @returns The first available index, or null if the pool is full - */ -function findAvailableIndex(pool: LoopbackPool): number | null { - for (let i = 0; i < pool.length; i++) { - if (pool[i] === null) { - return i; - } - } - return null; -} - /** * Validates the format of a loopback address and extracts the pool index. * @@ -256,7 +243,7 @@ function validateLoopbackAddress(loopbackAddress: string): Promise { function isLoopbackAddressInUse(loopbackAddress: string): Promise { return new Promise((resolve, reject) => { const server = createServer(); - server.once('error', (error: NodeJS.ErrnoException) => { + const onError = (error: NodeJS.ErrnoException) => { if (error.code === 'EADDRINUSE') { resolve(true); return; @@ -264,8 +251,12 @@ function isLoopbackAddressInUse(loopbackAddress: string): Promise { const enhancedError = error as LoopbackAddressError; enhancedError.loopbackAddress = loopbackAddress; reject(enhancedError); - }); + }; + server.once('error', onError); server.listen(CONFLICT_PROBE_PORT, loopbackAddress, () => { + // Bind succeeded — the address is free. Drop the error listener so a stray + // post-bind socket error during close() can't flip the resolved verdict. + server.off('error', onError); server.close(() => { resolve(false); }); @@ -342,16 +333,31 @@ export async function getNextAvailableLoopbackAddress(): Promise { // This continues until all loopback addresses are used, at which point new processes will wait until an address becomes available // Since multiple processes may be trying to get a loopback address at the same time, we need to implement a simple file-based locking mechanism to prevent race conditions + + // Indices found in-use by the conflict canary during THIS call. Tracked locally (not by + // parking them in the shared pool) so we never deadlock ourselves: if we parked every + // poisoned slot under our own live PID, the pool could fill with our PID, the free-slot search + // would find nothing, removeDeadProcessesFromPool couldn't reclaim them (we're alive), and we'd + // spin forever. Instead we release poisoned slots back to the pool and just avoid re-trying + // them within this attempt; the set is cleared before each wait so they can be re-probed once + // the lingering node has had time to exit. + const triedIndices = new Set(); while (true) { const assignedIndex = await withLock(async () => { // Read the pool file const loopbackPool = await readPoolFile(); - // Find the first available index - const index = findAvailableIndex(loopbackPool); + // Find the first available index we haven't already found in-use this attempt + let index: number | null = null; + for (let i = 0; i < loopbackPool.length; i++) { + if (loopbackPool[i] === null && !triedIndices.has(i)) { + index = i; + break; + } + } if (index === null) { - // No available addresses - remove any dead processes from the pool and wait for one to become available + // Nothing available - remove any dead processes from the pool and wait for one to become available removeDeadProcessesFromPool(loopbackPool); } else { // Assign the process PID to that index to mark it as used @@ -389,16 +395,19 @@ export async function getNextAvailableLoopbackAddress(): Promise { return loopbackAddress; } - // Leave the slot parked under our PID so no other process retries this poisoned - // address, then loop to claim the next available one. Parked slots are reclaimed - // when this (short-lived, per-file) process exits and its PID goes dead. + // Release the slot back to the pool (so it isn't leaked under our PID) and remember + // not to re-try it this attempt; loop to claim the next available one. + await releaseLoopbackAddress(loopbackAddress); + triedIndices.add(assignedIndex); console.warn( `[loopback-pool] ${loopbackAddress} is still in use by another Harper node (operations port ${CONFLICT_PROBE_PORT} bound); skipping to avoid an SO_REUSEPORT co-bind.` ); continue; } - // No available addresses; wait and retry + // No available addresses; clear the per-attempt skip set so poisoned addresses can be + // re-probed (their lingering node may have exited during the wait), then wait and retry. + triedIndices.clear(); await sleep(RETRY_DELAY_MS); } }