diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts index e937cf7..e88c911 100644 --- a/src/harperLifecycle.ts +++ b/src/harperLifecycle.ts @@ -741,15 +741,21 @@ export async function teardownHarper(ctx: StartedHarperTestContext): Promise { + const parsed = parseInt(process.env.HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT || '', 10); + // Fall back to the default if unset/non-numeric/out-of-range, so an invalid override + // can't turn into a `server.listen(NaN)` crash during allocation. + return Number.isNaN(parsed) || parsed < 0 || parsed > 65535 ? 9925 : parsed; +})(); + // Type definitions type LoopbackPool = (number | null)[]; @@ -158,22 +175,6 @@ async function writePoolFile(pool: LoopbackPool): Promise { await writeFile(HARPER_LOOPBACK_POOL_PATH, JSON.stringify(pool)); } -/** - * Finds the first available (null) index in the pool. This implements a simple - * first-available allocation strategy. - * - * @param pool The loopback pool array - * @returns The first available index, or null if the pool is full - */ -function findAvailableIndex(pool: LoopbackPool): number | null { - for (let i = 0; i < pool.length; i++) { - if (pool[i] === null) { - return i; - } - } - return null; -} - /** * Validates the format of a loopback address and extracts the pool index. * @@ -227,6 +228,42 @@ function validateLoopbackAddress(loopbackAddress: string): Promise { }); } +/** + * Conflict canary: detects whether a Harper node is already bound to `loopbackAddress` + * by attempting an exclusive (non-SO_REUSEPORT) bind on the operations-API port. Node's + * `net` server does not set SO_REUSEPORT, so if a stale/overlapping Harper node still + * owns the address's operations port, this bind fails with EADDRINUSE. + * + * Returns `true` if the address appears in use (EADDRINUSE), `false` if it is free. + * Re-throws any other bind error (e.g. EADDRNOTAVAIL) — that's a real configuration + * problem the caller should surface, not a transient conflict to skip. + * + * @param loopbackAddress The loopback IP address to probe (e.g., "127.0.0.2") + */ +function isLoopbackAddressInUse(loopbackAddress: string): Promise { + return new Promise((resolve, reject) => { + const server = createServer(); + const onError = (error: NodeJS.ErrnoException) => { + if (error.code === 'EADDRINUSE') { + resolve(true); + return; + } + const enhancedError = error as LoopbackAddressError; + enhancedError.loopbackAddress = loopbackAddress; + reject(enhancedError); + }; + server.once('error', onError); + server.listen(CONFLICT_PROBE_PORT, loopbackAddress, () => { + // Bind succeeded — the address is free. Drop the error listener so a stray + // post-bind socket error during close() can't flip the resolved verdict. + server.off('error', onError); + server.close(() => { + resolve(false); + }); + }); + }); +} + /** * This method attempts to validate all loopback addresses in the pool by trying to * bind to each one. It returns an object containing arrays of successfully bound @@ -296,16 +333,31 @@ export async function getNextAvailableLoopbackAddress(): Promise { // This continues until all loopback addresses are used, at which point new processes will wait until an address becomes available // Since multiple processes may be trying to get a loopback address at the same time, we need to implement a simple file-based locking mechanism to prevent race conditions + + // Indices found in-use by the conflict canary during THIS call. Tracked locally (not by + // parking them in the shared pool) so we never deadlock ourselves: if we parked every + // poisoned slot under our own live PID, the pool could fill with our PID, the free-slot search + // would find nothing, removeDeadProcessesFromPool couldn't reclaim them (we're alive), and we'd + // spin forever. Instead we release poisoned slots back to the pool and just avoid re-trying + // them within this attempt; the set is cleared before each wait so they can be re-probed once + // the lingering node has had time to exit. + const triedIndices = new Set(); while (true) { const assignedIndex = await withLock(async () => { // Read the pool file const loopbackPool = await readPoolFile(); - // Find the first available index - const index = findAvailableIndex(loopbackPool); + // Find the first available index we haven't already found in-use this attempt + let index: number | null = null; + for (let i = 0; i < loopbackPool.length; i++) { + if (loopbackPool[i] === null && !triedIndices.has(i)) { + index = i; + break; + } + } if (index === null) { - // No available addresses - remove any dead processes from the pool and wait for one to become available + // Nothing available - remove any dead processes from the pool and wait for one to become available removeDeadProcessesFromPool(loopbackPool); } else { // Assign the process PID to that index to mark it as used @@ -322,14 +374,40 @@ export async function getNextAvailableLoopbackAddress(): Promise { const loopbackAddress = `127.0.0.${assignedIndex + HARPER_LOOPBACK_POOL_START}`; try { await validateLoopbackAddress(loopbackAddress); - return loopbackAddress; } catch (error) { // Validation failed - throw a proper error instead of breaking throw new LoopbackAddressValidationError(loopbackAddress, error as Error); } + + // Conflict canary: a stale or still-running Harper node from an overlapping + // suite may already hold this address (e.g. the pool slot was freed while its + // node lingered). SO_REUSEPORT on Harper's shared HTTP/replication ports would + // otherwise let our node silently co-bind it, splitting connections between two + // nodes and corrupting both suites. Detect that here via the exclusive + // operations port and skip the poisoned address. + let inUse: boolean; + try { + inUse = await isLoopbackAddressInUse(loopbackAddress); + } catch (error) { + throw new LoopbackAddressValidationError(loopbackAddress, error as Error); + } + if (!inUse) { + return loopbackAddress; + } + + // Release the slot back to the pool (so it isn't leaked under our PID) and remember + // not to re-try it this attempt; loop to claim the next available one. + await releaseLoopbackAddress(loopbackAddress); + triedIndices.add(assignedIndex); + console.warn( + `[loopback-pool] ${loopbackAddress} is still in use by another Harper node (operations port ${CONFLICT_PROBE_PORT} bound); skipping to avoid an SO_REUSEPORT co-bind.` + ); + continue; } - // No available addresses; wait and retry + // No available addresses; clear the per-attempt skip set so poisoned addresses can be + // re-probed (their lingering node may have exited during the wait), then wait and retry. + triedIndices.clear(); await sleep(RETRY_DELAY_MS); } }