From 09d21a4b7cedb1584a04360778a49560e3cd54a8 Mon Sep 17 00:00:00 2001
From: Kris Zyp <kriszyp@gmail.com>
Date: Mon, 15 Jun 2026 07:12:38 -0600
Subject: [PATCH 1/2] fix(loopback): prevent SO_REUSEPORT co-binds via an
 operations-port conflict canary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Under concurrency, the loopback pool can hand the same 127.0.0.x to two suites that
are alive at the same time (e.g. a slot freed while its Harper node lingered). On Linux,
Harper binds its HTTP/replication ports with SO_REUSEPORT, so the freshly-assigned node
silently co-binds an address another node still owns — the kernel then splits connections
between the two nodes and corrupts both suites (manifests as replication sockets that
never connect, "no encryption keys", missing databases/tables, etc.). macOS binds those
ports without SO_REUSEPORT, which is why this only reproduces on Linux.

Two defenses:
- Conflict canary in getNextAvailableLoopbackAddress: after claiming a slot, probe the
  operations-API port (9925) with a plain, non-SO_REUSEPORT bind. The operations API is
  the one port Harper binds exclusively (main-thread only), so EADDRINUSE here means a
  node still holds the address. On conflict we park the slot under our PID (so no other
  process retries the poisoned address) and claim the next one. Override the probe port
  via HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT.
- teardownHarper no longer recycles an address whose ports are still held by a child that
  escaped the tree-kill; it parks the slot (reclaimed when the per-file process exits).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/harperLifecycle.ts     | 22 ++++++++----
 src/loopbackAddressPool.ts | 71 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 8 deletions(-)

diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts
index e937cf7..ea0d88b 100644
--- a/src/harperLifecycle.ts
+++ b/src/harperLifecycle.ts
@@ -739,16 +739,24 @@ export async function teardownHarper(ctx: StartedHarperTestContext): Promise<voi
 	// specific ports are free) and warn if anything is somehow still holding them — that warning
 	// is a signal that a Harper child process escaped the tree kill, not normal operation. The
 	// address is recycled regardless.
+	let portsFreed = true;
 	if (ctx.harper.hostname) {
-		const portsFreed = await waitForPortsFree(ctx.harper.hostname, ALL_HARPER_PORTS, DEFAULT_PORT_RELEASE_TIMEOUT_MS);
-		if (!portsFreed) {
-			console.warn(
-				`Harper ports on ${ctx.harper.hostname} still in use after teardown (${DEFAULT_PORT_RELEASE_TIMEOUT_MS}ms); recycling the address anyway. This usually means a Harper child process outlived the kill.`
-			);
-		}
+		portsFreed = await waitForPortsFree(ctx.harper.hostname, ALL_HARPER_PORTS, DEFAULT_PORT_RELEASE_TIMEOUT_MS);
 	}
 
-	await releaseLoopbackAddress(ctx.harper.hostname);
+	if (portsFreed) {
+		await releaseLoopbackAddress(ctx.harper.hostname);
+	} else {
+		// A Harper child escaped the tree kill and still holds this address's ports. Do NOT
+		// recycle the address — under SO_REUSEPORT a later suite could silently co-bind it
+		// (the exact failure the loopback pool exists to prevent). Leave the slot parked under
+		// this process's PID; it is reclaimed when this (per-file) process exits and its PID
+		// goes dead. The conflict canary in getNextAvailableLoopbackAddress is the backstop if
+		// the address is somehow handed out before then.
+		console.warn(
+			`Harper ports on ${ctx.harper.hostname} still in use after teardown (${DEFAULT_PORT_RELEASE_TIMEOUT_MS}ms); NOT recycling the address (a Harper child outlived the kill). The slot will be reclaimed when this process exits.`
+		);
+	}
 
 	// a few retries are typically necessary, might take a sec for a process to finish, especially since rocksdb may be flushing
 	try {
diff --git a/src/loopbackAddressPool.ts b/src/loopbackAddressPool.ts
index 440ee71..b2947e0 100644
--- a/src/loopbackAddressPool.ts
+++ b/src/loopbackAddressPool.ts
@@ -28,6 +28,20 @@ const HARPER_LOOPBACK_POOL_LOCK_PATH = join(tmpdir(), 'harper-integration-test-l
 const LOCK_STALE_TIMEOUT_MS = 10000;
 const RETRY_DELAY_MS = 1000;
 
+// Port used as a conflict canary when allocating an address. This MUST be a port that
+// Harper binds WITHOUT SO_REUSEPORT (the operations API — `OPERATIONS_API_PORT` in
+// harperLifecycle.ts — does exactly that, main-thread-only). Because that port is
+// exclusive, a stale/overlapping Harper node still holding the address makes a plain
+// (non-reusePort) bind here fail with EADDRINUSE — which is precisely the signal we
+// need. Without this check, SO_REUSEPORT on the shared HTTP/replication ports would let
+// a freshly-assigned node silently co-bind an address another node still owns, corrupting
+// both suites. Override via HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT if the operations
+// port ever changes. (Hardcoded rather than imported from harperLifecycle to avoid a
+// circular import between these two modules.)
+const CONFLICT_PROBE_PORT = process.env.HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT
+	? parseInt(process.env.HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT, 10)
+	: 9925;
+
 // Type definitions
 type LoopbackPool = (number | null)[];
 
@@ -227,6 +241,38 @@ function validateLoopbackAddress(loopbackAddress: string): Promise<string> {
 	});
 }
 
+/**
+ * Conflict canary: detects whether a Harper node is already bound to `loopbackAddress`
+ * by attempting an exclusive (non-SO_REUSEPORT) bind on the operations-API port. Node's
+ * `net` server does not set SO_REUSEPORT, so if a stale/overlapping Harper node still
+ * owns the address's operations port, this bind fails with EADDRINUSE.
+ *
+ * Returns `true` if the address appears in use (EADDRINUSE), `false` if it is free.
+ * Re-throws any other bind error (e.g. EADDRNOTAVAIL) — that's a real configuration
+ * problem the caller should surface, not a transient conflict to skip.
+ *
+ * @param loopbackAddress The loopback IP address to probe (e.g., "127.0.0.2")
+ */
+function isLoopbackAddressInUse(loopbackAddress: string): Promise<boolean> {
+	return new Promise((resolve, reject) => {
+		const server = createServer();
+		server.once('error', (error: NodeJS.ErrnoException) => {
+			if (error.code === 'EADDRINUSE') {
+				resolve(true);
+				return;
+			}
+			const enhancedError = error as LoopbackAddressError;
+			enhancedError.loopbackAddress = loopbackAddress;
+			reject(enhancedError);
+		});
+		server.listen(CONFLICT_PROBE_PORT, loopbackAddress, () => {
+			server.close(() => {
+				resolve(false);
+			});
+		});
+	});
+}
+
 /**
  * This method attempts to validate all loopback addresses in the pool by trying to
  * bind to each one. It returns an object containing arrays of successfully bound
@@ -322,11 +368,34 @@ export async function getNextAvailableLoopbackAddress(): Promise<string> {
 			const loopbackAddress = `127.0.0.${assignedIndex + HARPER_LOOPBACK_POOL_START}`;
 			try {
 				await validateLoopbackAddress(loopbackAddress);
-				return loopbackAddress;
 			} catch (error) {
 				// Validation failed - throw a proper error instead of breaking
 				throw new LoopbackAddressValidationError(loopbackAddress, error as Error);
 			}
+
+			// Conflict canary: a stale or still-running Harper node from an overlapping
+			// suite may already hold this address (e.g. the pool slot was freed while its
+			// node lingered). SO_REUSEPORT on Harper's shared HTTP/replication ports would
+			// otherwise let our node silently co-bind it, splitting connections between two
+			// nodes and corrupting both suites. Detect that here via the exclusive
+			// operations port and skip the poisoned address.
+			let inUse: boolean;
+			try {
+				inUse = await isLoopbackAddressInUse(loopbackAddress);
+			} catch (error) {
+				throw new LoopbackAddressValidationError(loopbackAddress, error as Error);
+			}
+			if (!inUse) {
+				return loopbackAddress;
+			}
+
+			// Leave the slot parked under our PID so no other process retries this poisoned
+			// address, then loop to claim the next available one. Parked slots are reclaimed
+			// when this (short-lived, per-file) process exits and its PID goes dead.
+			console.warn(
+				`[loopback-pool] ${loopbackAddress} is still in use by another Harper node (operations port ${CONFLICT_PROBE_PORT} bound); skipping to avoid an SO_REUSEPORT co-bind.`
+			);
+			continue;
 		}
 
 		// No available addresses; wait and retry

From ec2a7cee8a4040c389f9e481c8c305e6baa70669 Mon Sep 17 00:00:00 2001
From: Kris Zyp <kriszyp@gmail.com>
Date: Mon, 15 Jun 2026 07:24:26 -0600
Subject: [PATCH 2/2] =?UTF-8?q?fix(loopback):=20address=20review=20?=
 =?UTF-8?q?=E2=80=94=20avoid=20park-under-PID=20deadlock=20+=20edge=20case?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

From PR review (gemini-code-assist):
- Pool-exhaustion deadlock: parking poisoned slots under our own live PID could fill
  the pool with our PID, leaving nothing reclaimable and spinning forever. Instead
  release poisoned slots back to the pool and skip them via a per-attempt local Set
  (cleared before each wait so they can be re-probed once the lingering node exits).
  Removed the now-unused findAvailableIndex helper.
- teardownHarper: nest the release under `if (ctx.harper.hostname)` so an early
  startup failure (no hostname) can't call releaseLoopbackAddress(undefined) and mask
  the real error with a TypeError.
- CONFLICT_PROBE_PORT: fall back to 9925 on a NaN/out-of-range env override instead of
  crashing on server.listen(NaN).
- isLoopbackAddressInUse: remove the 'error' listener once listen() succeeds so a
  post-bind socket error during close() can't flip the verdict.

Validated locally: skip-in-use returns the next address; full-pool exhaustion recovers
without deadlock once an address frees; an invalid probe-port env falls back cleanly.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/harperLifecycle.ts     | 30 ++++++++----------
 src/loopbackAddressPool.ts | 65 ++++++++++++++++++++++----------------
 2 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/src/harperLifecycle.ts b/src/harperLifecycle.ts
index ea0d88b..e88c911 100644
--- a/src/harperLifecycle.ts
+++ b/src/harperLifecycle.ts
@@ -739,23 +739,21 @@ export async function teardownHarper(ctx: StartedHarperTestContext): Promise<voi
 	// specific ports are free) and warn if anything is somehow still holding them — that warning
 	// is a signal that a Harper child process escaped the tree kill, not normal operation. The
 	// address is recycled regardless.
-	let portsFreed = true;
 	if (ctx.harper.hostname) {
-		portsFreed = await waitForPortsFree(ctx.harper.hostname, ALL_HARPER_PORTS, DEFAULT_PORT_RELEASE_TIMEOUT_MS);
-	}
-
-	if (portsFreed) {
-		await releaseLoopbackAddress(ctx.harper.hostname);
-	} else {
-		// A Harper child escaped the tree kill and still holds this address's ports. Do NOT
-		// recycle the address — under SO_REUSEPORT a later suite could silently co-bind it
-		// (the exact failure the loopback pool exists to prevent). Leave the slot parked under
-		// this process's PID; it is reclaimed when this (per-file) process exits and its PID
-		// goes dead. The conflict canary in getNextAvailableLoopbackAddress is the backstop if
-		// the address is somehow handed out before then.
-		console.warn(
-			`Harper ports on ${ctx.harper.hostname} still in use after teardown (${DEFAULT_PORT_RELEASE_TIMEOUT_MS}ms); NOT recycling the address (a Harper child outlived the kill). The slot will be reclaimed when this process exits.`
-		);
+		const portsFreed = await waitForPortsFree(ctx.harper.hostname, ALL_HARPER_PORTS, DEFAULT_PORT_RELEASE_TIMEOUT_MS);
+		if (portsFreed) {
+			await releaseLoopbackAddress(ctx.harper.hostname);
+		} else {
+			// A Harper child escaped the tree kill and still holds this address's ports. Do NOT
+			// recycle the address — under SO_REUSEPORT a later suite could silently co-bind it
+			// (the exact failure the loopback pool exists to prevent). Leave the slot parked under
+			// this process's PID; it is reclaimed when this (per-file) process exits and its PID
+			// goes dead. The conflict canary in getNextAvailableLoopbackAddress is the backstop if
+			// the address is somehow handed out before then.
+			console.warn(
+				`Harper ports on ${ctx.harper.hostname} still in use after teardown (${DEFAULT_PORT_RELEASE_TIMEOUT_MS}ms); NOT recycling the address (a Harper child outlived the kill). The slot will be reclaimed when this process exits.`
+			);
+		}
 	}
 
 	// a few retries are typically necessary, might take a sec for a process to finish, especially since rocksdb may be flushing
diff --git a/src/loopbackAddressPool.ts b/src/loopbackAddressPool.ts
index b2947e0..41103a5 100644
--- a/src/loopbackAddressPool.ts
+++ b/src/loopbackAddressPool.ts
@@ -38,9 +38,12 @@ const RETRY_DELAY_MS = 1000;
 // both suites. Override via HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT if the operations
 // port ever changes. (Hardcoded rather than imported from harperLifecycle to avoid a
 // circular import between these two modules.)
-const CONFLICT_PROBE_PORT = process.env.HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT
-	? parseInt(process.env.HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT, 10)
-	: 9925;
+const CONFLICT_PROBE_PORT = (() => {
+	const parsed = parseInt(process.env.HARPER_INTEGRATION_TEST_CONFLICT_PROBE_PORT || '', 10);
+	// Fall back to the default if unset/non-numeric/out-of-range, so an invalid override
+	// can't turn into a `server.listen(NaN)` crash during allocation.
+	return Number.isNaN(parsed) || parsed < 0 || parsed > 65535 ? 9925 : parsed;
+})();
 
 // Type definitions
 type LoopbackPool = (number | null)[];
@@ -172,22 +175,6 @@ async function writePoolFile(pool: LoopbackPool): Promise<void> {
 	await writeFile(HARPER_LOOPBACK_POOL_PATH, JSON.stringify(pool));
 }
 
-/**
- * Finds the first available (null) index in the pool. This implements a simple
- * first-available allocation strategy.
- *
- * @param pool The loopback pool array
- * @returns The first available index, or null if the pool is full
- */
-function findAvailableIndex(pool: LoopbackPool): number | null {
-	for (let i = 0; i < pool.length; i++) {
-		if (pool[i] === null) {
-			return i;
-		}
-	}
-	return null;
-}
-
 /**
  * Validates the format of a loopback address and extracts the pool index.
  *
@@ -256,7 +243,7 @@ function validateLoopbackAddress(loopbackAddress: string): Promise<string> {
 function isLoopbackAddressInUse(loopbackAddress: string): Promise<boolean> {
 	return new Promise((resolve, reject) => {
 		const server = createServer();
-		server.once('error', (error: NodeJS.ErrnoException) => {
+		const onError = (error: NodeJS.ErrnoException) => {
 			if (error.code === 'EADDRINUSE') {
 				resolve(true);
 				return;
@@ -264,8 +251,12 @@ function isLoopbackAddressInUse(loopbackAddress: string): Promise<boolean> {
 			const enhancedError = error as LoopbackAddressError;
 			enhancedError.loopbackAddress = loopbackAddress;
 			reject(enhancedError);
-		});
+		};
+		server.once('error', onError);
 		server.listen(CONFLICT_PROBE_PORT, loopbackAddress, () => {
+			// Bind succeeded — the address is free. Drop the error listener so a stray
+			// post-bind socket error during close() can't flip the resolved verdict.
+			server.off('error', onError);
 			server.close(() => {
 				resolve(false);
 			});
@@ -342,16 +333,31 @@ export async function getNextAvailableLoopbackAddress(): Promise<string> {
 	// This continues until all loopback addresses are used, at which point new processes will wait until an address becomes available
 
 	// Since multiple processes may be trying to get a loopback address at the same time, we need to implement a simple file-based locking mechanism to prevent race conditions
+
+	// Indices found in-use by the conflict canary during THIS call. Tracked locally (not by
+	// parking them in the shared pool) so we never deadlock ourselves: if we parked every
+	// poisoned slot under our own live PID, the pool could fill with our PID, the free-slot search
+	// would find nothing, removeDeadProcessesFromPool couldn't reclaim them (we're alive), and we'd
+	// spin forever. Instead we release poisoned slots back to the pool and just avoid re-trying
+	// them within this attempt; the set is cleared before each wait so they can be re-probed once
+	// the lingering node has had time to exit.
+	const triedIndices = new Set<number>();
 	while (true) {
 		const assignedIndex = await withLock(async () => {
 			// Read the pool file
 			const loopbackPool = await readPoolFile();
 
-			// Find the first available index
-			const index = findAvailableIndex(loopbackPool);
+			// Find the first available index we haven't already found in-use this attempt
+			let index: number | null = null;
+			for (let i = 0; i < loopbackPool.length; i++) {
+				if (loopbackPool[i] === null && !triedIndices.has(i)) {
+					index = i;
+					break;
+				}
+			}
 
 			if (index === null) {
-				// No available addresses - remove any dead processes from the pool and wait for one to become available
+				// Nothing available - remove any dead processes from the pool and wait for one to become available
 				removeDeadProcessesFromPool(loopbackPool);
 			} else {
 				// Assign the process PID to that index to mark it as used
@@ -389,16 +395,19 @@ export async function getNextAvailableLoopbackAddress(): Promise<string> {
 				return loopbackAddress;
 			}
 
-			// Leave the slot parked under our PID so no other process retries this poisoned
-			// address, then loop to claim the next available one. Parked slots are reclaimed
-			// when this (short-lived, per-file) process exits and its PID goes dead.
+			// Release the slot back to the pool (so it isn't leaked under our PID) and remember
+			// not to re-try it this attempt; loop to claim the next available one.
+			await releaseLoopbackAddress(loopbackAddress);
+			triedIndices.add(assignedIndex);
 			console.warn(
 				`[loopback-pool] ${loopbackAddress} is still in use by another Harper node (operations port ${CONFLICT_PROBE_PORT} bound); skipping to avoid an SO_REUSEPORT co-bind.`
 			);
 			continue;
 		}
 
-		// No available addresses; wait and retry
+		// No available addresses; clear the per-attempt skip set so poisoned addresses can be
+		// re-probed (their lingering node may have exited during the wait), then wait and retry.
+		triedIndices.clear();
 		await sleep(RETRY_DELAY_MS);
 	}
 }