From 83b9cebfe99b9dc3bd6b7144879cbbb406459166 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 12 Jun 2026 14:19:49 -0500 Subject: [PATCH 1/7] Listen for unused port (never used for a engineered homeserver in the process before) --- federation/server.go | 61 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/federation/server.go b/federation/server.go index 3e0ed3cfb..5140effe1 100644 --- a/federation/server.go +++ b/federation/server.go @@ -528,7 +528,61 @@ func (s *Server) Mux() *mux.Router { return s.mux } -// Listen for federation server requests - call the returned function to gracefully close the server. +// Keep track of the ports that we've previously used so that we never use the same port +// (and therefore the same `server_name`) to two different servers. +// +// A `Server` is identified over federation solely by its `server_name` (which looks +// like `hostname:port` for these Complement engineered homeservers). When the OS recycles a +// freed port, a new Server could otherwise get a `server_name` that is identical to a +// previously torn-down one. +// +// To explain an actual situation where this becomes a problem: A real homeserver under +// test (that is participating in a room with the now-dead engineered homeserver) might +// still try to reach the dead server, but since the `server_name` is the same, it's now +// hitting the new server unexpectedly (cross-test pollution). +// +// This particularly happens when you try to share a `deployment` across many tests and +// then each test creates a engineered homeservers to interact against. +// +// Retiring each port for the lifetime of the process keeps stray requests pointed at a +// dead port (connection refused) instead of a live, unrelated server. +var ( + usedPortsMu sync.Mutex + usedPorts = make(map[int]struct{}) +) + +// listenOnUnusedPort listens on an OS-assigned high-numbered port that no federation Server has +// used before in this process. +func listenOnUnusedPort(t ct.TestLike) net.Listener { + usedPortsMu.Lock() + defer usedPortsMu.Unlock() + + // Keep any already-claimed listeners open until we've found a fresh port, otherwise the OS + // could keep offering us a port we're about to reject. + var rejected []net.Listener + defer func() { + for _, ln := range rejected { + ln.Close() + } + }() + + for attempt := 0; attempt < 100; attempt++ { + ln, err := net.Listen("tcp", ":0") //nolint + if err != nil { + ct.Fatalf(t, "listenOnUnusedPort: net.Listen failed: %s", err) + } + port := ln.Addr().(*net.TCPAddr).Port + if _, used := usedPorts[port]; used { + rejected = append(rejected, ln) + continue + } + usedPorts[port] = struct{}{} + return ln + } + ct.Fatalf(t, "listenOnUnusedPort: could not find an unused port after 100 attempts") + return nil +} + func (s *Server) Listen() (cancel func()) { if s.listening { return @@ -536,10 +590,7 @@ func (s *Server) Listen() (cancel func()) { var wg sync.WaitGroup wg.Add(1) - ln, err := net.Listen("tcp", ":0") //nolint - if err != nil { - ct.Fatalf(s.t, "ListenFederationServer: net.Listen failed: %s", err) - } + ln := listenOnUnusedPort(s.t) port := ln.Addr().(*net.TCPAddr).Port s.serverName = spec.ServerName(fmt.Sprintf("%s:%d", s.serverName, port)) s.listening = true From c103b26d7d07f5102a08758365b5a157f57494fd Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 12 Jun 2026 14:43:25 -0500 Subject: [PATCH 2/7] Try more --- federation/server.go | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/federation/server.go b/federation/server.go index 5140effe1..c0c6601cc 100644 --- a/federation/server.go +++ b/federation/server.go @@ -566,7 +566,24 @@ func listenOnUnusedPort(t ct.TestLike) net.Listener { } }() - for attempt := 0; attempt < 100; attempt++ { + // We try the same number of times as the number of ports that have been previously + // used, as the OS could sequentially hand back the same ports and we need to find the + // next unused port. + max_attempts := len(usedPorts) + // Sanity check that we haven't already exhausted the entire port range + if max_attempts > 65535 { + // If this ever becomes a problem, we can namespace used ports by `deployment` since + // that has to be passed into `NewServer(...)` anyway + ct.Fatalf( + t, "listenOnUnusedPort: We've exhausted the whole port range 0 - 65,535. "+ + "(see comment here if you run into this)", + ) + } + + for attempt := 0; attempt < max_attempts; attempt++ { + // This just gets us an unused port (could be random, could be the next sequential + // unused port, we don't know). Ideally, we could ask for the next unused port after + // N to avoid a bunch of work. ln, err := net.Listen("tcp", ":0") //nolint if err != nil { ct.Fatalf(t, "listenOnUnusedPort: net.Listen failed: %s", err) @@ -579,7 +596,7 @@ func listenOnUnusedPort(t ct.TestLike) net.Listener { usedPorts[port] = struct{}{} return ln } - ct.Fatalf(t, "listenOnUnusedPort: could not find an unused port after 100 attempts") + ct.Fatalf(t, "listenOnUnusedPort: could not find an unused port after %s attempts", max_attempts) return nil } From 2d51bccc9f5f8bd5a93bc37a94720f8b2ae79f7e Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 12 Jun 2026 14:44:55 -0500 Subject: [PATCH 3/7] More better comment --- federation/server.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/federation/server.go b/federation/server.go index c0c6601cc..1195bf887 100644 --- a/federation/server.go +++ b/federation/server.go @@ -573,7 +573,9 @@ func listenOnUnusedPort(t ct.TestLike) net.Listener { // Sanity check that we haven't already exhausted the entire port range if max_attempts > 65535 { // If this ever becomes a problem, we can namespace used ports by `deployment` since - // that has to be passed into `NewServer(...)` anyway + // that has to be passed into `NewServer(...)` anyway and the whole point of this is + // that a homeserver from the `deployment` doesn't try to reach out to a previous + // engineered homeserver it knows about. ct.Fatalf( t, "listenOnUnusedPort: We've exhausted the whole port range 0 - 65,535. "+ "(see comment here if you run into this)", From b7be7b6f6f4e07f53f3ad45531b98dae774f2584 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 12 Jun 2026 14:50:22 -0500 Subject: [PATCH 4/7] Try `+ 1` times --- federation/server.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/federation/server.go b/federation/server.go index 1195bf887..2591ac301 100644 --- a/federation/server.go +++ b/federation/server.go @@ -569,7 +569,10 @@ func listenOnUnusedPort(t ct.TestLike) net.Listener { // We try the same number of times as the number of ports that have been previously // used, as the OS could sequentially hand back the same ports and we need to find the // next unused port. - max_attempts := len(usedPorts) + // + // `+ 1` as we want to find the next one after everything we might have tried before + // and it also means we try at-least one time when `len(usedPorts)` is `0`. + max_attempts := len(usedPorts) + 1 // Sanity check that we haven't already exhausted the entire port range if max_attempts > 65535 { // If this ever becomes a problem, we can namespace used ports by `deployment` since @@ -583,9 +586,9 @@ func listenOnUnusedPort(t ct.TestLike) net.Listener { } for attempt := 0; attempt < max_attempts; attempt++ { - // This just gets us an unused port (could be random, could be the next sequential - // unused port, we don't know). Ideally, we could ask for the next unused port after - // N to avoid a bunch of work. + // Using `:0` means an unused port is automatically picked for us (could be random, + // could be the next sequential unused port, we don't know). Ideally, we could ask + // for the next unused port after X to avoid a bunch of work. ln, err := net.Listen("tcp", ":0") //nolint if err != nil { ct.Fatalf(t, "listenOnUnusedPort: net.Listen failed: %s", err) From 820b4286e37a0fa26033ae8c2f7975d5a398f5f4 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 12 Jun 2026 15:24:05 -0500 Subject: [PATCH 5/7] Sequential port scan --- federation/server.go | 87 ++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/federation/server.go b/federation/server.go index 2591ac301..caba7f00d 100644 --- a/federation/server.go +++ b/federation/server.go @@ -547,61 +547,62 @@ func (s *Server) Mux() *mux.Router { // Retiring each port for the lifetime of the process keeps stray requests pointed at a // dead port (connection refused) instead of a live, unrelated server. var ( - usedPortsMu sync.Mutex - usedPorts = make(map[int]struct{}) + // Use a mutex so only one thread can advance `lastUsedPort` at a time. We don't want + // multiple threads clobbering `lastUsedPort`. + lastUsedPortMu sync.Mutex + // Start at 1024 (1023 + 1) to avoid the priviged ports used by the system + // + // Since we sequentially try each port, we just need to keep track of the last one we tried + lastUsedPort = 1023 ) -// listenOnUnusedPort listens on an OS-assigned high-numbered port that no federation Server has +// listenOnUnusedPort listens on an unused port that no other federation `Server` has // used before in this process. func listenOnUnusedPort(t ct.TestLike) net.Listener { - usedPortsMu.Lock() - defer usedPortsMu.Unlock() - - // Keep any already-claimed listeners open until we've found a fresh port, otherwise the OS - // could keep offering us a port we're about to reject. - var rejected []net.Listener - defer func() { - for _, ln := range rejected { - ln.Close() - } - }() + lastUsedPortMu.Lock() + defer lastUsedPortMu.Unlock() - // We try the same number of times as the number of ports that have been previously - // used, as the OS could sequentially hand back the same ports and we need to find the - // next unused port. + // We use this sequential port scan strategy over guess and check with an OS-assigned + // port (by using `:0`) as it's more efficient. The OS may recycle and re-use freed + // ports meaning we could regress to O(n^2) behavior trying to search for each new + // port we want to find. // - // `+ 1` as we want to find the next one after everything we might have tried before - // and it also means we try at-least one time when `len(usedPorts)` is `0`. - max_attempts := len(usedPorts) + 1 - // Sanity check that we haven't already exhausted the entire port range - if max_attempts > 65535 { - // If this ever becomes a problem, we can namespace used ports by `deployment` since - // that has to be passed into `NewServer(...)` anyway and the whole point of this is - // that a homeserver from the `deployment` doesn't try to reach out to a previous - // engineered homeserver it knows about. - ct.Fatalf( - t, "listenOnUnusedPort: We've exhausted the whole port range 0 - 65,535. "+ - "(see comment here if you run into this)", - ) - } + // Using `:0` means an unused port is automatically picked for us (could be random, + // could be the next sequential unused port, we don't know). Ideally, we could ask for + // the next unused port after X to avoid a bunch of work. When using using `:0`, the + // pathological case that is O(n^2) is if OS hands back next lowest unused port + // sequentially which would mean we would have to probe and hold each listener until + // we finally got something new. + + // Try up to 1000 ports + attempts := 1000 + for i := 0; i < attempts; i++ { + port := lastUsedPort + 1 + if port > 65535 { + // If this ever becomes a problem, we can namespace used ports by `deployment` since + // that has to be passed into `NewServer(...)` anyway and the whole point of this is + // that a homeserver from the `deployment` doesn't try to reach out to a previous + // engineered homeserver it knows about. + // + // As another alternative, we could also wrap-around to the beginning of the port + // range again although that is slightly unsound. + ct.Fatalf( + t, "listenOnUnusedPort: We've exhausted the whole port range 0 - 65,535. "+ + "(see comment here if you run into this)", + ) + } - for attempt := 0; attempt < max_attempts; attempt++ { - // Using `:0` means an unused port is automatically picked for us (could be random, - // could be the next sequential unused port, we don't know). Ideally, we could ask - // for the next unused port after X to avoid a bunch of work. - ln, err := net.Listen("tcp", ":0") //nolint + // Check port availability + ln, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) + lastUsedPort = port if err != nil { - ct.Fatalf(t, "listenOnUnusedPort: net.Listen failed: %s", err) - } - port := ln.Addr().(*net.TCPAddr).Port - if _, used := usedPorts[port]; used { - rejected = append(rejected, ln) + // Port unavailable, skip continue } - usedPorts[port] = struct{}{} + return ln } - ct.Fatalf(t, "listenOnUnusedPort: could not find an unused port after %s attempts", max_attempts) + ct.Fatalf(t, "listenOnUnusedPort: could not find an unused port in the range %s - %s", lastUsedPort-attempts, lastUsedPort) return nil } From 9f186bc4d53515533e7f0379513fc59a98744637 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 12 Jun 2026 15:37:46 -0500 Subject: [PATCH 6/7] Fix little things --- federation/server.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/federation/server.go b/federation/server.go index caba7f00d..a6cc47e81 100644 --- a/federation/server.go +++ b/federation/server.go @@ -574,9 +574,10 @@ func listenOnUnusedPort(t ct.TestLike) net.Listener { // sequentially which would mean we would have to probe and hold each listener until // we finally got something new. - // Try up to 1000 ports - attempts := 1000 - for i := 0; i < attempts; i++ { + // Try the whole port range (untested but it's probably fast to do so) + max_attempts := 65535 + var lastErr error + for i := 0; i < max_attempts; i++ { port := lastUsedPort + 1 if port > 65535 { // If this ever becomes a problem, we can namespace used ports by `deployment` since @@ -596,13 +597,14 @@ func listenOnUnusedPort(t ct.TestLike) net.Listener { ln, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) lastUsedPort = port if err != nil { + lastErr = err // Port unavailable, skip continue } return ln } - ct.Fatalf(t, "listenOnUnusedPort: could not find an unused port in the range %s - %s", lastUsedPort-attempts, lastUsedPort) + ct.Fatalf(t, "listenOnUnusedPort: could not find an unused port in the entire port range (0 - 65535). Last error: %w", lastErr) return nil } From 40e09fe754e38ba40313334a5a02cb25d9afd0d7 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 12 Jun 2026 15:43:51 -0500 Subject: [PATCH 7/7] Better errors --- federation/server.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/federation/server.go b/federation/server.go index a6cc47e81..36487a1ac 100644 --- a/federation/server.go +++ b/federation/server.go @@ -588,8 +588,8 @@ func listenOnUnusedPort(t ct.TestLike) net.Listener { // As another alternative, we could also wrap-around to the beginning of the port // range again although that is slightly unsound. ct.Fatalf( - t, "listenOnUnusedPort: We've exhausted the whole port range 0 - 65,535. "+ - "(see comment here if you run into this)", + t, "listenOnUnusedPort: could not find an unused port in the entire port range (0 - 65535). "+ + "(see comment here if you run into this). Last error: %s", lastErr, ) } @@ -604,7 +604,9 @@ func listenOnUnusedPort(t ct.TestLike) net.Listener { return ln } - ct.Fatalf(t, "listenOnUnusedPort: could not find an unused port in the entire port range (0 - 65535). Last error: %w", lastErr) + // Since we try the entire port range, we don't really expect to get here but we have + // it in case there is a programming error above + ct.Fatalf(t, "listenOnUnusedPort: Programming error") return nil }