From 605d44c4f985c9faf5c6a4750db0d1147736d396 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zolt=C3=A1n=20Papp?= Date: Sat, 15 Nov 2025 00:08:47 +0100 Subject: [PATCH] Fix race condition in relay peer reconnection handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a peer reconnects with the same ID, other peers were not reliably notified that the old connection went offline. This caused "connection already exists" errors when attempting to establish new connections to the reconnected peer. The issue occurred because the old peer's cleanup notification raced with the new peer's online notification. If reconnection happened before cleanup, the offline notification was silently dropped. The fix sends an offline notification synchronously during reconnection (when AddPeer returns true), ensuring all subscribed peers receive events in the correct order (offline → online). Added TestBindReconnectRace to validate the fix with 1000 reconnection iterations. --- relay/server/relay.go | 1 + shared/relay/client/client_test.go | 103 ++++++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/relay/server/relay.go b/relay/server/relay.go index d86684937..771eea4fd 100644 --- a/relay/server/relay.go +++ b/relay/server/relay.go @@ -132,6 +132,7 @@ func (r *Relay) Accept(conn net.Conn) { storeTime := time.Now() if isReconnection := r.store.AddPeer(peer); isReconnection { r.metrics.RecordPeerReconnection() + r.notifier.PeerWentOffline(peer.ID()) } r.notifier.PeerCameOnline(peer.ID()) diff --git a/shared/relay/client/client_test.go b/shared/relay/client/client_test.go index 8fe5f04f4..0f9997340 100644 --- a/shared/relay/client/client_test.go +++ b/shared/relay/client/client_test.go @@ -11,11 +11,11 @@ import ( "go.opentelemetry.io/otel" "github.com/netbirdio/netbird/client/iface" + "github.com/netbirdio/netbird/relay/server" "github.com/netbirdio/netbird/shared/relay/auth/allow" "github.com/netbirdio/netbird/shared/relay/auth/hmac" + "github.com/netbirdio/netbird/shared/relay/messages" "github.com/netbirdio/netbird/util" - - "github.com/netbirdio/netbird/relay/server" ) var ( @@ -427,6 +427,105 @@ func TestBindReconnect(t *testing.T) { } } +func TestBindReconnectRace(t *testing.T) { + ctx := context.Background() + + srvCfg := server.ListenerConfig{Address: serverListenAddr} + srv, err := server.NewServer(serverCfg) + if err != nil { + t.Fatalf("failed to create server: %s", err) + } + errChan := make(chan error, 1) + go func() { + err := srv.Listen(srvCfg) + if err != nil { + errChan <- err + } + }() + + defer func() { + log.Infof("closing server") + err := srv.Shutdown(ctx) + if err != nil { + t.Errorf("failed to close server: %s", err) + } + }() + + // wait for servers to start + if err := waitForServerToStart(errChan); err != nil { + t.Fatalf("failed to start server: %s", err) + } + + clientBob := NewClient(serverURL, hmacTokenStore, "bob", iface.DefaultMTU) + err = clientBob.Connect(ctx) + if err != nil { + t.Fatalf("failed to connect to server: %s", err) + } + defer clientBob.Close() + + // Run the reconnection scenario multiple times to expose the race + failures := 0 + iterations := 1000 + + for i := 0; i < iterations; i++ { + log.Infof("Iteration %d/%d", i+1, iterations) + + // Alice connects + clientAlice := NewClient(serverURL, hmacTokenStore, "alice", iface.DefaultMTU) + err = clientAlice.Connect(ctx) + if err != nil { + t.Fatalf("iteration %d: failed to connect alice: %s", i, err) + } + + // Bob opens connection to Alice + _, err = clientBob.OpenConn(ctx, "alice") + if err != nil { + t.Fatalf("iteration %d: failed to open conn from bob: %s", i, err) + } + + // Close Alice immediately + err = clientAlice.Close() + if err != nil { + t.Errorf("iteration %d: failed to close alice: %s", i, err) + } + + // Reconnect Alice immediately (this is where the race occurs) + clientAlice = NewClient(serverURL, hmacTokenStore, "alice", iface.DefaultMTU) + err = clientAlice.Connect(ctx) + if err != nil { + t.Fatalf("iteration %d: failed to reconnect alice: %s", i, err) + } + + // Bob tries to open a new connection to the reconnected Alice + // Without the fix, this will sometimes fail with "connection already exists" + // because Bob still has the old connection in its map + _, err = clientBob.OpenConn(ctx, "alice") + if err != nil { + log.Errorf("iteration %d: RACE DETECTED - failed to open new conn after reconnect: %s", i, err) + failures++ + } + + // Clean up + clientAlice.Close() + + // Close Bob's connection to Alice to prepare for next iteration + clientBob.mu.Lock() + aliceID := messages.HashID("alice") + if container, ok := clientBob.conns[aliceID]; ok { + container.close() + delete(clientBob.conns, aliceID) + } + clientBob.mu.Unlock() + } + + if failures > 0 { + t.Errorf("Race condition detected in %d out of %d iterations (%.1f%%)", + failures, iterations, float64(failures)/float64(iterations)*100) + } else { + log.Infof("No race detected in %d iterations (fix is working or race didn't trigger)", iterations) + } +} + func TestCloseConn(t *testing.T) { ctx := context.Background()