Remove verbose comments

[Recheck watcher ctx cancellation under conn.mu in onWGDisconnected
onWGDisconnected only checked conn.ctx (the engine-scoped context), never the watcher's own context. disableWgWatcherIfNeeded cancels the wgWatcherCtx, not conn.ctx, so a disabled watcher's timeout callback did not see the cancellation. handshakeCheck runs lock-free, so between the ctx check in periodicHandshakeCheck and acquiring conn.mu a fast disconnect/reconnect can slip in: the stale watcher then acquires the lock and tears down the *new*, healthy connection based on the old timeout, forcing the guard into an unnecessary reconnect (flap). Recheck watcherCtx.Err() under conn.mu so a superseded watcher exits without touching the connection that replaced it.
2026-07-04 05:40:02 +00:00 · 2026-07-03 17:26:20 +02:00 · 2026-07-03 12:15:24 +02:00 · 2026-07-03 12:02:50 +02:00 · 2026-07-03 11:37:41 +02:00 · 2026-07-03 10:31:06 +02:00
11 changed files with 449 additions and 807 deletions
--- a/client/internal/engine.go
+++ b/client/internal/engine.go
@@ -220,12 +220,6 @@ type Engine struct {
 	// networkSerial is the latest CurrentSerial (state ID) of the network sent by the Management service
 	networkSerial uint64

-	// forwardingRules holds the ingress forward rules applied for the current target.
-	// Wholesale sections (incl. forward rules) run only on the first pass of a target;
-	// it is stashed here so the final, peer-converged pass can build the lazy-connection
-	// exclude list without recomputing them on every bounded peer pass.
-	forwardingRules []firewallManager.ForwardRule
-
 	networkMonitor *networkmonitor.NetworkMonitor

 	sshServer sshServer
@@ -780,15 +774,7 @@ func (e *Engine) blockLanAccess() {

 // modifyPeers updates peers that have been modified (e.g. IP address has been changed).
 // It closes the existing connection, removes it from the peerConns map, and creates a new one.
-// maxPeersPerSyncPass is the default per-pass cap on how many peers each of
-// removePeers/modifyPeers/addNewPeers applies, so syncMsgMux is held only for a
-// batch at a time and other subsystems can interleave between passes. It is
-// passed in (not read globally) so tests can exercise the multi-pass path.
-const maxPeersPerSyncPass = 300
-
-// modifyPeers re-applies up to maxBatch changed peers per call. It returns true
-// when more changed peers remained than the cap, so the caller re-runs.
-func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch int) (bool, error) {
+func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {

 	// first, check if peers have been modified
 	var modified []*mgmProto.RemotePeerConfig
@@ -818,32 +804,26 @@ func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch
 		}
 	}

-	more := false
-	if len(modified) > maxBatch {
-		modified = modified[:maxBatch]
-		more = true
-	}
-
 	// second, close all modified connections and remove them from the state map
 	for _, p := range modified {
-		if err := e.removePeer(p.GetWgPubKey()); err != nil {
-			return false, err
+		err := e.removePeer(p.GetWgPubKey())
+		if err != nil {
+			return err
 		}
 	}
 	// third, add the peer connections again
 	for _, p := range modified {
-		if err := e.addNewPeer(p); err != nil {
-			return false, err
+		err := e.addNewPeer(p)
+		if err != nil {
+			return err
 		}
 	}
-	return more, nil
+	return nil
 }

 // removePeers finds and removes peers that do not exist anymore in the network map received from the Management Service.
 // It also removes peers that have been modified (e.g. change of IP address). They will be added again in addPeers method.
-// removePeers removes up to maxBatch peers per call. It returns true when more
-// peers remained to remove than the cap, so the caller re-runs.
-func (e *Engine) removePeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch int) (bool, error) {
+func (e *Engine) removePeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
 	newPeers := make([]string, 0, len(peersUpdate))
 	for _, p := range peersUpdate {
 		newPeers = append(newPeers, p.GetWgPubKey())
@@ -851,19 +831,14 @@ func (e *Engine) removePeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch

 	toRemove := util.SliceDiff(e.peerStore.PeersPubKey(), newPeers)

-	more := false
-	if len(toRemove) > maxBatch {
-		toRemove = toRemove[:maxBatch]
-		more = true
-	}
-
 	for _, p := range toRemove {
-		if err := e.removePeer(p); err != nil {
-			return false, err
+		err := e.removePeer(p)
+		if err != nil {
+			return err
 		}
 		log.Infof("removed peer %s", p)
 	}
-	return more, nil
+	return nil
 }

 func (e *Engine) removeAllPeers() error {
@@ -942,17 +917,19 @@ func (e *Engine) phase(name string) func() {
 	}
 }

-// applySyncPass applies one bounded pass of the sync update under syncMsgMux and
-// returns true if more peers remained than the per-pass cap. It is driven by the
-// mapStateManager, which re-invokes it (releasing the lock between passes) until
-// the update is fully applied.
-func (e *Engine) applySyncPass(update *mgmProto.SyncResponse, firstPass bool) (bool, error) {
+func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
+	started := time.Now()
+	defer func() {
+		duration := time.Since(started)
+		log.Infof("sync finished in %s", duration)
+		e.clientMetrics.RecordSyncDuration(e.ctx, duration)
+	}()
 	e.syncMsgMux.Lock()
 	defer e.syncMsgMux.Unlock()

 	// Check context INSIDE lock to ensure atomicity with shutdown
 	if e.ctx.Err() != nil {
-		return false, e.ctx.Err()
+		return e.ctx.Err()
 	}

 	if update.NetworkMap != nil && update.NetworkMap.PeerConfig != nil {
@@ -963,7 +940,7 @@ func (e *Engine) applySyncPass(update *mgmProto.SyncResponse, firstPass bool) (b
 	err := e.updateNetbirdConfig(update.GetNetbirdConfig())
 	done()
 	if err != nil {
-		return false, err
+		return err
 	}

 	// Posture checks are bound to the network map presence:
@@ -973,25 +950,28 @@ func (e *Engine) applySyncPass(update *mgmProto.SyncResponse, firstPass bool) (b
 	//                                        leave the previously applied checks untouched
 	nm := update.GetNetworkMap()
 	if nm == nil {
-		return false, nil
+		return nil
 	}

 	done = e.phase("checks")
 	err = e.updateChecksIfNew(update.Checks)
 	done()
 	if err != nil {
-		return false, err
+		return err
 	}

+	done = e.phase("persist")
+	e.persistSyncResponse(update)
+	done()
+
 	// only apply new changes and ignore old ones
-	more, err := e.updateNetworkMap(nm, maxPeersPerSyncPass, firstPass)
-	if err != nil {
-		return false, err
+	if err := e.updateNetworkMap(nm); err != nil {
+		return err
 	}

 	e.statusRecorder.PublishEvent(cProto.SystemEvent_INFO, cProto.SystemEvent_SYSTEM, "Network map updated", "", nil)

-	return more, nil
+	return nil
 }

 // updateNetbirdConfig applies the management-provided NetBird configuration:
@@ -1039,13 +1019,6 @@ func (e *Engine) updateNetbirdConfig(wCfg *mgmProto.NetbirdConfig) error {
 // (not syncMsgMux) is held for the whole Set so the store cannot be cleared (disabled /
 // engine close) mid-call and have this write resurrect a file that was just removed.
 func (e *Engine) persistSyncResponse(update *mgmProto.SyncResponse) {
-	// Only persist updates that carry a network map. Config-only updates (e.g. relay
-	// token rotation, STUN/TURN) have a nil NetworkMap; persisting them would overwrite
-	// the last full map on disk and break restore-on-restart.
-	if update.GetNetworkMap() == nil {
-		return
-	}
-
 	e.syncRespMux.RLock()
 	defer e.syncRespMux.RUnlock()

@@ -1333,24 +1306,7 @@ func (e *Engine) receiveManagementEvents() {
 		}
 		e.applyInfoFlags(info)

-		// The map-state manager converges the latest update in the background in
-		// bounded passes; the stream callback only hands it the newest target.
-		persist := func(u *mgmProto.SyncResponse) {
-			done := e.phase("persist")
-			e.persistSyncResponse(u)
-			done()
-		}
-		manager := newMapStateManager(e.applySyncPass, persist, func(d time.Duration) {
-			log.Infof("sync finished in %s", d)
-			e.clientMetrics.RecordSyncDuration(e.ctx, d)
-		})
-		e.shutdownWg.Add(1)
-		go func() {
-			defer e.shutdownWg.Done()
-			manager.run(e.ctx)
-		}()
-
-		err := e.mgmClient.Sync(e.ctx, info, manager.SetTarget)
+		err := e.mgmClient.Sync(e.ctx, info, e.handleSync)
 		if err != nil {
 			// happens if management is unavailable for a long time.
 			// We want to cancel the operation of the whole client
@@ -1401,107 +1357,21 @@ func (e *Engine) updateTURNs(turns []*mgmProto.ProtectedHostConfig) error {
 	return nil
 }

-// updateNetworkMap applies the wholesale parts (config, routes, ACL, DNS) in full
-// and up to maxBatch peers per phase. It returns true when more peers remained
-// than the cap, so the caller re-runs until convergence.
-func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap, maxBatch int, firstPass bool) (bool, error) {
+func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
 	// intentionally leave it before checking serial because for now it can happen that peer IP changed but serial didn't
 	if networkMap.GetPeerConfig() != nil {
 		err := e.updateConfig(networkMap.GetPeerConfig())
 		if err != nil {
-			return false, err
+			return err
 		}
 	}

 	serial := networkMap.GetSerial()
 	if e.networkSerial > serial {
 		log.Debugf("received outdated NetworkMap with serial %d, ignoring", serial)
-		return false, nil
+		return nil
 	}

-	// Wholesale sections (firewall/ACL, DNS, routes, forward rules) are applied
-	// up-front and only once per target: they are cheap, local, idempotent and must
-	// be in place before peers come up (fail-closed). On the bounded re-runs that only
-	// drain the remaining peer batches they are skipped — the applied forward rules are
-	// reused from e.forwardingRules for the lazy-exclude finalize.
-	if firstPass {
-		e.applyWholesale(networkMap, serial)
-	}
-
-	log.Debugf("got peers update from Management Service, total peers to connect to = %d", len(networkMap.GetRemotePeers()))
-
-	doneOffline := e.phase("offline_peers")
-	e.updateOfflinePeers(networkMap.GetOfflinePeers())
-	doneOffline()
-
-	// Filter out own peer from the remote peers list
-	localPubKey := e.config.WgPrivateKey.PublicKey().String()
-	remotePeers := make([]*mgmProto.RemotePeerConfig, 0, len(networkMap.GetRemotePeers()))
-	for _, p := range networkMap.GetRemotePeers() {
-		if p.GetWgPubKey() != localPubKey {
-			remotePeers = append(remotePeers, p)
-		}
-	}
-
-	// No special case for cleanup: when management signals RemotePeersIsEmpty (e.g. our
-	// peer was deleted), remotePeers is already empty, so the bounded diff below removes
-	// every peer in batches — same path as a normal update, no unbounded removeAllPeers
-	// held under syncMsgMux in one shot.
-	doneRemoved := e.phase("removed_peers")
-	removeMore, err := e.removePeers(remotePeers, maxBatch)
-	doneRemoved()
-	if err != nil {
-		return false, err
-	}
-
-	doneModified := e.phase("modified_peers")
-	modifyMore, err := e.modifyPeers(remotePeers, maxBatch)
-	doneModified()
-	if err != nil {
-		return false, err
-	}
-
-	doneAdded := e.phase("added_peers")
-	addMore, err := e.addNewPeers(remotePeers, maxBatch)
-	doneAdded()
-	if err != nil {
-		return false, err
-	}
-
-	// needMore signals the caller to re-run when a peer phase hit its per-pass cap.
-	needMore := removeMore || modifyMore || addMore
-
-	e.statusRecorder.FinishPeerListModifications()
-
-	e.updatePeerSSHHostKeys(remotePeers)
-
-	if err := e.updateSSHClientConfig(remotePeers); err != nil {
-		log.Warnf("failed to update SSH client config: %v", err)
-	}
-
-	e.updateSSHServerAuth(networkMap.GetSshAuth())
-
-	// Set the exclude list only once peers have fully converged (this pass added
-	// the last batch). It needs all target peers present in the store, and
-	// ExcludePeer has replace-semantics — a partial set mid-convergence would be wrong.
-	if !needMore {
-		doneLazy := e.phase("lazy_exclude")
-		excludedLazyPeers := e.toExcludedLazyPeers(e.forwardingRules, remotePeers)
-		e.connMgr.SetExcludeList(e.ctx, excludedLazyPeers)
-		doneLazy()
-	}
-
-	e.networkSerial = serial
-
-	return needMore, nil
-}
-
-// applyWholesale applies the cheap, local, idempotent map sections — lazy feature
-// flag, firewall/legacy management, DNS, routes, ACL filtering, DNS forwarder and
-// ingress forward rules — that must be in place before peers come up. It runs once
-// per target (first pass only); the resulting forward rules are stashed in
-// e.forwardingRules for the lazy-exclude finalize on the peer-converged pass.
-func (e *Engine) applyWholesale(networkMap *mgmProto.NetworkMap, serial uint64) {
 	if err := e.connMgr.UpdatedRemoteFeatureFlag(e.ctx, networkMap.GetPeerConfig().GetLazyConnectionEnabled()); err != nil {
 		log.Errorf("failed to update lazy connection feature flag: %v", err)
 	}
@@ -1574,7 +1444,84 @@ func (e *Engine) applyWholesale(networkMap *mgmProto.NetworkMap, serial uint64)
 		log.Errorf("failed to update forward rules, err: %v", err)
 	}
 	done()
-	e.forwardingRules = forwardingRules
+
+	log.Debugf("got peers update from Management Service, total peers to connect to = %d", len(networkMap.GetRemotePeers()))
+
+	done = e.phase("offline_peers")
+	e.updateOfflinePeers(networkMap.GetOfflinePeers())
+	done()
+
+	remotePeers, err := e.reconcilePeers(networkMap)
+	if err != nil {
+		return err
+	}
+
+	// must set the exclude list after the peers are added. Without it the manager can not figure out the peers parameters from the store
+	done = e.phase("lazy_exclude")
+	excludedLazyPeers := e.toExcludedLazyPeers(forwardingRules, remotePeers)
+	e.connMgr.SetExcludeList(e.ctx, excludedLazyPeers)
+	done()
+
+	e.networkSerial = serial
+
+	return nil
+}
+
+// reconcilePeers applies the remote peer list from the network map (removing,
+// modifying and adding peers, then updating SSH config) and returns the remote
+// peers with our own peer filtered out, for use by later sync steps.
+func (e *Engine) reconcilePeers(networkMap *mgmProto.NetworkMap) ([]*mgmProto.RemotePeerConfig, error) {
+	// Filter out own peer from the remote peers list
+	localPubKey := e.config.WgPrivateKey.PublicKey().String()
+	remotePeers := make([]*mgmProto.RemotePeerConfig, 0, len(networkMap.GetRemotePeers()))
+	for _, p := range networkMap.GetRemotePeers() {
+		if p.GetWgPubKey() != localPubKey {
+			remotePeers = append(remotePeers, p)
+		}
+	}
+
+	// cleanup request, most likely our peer has been deleted
+	if networkMap.GetRemotePeersIsEmpty() {
+		err := e.removeAllPeers()
+		e.statusRecorder.FinishPeerListModifications()
+		if err != nil {
+			return nil, err
+		}
+		return remotePeers, nil
+	}
+
+	done := e.phase("removed_peers")
+	err := e.removePeers(remotePeers)
+	done()
+	if err != nil {
+		return nil, err
+	}
+
+	done = e.phase("modified_peers")
+	err = e.modifyPeers(remotePeers)
+	done()
+	if err != nil {
+		return nil, err
+	}
+
+	done = e.phase("added_peers")
+	err = e.addNewPeers(remotePeers)
+	done()
+	if err != nil {
+		return nil, err
+	}
+
+	e.statusRecorder.FinishPeerListModifications()
+
+	e.updatePeerSSHHostKeys(remotePeers)
+
+	if err := e.updateSSHClientConfig(remotePeers); err != nil {
+		log.Warnf("failed to update SSH client config: %v", err)
+	}
+
+	e.updateSSHServerAuth(networkMap.GetSshAuth())
+
+	return remotePeers, nil
 }

 func toDNSFeatureFlag(networkMap *mgmProto.NetworkMap) bool {
@@ -1754,23 +1701,14 @@ func addrToString(addr netip.Addr) string {
 }

 // addNewPeers adds peers that were not know before but arrived from the Management service with the update
-// addNewPeers adds up to maxBatch not-yet-present peers per call. It returns true
-// when more new peers remained than the cap, so the caller re-runs.
-func (e *Engine) addNewPeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch int) (bool, error) {
-	added := 0
+func (e *Engine) addNewPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
 	for _, p := range peersUpdate {
-		if _, ok := e.peerStore.PeerConn(p.GetWgPubKey()); ok {
-			continue // already present (cheap skip), does not count toward the cap
+		err := e.addNewPeer(p)
+		if err != nil {
+			return err
 		}
-		if added >= maxBatch {
-			return true, nil // at least one more new peer remains
-		}
-		if err := e.addNewPeer(p); err != nil {
-			return false, err
-		}
-		added++
 	}
-	return false, nil
+	return nil
 }

 // addNewPeer add peer if connection doesn't exist
--- a/client/internal/engine_privileged_test.go
+++ b/client/internal/engine_privileged_test.go
@@ -124,7 +124,7 @@ func TestEngine_SSH(t *testing.T) {
 		RemotePeersIsEmpty: false,
 	}

-	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
+	err = engine.updateNetworkMap(networkMap)
 	require.NoError(t, err)

 	assert.Nil(t, engine.sshServer)
@@ -146,7 +146,7 @@ func TestEngine_SSH(t *testing.T) {
 		RemotePeersIsEmpty: false,
 	}

-	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
+	err = engine.updateNetworkMap(networkMap)
 	require.NoError(t, err)

 	time.Sleep(250 * time.Millisecond)
@@ -159,7 +159,7 @@ func TestEngine_SSH(t *testing.T) {
 		RemotePeersIsEmpty: false,
 	}

-	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
+	err = engine.updateNetworkMap(networkMap)
 	require.NoError(t, err)

 	// time.Sleep(250 * time.Millisecond)
@@ -174,7 +174,7 @@ func TestEngine_SSH(t *testing.T) {
 		RemotePeersIsEmpty: false,
 	}

-	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
+	err = engine.updateNetworkMap(networkMap)
 	require.NoError(t, err)

 	assert.Nil(t, engine.sshServer)
--- a/client/internal/engine_test.go
+++ b/client/internal/engine_test.go
@@ -437,7 +437,7 @@ func TestEngine_UpdateNetworkMap(t *testing.T) {

 	for _, c := range []testCase{case1, case2, case3, case4, case5, case6} {
 		t.Run(c.name, func(t *testing.T) {
-			_, err = engine.updateNetworkMap(c.networkMap, maxPeersPerSyncPass, true)
+			err = engine.updateNetworkMap(c.networkMap)
 			if err != nil {
 				t.Fatal(err)
 				return
@@ -464,47 +464,6 @@ func TestEngine_UpdateNetworkMap(t *testing.T) {
 			}
 		})
 	}
-
-	// chunked apply: with a per-pass cap smaller than the number of peers, a
-	// single updateNetworkMap applies one batch and reports more==true; the
-	// caller re-runs until convergence. (engine currently holds 0 peers.)
-	t.Run("chunked add converges over multiple passes", func(t *testing.T) {
-		nm := &mgmtProto.NetworkMap{
-			Serial:      6,
-			RemotePeers: []*mgmtProto.RemotePeerConfig{peer1, peer2, peer3},
-		}
-
-		more, err := engine.updateNetworkMap(nm, 1, true)
-		require.NoError(t, err)
-		require.True(t, more, "pass 1 should signal more")
-		require.Len(t, engine.peerStore.PeersPubKey(), 1)
-
-		more, err = engine.updateNetworkMap(nm, 1, false)
-		require.NoError(t, err)
-		require.True(t, more, "pass 2 should signal more")
-		require.Len(t, engine.peerStore.PeersPubKey(), 2)
-
-		more, err = engine.updateNetworkMap(nm, 1, false)
-		require.NoError(t, err)
-		require.False(t, more, "pass 3 should converge")
-		require.Len(t, engine.peerStore.PeersPubKey(), 3)
-	})
-
-	t.Run("chunked remove converges over multiple passes", func(t *testing.T) {
-		nm := &mgmtProto.NetworkMap{
-			Serial:      7,
-			RemotePeers: []*mgmtProto.RemotePeerConfig{peer1}, // remove peer2, peer3
-		}
-
-		more, err := engine.updateNetworkMap(nm, 1, true)
-		require.NoError(t, err)
-		require.True(t, more, "pass 1 should signal more (2 to remove, cap 1)")
-
-		more, err = engine.updateNetworkMap(nm, 1, false)
-		require.NoError(t, err)
-		require.False(t, more, "pass 2 should converge")
-		require.Len(t, engine.peerStore.PeersPubKey(), 1)
-	})
 }

 func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) {
@@ -675,7 +634,7 @@ func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) {
 				}
 			}()

-			_, err = engine.updateNetworkMap(testCase.networkMap, maxPeersPerSyncPass, true)
+			err = engine.updateNetworkMap(testCase.networkMap)
 			assert.NoError(t, err, "shouldn't return error")
 			assert.Equal(t, testCase.expectedSerial, input.inputSerial, "serial should match")
 			assert.Len(t, input.clientRoutes, testCase.expectedLen, "clientRoutes len should match")
@@ -879,7 +838,7 @@ func TestEngine_UpdateNetworkMapWithDNSUpdate(t *testing.T) {
 				}
 			}()

-			_, err = engine.updateNetworkMap(testCase.networkMap, maxPeersPerSyncPass, true)
+			err = engine.updateNetworkMap(testCase.networkMap)
 			assert.NoError(t, err, "shouldn't return error")
 			assert.Equal(t, testCase.expectedSerial, input.inputSerial, "serial should match")
 			assert.Len(t, input.inputNSGroups, testCase.expectedZonesLen, "zones len should match")
--- a/client/internal/mapsync.go
+++ b/client/internal/mapsync.go
@@ -1,214 +0,0 @@
-package internal
-
-import (
-	"context"
-	"sync"
-	"time"
-
-	log "github.com/sirupsen/logrus"
-
-	mgmProto "github.com/netbirdio/netbird/shared/management/proto"
-)
-
-// mapStateManager is the single read/write point between the management stream
-// (writes) and the convergence loop (reads/applies).
-//
-// The stream calls SetTarget with the latest full SyncResponse — the complete
-// desired state. A single background goroutine (run) applies it to the engine in
-// bounded passes via apply() until converged, releasing syncMsgMux between passes
-// so other subsystems interleave. If a newer update arrives mid-flight, the loop
-// coalesces: it keeps converging toward the latest target and the intermediate one
-// is SKIPPED — never applied on its own (logged, no onConverged).
-//
-// Convergence is a single comparison: appliedGen == targetGen. targetGen
-// increments on every SetTarget (an internal generation counter, so it also covers
-// config-only updates that carry no network-map serial).
-//
-// onConverged fires once for each — and only each — map that is actually processed
-// (i.e. converged as the target). Skipped/superseded maps and dropped-on-error maps
-// do NOT fire it. So "sync finished in X" / RecordSyncDuration always corresponds
-// to a real, completed alignment.
-type mapStateManager struct {
-	// apply performs one bounded apply pass and reports whether more passes are needed.
-	// firstPass is true on the first pass of a given target, so the caller can run
-	// wholesale (firewall/routes/DNS/forward-rules) once per target and skip it on the
-	// re-runs that only drain the bounded peer batches. The manager owns this signal
-	// because it owns the convergence boundary; the engine need not track serials for it.
-	apply func(update *mgmProto.SyncResponse, firstPass bool) (bool, error)
-	// onConverged is called once per processed map, with the elapsed time since that
-	// map was received (for the sync-duration metric / "sync finished" log).
-	onConverged func(time.Duration)
-	// persist snapshots an update to disk for restore-on-restart. Called once per
-	// update received from management (in SetTarget), including ones later coalesced
-	// or skipped from apply, so the on-disk state mirrors what management last sent.
-	// The impl skips config-only updates (nil NetworkMap). May be nil.
-	persist func(*mgmProto.SyncResponse)
-
-	mu          sync.Mutex
-	target      *mgmProto.SyncResponse
-	targetGen   uint64
-	appliedGen  uint64
-	targetSetAt time.Time
-
-	wake chan struct{}
-}
-
-func newMapStateManager(apply func(update *mgmProto.SyncResponse, firstPass bool) (bool, error), persist func(*mgmProto.SyncResponse), onConverged func(time.Duration)) *mapStateManager {
-	return &mapStateManager{
-		apply:       apply,
-		persist:     persist,
-		onConverged: onConverged,
-		wake:        make(chan struct{}, 1),
-	}
-}
-
-// SetTarget records the latest update as the desired state and wakes the loop.
-// It returns immediately; convergence happens in the background. Serial-based
-// staleness of the network map is still enforced inside apply (updateNetworkMap).
-func (m *mapStateManager) SetTarget(update *mgmProto.SyncResponse) error {
-	m.mu.Lock()
-	// A target that has not settled yet (targetGen > appliedGen) is being superseded
-	// before it converged: we coalesce to the latest map and never apply this one on
-	// its own. It is SKIPPED — logged here, and it will not fire onConverged.
-	if m.target != nil && m.targetGen > m.appliedGen {
-		log.Debugf("sync map (gen %d) superseded before convergence, skipping", m.targetGen)
-	}
-	m.target = m.mergeTarget(m.target, update)
-	// Bump an internal generation counter, NOT the map serial: config-only updates
-	// (relay token rotation, STUN/TURN) arrive with NetworkMap == nil and carry no
-	// serial, yet must still be applied. Every SetTarget is therefore a distinct
-	// target regardless of payload. Map-serial staleness is enforced separately
-	// inside apply (updateNetworkMap).
-	m.targetGen++
-	m.targetSetAt = time.Now()
-	m.mu.Unlock()
-
-	select {
-	case m.wake <- struct{}{}:
-	default:
-	}
-
-	// Persist every update received from management — once per update (not per apply
-	// pass), and including ones that get coalesced/skipped from apply, so the on-disk
-	// state always reflects the latest map management sent. Done after waking the loop
-	// so convergence can start in parallel with the disk write. The persist impl skips
-	// config-only updates (nil NetworkMap).
-	if m.persist != nil {
-		m.persist(update)
-	}
-	return nil
-}
-
-// mergeTarget combines the currently pending target with a freshly received update
-// and returns the new desired state. It is called under m.mu from SetTarget and is
-// the single seam where the replace-vs-squash decision lives.
-//
-// Today management always sends a FULL map (the complete desired state), so the
-// update simply replaces whatever was pending — prev is ignored. When management
-// starts sending incremental/delta updates, squash `update` onto `prev` here; the
-// rest of the manager (generation tracking, convergence, signaling) is unaffected
-// because it already treats target as "the complete desired state, whatever it is".
-func (m *mapStateManager) mergeTarget(prev, update *mgmProto.SyncResponse) *mgmProto.SyncResponse {
-	// Nothing pending to preserve (no prev, or prev already fully applied): plain replace.
-	if prev == nil || update == nil || m.targetGen == m.appliedGen {
-		return update
-	}
-
-	// prev still has unapplied state (targetGen > appliedGen). In the sync protocol a
-	// nil component means "no change", so if `update` omits a component that prev
-	// carried, carry prev's forward — otherwise coalescing an update that superseded a
-	// not-yet-applied one would silently drop the map or config it uniquely brought.
-	// A present component in `update` is newer and wins. Management may send map-only
-	// updates (nil config) and config-only updates (nil map); both are handled here.
-	// A nil component in `update` means "no change", so fill it in from prev — otherwise
-	// coalescing an update that superseded a not-yet-applied one would drop the map or
-	// config it uniquely carried. A present component in `update` is newer and wins.
-	// We mutate `update` in place: it is a fresh per-message allocation from the sync
-	// stream (see receiveUpdatesEvents — not reused), and persisting this squashed target
-	// is correct, since it is the current full (superset) desired state.
-	if update.GetNetworkMap() == nil && prev.GetNetworkMap() != nil {
-		update.NetworkMap = prev.GetNetworkMap()
-		update.Checks = prev.Checks // checks travel with the map
-	}
-	if update.GetNetbirdConfig() == nil && prev.GetNetbirdConfig() != nil {
-		update.NetbirdConfig = prev.GetNetbirdConfig()
-	}
-	return update
-}
-
-// run drives convergence until ctx is done. It is meant to run in its own goroutine.
-func (m *mapStateManager) run(ctx context.Context) {
-	// passGen is the generation of the most recent apply() call (0 = none). A pass is
-	// the first for its target when its generation differs from the previous one —
-	// true on a fresh target and on a coalesced switch to a newer target mid-flight.
-	var passGen uint64
-	for {
-		m.mu.Lock()
-		target, tg, ag := m.target, m.targetGen, m.appliedGen
-		m.mu.Unlock()
-
-		// Fully converged (or nothing yet): block until a new target arrives.
-		if target == nil || ag == tg {
-			select {
-			case <-ctx.Done():
-				return
-			case <-m.wake:
-				continue
-			}
-		}
-
-		firstPass := tg != passGen
-		passGen = tg
-		more, err := m.apply(target, firstPass)
-		if err != nil {
-			if ctx.Err() != nil {
-				return
-			}
-			// Log and DROP this target — do not retry it. A deterministic failure
-			// (e.g. a malformed peer in the map) would otherwise spin every pass
-			// making no progress. Management is the source of truth and re-delivers
-			// the full map on the next sync, so dropping is safe; peers already
-			// applied this convergence stay (idempotent diffs) and the remainder is
-			// reconciled by the next target. Mirrors the legacy handleSync path,
-			// where the apply error was logged by the gRPC client and the update
-			// dropped. No onConverged: this target did not converge.
-			log.Errorf("apply sync pass, dropping update: %v", err)
-			m.settle(tg, false)
-			continue
-		}
-
-		if more {
-			// keep converging the current target; syncMsgMux was released by apply
-			// between passes so other subsystems interleave.
-			continue
-		}
-
-		// This pass converged. Mark applied and signal this one map.
-		m.settle(tg, true)
-		// if a newer target arrived mid-pass, settle is a no-op (targetGen != tg) and
-		// ag<tg next iteration -> apply it; this generation was skipped (logged in
-		// SetTarget) and is not signaled.
-	}
-}
-
-// settle marks generation tg as processed so the loop goes idle instead of
-// re-applying the same target. It is a no-op when a newer target arrived during the
-// pass (targetGen != tg), leaving appliedGen behind so that target re-applies — the
-// just-finished generation was already counted as skipped.
-//
-// When signal is true (the pass converged) it fires onConverged once for this map;
-// when false (the target was dropped on error) it does not — the map did not converge.
-func (m *mapStateManager) settle(tg uint64, signal bool) {
-	m.mu.Lock()
-	if m.targetGen != tg {
-		m.mu.Unlock()
-		return
-	}
-	m.appliedGen = tg
-	setAt := m.targetSetAt
-	m.mu.Unlock()
-
-	if signal && m.onConverged != nil {
-		m.onConverged(time.Since(setAt))
-	}
-}
--- a/client/internal/mapsync_test.go
+++ b/client/internal/mapsync_test.go
@@ -1,281 +0,0 @@
-package internal
-
-import (
-	"context"
-	"errors"
-	"sync/atomic"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/require"
-
-	mgmProto "github.com/netbirdio/netbird/shared/management/proto"
-)
-
-// mergeTarget fills components missing from the incoming update with the pending
-// (not-yet-applied) prev's, in place, so a coalesced/superseded update does not drop
-// the map or config it uniquely carried.
-func TestMapStateManager_MergeTargetPreservesPendingState(t *testing.T) {
-	m := newMapStateManager(nil, nil, nil)
-
-	// config-only update while a full map is still converging (targetGen > appliedGen):
-	// the pending map (+ checks) is filled into the update in place
-	m.targetGen, m.appliedGen = 5, 4
-	prev := &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 5}}
-	update := &mgmProto.SyncResponse{NetbirdConfig: &mgmProto.NetbirdConfig{}}
-	merged := m.mergeTarget(prev, update)
-	require.Same(t, update, merged, "merges in place, returns the update")
-	require.EqualValues(t, 5, merged.GetNetworkMap().GetSerial(), "pending map preserved")
-	require.NotNil(t, merged.GetNetbirdConfig(), "new config kept")
-
-	// symmetric: map-only update while a config-only update is pending -> keep the config
-	m.targetGen, m.appliedGen = 5, 4
-	prev = &mgmProto.SyncResponse{NetbirdConfig: &mgmProto.NetbirdConfig{}}
-	update = &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 7}}
-	merged = m.mergeTarget(prev, update)
-	require.EqualValues(t, 7, merged.GetNetworkMap().GetSerial(), "new map kept")
-	require.NotNil(t, merged.GetNetbirdConfig(), "pending config preserved")
-
-	// prev already applied (targetGen == appliedGen): plain replace, no fill-in
-	m.targetGen, m.appliedGen = 5, 5
-	prev = &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 5}}
-	update = &mgmProto.SyncResponse{NetbirdConfig: &mgmProto.NetbirdConfig{}}
-	merged = m.mergeTarget(prev, update)
-	require.Same(t, update, merged)
-	require.Nil(t, merged.GetNetworkMap(), "no map grafted when prev already applied")
-
-	// nothing to carry (update has a map, prev has no config): plain replace
-	m.targetGen, m.appliedGen = 5, 4
-	prev = &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 5}}
-	update = &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 6}}
-	require.Same(t, update, m.mergeTarget(prev, update))
-}
-
-// converges over the bounded passes (apply returns more until the 3rd pass),
-// fires onConverged exactly once, then blocks (no further apply) until a new target.
-func TestMapStateManager_ConvergesThenStops(t *testing.T) {
-	var passes int32
-	var firstPasses int32
-	converged := make(chan struct{}, 1)
-
-	apply := func(_ *mgmProto.SyncResponse, firstPass bool) (bool, error) {
-		n := atomic.AddInt32(&passes, 1)
-		if firstPass {
-			atomic.AddInt32(&firstPasses, 1)
-		}
-		return n < 3, nil // more on pass 1 and 2, converge on pass 3
-	}
-	m := newMapStateManager(apply, nil, func(time.Duration) { converged <- struct{}{} })
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	go m.run(ctx)
-
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
-
-	select {
-	case <-converged:
-	case <-time.After(2 * time.Second):
-		t.Fatal("manager did not converge")
-	}
-	require.EqualValues(t, 3, atomic.LoadInt32(&passes))
-	require.EqualValues(t, 1, atomic.LoadInt32(&firstPasses), "firstPass true only on pass 1, false on re-runs of the same target")
-
-	// once converged the loop blocks: no further apply calls
-	time.Sleep(100 * time.Millisecond)
-	require.EqualValues(t, 3, atomic.LoadInt32(&passes), "apply must not run after convergence")
-}
-
-// persist runs once per received update (not per apply pass), regardless of how many
-// bounded passes that target takes to converge.
-func TestMapStateManager_PersistsOncePerUpdate(t *testing.T) {
-	var passes, persists int32
-	converged := make(chan struct{}, 1)
-	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
-		n := atomic.AddInt32(&passes, 1)
-		return n < 3, nil // 3 passes for one target
-	}
-	persist := func(*mgmProto.SyncResponse) { atomic.AddInt32(&persists, 1) }
-	m := newMapStateManager(apply, persist, func(time.Duration) { converged <- struct{}{} })
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	go m.run(ctx)
-
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
-	select {
-	case <-converged:
-	case <-time.After(2 * time.Second):
-		t.Fatal("did not converge")
-	}
-	require.EqualValues(t, 3, atomic.LoadInt32(&passes))
-	require.EqualValues(t, 1, atomic.LoadInt32(&persists), "persist once per update, not per pass")
-}
-
-// every update received from management is persisted — even one that is coalesced /
-// skipped from apply before it ever converges.
-func TestMapStateManager_PersistsEveryUpdateIncludingSkipped(t *testing.T) {
-	release := make(chan struct{})
-	var persists int32
-	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
-		<-release // hold the first apply so the second update coalesces/skips
-		return false, nil
-	}
-	persist := func(*mgmProto.SyncResponse) { atomic.AddInt32(&persists, 1) }
-	m := newMapStateManager(apply, persist, nil)
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	go m.run(ctx)
-
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{})) // map1 -> apply blocks
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{})) // map2 supersedes map1 (skipped from apply)
-	close(release)
-
-	// both updates persisted even though map1 is skipped from apply
-	require.Eventually(t, func() bool { return atomic.LoadInt32(&persists) == 2 }, 2*time.Second, 10*time.Millisecond)
-}
-
-// each map that is actually processed (converged before the next arrives) fires
-// onConverged exactly once — mirroring the legacy per-message handleSync timing.
-func TestMapStateManager_SignalsEachProcessedMap(t *testing.T) {
-	converged := make(chan struct{}, 8)
-	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
-		return false, nil // converge in one pass
-	}
-	m := newMapStateManager(apply, nil, func(time.Duration) { converged <- struct{}{} })
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	go m.run(ctx)
-
-	const maps = 3
-	for i := 0; i < maps; i++ {
-		require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
-		select { // wait for this map to converge before sending the next (no coalescing)
-		case <-converged:
-		case <-time.After(2 * time.Second):
-			t.Fatalf("map %d not signaled", i)
-		}
-	}
-
-	// no extra signals once the stream goes quiet
-	select {
-	case <-converged:
-		t.Fatal("unexpected extra onConverged")
-	case <-time.After(100 * time.Millisecond):
-	}
-}
-
-// a map superseded before it converges is skipped: only the latest (processed) map
-// fires onConverged, not the skipped one.
-func TestMapStateManager_SkippedMapNotSignaled(t *testing.T) {
-	release := make(chan struct{})
-	var applies, converged atomic.Int32
-	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
-		applies.Add(1)
-		<-release // hold the first apply in-flight so we can queue a newer target
-		return false, nil
-	}
-	m := newMapStateManager(apply, nil, func(time.Duration) { converged.Add(1) })
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	go m.run(ctx)
-
-	// map1 is picked up; its apply blocks on release
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
-	require.Eventually(t, func() bool { return applies.Load() >= 1 }, 2*time.Second, 5*time.Millisecond)
-
-	// map2 supersedes map1 before it settled -> map1 is skipped
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
-	close(release) // let both applies proceed
-
-	// only the processed (latest) map signals; the skipped one does not
-	require.Eventually(t, func() bool { return converged.Load() == 1 }, 2*time.Second, 10*time.Millisecond)
-	time.Sleep(150 * time.Millisecond)
-	require.EqualValues(t, 1, converged.Load(), "skipped map must not fire onConverged")
-	require.EqualValues(t, 2, applies.Load(), "both targets entered apply (map1 once, map2 once)")
-}
-
-// an apply error drops the target: no retry of the same target, no onConverged,
-// the loop goes idle — and a fresh target is still applied afterwards.
-func TestMapStateManager_DropsTargetOnError(t *testing.T) {
-	applied := make(chan struct{}, 8)
-	var failNext atomic.Bool
-	failNext.Store(true)
-
-	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
-		applied <- struct{}{}
-		if failNext.Load() {
-			return false, errors.New("boom")
-		}
-		return false, nil // converge in one pass
-	}
-	var converged atomic.Int32
-	m := newMapStateManager(apply, nil, func(time.Duration) { converged.Add(1) })
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	go m.run(ctx)
-
-	// first target errors -> applied once, then dropped (no retry, no onConverged)
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
-	select {
-	case <-applied:
-	case <-time.After(2 * time.Second):
-		t.Fatal("errored target not applied")
-	}
-	select {
-	case <-applied:
-		t.Fatal("errored target must not be retried")
-	case <-time.After(150 * time.Millisecond):
-	}
-	require.EqualValues(t, 0, converged.Load(), "onConverged must not fire on error")
-
-	// a new target is still processed normally and converges
-	failNext.Store(false)
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
-	select {
-	case <-applied:
-	case <-time.After(2 * time.Second):
-		t.Fatal("new target after error not applied")
-	}
-	require.Eventually(t, func() bool { return converged.Load() == 1 }, 2*time.Second, 10*time.Millisecond)
-}
-
-// a new target after convergence triggers a fresh apply; an idle (converged)
-// manager does not apply on its own.
-func TestMapStateManager_ReappliesOnNewTarget(t *testing.T) {
-	applied := make(chan struct{}, 8)
-	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
-		applied <- struct{}{}
-		return false, nil // converge in one pass
-	}
-	m := newMapStateManager(apply, nil, nil)
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	go m.run(ctx)
-
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
-	select {
-	case <-applied:
-	case <-time.After(2 * time.Second):
-		t.Fatal("first target not applied")
-	}
-
-	// converged → must stay idle (no spurious apply)
-	select {
-	case <-applied:
-		t.Fatal("unexpected apply while idle/converged")
-	case <-time.After(150 * time.Millisecond):
-	}
-
-	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
-	select {
-	case <-applied:
-	case <-time.After(2 * time.Second):
-		t.Fatal("new target not applied")
-	}
-}
--- a/client/internal/peer/conn.go
+++ b/client/internal/peer/conn.go
@@ -195,7 +195,6 @@ func NewConn(config ConnConfig, services ServiceDependencies) (*Conn, error) {
 		statusICE:          worker.NewAtomicStatus(),
 		dumpState:          dumpState,
 		endpointUpdater:    NewEndpointUpdater(connLog, config.WgConfig, isController(config)),
-		wgWatcher:          NewWGWatcher(connLog, config.WgConfig.WgInterface, config.Key, dumpState),
 		metricsRecorder:    services.MetricsRecorder,
 	}

@@ -663,11 +662,12 @@ func (conn *Conn) onGuardEvent() {
 	}
 }

-func (conn *Conn) onWGDisconnected() {
+func (conn *Conn) onWGDisconnected(watcherCtx context.Context) {
 	conn.mu.Lock()
 	defer conn.mu.Unlock()

-	if conn.ctx.Err() != nil {
+	// watcherCtx guards against a stale watcher tearing down a connection that already superseded it.
+	if conn.ctx.Err() != nil || watcherCtx.Err() != nil {
 		return
 	}

@@ -802,25 +802,39 @@ func (conn *Conn) isConnectedOnAllWay() (status guard.ConnStatus) {
 	})
 }

+// enableWgWatcherIfNeeded starts a fresh watcher instance per connection attempt, so its
+// lifecycle stays bound to conn.mu and enable/disable can't race an old goroutine's shutdown.
+// Caller must hold conn.mu.
 func (conn *Conn) enableWgWatcherIfNeeded(enabledTime time.Time) {
-	if !conn.wgWatcher.PrepareInitialHandshake() {
+	if conn.wgWatcher != nil {
 		return
 	}

+	watcher := NewWGWatcher(conn.Log, conn.config.WgConfig.WgInterface, conn.config.Key, conn.dumpState)
+	watcher.PrepareInitialHandshake()
+
 	wgWatcherCtx, wgWatcherCancel := context.WithCancel(conn.ctx)
+	conn.wgWatcher = watcher
 	conn.wgWatcherCancel = wgWatcherCancel
+
 	conn.wgWatcherWg.Add(1)
 	go func() {
 		defer conn.wgWatcherWg.Done()
-		conn.wgWatcher.EnableWgWatcher(wgWatcherCtx, enabledTime, conn.onWGDisconnected, conn.onWGHandshakeSuccess)
+		onDisconnected := func() { conn.onWGDisconnected(wgWatcherCtx) }
+		watcher.EnableWgWatcher(wgWatcherCtx, enabledTime, onDisconnected, conn.onWGHandshakeSuccess)
 	}()
 }

+// disableWgWatcherIfNeeded cancels and drops the watcher once no transport is active. It never
+// waits for the goroutine: the timeout path reentrantly calls back here under conn.mu, so
+// blocking would deadlock. Caller must hold conn.mu.
 func (conn *Conn) disableWgWatcherIfNeeded() {
-	if conn.currentConnPriority == conntype.None && conn.wgWatcherCancel != nil {
-		conn.wgWatcherCancel()
-		conn.wgWatcherCancel = nil
+	if conn.currentConnPriority != conntype.None || conn.wgWatcher == nil {
+		return
 	}
+	conn.wgWatcherCancel()
+	conn.wgWatcher = nil
+	conn.wgWatcherCancel = nil
 }

 func (conn *Conn) newProxy(remoteConn net.Conn) (wgproxy.Proxy, error) {
@@ -843,7 +857,9 @@ func (conn *Conn) resetEndpoint() {
 		return
 	}
 	conn.Log.Infof("reset wg endpoint")
-	conn.wgWatcher.Reset()
+	if conn.wgWatcher != nil {
+		conn.wgWatcher.Reset()
+	}
 	if err := conn.endpointUpdater.RemoveEndpointAddress(); err != nil {
 		conn.Log.Warnf("failed to remove endpoint address before update: %v", err)
 	}
--- a/client/internal/peer/wg_watcher.go
+++ b/client/internal/peer/wg_watcher.go
@@ -3,7 +3,6 @@ package peer
 import (
 	"context"
 	"fmt"
-	"sync"
 	"time"

 	log "github.com/sirupsen/logrus"
@@ -24,14 +23,14 @@ type WGInterfaceStater interface {
 	GetStats() (map[string]configurer.WGStats, error)
 }

+// WGWatcher is single-shot: one instance per connection attempt, run once, then discarded.
+// Lifecycle is owned by Conn under conn.mu, so it keeps no "enabled" state to go stale.
 type WGWatcher struct {
 	log           *log.Entry
 	wgIfaceStater WGInterfaceStater
 	peerKey       string
 	stateDump     *stateDump

-	enabled   bool
-	muEnabled sync.Mutex
 	// initialHandshake is not thread-safe; never call PrepareInitialHandshake and EnableWgWatcher concurrently.
 	initialHandshake time.Time

@@ -48,25 +47,14 @@ func NewWGWatcher(log *log.Entry, wgIfaceStater WGInterfaceStater, peerKey strin
 	}
 }

-// PrepareInitialHandshake reserves the watcher and reads the peer's current WireGuard
-// handshake time. It must be called before the peer is (re)configured on the WireGuard
-// interface, so the captured baseline reflects the state prior to this connection attempt
-// instead of racing with that configuration. Returns ok=false if the watcher is already
-// running, in which case EnableWgWatcher must not be called.
-func (w *WGWatcher) PrepareInitialHandshake() (ok bool) {
-	w.muEnabled.Lock()
-	if w.enabled {
-		w.muEnabled.Unlock()
-		return false
-	}
-
+// PrepareInitialHandshake reads the peer's current WireGuard handshake time. It must be
+// called before the peer is (re)configured on the WireGuard interface, so the captured
+// baseline reflects the state prior to this connection attempt instead of racing with
+// that configuration.
+func (w *WGWatcher) PrepareInitialHandshake() {
 	w.log.Debugf("enable WireGuard watcher")
-	w.enabled = true
-	w.muEnabled.Unlock()
-
 	handshake, _ := w.wgState()
 	w.initialHandshake = handshake
-	return true
 }

 // EnableWgWatcher runs the WireGuard watcher loop using the handshake baseline captured by
@@ -74,10 +62,6 @@ func (w *WGWatcher) PrepareInitialHandshake() (ok bool) {
 // for context lifecycle management.
 func (w *WGWatcher) EnableWgWatcher(ctx context.Context, enabledTime time.Time, onDisconnectedFn func(), onHandshakeSuccessFn func(when time.Time)) {
 	w.periodicHandshakeCheck(ctx, onDisconnectedFn, onHandshakeSuccessFn, enabledTime, w.initialHandshake)
-
-	w.muEnabled.Lock()
-	w.enabled = false
-	w.muEnabled.Unlock()
 }

 // Reset signals the watcher that the WireGuard peer has been reset and a new
@@ -103,6 +87,7 @@ func (w *WGWatcher) periodicHandshakeCheck(ctx context.Context, onDisconnectedFn
 		case <-timer.C:
 			handshake, ok := w.handshakeCheck(lastHandshake)
 			if !ok {
+				// early ctx cancel check return
 				if ctx.Err() != nil {
 					return
 				}
@@ -147,9 +132,9 @@ func (w *WGWatcher) handshakeCheck(lastHandshake time.Time) (*time.Time, bool) {

 	w.log.Tracef("previous handshake, handshake: %v, %v", lastHandshake, handshake)

-	// the current know handshake did not change
+	// the current known handshake did not change
 	if handshake.Equal(lastHandshake) {
-		w.log.Warnf("WireGuard handshake timed out: %v", handshake)
+		w.log.Warnf("WireGuard handshake not updated: %v", handshake)
 		return nil, false
 	}

--- a/client/internal/peer/wg_watcher_test.go
+++ b/client/internal/peer/wg_watcher_test.go
@@ -7,7 +7,6 @@ import (
 	"time"

 	log "github.com/sirupsen/logrus"
-	"github.com/stretchr/testify/require"

 	"github.com/netbirdio/netbird/client/iface/configurer"
 )
@@ -35,8 +34,7 @@ func TestWGWatcher_EnableWgWatcher(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

-	ok := watcher.PrepareInitialHandshake()
-	require.True(t, ok, "watcher should not be enabled yet")
+	watcher.PrepareInitialHandshake()

 	onDisconnected := make(chan struct{}, 1)
 	go watcher.EnableWgWatcher(ctx, time.Now(), func() {
@@ -66,8 +64,7 @@ func TestWGWatcher_ReEnable(t *testing.T) {
 	watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{}))

 	ctx, cancel := context.WithCancel(context.Background())
-	ok := watcher.PrepareInitialHandshake()
-	require.True(t, ok, "watcher should not be enabled yet")
+	watcher.PrepareInitialHandshake()

 	wg := &sync.WaitGroup{}
 	wg.Add(1)
@@ -83,8 +80,7 @@ func TestWGWatcher_ReEnable(t *testing.T) {
 	ctx, cancel = context.WithCancel(context.Background())
 	defer cancel()

-	ok = watcher.PrepareInitialHandshake()
-	require.True(t, ok, "watcher should be re-enabled after the previous run stopped")
+	watcher.PrepareInitialHandshake()

 	onDisconnected := make(chan struct{}, 1)
 	go watcher.EnableWgWatcher(ctx, time.Now(), func() {
--- a/client/internal/routemanager/exit_node_selection_test.go
+++ b/client/internal/routemanager/exit_node_selection_test.go
@@ -0,0 +1,191 @@
+package routemanager
+
+import (
+	"net/netip"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/netbirdio/netbird/client/internal/routeselector"
+	"github.com/netbirdio/netbird/route"
+)
+
+func newExitNodeTestManager() *DefaultManager {
+	return &DefaultManager{routeSelector: routeselector.NewRouteSelector()}
+}
+
+func exitRoute(netID, peer string, skipAutoApply bool) *route.Route {
+	return &route.Route{
+		NetID:         route.NetID(netID),
+		Network:       netip.MustParsePrefix("0.0.0.0/0"),
+		Peer:          peer,
+		SkipAutoApply: skipAutoApply,
+	}
+}
+
+func TestPickPreferredExitNode(t *testing.T) {
+	tests := []struct {
+		name string
+		info exitNodeInfo
+		want route.NetID
+	}{
+		{
+			name: "persisted user selection wins over management",
+			info: exitNodeInfo{
+				allIDs:               []route.NetID{"a", "b", "c"},
+				userSelected:         []route.NetID{"b"},
+				selectedByManagement: []route.NetID{"a"},
+			},
+			want: "b",
+		},
+		{
+			name: "multiple user-selected self-heal to deterministic min",
+			info: exitNodeInfo{
+				allIDs:       []route.NetID{"a", "b", "c"},
+				userSelected: []route.NetID{"c", "a"},
+			},
+			want: "a",
+		},
+		{
+			name: "explicit opt-out keeps none",
+			info: exitNodeInfo{
+				allIDs:         []route.NetID{"a", "b"},
+				userDeselected: []route.NetID{"a", "b"},
+			},
+			want: "",
+		},
+		{
+			name: "fresh defaults to management auto-apply pick",
+			info: exitNodeInfo{
+				allIDs:               []route.NetID{"a", "b", "c"},
+				selectedByManagement: []route.NetID{"b"},
+			},
+			want: "b",
+		},
+		{
+			name: "no user pick and no management auto-apply selects none",
+			info: exitNodeInfo{
+				allIDs: []route.NetID{"c", "a", "b"},
+			},
+			want: "",
+		},
+		{
+			name: "user-deselect does not block a management auto-apply sibling",
+			info: exitNodeInfo{
+				allIDs:               []route.NetID{"a", "b"},
+				userDeselected:       []route.NetID{"a"},
+				selectedByManagement: []route.NetID{"b"},
+			},
+			want: "b",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.want, pickPreferredExitNode(tt.info), "preferred exit node")
+		})
+	}
+}
+
+func TestEnforceSingleExitNode(t *testing.T) {
+	m := newExitNodeTestManager()
+	all := []route.NetID{"a", "b", "c"}
+
+	m.enforceSingleExitNode("b", all)
+	assert.False(t, m.routeSelector.IsSelected("a"), "a should be deselected")
+	assert.True(t, m.routeSelector.IsSelected("b"), "b should be the only selected exit node")
+	assert.False(t, m.routeSelector.IsSelected("c"), "c should be deselected")
+
+	// Switching the preferred node moves the single selection.
+	m.enforceSingleExitNode("c", all)
+	assert.False(t, m.routeSelector.IsSelected("a"), "a stays deselected")
+	assert.False(t, m.routeSelector.IsSelected("b"), "b should now be deselected")
+	assert.True(t, m.routeSelector.IsSelected("c"), "c should now be selected")
+
+	// Empty preferred turns every exit node off.
+	m.enforceSingleExitNode("", all)
+	for _, id := range all {
+		assert.False(t, m.routeSelector.IsSelected(id), "no exit node should be selected")
+	}
+}
+
+func TestEnforceSingleExitNode_RespectsDeselectAll(t *testing.T) {
+	m := newExitNodeTestManager()
+	m.routeSelector.DeselectAllRoutes()
+
+	m.enforceSingleExitNode("b", []route.NetID{"a", "b"})
+
+	assert.True(t, m.routeSelector.IsDeselectAll(), "global deselect-all must stay in effect")
+	assert.False(t, m.routeSelector.IsSelected("b"), "no exit node should be forced on while deselect-all is set")
+}
+
+func TestUpdateRouteSelectorFromManagement_FreshSelectsOne(t *testing.T) {
+	m := newExitNodeTestManager()
+	routes := route.HAMap{
+		"exitA|0.0.0.0/0":    {exitRoute("exitA", "p1", false)},
+		"exitB|0.0.0.0/0":    {exitRoute("exitB", "p2", false)},
+		"lan|192.168.1.0/24": {{NetID: "lan", Network: netip.MustParsePrefix("192.168.1.0/24"), Peer: "p3"}},
+		"exitC|0.0.0.0/0":    {exitRoute("exitC", "p4", false)},
+	}
+
+	m.updateRouteSelectorFromManagement(routes)
+
+	// Exactly one exit node (the deterministic first) is selected.
+	assert.True(t, m.routeSelector.IsSelected("exitA"), "exitA is the deterministic default")
+	assert.False(t, m.routeSelector.IsSelected("exitB"), "exitB must not also be selected")
+	assert.False(t, m.routeSelector.IsSelected("exitC"), "exitC must not also be selected")
+	// Non-exit routes are left at their default-on state.
+	assert.True(t, m.routeSelector.IsSelected("lan"), "non-exit route selection is untouched")
+}
+
+func TestUpdateRouteSelectorFromManagement_HonorsPersistedPick(t *testing.T) {
+	m := newExitNodeTestManager()
+	routes := route.HAMap{
+		"exitA|0.0.0.0/0": {exitRoute("exitA", "p1", false)},
+		"exitB|0.0.0.0/0": {exitRoute("exitB", "p2", false)},
+	}
+	all := []route.NetID{"exitA", "exitB"}
+
+	// Simulate the state the runtime select path leaves behind: exactly one
+	// exit node explicitly selected, its sibling deselected.
+	require.NoError(t, m.routeSelector.SelectRoutes([]route.NetID{"exitB"}, true, all))
+	require.NoError(t, m.routeSelector.DeselectRoutes([]route.NetID{"exitA"}, all))
+
+	m.updateRouteSelectorFromManagement(routes)
+
+	assert.True(t, m.routeSelector.IsSelected("exitB"), "persisted pick must stay selected")
+	assert.False(t, m.routeSelector.IsSelected("exitA"), "the other exit node stays deselected")
+}
+
+func TestUpdateRouteSelectorFromManagement_OptOutKeepsNone(t *testing.T) {
+	m := newExitNodeTestManager()
+	routes := route.HAMap{
+		"exitA|0.0.0.0/0": {exitRoute("exitA", "p1", false)},
+		"exitB|0.0.0.0/0": {exitRoute("exitB", "p2", false)},
+	}
+	all := []route.NetID{"exitA", "exitB"}
+
+	// User deselected exit nodes and selected none.
+	require.NoError(t, m.routeSelector.DeselectRoutes(all, all))
+
+	m.updateRouteSelectorFromManagement(routes)
+
+	assert.False(t, m.routeSelector.IsSelected("exitA"), "opt-out keeps exitA off")
+	assert.False(t, m.routeSelector.IsSelected("exitB"), "opt-out keeps exitB off")
+}
+
+func TestUpdateRouteSelectorFromManagement_NoAutoApplySelectsNone(t *testing.T) {
+	m := newExitNodeTestManager()
+	// SkipAutoApply=true: management offers the exit nodes but doesn't request
+	// auto-activation, so none should be selected until the user picks one.
+	routes := route.HAMap{
+		"exitA|0.0.0.0/0": {exitRoute("exitA", "p1", true)},
+		"exitB|0.0.0.0/0": {exitRoute("exitB", "p2", true)},
+	}
+
+	m.updateRouteSelectorFromManagement(routes)
+
+	assert.False(t, m.routeSelector.IsSelected("exitA"), "no auto-apply keeps exitA off")
+	assert.False(t, m.routeSelector.IsSelected("exitB"), "no auto-apply keeps exitB off")
+}
--- a/client/internal/routemanager/manager.go
+++ b/client/internal/routemanager/manager.go
@@ -701,7 +701,13 @@ func resolveURLsToIPs(urls []string) []net.IP {
 	return ips
 }

-// updateRouteSelectorFromManagement updates the route selector based on the isSelected status from the management server
+// updateRouteSelectorFromManagement reconciles exit-node selection on every
+// network map: it keeps at most one exit node selected — the user's persisted
+// pick, else whatever management marks for auto-apply (SkipAutoApply=false),
+// else none. We never auto-activate an exit node the map doesn't request; it
+// stays off until the user picks it. Exit nodes are mutually exclusive, but the
+// RouteSelector stores routes with default-on semantics, so without this every
+// available exit node would report selected at once.
 func (m *DefaultManager) updateRouteSelectorFromManagement(clientRoutes route.HAMap) {
 	m.mirrorV6ExitPairSelections(clientRoutes)

@@ -712,13 +718,14 @@ func (m *DefaultManager) updateRouteSelectorFromManagement(clientRoutes route.HA
 		return
 	}

-	exitNodeInfo := m.collectExitNodeInfo(clientRoutes)
-	if len(exitNodeInfo.allIDs) == 0 {
+	info := m.collectExitNodeInfo(clientRoutes)
+	if len(info.allIDs) == 0 {
 		return
 	}

-	m.updateExitNodeSelections(exitNodeInfo)
-	m.logExitNodeUpdate(exitNodeInfo)
+	preferred := pickPreferredExitNode(info)
+	m.enforceSingleExitNode(preferred, info.allIDs)
+	m.logExitNodeUpdate(info, preferred)
 }

 // mirrorV6ExitPairSelections keeps every synthesized "-v6" exit route's selection
@@ -746,6 +753,10 @@ type exitNodeInfo struct {
 	userDeselected       []route.NetID
 }

+// collectExitNodeInfo categorises the available exit nodes by their persisted
+// selection state. It keys on the base (v4) NetID and skips the synthesized
+// "-v6" partner, which inherits its base's selection through the RouteSelector
+// — counting it separately would double-count the pair.
 func (m *DefaultManager) collectExitNodeInfo(clientRoutes route.HAMap) exitNodeInfo {
 	var info exitNodeInfo

@@ -755,6 +766,9 @@ func (m *DefaultManager) collectExitNodeInfo(clientRoutes route.HAMap) exitNodeI
 		}

 		netID := haID.NetID()
+		if strings.HasSuffix(string(netID), route.V6ExitSuffix) {
+			continue
+		}
 		info.allIDs = append(info.allIDs, netID)

 		if m.routeSelector.HasUserSelectionForRoute(netID) {
@@ -791,45 +805,52 @@ func (m *DefaultManager) checkManagementSelection(routes []*route.Route, netID r
 	}
 }

-func (m *DefaultManager) updateExitNodeSelections(info exitNodeInfo) {
-	routesToDeselect := m.getRoutesToDeselect(info.allIDs)
-	m.deselectExitNodes(routesToDeselect)
-	m.selectExitNodesByManagement(info.selectedByManagement, info.allIDs)
+// pickPreferredExitNode chooses the single exit node to keep selected. In order:
+//   - a persisted user selection wins (deterministic if several survive from
+//     legacy state, so the set self-heals down to one);
+//   - otherwise activate only what management marks for auto-apply
+//     (SkipAutoApply=false); the lexicographically first if it marks several.
+//
+// Returns "" when neither holds — we never force an arbitrary exit node on. A
+// route the map doesn't auto-apply stays off until the user selects it.
+// info.userDeselected is informational only: an explicit deselect simply keeps
+// that route out of both lists above, so it can't be picked.
+func pickPreferredExitNode(info exitNodeInfo) route.NetID {
+	if len(info.userSelected) > 0 {
+		return minNetID(info.userSelected)
+	}
+	if len(info.selectedByManagement) > 0 {
+		return minNetID(info.selectedByManagement)
+	}
+	return ""
 }

-func (m *DefaultManager) getRoutesToDeselect(allIDs []route.NetID) []route.NetID {
-	var routesToDeselect []route.NetID
-	for _, netID := range allIDs {
-		if !m.routeSelector.HasUserSelectionForRoute(netID) {
-			routesToDeselect = append(routesToDeselect, netID)
+// enforceSingleExitNode makes preferred the only selected exit node: every other
+// available exit node is deselected and preferred (if any) is selected, without
+// disturbing non-exit route selections. The whole reconciliation runs under a
+// single RouteSelector lock (SetExclusiveExitNode) so a concurrent deselect-all
+// cannot interleave and get undone; a global deselect-all is left untouched so
+// the user's "all off" stays in effect.
+func (m *DefaultManager) enforceSingleExitNode(preferred route.NetID, allIDs []route.NetID) {
+	m.routeSelector.SetExclusiveExitNode(preferred, allIDs)
+}
+
+func (m *DefaultManager) logExitNodeUpdate(info exitNodeInfo, preferred route.NetID) {
+	log.Debugf("Exit node selection: %d available, preferred=%q (%d user-selected, %d user-deselected, %d management-selected)",
+		len(info.allIDs), preferred, len(info.userSelected), len(info.userDeselected), len(info.selectedByManagement))
+}
+
+// minNetID returns the lexicographically smallest NetID, for a deterministic
+// default pick that stays stable across restarts.
+func minNetID(ids []route.NetID) route.NetID {
+	if len(ids) == 0 {
+		return ""
+	}
+	best := ids[0]
+	for _, id := range ids[1:] {
+		if id < best {
+			best = id
 		}
 	}
-	return routesToDeselect
-}
-
-func (m *DefaultManager) deselectExitNodes(routesToDeselect []route.NetID) {
-	if len(routesToDeselect) == 0 {
-		return
-	}
-
-	err := m.routeSelector.DeselectRoutes(routesToDeselect, routesToDeselect)
-	if err != nil {
-		log.Warnf("Failed to deselect exit nodes: %v", err)
-	}
-}
-
-func (m *DefaultManager) selectExitNodesByManagement(selectedByManagement []route.NetID, allIDs []route.NetID) {
-	if len(selectedByManagement) == 0 {
-		return
-	}
-
-	err := m.routeSelector.SelectRoutes(selectedByManagement, true, allIDs)
-	if err != nil {
-		log.Warnf("Failed to select exit nodes: %v", err)
-	}
-}
-
-func (m *DefaultManager) logExitNodeUpdate(info exitNodeInfo) {
-	log.Debugf("Updated route selector: %d exit nodes available, %d selected by management, %d user-selected, %d user-deselected",
-		len(info.allIDs), len(info.selectedByManagement), len(info.userSelected), len(info.userDeselected))
+	return best
 }
--- a/client/internal/routeselector/routeselector.go
+++ b/client/internal/routeselector/routeselector.go
@@ -115,7 +115,38 @@ func (rs *RouteSelector) DeselectAllRoutes() {
 	clear(rs.selectedRoutes)
 }

-// IsDeselectAll reports whether the user has explicitly deselected all routes.
+// SetExclusiveExitNode atomically makes preferred the only selected exit node
+// among exitIDs: every other ID in exitIDs is deselected and preferred (when
+// non-empty) is selected, all under a single lock. Holding the lock across the
+// whole reconciliation prevents a concurrent DeselectAllRoutes from interleaving
+// between the deselect and select steps and being silently undone. A global
+// deselect-all is left untouched so the user's "all off" stays in effect;
+// non-exit routes are never referenced, so their selection is preserved.
+func (rs *RouteSelector) SetExclusiveExitNode(preferred route.NetID, exitIDs []route.NetID) {
+	rs.mu.Lock()
+	defer rs.mu.Unlock()
+
+	if rs.deselectAll {
+		return
+	}
+
+	for _, id := range exitIDs {
+		if id == preferred {
+			continue
+		}
+		rs.deselectedRoutes[id] = struct{}{}
+		delete(rs.selectedRoutes, id)
+	}
+
+	if preferred != "" {
+		delete(rs.deselectedRoutes, preferred)
+		rs.selectedRoutes[preferred] = struct{}{}
+	}
+}
+
+// IsDeselectAll reports whether the global "deselect all" flag is set, i.e. the
+// user explicitly disabled every route. Callers enforcing per-route invariants
+// (e.g. single exit node) should leave the selection untouched when it is.
 func (rs *RouteSelector) IsDeselectAll() bool {
 	rs.mu.RLock()
 	defer rs.mu.RUnlock()
Author	SHA1	Message	Date
riccardom	5740dd22e6	Remove verbose comments	2026-07-03 17:26:20 +02:00
riccardom	ec98c930cb	[Recheck watcher ctx cancellation under conn.mu in onWGDisconnected onWGDisconnected only checked conn.ctx (the engine-scoped context), never the watcher's own context. disableWgWatcherIfNeeded cancels the wgWatcherCtx, not conn.ctx, so a disabled watcher's timeout callback did not see the cancellation. handshakeCheck runs lock-free, so between the ctx check in periodicHandshakeCheck and acquiring conn.mu a fast disconnect/reconnect can slip in: the stale watcher then acquires the lock and tears down the new, healthy connection based on the old timeout, forcing the guard into an unnecessary reconnect (flap). Recheck watcherCtx.Err() under conn.mu so a superseded watcher exits without touching the connection that replaced it.	2026-07-03 12:15:24 +02:00
riccardom	60104e000b	Discriminate not updated from timeout handshakes	2026-07-03 12:02:50 +02:00
riccardom	d5a212349f	Stick new watcher creation to actual existence of af the conn and its removal to the removal of such same conn. Avoid debouncing and cross lock dead locking	2026-07-03 11:37:41 +02:00
Zoltan Papp	f6900fb07c	[client] backport enforce a single selected exit node (#6640 ) * routemanager: enforce a single selected exit node Backport of the exit-node exclusivity reconcile from the 0.75.0 line (upstream commit `966fbec11`) onto v0.74.0. Exit nodes are mutually exclusive, but the RouteSelector stores routes with default-on semantics, so every available exit node reported as selected at once. Reconcile exit-node selection on each network map: keep at most one selected -- the user's persisted pick, else whatever management marks for auto-apply (SkipAutoApply=false), else none. Never auto-activate an exit node the map does not request. Carries over only the manager/routeselector logic and its test; the desktop-only client/server changes and the BumpNetworksRevision UI-push feature from the original commit are intentionally excluded. * routeselector: make exit-node reconciliation atomic enforceSingleExitNode took the RouteSelector lock three separate times (IsDeselectAll, then DeselectRoutes, then SelectRoutes), so a concurrent DeselectAllRoutes could interleave and be silently undone: SelectRoutes on its deselectAll branch clears the flag and re-selects the preferred exit node, overriding the user's "all off". Move the whole reconciliation into a single locked RouteSelector method (SetExclusiveExitNode) that checks deselectAll inside the critical section, so a deselect-all either fully precedes the reconcile (left untouched) or fully follows it (honoured). No interleaving is possible.	2026-07-03 10:31:06 +02:00