Compare commits

...

2 Commits

Author SHA1 Message Date
Theodor Midtlien
3aa6c02b93 [client] Fix backoff.Ticker goroutine leak in reconnect guard 2026-07-03 12:23:11 +02:00
Zoltan Papp
f6900fb07c [client] backport enforce a single selected exit node (#6640)
* routemanager: enforce a single selected exit node

Backport of the exit-node exclusivity reconcile from the 0.75.0 line
(upstream commit 966fbec11) onto v0.74.0. Exit nodes are mutually
exclusive, but the RouteSelector stores routes with default-on semantics,
so every available exit node reported as selected at once.

Reconcile exit-node selection on each network map: keep at most one
selected -- the user's persisted pick, else whatever management marks for
auto-apply (SkipAutoApply=false), else none. Never auto-activate an exit
node the map does not request.

Carries over only the manager/routeselector logic and its test; the
desktop-only client/server changes and the BumpNetworksRevision UI-push
feature from the original commit are intentionally excluded.

* routeselector: make exit-node reconciliation atomic

enforceSingleExitNode took the RouteSelector lock three separate times
(IsDeselectAll, then DeselectRoutes, then SelectRoutes), so a concurrent
DeselectAllRoutes could interleave and be silently undone: SelectRoutes on
its deselectAll branch clears the flag and re-selects the preferred exit
node, overriding the user's "all off".

Move the whole reconciliation into a single locked RouteSelector method
(SetExclusiveExitNode) that checks deselectAll inside the critical section,
so a deselect-all either fully precedes the reconcile (left untouched) or
fully follows it (honoured). No interleaving is possible.
2026-07-03 10:31:06 +02:00
5 changed files with 383 additions and 44 deletions

View File

@@ -85,7 +85,11 @@ func (g *Guard) reconnectLoopWithRetry(ctx context.Context, callback func()) {
defer g.srWatcher.RemoveListener(srReconnectedChan)
ticker := g.initialTicker(ctx)
defer ticker.Stop()
defer func() {
// If backoff.Ticker.send is blocked, context.Done will not close the Ticker goroutine.
// We have to explicitly call Stop, even if we use backoff.WithContext.
ticker.Stop()
}()
tickerChannel := ticker.C

View File

@@ -0,0 +1,92 @@
package guard
import (
"context"
"runtime"
"strings"
"sync"
"testing"
"time"
log "github.com/sirupsen/logrus"
"github.com/netbirdio/netbird/client/internal/peer/ice"
)
func newTestGuard(status connStatusFunc) *Guard {
srw := NewSRWatcher(nil, nil, nil, ice.Config{})
return NewGuard(log.WithField("test", "guard"), status, 50*time.Millisecond, srw)
}
// countBackoffTickerGoroutines returns how many goroutines are currently sitting
// in backoff/v4.(*Ticker).run (a ticker goroutine that has not exited).
func countBackoffTickerGoroutines() int {
buf := make([]byte, 1<<25) // 32MB
n := runtime.Stack(buf, true)
return strings.Count(string(buf[:n]), "backoff/v4.(*Ticker).run")
}
// TestGuard_ReconnectTicker_NoGoroutineLeakOnShutdown reproduces a observed
// leak: after a shutdown burst, ticker run/send goroutines stay parked
// forever even though every reconnect loop has exited.
func TestGuard_ReconnectTicker_NoGoroutineLeakOnShutdown(t *testing.T) {
before := countBackoffTickerGoroutines()
const peers = 6000
cancels := make([]context.CancelFunc, 0, peers)
var wg sync.WaitGroup
// A status check slower than the tick cadence. This models the real
// isConnectedOnAllWay/callback doing work: while the loop is busy in the
// handler, the ticker fires the next tick and parks in send(), because
// send() never selects on ctx.
slowStatus := func() ConnStatus {
time.Sleep(70 * time.Millisecond)
return ConnStatusConnected
}
for range peers {
g := newTestGuard(slowStatus)
ctx, cancel := context.WithCancel(context.Background())
cancels = append(cancels, cancel)
wg.Add(1)
go func() {
defer wg.Done()
g.Start(ctx, func() {})
}()
// Force the live ticker to be a newReconnectTicker.
g.SetRelayedConnDisconnected()
}
// Let the replacement tickers get past their 800ms initial interval, so
// many are parked in send() waiting on the (slow) consumer when we tear
// everything down.
time.Sleep(1500 * time.Millisecond)
// Shutdown burst: cancel every peer at once, like engine teardown.
for _, c := range cancels {
c()
}
// Every reconnect loop must return
waitCh := make(chan struct{})
go func() { wg.Wait(); close(waitCh) }()
select {
case <-waitCh:
case <-time.After(30 * time.Second):
t.Fatal("not all reconnect loops returned after ctx cancel")
}
// Give any correctly-stopped ticker goroutines time to unwind.
for range 50 {
runtime.Gosched()
time.Sleep(10 * time.Millisecond)
}
leaked := countBackoffTickerGoroutines() - before
t.Logf("backoff Ticker.run goroutines still parked after teardown of %d peers: %d", peers, leaked)
if leaked > 0 {
t.Errorf("LEAK: %d backoff ticker goroutines parked after all reconnect loops exited "+
"(defer ticker.Stop() stops the initial ticker, not the live replacement)", leaked)
}
}

View File

@@ -0,0 +1,191 @@
package routemanager
import (
"net/netip"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/netbirdio/netbird/client/internal/routeselector"
"github.com/netbirdio/netbird/route"
)
func newExitNodeTestManager() *DefaultManager {
return &DefaultManager{routeSelector: routeselector.NewRouteSelector()}
}
func exitRoute(netID, peer string, skipAutoApply bool) *route.Route {
return &route.Route{
NetID: route.NetID(netID),
Network: netip.MustParsePrefix("0.0.0.0/0"),
Peer: peer,
SkipAutoApply: skipAutoApply,
}
}
func TestPickPreferredExitNode(t *testing.T) {
tests := []struct {
name string
info exitNodeInfo
want route.NetID
}{
{
name: "persisted user selection wins over management",
info: exitNodeInfo{
allIDs: []route.NetID{"a", "b", "c"},
userSelected: []route.NetID{"b"},
selectedByManagement: []route.NetID{"a"},
},
want: "b",
},
{
name: "multiple user-selected self-heal to deterministic min",
info: exitNodeInfo{
allIDs: []route.NetID{"a", "b", "c"},
userSelected: []route.NetID{"c", "a"},
},
want: "a",
},
{
name: "explicit opt-out keeps none",
info: exitNodeInfo{
allIDs: []route.NetID{"a", "b"},
userDeselected: []route.NetID{"a", "b"},
},
want: "",
},
{
name: "fresh defaults to management auto-apply pick",
info: exitNodeInfo{
allIDs: []route.NetID{"a", "b", "c"},
selectedByManagement: []route.NetID{"b"},
},
want: "b",
},
{
name: "no user pick and no management auto-apply selects none",
info: exitNodeInfo{
allIDs: []route.NetID{"c", "a", "b"},
},
want: "",
},
{
name: "user-deselect does not block a management auto-apply sibling",
info: exitNodeInfo{
allIDs: []route.NetID{"a", "b"},
userDeselected: []route.NetID{"a"},
selectedByManagement: []route.NetID{"b"},
},
want: "b",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, pickPreferredExitNode(tt.info), "preferred exit node")
})
}
}
func TestEnforceSingleExitNode(t *testing.T) {
m := newExitNodeTestManager()
all := []route.NetID{"a", "b", "c"}
m.enforceSingleExitNode("b", all)
assert.False(t, m.routeSelector.IsSelected("a"), "a should be deselected")
assert.True(t, m.routeSelector.IsSelected("b"), "b should be the only selected exit node")
assert.False(t, m.routeSelector.IsSelected("c"), "c should be deselected")
// Switching the preferred node moves the single selection.
m.enforceSingleExitNode("c", all)
assert.False(t, m.routeSelector.IsSelected("a"), "a stays deselected")
assert.False(t, m.routeSelector.IsSelected("b"), "b should now be deselected")
assert.True(t, m.routeSelector.IsSelected("c"), "c should now be selected")
// Empty preferred turns every exit node off.
m.enforceSingleExitNode("", all)
for _, id := range all {
assert.False(t, m.routeSelector.IsSelected(id), "no exit node should be selected")
}
}
func TestEnforceSingleExitNode_RespectsDeselectAll(t *testing.T) {
m := newExitNodeTestManager()
m.routeSelector.DeselectAllRoutes()
m.enforceSingleExitNode("b", []route.NetID{"a", "b"})
assert.True(t, m.routeSelector.IsDeselectAll(), "global deselect-all must stay in effect")
assert.False(t, m.routeSelector.IsSelected("b"), "no exit node should be forced on while deselect-all is set")
}
func TestUpdateRouteSelectorFromManagement_FreshSelectsOne(t *testing.T) {
m := newExitNodeTestManager()
routes := route.HAMap{
"exitA|0.0.0.0/0": {exitRoute("exitA", "p1", false)},
"exitB|0.0.0.0/0": {exitRoute("exitB", "p2", false)},
"lan|192.168.1.0/24": {{NetID: "lan", Network: netip.MustParsePrefix("192.168.1.0/24"), Peer: "p3"}},
"exitC|0.0.0.0/0": {exitRoute("exitC", "p4", false)},
}
m.updateRouteSelectorFromManagement(routes)
// Exactly one exit node (the deterministic first) is selected.
assert.True(t, m.routeSelector.IsSelected("exitA"), "exitA is the deterministic default")
assert.False(t, m.routeSelector.IsSelected("exitB"), "exitB must not also be selected")
assert.False(t, m.routeSelector.IsSelected("exitC"), "exitC must not also be selected")
// Non-exit routes are left at their default-on state.
assert.True(t, m.routeSelector.IsSelected("lan"), "non-exit route selection is untouched")
}
func TestUpdateRouteSelectorFromManagement_HonorsPersistedPick(t *testing.T) {
m := newExitNodeTestManager()
routes := route.HAMap{
"exitA|0.0.0.0/0": {exitRoute("exitA", "p1", false)},
"exitB|0.0.0.0/0": {exitRoute("exitB", "p2", false)},
}
all := []route.NetID{"exitA", "exitB"}
// Simulate the state the runtime select path leaves behind: exactly one
// exit node explicitly selected, its sibling deselected.
require.NoError(t, m.routeSelector.SelectRoutes([]route.NetID{"exitB"}, true, all))
require.NoError(t, m.routeSelector.DeselectRoutes([]route.NetID{"exitA"}, all))
m.updateRouteSelectorFromManagement(routes)
assert.True(t, m.routeSelector.IsSelected("exitB"), "persisted pick must stay selected")
assert.False(t, m.routeSelector.IsSelected("exitA"), "the other exit node stays deselected")
}
func TestUpdateRouteSelectorFromManagement_OptOutKeepsNone(t *testing.T) {
m := newExitNodeTestManager()
routes := route.HAMap{
"exitA|0.0.0.0/0": {exitRoute("exitA", "p1", false)},
"exitB|0.0.0.0/0": {exitRoute("exitB", "p2", false)},
}
all := []route.NetID{"exitA", "exitB"}
// User deselected exit nodes and selected none.
require.NoError(t, m.routeSelector.DeselectRoutes(all, all))
m.updateRouteSelectorFromManagement(routes)
assert.False(t, m.routeSelector.IsSelected("exitA"), "opt-out keeps exitA off")
assert.False(t, m.routeSelector.IsSelected("exitB"), "opt-out keeps exitB off")
}
func TestUpdateRouteSelectorFromManagement_NoAutoApplySelectsNone(t *testing.T) {
m := newExitNodeTestManager()
// SkipAutoApply=true: management offers the exit nodes but doesn't request
// auto-activation, so none should be selected until the user picks one.
routes := route.HAMap{
"exitA|0.0.0.0/0": {exitRoute("exitA", "p1", true)},
"exitB|0.0.0.0/0": {exitRoute("exitB", "p2", true)},
}
m.updateRouteSelectorFromManagement(routes)
assert.False(t, m.routeSelector.IsSelected("exitA"), "no auto-apply keeps exitA off")
assert.False(t, m.routeSelector.IsSelected("exitB"), "no auto-apply keeps exitB off")
}

View File

@@ -701,7 +701,13 @@ func resolveURLsToIPs(urls []string) []net.IP {
return ips
}
// updateRouteSelectorFromManagement updates the route selector based on the isSelected status from the management server
// updateRouteSelectorFromManagement reconciles exit-node selection on every
// network map: it keeps at most one exit node selected — the user's persisted
// pick, else whatever management marks for auto-apply (SkipAutoApply=false),
// else none. We never auto-activate an exit node the map doesn't request; it
// stays off until the user picks it. Exit nodes are mutually exclusive, but the
// RouteSelector stores routes with default-on semantics, so without this every
// available exit node would report selected at once.
func (m *DefaultManager) updateRouteSelectorFromManagement(clientRoutes route.HAMap) {
m.mirrorV6ExitPairSelections(clientRoutes)
@@ -712,13 +718,14 @@ func (m *DefaultManager) updateRouteSelectorFromManagement(clientRoutes route.HA
return
}
exitNodeInfo := m.collectExitNodeInfo(clientRoutes)
if len(exitNodeInfo.allIDs) == 0 {
info := m.collectExitNodeInfo(clientRoutes)
if len(info.allIDs) == 0 {
return
}
m.updateExitNodeSelections(exitNodeInfo)
m.logExitNodeUpdate(exitNodeInfo)
preferred := pickPreferredExitNode(info)
m.enforceSingleExitNode(preferred, info.allIDs)
m.logExitNodeUpdate(info, preferred)
}
// mirrorV6ExitPairSelections keeps every synthesized "-v6" exit route's selection
@@ -746,6 +753,10 @@ type exitNodeInfo struct {
userDeselected []route.NetID
}
// collectExitNodeInfo categorises the available exit nodes by their persisted
// selection state. It keys on the base (v4) NetID and skips the synthesized
// "-v6" partner, which inherits its base's selection through the RouteSelector
// — counting it separately would double-count the pair.
func (m *DefaultManager) collectExitNodeInfo(clientRoutes route.HAMap) exitNodeInfo {
var info exitNodeInfo
@@ -755,6 +766,9 @@ func (m *DefaultManager) collectExitNodeInfo(clientRoutes route.HAMap) exitNodeI
}
netID := haID.NetID()
if strings.HasSuffix(string(netID), route.V6ExitSuffix) {
continue
}
info.allIDs = append(info.allIDs, netID)
if m.routeSelector.HasUserSelectionForRoute(netID) {
@@ -791,45 +805,52 @@ func (m *DefaultManager) checkManagementSelection(routes []*route.Route, netID r
}
}
func (m *DefaultManager) updateExitNodeSelections(info exitNodeInfo) {
routesToDeselect := m.getRoutesToDeselect(info.allIDs)
m.deselectExitNodes(routesToDeselect)
m.selectExitNodesByManagement(info.selectedByManagement, info.allIDs)
// pickPreferredExitNode chooses the single exit node to keep selected. In order:
// - a persisted user selection wins (deterministic if several survive from
// legacy state, so the set self-heals down to one);
// - otherwise activate only what management marks for auto-apply
// (SkipAutoApply=false); the lexicographically first if it marks several.
//
// Returns "" when neither holds — we never force an arbitrary exit node on. A
// route the map doesn't auto-apply stays off until the user selects it.
// info.userDeselected is informational only: an explicit deselect simply keeps
// that route out of both lists above, so it can't be picked.
func pickPreferredExitNode(info exitNodeInfo) route.NetID {
if len(info.userSelected) > 0 {
return minNetID(info.userSelected)
}
if len(info.selectedByManagement) > 0 {
return minNetID(info.selectedByManagement)
}
return ""
}
func (m *DefaultManager) getRoutesToDeselect(allIDs []route.NetID) []route.NetID {
var routesToDeselect []route.NetID
for _, netID := range allIDs {
if !m.routeSelector.HasUserSelectionForRoute(netID) {
routesToDeselect = append(routesToDeselect, netID)
// enforceSingleExitNode makes preferred the only selected exit node: every other
// available exit node is deselected and preferred (if any) is selected, without
// disturbing non-exit route selections. The whole reconciliation runs under a
// single RouteSelector lock (SetExclusiveExitNode) so a concurrent deselect-all
// cannot interleave and get undone; a global deselect-all is left untouched so
// the user's "all off" stays in effect.
func (m *DefaultManager) enforceSingleExitNode(preferred route.NetID, allIDs []route.NetID) {
m.routeSelector.SetExclusiveExitNode(preferred, allIDs)
}
func (m *DefaultManager) logExitNodeUpdate(info exitNodeInfo, preferred route.NetID) {
log.Debugf("Exit node selection: %d available, preferred=%q (%d user-selected, %d user-deselected, %d management-selected)",
len(info.allIDs), preferred, len(info.userSelected), len(info.userDeselected), len(info.selectedByManagement))
}
// minNetID returns the lexicographically smallest NetID, for a deterministic
// default pick that stays stable across restarts.
func minNetID(ids []route.NetID) route.NetID {
if len(ids) == 0 {
return ""
}
best := ids[0]
for _, id := range ids[1:] {
if id < best {
best = id
}
}
return routesToDeselect
}
func (m *DefaultManager) deselectExitNodes(routesToDeselect []route.NetID) {
if len(routesToDeselect) == 0 {
return
}
err := m.routeSelector.DeselectRoutes(routesToDeselect, routesToDeselect)
if err != nil {
log.Warnf("Failed to deselect exit nodes: %v", err)
}
}
func (m *DefaultManager) selectExitNodesByManagement(selectedByManagement []route.NetID, allIDs []route.NetID) {
if len(selectedByManagement) == 0 {
return
}
err := m.routeSelector.SelectRoutes(selectedByManagement, true, allIDs)
if err != nil {
log.Warnf("Failed to select exit nodes: %v", err)
}
}
func (m *DefaultManager) logExitNodeUpdate(info exitNodeInfo) {
log.Debugf("Updated route selector: %d exit nodes available, %d selected by management, %d user-selected, %d user-deselected",
len(info.allIDs), len(info.selectedByManagement), len(info.userSelected), len(info.userDeselected))
return best
}

View File

@@ -115,7 +115,38 @@ func (rs *RouteSelector) DeselectAllRoutes() {
clear(rs.selectedRoutes)
}
// IsDeselectAll reports whether the user has explicitly deselected all routes.
// SetExclusiveExitNode atomically makes preferred the only selected exit node
// among exitIDs: every other ID in exitIDs is deselected and preferred (when
// non-empty) is selected, all under a single lock. Holding the lock across the
// whole reconciliation prevents a concurrent DeselectAllRoutes from interleaving
// between the deselect and select steps and being silently undone. A global
// deselect-all is left untouched so the user's "all off" stays in effect;
// non-exit routes are never referenced, so their selection is preserved.
func (rs *RouteSelector) SetExclusiveExitNode(preferred route.NetID, exitIDs []route.NetID) {
rs.mu.Lock()
defer rs.mu.Unlock()
if rs.deselectAll {
return
}
for _, id := range exitIDs {
if id == preferred {
continue
}
rs.deselectedRoutes[id] = struct{}{}
delete(rs.selectedRoutes, id)
}
if preferred != "" {
delete(rs.deselectedRoutes, preferred)
rs.selectedRoutes[preferred] = struct{}{}
}
}
// IsDeselectAll reports whether the global "deselect all" flag is set, i.e. the
// user explicitly disabled every route. Callers enforcing per-route invariants
// (e.g. single exit node) should leave the selection untouched when it is.
func (rs *RouteSelector) IsDeselectAll() bool {
rs.mu.RLock()
defer rs.mu.RUnlock()