Emit IPv6 default permit firewall rule for exit node routes

2026-06-08 17:09:57 +00:00 · 2026-06-08 15:27:33 +02:00
9 changed files with 108 additions and 609 deletions
--- a/management/internals/shared/grpc/proxy.go
+++ b/management/internals/shared/grpc/proxy.go
@@ -666,10 +666,8 @@ func (s *ProxyServiceServer) sender(conn *proxyConnection, errChan chan<- error)
 		case resp := <-conn.sendChan:
 			if err := conn.sendResponse(resp); err != nil {
 				errChan <- err
-				log.WithContext(conn.ctx).Tracef("Failed to send response to proxy %s: %v", conn.proxyID, err)
 				return
 			}
-			log.WithContext(conn.ctx).Tracef("Send response to proxy %s", conn.proxyID)
 		case <-conn.ctx.Done():
 			return
 		}
--- a/management/server/types/networkmap_components.go
+++ b/management/server/types/networkmap_components.go
@@ -557,7 +557,6 @@ func (c *NetworkMapComponents) getRoutingPeerRoutes(peerID string) (enabledRoute
 	return enabledRoutes, disabledRoutes
 }

-
 func (c *NetworkMapComponents) filterRoutesByGroups(routes []*route.Route, groupListMap LookupMap) []*route.Route {
 	var filteredRoutes []*route.Route
 	for _, r := range routes {
@@ -628,9 +627,14 @@ func (c *NetworkMapComponents) getDefaultPermit(r *route.Route, includeIPv6 bool

 	rules := []*RouteFirewallRule{&rule}

-	if includeIPv6 && r.IsDynamic() {
+	isDefaultV4 := r.Network.Addr().Is4() && r.Network.Bits() == 0
+	if includeIPv6 && (r.IsDynamic() || isDefaultV4) {
 		ruleV6 := rule
 		ruleV6.SourceRanges = []string{"::/0"}
+		if isDefaultV4 {
+			ruleV6.Destination = "::/0"
+			ruleV6.RouteID = r.ID + "-v6-default"
+		}
 		rules = append(rules, &ruleV6)
 	}

--- a/management/server/types/networkmap_components_correctness_test.go
+++ b/management/server/types/networkmap_components_correctness_test.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"net"
 	"net/netip"
+	"slices"
 	"testing"
 	"time"

@@ -1029,6 +1030,48 @@ func TestComponents_RouteDefaultPermit(t *testing.T) {
 	assert.True(t, hasDefaultPermit, "route without ACG should have default permit rule with 0.0.0.0/0 source")
 }

+// TestComponents_ExitNodeDefaultPermitIPv6 verifies that a default exit node route
+// (0.0.0.0/0) without AccessControlGroups also emits an IPv6 default permit rule
+// (::/0 source and destination) for peers that support IPv6, mirroring the route
+// the client installs. Without it, IPv6 traffic is routed to the exit node but
+// dropped at the forward chain.
+func TestComponents_ExitNodeDefaultPermitIPv6(t *testing.T) {
+	account, validatedPeers := scalableTestAccount(20, 2)
+
+	routingPeerID := "peer-5"
+	routingPeer := account.Peers[routingPeerID]
+	routingPeer.IPv6 = netip.MustParseAddr("fd00::5")
+	routingPeer.Meta.Capabilities = append(routingPeer.Meta.Capabilities, nbpeer.PeerCapabilityIPv6Overlay)
+
+	account.Routes["route-exit"] = &route.Route{
+		ID: "route-exit", Network: netip.MustParsePrefix("0.0.0.0/0"),
+		PeerID: routingPeerID, Peer: routingPeer.Key,
+		Enabled: true, Groups: []string{"group-all"}, PeerGroups: []string{"group-0"},
+		AccessControlGroups: []string{},
+		AccountID:           "test-account",
+	}
+
+	nm := componentsNetworkMap(account, routingPeerID, validatedPeers)
+	require.NotNil(t, nm)
+
+	hasV4 := false
+	hasV6 := false
+	for _, rfr := range nm.RoutesFirewallRules {
+		switch rfr.Destination {
+		case "0.0.0.0/0":
+			if slices.Contains(rfr.SourceRanges, "0.0.0.0/0") {
+				hasV4 = true
+			}
+		case "::/0":
+			if slices.Contains(rfr.SourceRanges, "::/0") {
+				hasV6 = true
+			}
+		}
+	}
+	assert.True(t, hasV4, "exit node route should have an IPv4 default permit rule (0.0.0.0/0)")
+	assert.True(t, hasV6, "exit node route should have an IPv6 default permit rule (::/0)")
+}
+
 // ──────────────────────────────────────────────────────────────────────────────
 // 15. MULTIPLE ROUTERS PER NETWORK
 // ──────────────────────────────────────────────────────────────────────────────
--- a/proxy/cmd/proxy/cmd/root.go
+++ b/proxy/cmd/proxy/cmd/root.go
@@ -249,7 +249,6 @@ func runServer(cmd *cobra.Command, args []string) error {
 		Private:                  private,
 		MaxDialTimeout:           maxDialTimeout,
 		MaxSessionIdleTimeout:    maxSessionIdleTimeout,
-		MappingBatchWatchdog:     envDurationOrDefault("NB_PROXY_MAPPING_BATCH_WATCHDOG", 0),
 		GeoDataDir:               geoDataDir,
 		CrowdSecAPIURL:           crowdsecAPIURL,
 		CrowdSecAPIKey:           crowdsecAPIKey,
--- a/proxy/internal/roundtrip/netbird.go
+++ b/proxy/internal/roundtrip/netbird.go
@@ -28,10 +28,6 @@ import (

 const deviceNamePrefix = "ingress-proxy-"

-const clientStopTimeout = 30 * time.Second
-
-const createProxyPeerTimeout = 30 * time.Second
-
 // backendKey identifies a backend by its host:port from the target URL.
 type backendKey string

@@ -166,7 +162,6 @@ type NetBird struct {

 	clientsMux     sync.RWMutex
 	clients        map[types.AccountID]*clientEntry
-	lifecycleMu    sync.Map
 	initLogOnce    sync.Once
 	statusNotifier statusNotifier
 	// readyHandler runs after the embedded client for an account reports
@@ -182,10 +177,6 @@ type NetBird struct {
 	// (i.e. when a new client was actually created, not when an existing one
 	// was reused). The duration covers keygen + gRPC CreateProxyPeer + embed.New.
 	OnAddPeer func(d time.Duration, err error)
-
-	// startClient runs the post-create client startup. Nil uses runClientStartup;
-	// tests override it to avoid a real embed client.Start.
-	startClient func(accountID types.AccountID, client *embed.Client)
 }

 // ClientDebugInfo contains debug information about a client.
@@ -209,20 +200,31 @@ type skipTLSVerifyContextKey struct{}
 func (n *NetBird) AddPeer(ctx context.Context, accountID types.AccountID, key ServiceKey, authToken string, serviceID types.ServiceID) error {
 	si := serviceInfo{serviceID: serviceID}

-	if n.registerExistingClient(accountID, key, si) {
-		return nil
-	}
+	n.clientsMux.Lock()

-	lifecycle := n.accountLifecycle(accountID)
-	lifecycle.Lock()
-	transferred := false
-	defer func() {
-		if !transferred {
-			lifecycle.Unlock()
+	entry, exists := n.clients[accountID]
+	if exists {
+		entry.services[key] = si
+		started := entry.started
+		n.clientsMux.Unlock()
+
+		n.logger.WithFields(log.Fields{
+			"account_id":  accountID,
+			"service_key": key,
+		}).Debug("registered service with existing client")
+
+		if started && n.statusNotifier != nil {
+			// Use a background context, not the caller's: the management
+			// connection notification must land even if the request /
+			// stream that triggered this registration is cancelled.
+			// Mirrors the async runClientStartup path.
+			if err := n.statusNotifier.NotifyStatus(context.Background(), accountID, serviceID, true); err != nil {
+				n.logger.WithFields(log.Fields{
+					"account_id":  accountID,
+					"service_key": key,
+				}).WithError(err).Warn("failed to notify status for existing client")
+			}
 		}
-	}()
-
-	if n.registerExistingClient(accountID, key, si) {
 		return nil
 	}

@@ -232,10 +234,10 @@ func (n *NetBird) AddPeer(ctx context.Context, accountID types.AccountID, key Se
 		n.OnAddPeer(time.Since(createStart), err)
 	}
 	if err != nil {
+		n.clientsMux.Unlock()
 		return err
 	}

-	n.clientsMux.Lock()
 	n.clients[accountID] = entry
 	n.clientsMux.Unlock()

@@ -244,64 +246,17 @@ func (n *NetBird) AddPeer(ctx context.Context, accountID types.AccountID, key Se
 		"service_key": key,
 	}).Info("created new client for account")

-	transferred = true
-	go func() {
-		defer lifecycle.Unlock()
-		n.startClientStartup(accountID, entry.client)
-	}()
+	// Attempt to start the client in the background; if this fails we will
+	// retry on the first request via RoundTrip. runClientStartup uses its
+	// own background context so the caller's request-scoped ctx can't
+	// cancel the inbound bring-up.
+	go n.runClientStartup(accountID, entry.client)

 	return nil
 }

-func (n *NetBird) startClientStartup(accountID types.AccountID, client *embed.Client) {
-	if n.startClient != nil {
-		n.startClient(accountID, client)
-		return
-	}
-	n.runClientStartup(accountID, client)
-}
-
-// registerExistingClient registers the service against an already-present
-// client for the account and returns true when it did. It notifies management
-// of the new service when the client is already started.
-func (n *NetBird) registerExistingClient(accountID types.AccountID, key ServiceKey, si serviceInfo) bool {
-	n.clientsMux.Lock()
-	entry, exists := n.clients[accountID]
-	if !exists {
-		n.clientsMux.Unlock()
-		return false
-	}
-	entry.services[key] = si
-	started := entry.started
-	n.clientsMux.Unlock()
-
-	n.logger.WithFields(log.Fields{
-		"account_id":  accountID,
-		"service_key": key,
-	}).Debug("registered service with existing client")
-
-	if started && n.statusNotifier != nil {
-		if err := n.statusNotifier.NotifyStatus(context.Background(), accountID, si.serviceID, true); err != nil {
-			n.logger.WithFields(log.Fields{
-				"account_id":  accountID,
-				"service_key": key,
-			}).WithError(err).Warn("failed to notify status for existing client")
-		}
-	}
-	return true
-}
-
-// accountLifecycle returns the per-account lifecycle mutex, serialising client
-// creation against teardown so a slow client.Stop cannot race a new
-// client.Start for the same account, without blocking clientsMux.
-func (n *NetBird) accountLifecycle(accountID types.AccountID) *sync.Mutex {
-	mu, _ := n.lifecycleMu.LoadOrStore(accountID, &sync.Mutex{})
-	return mu.(*sync.Mutex)
-}
-
 // createClientEntry generates a WireGuard keypair, authenticates with management,
-// and creates an embedded NetBird client. Must be called with the account's
-// lifecycle mutex held.
+// and creates an embedded NetBird client. Must be called with clientsMux held.
 func (n *NetBird) createClientEntry(ctx context.Context, accountID types.AccountID, key ServiceKey, authToken string, si serviceInfo) (*clientEntry, error) {
 	serviceID := si.serviceID
 	n.logger.WithFields(log.Fields{
@@ -321,9 +276,7 @@ func (n *NetBird) createClientEntry(ctx context.Context, accountID types.Account
 		"public_key": publicKey.String(),
 	}).Debug("authenticating new proxy peer with management")

-	createCtx, cancel := context.WithTimeout(ctx, createProxyPeerTimeout)
-	defer cancel()
-	resp, err := n.mgmtClient.CreateProxyPeer(createCtx, &proto.CreateProxyPeerRequest{
+	resp, err := n.mgmtClient.CreateProxyPeer(ctx, &proto.CreateProxyPeerRequest{
 		ServiceId:          string(serviceID),
 		AccountId:          string(accountID),
 		Token:              authToken,
@@ -491,15 +444,6 @@ func (n *NetBird) notifyClientReady(accountID types.AccountID, client *embed.Cli
 // RemovePeer unregisters a service from an account. The client is only stopped
 // when no services are using it anymore.
 func (n *NetBird) RemovePeer(ctx context.Context, accountID types.AccountID, key ServiceKey) error {
-	lifecycle := n.accountLifecycle(accountID)
-	lifecycle.Lock()
-	transferred := false
-	defer func() {
-		if !transferred {
-			lifecycle.Unlock()
-		}
-	}()
-
 	n.clientsMux.Lock()

 	entry, exists := n.clients[accountID]
@@ -522,8 +466,17 @@ func (n *NetBird) RemovePeer(ctx context.Context, accountID types.AccountID, key
 	delete(entry.services, key)

 	stopClient := len(entry.services) == 0
+	var client *embed.Client
+	var transport, insecureTransport *http.Transport
+	var inbound any
+	var stopHandler func(types.AccountID, any)
 	if stopClient {
 		n.logger.WithField("account_id", accountID).Info("stopping client, no more services")
+		client = entry.client
+		transport = entry.transport
+		insecureTransport = entry.insecureTransport
+		inbound = entry.inbound
+		stopHandler = n.stopHandler
 		delete(n.clients, accountID)
 	} else {
 		n.logger.WithFields(log.Fields{
@@ -537,40 +490,19 @@ func (n *NetBird) RemovePeer(ctx context.Context, accountID types.AccountID, key
 	n.notifyDisconnect(ctx, accountID, key, si.serviceID)

 	if stopClient {
-		transferred = true
-		go n.stopClientLocked(accountID, lifecycle, entry)
+		if inbound != nil && stopHandler != nil {
+			stopHandler(accountID, inbound)
+		}
+		transport.CloseIdleConnections()
+		insecureTransport.CloseIdleConnections()
+		if err := client.Stop(ctx); err != nil {
+			n.logger.WithField("account_id", accountID).WithError(err).Warn("failed to stop netbird client")
+		}
 	}

 	return nil
 }

-// stopClientLocked releases a client's resources off the caller's goroutine so a
-// slow client.Stop cannot wedge the mapping receive loop (which calls RemovePeer
-// synchronously). It unlocks lifecycle when done so a new client.Start for the
-// same account waits for this teardown.
-func (n *NetBird) stopClientLocked(accountID types.AccountID, lifecycle *sync.Mutex, entry *clientEntry) {
-	defer lifecycle.Unlock()
-
-	if entry.inbound != nil && n.stopHandler != nil {
-		n.stopHandler(accountID, entry.inbound)
-	}
-	if entry.transport != nil {
-		entry.transport.CloseIdleConnections()
-	}
-	if entry.insecureTransport != nil {
-		entry.insecureTransport.CloseIdleConnections()
-	}
-	if entry.client == nil {
-		return
-	}
-
-	ctx, cancel := context.WithTimeout(context.Background(), clientStopTimeout)
-	defer cancel()
-	if err := entry.client.Stop(ctx); err != nil {
-		n.logger.WithField("account_id", accountID).WithError(err).Warn("failed to stop netbird client")
-	}
-}
-
 func (n *NetBird) notifyDisconnect(ctx context.Context, accountID types.AccountID, key ServiceKey, serviceID types.ServiceID) {
 	if n.statusNotifier == nil {
 		return
--- a/proxy/internal/roundtrip/netbird_test.go
+++ b/proxy/internal/roundtrip/netbird_test.go
@@ -6,7 +6,6 @@ import (
 	"net/netip"
 	"sync"
 	"testing"
-	"time"

 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -23,18 +22,6 @@ func (m *mockMgmtClient) CreateProxyPeer(_ context.Context, _ *proto.CreateProxy
 	return &proto.CreateProxyPeerResponse{Success: true}, nil
 }

-// signalMgmtClient closes entered the first time CreateProxyPeer is called, so
-// tests can detect AddPeer reaching client creation.
-type signalMgmtClient struct {
-	entered chan struct{}
-	once    sync.Once
-}
-
-func (m *signalMgmtClient) CreateProxyPeer(_ context.Context, _ *proto.CreateProxyPeerRequest, _ ...grpc.CallOption) (*proto.CreateProxyPeerResponse, error) {
-	m.once.Do(func() { close(m.entered) })
-	return &proto.CreateProxyPeerResponse{Success: true}, nil
-}
-
 type mockStatusNotifier struct {
 	mu       sync.Mutex
 	statuses []statusCall
@@ -65,15 +52,11 @@ func (m *mockStatusNotifier) calls() []statusCall {
 // mockNetBird creates a NetBird instance for testing without actually connecting.
 // It uses an invalid management URL to prevent real connections.
 func mockNetBird() *NetBird {
-	nb := NewNetBird(context.Background(), "test-proxy", "invalid.test", ClientConfig{
+	return NewNetBird(context.Background(), "test-proxy", "invalid.test", ClientConfig{
 		MgmtAddr:     "http://invalid.test:9999",
 		WGPort:       0,
 		PreSharedKey: "",
 	}, nil, nil, &mockMgmtClient{})
-	// Skip the real embed client.Start, which would hang against the unreachable
-	// mgmt URL and (now that the lifecycle lock spans startup) serialise removes.
-	nb.startClient = func(types.AccountID, *embed.Client) {}
-	return nb
 }

 func TestNetBird_AddPeer_CreatesClientForNewAccount(t *testing.T) {
@@ -305,7 +288,6 @@ func TestNetBird_AddPeer_ExistingStartedClient_NotifiesStatus(t *testing.T) {
 		WGPort:       0,
 		PreSharedKey: "",
 	}, nil, notifier, &mockMgmtClient{})
-	nb.startClient = func(types.AccountID, *embed.Client) {}
 	accountID := types.AccountID("account-1")

 	// Add first service — creates a new client entry.
@@ -390,117 +372,6 @@ func TestNetBird_RemovePeer_NotifiesDisconnection(t *testing.T) {
 	assert.False(t, calls[0].connected)
 }

-// TestNetBird_RemovePeer_TeardownIsAsync proves the fix for the receive-loop
-// stall: RemovePeer must return promptly even when the client teardown blocks,
-// because teardown runs off the caller's goroutine. The receive loop calls
-// RemovePeer synchronously, so a blocking teardown inline would wedge it.
-func TestNetBird_RemovePeer_TeardownIsAsync(t *testing.T) {
-	nb := NewNetBird(context.Background(), "test-proxy", "invalid.test", ClientConfig{
-		MgmtAddr: "http://invalid.test:9999",
-	}, nil, &mockStatusNotifier{}, &mockMgmtClient{})
-
-	accountID := types.AccountID("acct-async-teardown")
-	key := DomainServiceKey("svc.example")
-
-	teardownEntered := make(chan struct{})
-	releaseTeardown := make(chan struct{})
-	nb.SetClientLifecycle(nil, func(types.AccountID, any) {
-		close(teardownEntered)
-		<-releaseTeardown
-	})
-
-	nb.clientsMux.Lock()
-	nb.clients[accountID] = &clientEntry{
-		services: map[ServiceKey]serviceInfo{key: {serviceID: types.ServiceID("svc-1")}},
-		started:  true,
-		inbound:  struct{}{},
-	}
-	nb.clientsMux.Unlock()
-
-	done := make(chan error, 1)
-	go func() { done <- nb.RemovePeer(context.Background(), accountID, key) }()
-
-	select {
-	case err := <-done:
-		require.NoError(t, err)
-	case <-time.After(2 * time.Second):
-		t.Fatal("RemovePeer did not return while teardown was blocked — teardown is not async")
-	}
-
-	select {
-	case <-teardownEntered:
-	case <-time.After(2 * time.Second):
-		t.Fatal("teardown never ran")
-	}
-
-	close(releaseTeardown)
-}
-
-// TestNetBird_AddPeer_WaitsForTeardown proves the lifecycle lock serialises a
-// new client bringup behind an in-flight teardown for the same account, so a
-// slow client.Stop can never race a new client.Start for that account.
-//
-// It targets the handoff race specifically: AddPeer is launched immediately
-// after RemovePeer returns, WITHOUT waiting for the teardown goroutine to start.
-// This only passes if RemovePeer acquires the lifecycle lock synchronously
-// (before returning) and hands it to the teardown goroutine — if the goroutine
-// acquired the lock itself, AddPeer could win the lock in this window and start
-// a replacement client while the old teardown is still pending.
-func TestNetBird_AddPeer_WaitsForTeardown(t *testing.T) {
-	nb := NewNetBird(context.Background(), "test-proxy", "invalid.test", ClientConfig{
-		MgmtAddr: "http://invalid.test:9999",
-	}, nil, &mockStatusNotifier{}, &mockMgmtClient{})
-	nb.startClient = func(types.AccountID, *embed.Client) {}
-
-	accountID := types.AccountID("acct-serialize")
-	key := DomainServiceKey("svc.example")
-
-	addEntered := make(chan struct{})
-	releaseTeardown := make(chan struct{})
-	nb.SetClientLifecycle(nil, func(types.AccountID, any) {
-		// Block teardown until released. If AddPeer ever reaches createClientEntry
-		// (signalled via the mgmt client below) while we hold the lock, the lock
-		// failed to serialise and the test fails before we release.
-		<-releaseTeardown
-	})
-
-	nb.clientsMux.Lock()
-	nb.clients[accountID] = &clientEntry{
-		services: map[ServiceKey]serviceInfo{key: {serviceID: types.ServiceID("svc-1")}},
-		started:  true,
-		inbound:  struct{}{},
-	}
-	nb.clientsMux.Unlock()
-
-	// createClientEntry calls CreateProxyPeer; closing addEntered there tells us
-	// AddPeer got past the lifecycle lock and into client creation.
-	nb.mgmtClient = &signalMgmtClient{entered: addEntered}
-
-	require.NoError(t, nb.RemovePeer(context.Background(), accountID, key))
-
-	// Launch AddPeer with NO synchronisation against the teardown goroutine.
-	addReturned := make(chan struct{})
-	go func() {
-		_ = nb.AddPeer(context.Background(), accountID, DomainServiceKey("svc2.example"), "key-2", types.ServiceID("svc-2"))
-		close(addReturned)
-	}()
-
-	select {
-	case <-addEntered:
-		t.Fatal("AddPeer entered client creation while teardown held the lifecycle lock — handoff race not closed")
-	case <-addReturned:
-		t.Fatal("AddPeer completed while teardown held the lifecycle lock — not serialised")
-	case <-time.After(300 * time.Millisecond):
-	}
-
-	close(releaseTeardown)
-	select {
-	case <-addReturned:
-	case <-time.After(2 * time.Second):
-		t.Fatal("AddPeer never completed after teardown released the lifecycle lock")
-	}
-}
-
 // TestNotifyClientReady_UsesBackgroundCtx pins the contract that the
 // post-Start hooks (readyHandler + statusNotifier.NotifyStatus) run on
 // a fresh context.Background() rather than inheriting the AddPeer
--- a/proxy/lifecycle.go
+++ b/proxy/lifecycle.go
@@ -114,10 +114,6 @@ type Config struct {
 	MaxDialTimeout time.Duration
 	// MaxSessionIdleTimeout caps the per-service session idle timeout.
 	MaxSessionIdleTimeout time.Duration
-	// MappingBatchWatchdog bounds how long a single mapping batch may spend
-	// being applied before the receive loop reconnects to resync. Zero falls
-	// back to the internal default.
-	MappingBatchWatchdog time.Duration

 	// GeoDataDir is the directory containing GeoLite2 MMDB files.
 	GeoDataDir string
@@ -168,7 +164,6 @@ func New(ctx context.Context, cfg Config) *Server {
 		Private:                  cfg.Private,
 		MaxDialTimeout:           cfg.MaxDialTimeout,
 		MaxSessionIdleTimeout:    cfg.MaxSessionIdleTimeout,
-		MappingBatchWatchdog:     cfg.MappingBatchWatchdog,
 		GeoDataDir:               cfg.GeoDataDir,
 		CrowdSecAPIURL:           cfg.CrowdSecAPIURL,
 		CrowdSecAPIKey:           cfg.CrowdSecAPIKey,
--- a/proxy/mapping_stall_test.go
+++ b/proxy/mapping_stall_test.go
@@ -1,282 +0,0 @@
-package proxy
-
-import (
-	"context"
-	"sync"
-	"sync/atomic"
-	"testing"
-	"time"
-
-	log "github.com/sirupsen/logrus"
-	"github.com/stretchr/testify/assert"
-	"google.golang.org/grpc"
-	"google.golang.org/grpc/metadata"
-
-	"github.com/netbirdio/netbird/proxy/internal/roundtrip"
-	"github.com/netbirdio/netbird/proxy/internal/types"
-	"github.com/netbirdio/netbird/shared/management/proto"
-)
-
-// blockingMgmtClient implements roundtrip's managementClient interface.
-// CreateProxyPeer parks until release is closed, signalling entry on entered.
-// This reproduces the confirmed real-world stall: createClientEntry calls
-// CreateProxyPeer synchronously while holding clientsMux, and the proxy's
-// receive loop calls that path synchronously inside processMappings.
-type blockingMgmtClient struct {
-	entered chan struct{}
-	once    sync.Once
-}
-
-func (b *blockingMgmtClient) CreateProxyPeer(ctx context.Context, _ *proto.CreateProxyPeerRequest, _ ...grpc.CallOption) (*proto.CreateProxyPeerResponse, error) {
-	b.once.Do(func() { close(b.entered) })
-	// Park until the caller's context is cancelled. In production this ctx is
-	// the gRPC mapping-stream context with no per-call timeout, so a slow or
-	// unresponsive CreateProxyPeer parks the receive loop here indefinitely.
-	<-ctx.Done()
-	return nil, ctx.Err()
-}
-
-// gatedMappingStream is a mock GetMappingUpdate client stream that hands out a
-// pre-seeded list of messages, then records how many times Recv advanced. It
-// lets the test observe whether the single-threaded receive loop ever gets
-// past the first (blocking) batch to pull the second message.
-type gatedMappingStream struct {
-	grpc.ClientStream
-	messages []*proto.GetMappingUpdateResponse
-	idx      int32
-}
-
-func (g *gatedMappingStream) Recv() (*proto.GetMappingUpdateResponse, error) {
-	i := int(atomic.LoadInt32(&g.idx))
-	if i >= len(g.messages) {
-		// Block instead of returning EOF so the loop doesn't exit; we only
-		// care whether the loop ever reaches this second Recv at all.
-		select {}
-	}
-	msg := g.messages[i]
-	atomic.AddInt32(&g.idx, 1)
-	return msg, nil
-}
-
-func (g *gatedMappingStream) deliveredCount() int32 { return atomic.LoadInt32(&g.idx) }
-
-func (g *gatedMappingStream) Header() (metadata.MD, error) { return nil, nil } //nolint:nilnil
-func (g *gatedMappingStream) Trailer() metadata.MD         { return nil }
-func (g *gatedMappingStream) CloseSend() error             { return nil }
-func (g *gatedMappingStream) Context() context.Context     { return context.Background() }
-func (g *gatedMappingStream) SendMsg(any) error            { return nil }
-func (g *gatedMappingStream) RecvMsg(any) error            { return nil }
-
-// noopNotifier satisfies roundtrip's statusNotifier interface.
-type noopNotifier struct{}
-
-func (noopNotifier) NotifyStatus(context.Context, types.AccountID, types.ServiceID, bool) error {
-	return nil
-}
-
-// noopProxyClient is a proto.ProxyServiceClient that no-ops the one method the
-// teardown unwind reaches (SendStatusUpdate, via notifyError when the parked
-// AddPeer is cancelled). The embedded nil interface satisfies the rest at
-// compile time; none of those methods are called by this test.
-type noopProxyClient struct {
-	proto.ProxyServiceClient
-}
-
-func (noopProxyClient) SendStatusUpdate(context.Context, *proto.SendStatusUpdateRequest, ...grpc.CallOption) (*proto.SendStatusUpdateResponse, error) {
-	return &proto.SendStatusUpdateResponse{}, nil
-}
-
-// TestMappingStream_StallsWhenApplyBlocks proves the deadlock: the proxy's
-// mapping receive loop processes batches strictly serially, so when applying
-// one batch blocks (here: createClientEntry parked on a synchronous
-// CreateProxyPeer call, exactly as observed in production), the loop never
-// advances to Recv the next batch. Management can keep sending updates onto
-// the stream with no error and no channel overflow, yet the proxy applies
-// nothing further — it is stuck.
-func TestMappingStream_StallsWhenApplyBlocks(t *testing.T) {
-	logger := log.New()
-	logger.SetLevel(log.PanicLevel)
-
-	mgmt := &blockingMgmtClient{
-		entered: make(chan struct{}),
-	}
-
-	nb := roundtrip.NewNetBird(
-		context.Background(),
-		"proxy-test",
-		"proxy.example.com",
-		roundtrip.ClientConfig{},
-		logger,
-		noopNotifier{},
-		mgmt,
-	)
-
-	s := &Server{
-		Logger:       logger,
-		netbird:      nb,
-		mgmtClient:   noopProxyClient{},
-		routerReady:  closedChan(),
-		lastMappings: make(map[types.ServiceID]*proto.ProxyMapping),
-	}
-
-	// First batch: a CREATED mapping for a brand-new account. addMapping ->
-	// netbird.AddPeer -> createClientEntry -> CreateProxyPeer, which blocks.
-	// Empty Path keeps setupHTTPMapping a no-op (it returns early), so the
-	// ONLY blocking point is the synchronous CreateProxyPeer in AddPeer —
-	// no routers/auth need wiring. The second batch exists only to detect
-	// whether the loop ever advances past the blocked first batch.
-	stream := &gatedMappingStream{
-		messages: []*proto.GetMappingUpdateResponse{
-			{
-				Mapping: []*proto.ProxyMapping{
-					{
-						Type:      proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED,
-						Id:        "svc-1",
-						AccountId: "acct-1",
-						AuthToken: "token-1",
-					},
-				},
-			},
-			{
-				Mapping: []*proto.ProxyMapping{
-					{
-						Type:      proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED,
-						Id:        "svc-2",
-						AccountId: "acct-2",
-						AuthToken: "token-2",
-					},
-				},
-			},
-		},
-	}
-
-	ctx, cancel := context.WithCancel(context.Background())
-	// Unblock the parked apply on teardown via ctx (CreateProxyPeer returns
-	// ctx.Err()), so the wedged loop goroutine unwinds before embed.New —
-	// avoiding any dependency on collaborators this test deliberately leaves
-	// nil. The deadlock is fully proven before this fires.
-	t.Cleanup(cancel)
-
-	loopDone := make(chan struct{})
-	syncDone := false
-	go func() {
-		defer close(loopDone)
-		_ = s.handleMappingStream(ctx, stream, &syncDone, time.Time{})
-	}()
-
-	// The loop must reach the blocking apply for the first batch.
-	select {
-	case <-mgmt.entered:
-	case <-time.After(2 * time.Second):
-		t.Fatal("receive loop never reached CreateProxyPeer for the first batch")
-	}
-
-	// THE DEADLOCK: while the first batch is parked in CreateProxyPeer, the
-	// single-threaded loop cannot advance. The second batch is never pulled,
-	// even though it is already available on the stream. Give it ample time.
-	// deliveredCount is atomic; syncDone is intentionally not read here because
-	// the loop goroutine owns it (reading it from the test would race).
-	time.Sleep(500 * time.Millisecond)
-	assert.Equal(t, int32(1), stream.deliveredCount(),
-		"loop must NOT consume the second batch while the first is blocked in apply — proxy is stuck")
-
-	select {
-	case <-loopDone:
-		t.Fatal("receive loop returned while it should be wedged in apply")
-	default:
-		// Still wedged, as expected.
-	}
-}
-
-// TestMappingStream_StallsWhenRemoveBlocks proves the deadlock for the REMOVE
-// path observed in production: a mapping remove tears down the account's last
-// embedded client via netbird.RemovePeer -> client.Stop -> Engine.Stop, whose
-// jobExecutorWG.Wait() is unbounded. Because the receive loop is single-
-// threaded, a blocked remove wedges the loop: no further mapping updates of any
-// kind (create/modify/remove) are applied, while management keeps sending them
-// successfully (no send error, no channel-full). Matches the reported symptom:
-// the last log line is a remove that stops a client, then silence.
-func TestMappingStream_StallsWhenRemoveBlocks(t *testing.T) {
-	logger := log.New()
-	logger.SetLevel(log.PanicLevel)
-
-	enteredRemove := make(chan struct{})
-	blockRemove := make(chan struct{})
-	var once sync.Once
-
-	s := &Server{
-		Logger:       logger,
-		mgmtClient:   noopProxyClient{},
-		routerReady:  closedChan(),
-		lastMappings: make(map[types.ServiceID]*proto.ProxyMapping),
-		// Stand in for netbird.RemovePeer -> client.Stop hanging on
-		// Engine.Stop's unbounded jobExecutorWG.Wait(). Only the first remove
-		// blocks; later removes return immediately so the recovery assertion
-		// can observe the loop advancing.
-		removePeer: func(ctx context.Context, _ types.AccountID, _ roundtrip.ServiceKey) error {
-			first := false
-			once.Do(func() {
-				first = true
-				close(enteredRemove)
-			})
-			if !first {
-				return nil
-			}
-			select {
-			case <-blockRemove:
-			case <-ctx.Done():
-			}
-			return nil
-		},
-	}
-
-	// Batch 1 removes a service (blocks in teardown). Batch 2 is a later update
-	// that must never be applied while the remove is wedged.
-	stream := &gatedMappingStream{
-		messages: []*proto.GetMappingUpdateResponse{
-			{
-				Mapping: []*proto.ProxyMapping{
-					{Type: proto.ProxyMappingUpdateType_UPDATE_TYPE_REMOVED, Id: "svc-1", AccountId: "acct-1"},
-				},
-			},
-			{
-				Mapping: []*proto.ProxyMapping{
-					{Type: proto.ProxyMappingUpdateType_UPDATE_TYPE_REMOVED, Id: "svc-2", AccountId: "acct-1"},
-				},
-			},
-		},
-	}
-
-	loopDone := make(chan struct{})
-	syncDone := false
-	go func() {
-		defer close(loopDone)
-		_ = s.handleMappingStream(context.Background(), stream, &syncDone, time.Time{})
-	}()
-
-	select {
-	case <-enteredRemove:
-	case <-time.After(2 * time.Second):
-		t.Fatal("receive loop never reached the blocking remove for the first batch")
-	}
-
-	// THE DEADLOCK: the loop is parked in the blocked remove and cannot advance.
-	// syncDone is owned by the loop goroutine, so it is not read here.
-	time.Sleep(500 * time.Millisecond)
-	assert.Equal(t, int32(1), stream.deliveredCount(),
-		"loop must NOT consume the second batch while the first remove is blocked — proxy is stuck")
-
-	select {
-	case <-loopDone:
-		t.Fatal("receive loop returned while it should be wedged on the remove")
-	default:
-	}
-
-	// Unblock and confirm the wedge was solely the blocked remove: the loop
-	// then advances and consumes the next batch.
-	close(blockRemove)
-	assert.Eventually(t, func() bool {
-		return stream.deliveredCount() >= 2
-	}, 2*time.Second, 5*time.Millisecond,
-		"once the remove unblocks, the loop must advance and consume the next batch")
-}
--- a/proxy/server.go
+++ b/proxy/server.go
@@ -118,9 +118,6 @@ type Server struct {
 	// The mapping worker waits on this before processing updates.
 	routerReady chan struct{}

-	// removePeer defaults to netbird.RemovePeer; overridable in tests.
-	removePeer func(ctx context.Context, accountID types.AccountID, key roundtrip.ServiceKey) error
-
 	// inbound, when non-nil, manages per-account inbound listeners. Set by
 	// initPrivateInbound only when Private is true so the standalone
 	// proxy keeps its zero-overhead default path.
@@ -230,10 +227,6 @@ type Server struct {
 	// Zero means no cap (the proxy honors whatever management sends).
 	// Set via NB_PROXY_MAX_SESSION_IDLE_TIMEOUT for shared deployments.
 	MaxSessionIdleTimeout time.Duration
-	// MappingBatchWatchdog bounds how long a single mapping batch may spend
-	// in processMappings before the receive loop reconnects to resync.
-	// Zero uses defaultMappingBatchWatchdog.
-	MappingBatchWatchdog time.Duration
 }

 // clampIdleTimeout returns d capped to MaxSessionIdleTimeout when configured.
@@ -1179,30 +1172,24 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
 			s.healthChecker.SetManagementConnected(false)
 		}

-		connected := false
-		onConnected := func() { connected = true }
-
 		var streamErr error
 		if syncSupported {
-			streamErr = s.trySyncMappings(ctx, client, &initialSyncDone, onConnected)
+			streamErr = s.trySyncMappings(ctx, client, &initialSyncDone)
 			if isSyncUnimplemented(streamErr) {
 				syncSupported = false
 				s.Logger.Info("management does not support SyncMappings, falling back to GetMappingUpdate")
-				streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone, onConnected)
+				streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone)
 			}
 		} else {
-			streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone, onConnected)
+			streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone)
 		}

 		if s.healthChecker != nil {
 			s.healthChecker.SetManagementConnected(false)
 		}

-		// Reset backoff only when a stream actually connected, so immediate
-		// connect failures still back off instead of spinning.
-		if connected {
-			bo.Reset()
-		}
+		// Stream established — reset backoff so the next failure retries quickly.
+		bo.Reset()

 		if streamErr == nil {
 			return fmt.Errorf("stream closed by server")
@@ -1234,7 +1221,7 @@ func (s *Server) proxyCapabilities() *proto.ProxyCapabilities {
 	}
 }

-func (s *Server) tryGetMappingUpdate(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool, onConnected func()) error {
+func (s *Server) tryGetMappingUpdate(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool) error {
 	connectTime := time.Now()
 	mappingClient, err := client.GetMappingUpdate(ctx, &proto.GetMappingUpdateRequest{
 		ProxyId:      s.ID,
@@ -1247,7 +1234,6 @@ func (s *Server) tryGetMappingUpdate(ctx context.Context, client proto.ProxyServ
 		return fmt.Errorf("create mapping stream: %w", err)
 	}

-	onConnected()
 	if s.healthChecker != nil {
 		s.healthChecker.SetManagementConnected(true)
 	}
@@ -1256,7 +1242,7 @@ func (s *Server) tryGetMappingUpdate(ctx context.Context, client proto.ProxyServ
 	return s.handleMappingStream(ctx, mappingClient, initialSyncDone, connectTime)
 }

-func (s *Server) trySyncMappings(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool, onConnected func()) error {
+func (s *Server) trySyncMappings(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool) error {
 	connectTime := time.Now()
 	stream, err := client.SyncMappings(ctx)
 	if err != nil {
@@ -1277,7 +1263,6 @@ func (s *Server) trySyncMappings(ctx context.Context, client proto.ProxyServiceC
 		return fmt.Errorf("send sync init: %w", err)
 	}

-	onConnected()
 	if s.healthChecker != nil {
 		s.healthChecker.SetManagementConnected(true)
 	}
@@ -1322,9 +1307,7 @@ func (s *Server) handleSyncMappingsStream(ctx context.Context, stream proto.Prox

 			batchStart := time.Now()
 			s.Logger.Debug("Received mapping update, starting processing")
-			if err := s.processMappingsGuarded(ctx, msg.GetMapping()); err != nil {
-				return err
-			}
+			s.processMappings(ctx, msg.GetMapping())
 			s.Logger.Debug("Processing mapping update completed")
 			tracker.recordBatch(ctx, s, msg.GetMapping(), msg.GetInitialSyncComplete(), batchStart)

@@ -1408,9 +1391,7 @@ func (s *Server) handleMappingStream(ctx context.Context, mappingClient proto.Pr

 			batchStart := time.Now()
 			s.Logger.Debug("Received mapping update, starting processing")
-			if err := s.processMappingsGuarded(ctx, msg.GetMapping()); err != nil {
-				return err
-			}
+			s.processMappings(ctx, msg.GetMapping())
 			s.Logger.Debug("Processing mapping update completed")
 			tracker.recordBatch(ctx, s, msg.GetMapping(), msg.GetInitialSyncComplete(), batchStart)
 		}
@@ -1475,44 +1456,6 @@ func redactMappingForLog(m *proto.ProxyMapping) *proto.ProxyMapping {
 	return c
 }

-const defaultMappingBatchWatchdog = 2 * time.Minute
-
-// mappingBatchWatchdog returns the configured batch watchdog or the default.
-func (s *Server) mappingBatchWatchdog() time.Duration {
-	if s.MappingBatchWatchdog > 0 {
-		return s.MappingBatchWatchdog
-	}
-	return defaultMappingBatchWatchdog
-}
-
-// processMappingsGuarded applies a batch under a watchdog, returning an error
-// if processing exceeds the watchdog so the caller reconnects and resyncs
-// instead of wedging silently.
-func (s *Server) processMappingsGuarded(ctx context.Context, mappings []*proto.ProxyMapping) error {
-	batchCtx, cancel := context.WithCancel(ctx)
-	defer cancel()
-
-	done := make(chan struct{})
-	go func() {
-		defer close(done)
-		s.processMappings(batchCtx, mappings)
-	}()
-
-	watchdog := s.mappingBatchWatchdog()
-	timer := time.NewTimer(watchdog)
-	defer timer.Stop()
-
-	select {
-	case <-done:
-		return nil
-	case <-ctx.Done():
-		return ctx.Err()
-	case <-timer.C:
-		s.Logger.Errorf("processing mapping batch exceeded %s, cancelling and reconnecting to resync", watchdog)
-		return fmt.Errorf("mapping batch processing stalled after %s", watchdog)
-	}
-}
-
 func (s *Server) processMappings(ctx context.Context, mappings []*proto.ProxyMapping) {
 	debug := s.Logger != nil && s.Logger.IsLevelEnabled(log.DebugLevel)
 	for _, mapping := range mappings {
@@ -2008,11 +1951,7 @@ func (s *Server) updateMapping(ctx context.Context, mapping *proto.ProxyMapping)
 func (s *Server) removeMapping(ctx context.Context, mapping *proto.ProxyMapping) {
 	accountID := types.AccountID(mapping.GetAccountId())
 	svcKey := s.serviceKeyForMapping(mapping)
-	removePeer := s.removePeer
-	if removePeer == nil {
-		removePeer = s.netbird.RemovePeer
-	}
-	if err := removePeer(ctx, accountID, svcKey); err != nil {
+	if err := s.netbird.RemovePeer(ctx, accountID, svcKey); err != nil {
 		s.Logger.WithFields(log.Fields{
 			"account_id": accountID,
 			"service_id": mapping.GetId(),