Merge remote-tracking branch 'origin/main' into netmap_progressive_alignment

[client] Fix slow wg operations (#6633 )
* [iface] Drop redundant device dump in kernel configure() wgctrl.ConfigureDevice already returns an error when the interface is missing, so the preceding wg.Device() existence check is redundant. That check dumps the entire device (all peers) on every configure() call, making it O(peers) per call and turning bulk peer insertion into O(peers^2): inserting N peers one by one re-parsed the whole growing peer list N times. Removing it keeps each peer write constant-time regardless of how many peers are already configured. * [iface] Cache WireGuard stats to collapse per-peer device dumps Each peer runs a WGWatcher that polls GetStats(), and every call dumps the whole device, so with N peers the watchers perform O(N) full dumps per poll cycle (O(N^2) work) while each keeps only its own peer's entry. Wrap the kernel and userspace configurer GetStats() in a short-TTL cache with singleflight: the staggered per-peer calls share a single device dump per window and concurrent misses collapse into one dump. The kernel and userspace WireGuard APIs have no per-peer stats query (a get always returns the whole device), so a shared cached snapshot avoids the repeated full dumps. * Ignore .claude directory
2026-07-03 13:19:58 +00:00 · 2026-07-03 09:48:36 +02:00 · 2026-07-02 20:42:43 +02:00 · 2026-07-02 17:50:18 +02:00 · 2026-07-02 17:21:06 +02:00 · 2026-07-02 15:36:51 +02:00
45 changed files with 1541 additions and 329 deletions
--- a/.github/workflows/golang-test-linux.yml
+++ b/.github/workflows/golang-test-linux.yml
@@ -158,7 +158,7 @@ jobs:
        run: git --no-pager diff --exit-code

      - name: Test
-        run: CGO_ENABLED=1 GOARCH=${{ matrix.arch }} CI=true go test -coverprofile=coverage.txt -tags devcert -timeout 10m -p 1 $(go list ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined)
+        run: CGO_ENABLED=1 GOARCH=${{ matrix.arch }} CI=true go test -coverprofile=coverage.txt -tags 'devcert privileged' -exec 'sudo --preserve-env=CI,CGO_ENABLED' -timeout 10m -p 1 $(go list ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined -e /client/testutil/privileged)

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -293,8 +293,11 @@ jobs:
          ${{ steps.goreleaser.outputs.artifacts }}
          JSON

+          # dockers_v2 artifacts have no top-level goarch field, so match the
+          # per-platform -amd64 tag suffix instead; it works for both the old
+          # dockers and the new dockers_v2 image naming.
          mapfile -t src_images < <(
-            jq -r '.[] | select(.type == "Docker Image") | select(.goarch == "amd64") | .name | select(startswith("ghcr.io/"))' /tmp/goreleaser-artifacts.json
+            jq -r '.[] | select(.type == "Docker Image") | .name | select(startswith("ghcr.io/") and endswith("-amd64"))' /tmp/goreleaser-artifacts.json
          )

          for src in "${src_images[@]}"; do
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.claude
 .idea
 .run
 *.iml
--- a/client/android/env_list.go
+++ b/client/android/env_list.go
@@ -10,7 +10,7 @@ var (
 	EnvKeyNBForceRelay = peer.EnvKeyNBForceRelay

 	// EnvKeyNBLazyConn Exported for Android java client to configure lazy connection
-	EnvKeyNBLazyConn = lazyconn.EnvEnableLazyConn
+	EnvKeyNBLazyConn = lazyconn.EnvLazyConn

 	// EnvKeyNBInactivityThreshold Exported for Android java client to configure connection inactivity threshold
 	EnvKeyNBInactivityThreshold = lazyconn.EnvInactivityThreshold
--- a/client/cmd/root.go
+++ b/client/cmd/root.go
@@ -71,12 +71,14 @@ var (
 	extraIFaceBlackList     []string
 	anonymizeFlag           bool
 	dnsRouteInterval        time.Duration
-	lazyConnEnabled         bool
-	mtu                     uint16
-	profilesDisabled        bool
-	updateSettingsDisabled  bool
-	captureEnabled          bool
-	networksDisabled        bool
+	// lazyConnEnabled is the parse target for the deprecated --enable-lazy-connection
+	// flag. The flag is inert; the value is no longer read (use NB_LAZY_CONN instead).
+	lazyConnEnabled        bool
+	mtu                    uint16
+	profilesDisabled       bool
+	updateSettingsDisabled bool
+	captureEnabled         bool
+	networksDisabled       bool

 	rootCmd = &cobra.Command{
 		Use:          "netbird",
@@ -210,7 +212,8 @@ func init() {
 	upCmd.PersistentFlags().BoolVar(&rosenpassEnabled, enableRosenpassFlag, false, "[Experimental] Enable Rosenpass feature. If enabled, the connection will be post-quantum secured via Rosenpass.")
 	upCmd.PersistentFlags().BoolVar(&rosenpassPermissive, rosenpassPermissiveFlag, false, "[Experimental] Enable Rosenpass in permissive mode to allow this peer to accept WireGuard connections without requiring Rosenpass functionality from peers that do not have Rosenpass enabled.")
 	upCmd.PersistentFlags().BoolVar(&autoConnectDisabled, disableAutoConnectFlag, false, "Disables auto-connect feature. If enabled, then the client won't connect automatically when the service starts.")
-	upCmd.PersistentFlags().BoolVar(&lazyConnEnabled, enableLazyConnectionFlag, false, "[Experimental] Enable the lazy connection feature. If enabled, the client will establish connections on-demand. Note: this setting may be overridden by management configuration.")
+	upCmd.PersistentFlags().BoolVar(&lazyConnEnabled, enableLazyConnectionFlag, false, "Deprecated: no longer used. Lazy connections are controlled by the server and the NB_LAZY_CONN environment variable.")
+	_ = upCmd.PersistentFlags().MarkDeprecated(enableLazyConnectionFlag, "no longer used; lazy connections are controlled by the server and the NB_LAZY_CONN environment variable")

 }

--- a/client/cmd/up.go
+++ b/client/cmd/up.go
@@ -479,10 +479,6 @@ func setupSetConfigReq(customDNSAddressConverted []byte, cmd *cobra.Command, pro
 		req.DisableIpv6 = &disableIPv6
 	}

-	if cmd.Flag(enableLazyConnectionFlag).Changed {
-		req.LazyConnectionEnabled = &lazyConnEnabled
-	}
-
 	return &req
 }

@@ -600,9 +596,6 @@ func setupConfig(customDNSAddressConverted []byte, cmd *cobra.Command, configFil
 		ic.DisableIPv6 = &disableIPv6
 	}

-	if cmd.Flag(enableLazyConnectionFlag).Changed {
-		ic.LazyConnectionEnabled = &lazyConnEnabled
-	}
 	return &ic, nil
 }

@@ -718,9 +711,6 @@ func setupLoginRequest(providedSetupKey string, customDNSAddressConverted []byte
 		loginRequest.DisableIpv6 = &disableIPv6
 	}

-	if cmd.Flag(enableLazyConnectionFlag).Changed {
-		loginRequest.LazyConnectionEnabled = &lazyConnEnabled
-	}
 	return &loginRequest, nil
 }

--- a/client/iface/configurer/kernel_unix.go
+++ b/client/iface/configurer/kernel_unix.go
@@ -17,12 +17,15 @@ import (

 type KernelConfigurer struct {
 	deviceName string
+	statsCache *statsCache
 }

 func NewKernelConfigurer(deviceName string) *KernelConfigurer {
-	return &KernelConfigurer{
+	c := &KernelConfigurer{
 		deviceName: deviceName,
 	}
+	c.statsCache = newStatsCache(statsCacheTTL, c.fetchStats)
+	return c
 }

 func (c *KernelConfigurer) ConfigureInterface(privateKey string, port int) error {
@@ -246,12 +249,6 @@ func (c *KernelConfigurer) configure(config wgtypes.Config) error {
 		}
 	}()

-	// validate if device with name exists
-	_, err = wg.Device(c.deviceName)
-	if err != nil {
-		return err
-	}
-
 	return wg.ConfigureDevice(c.deviceName, config)
 }

@@ -300,6 +297,14 @@ func (c *KernelConfigurer) FullStats() (*Stats, error) {
 }

 func (c *KernelConfigurer) GetStats() (map[string]WGStats, error) {
+	return c.statsCache.get()
+}
+
+func (c *KernelConfigurer) LastActivities() map[string]monotime.Time {
+	return nil
+}
+
+func (c *KernelConfigurer) fetchStats() (map[string]WGStats, error) {
 	stats := make(map[string]WGStats)
 	wg, err := wgctrl.New()
 	if err != nil {
@@ -326,7 +331,3 @@ func (c *KernelConfigurer) GetStats() (map[string]WGStats, error) {
 	}
 	return stats, nil
 }
-
-func (c *KernelConfigurer) LastActivities() map[string]monotime.Time {
-	return nil
-}
--- a/client/iface/configurer/stats_cache.go
+++ b/client/iface/configurer/stats_cache.go
@@ -0,0 +1,52 @@
+package configurer
+
+import (
+	"sync"
+	"time"
+
+	"golang.org/x/sync/singleflight"
+)
+
+const statsCacheTTL = 1 * time.Second
+
+type statsCache struct {
+	ttl   time.Duration
+	fetch func() (map[string]WGStats, error)
+
+	mu       sync.RWMutex
+	value    map[string]WGStats
+	expireAt time.Time
+
+	sf singleflight.Group
+}
+
+func newStatsCache(ttl time.Duration, fetch func() (map[string]WGStats, error)) *statsCache {
+	return &statsCache{ttl: ttl, fetch: fetch}
+}
+
+func (c *statsCache) get() (map[string]WGStats, error) {
+	c.mu.RLock()
+	if c.value != nil && time.Now().Before(c.expireAt) {
+		value := c.value
+		c.mu.RUnlock()
+		return value, nil
+	}
+	c.mu.RUnlock()
+
+	value, err, _ := c.sf.Do("stats", func() (interface{}, error) {
+		res, err := c.fetch()
+		if err != nil {
+			return nil, err
+		}
+
+		c.mu.Lock()
+		c.value = res
+		c.expireAt = time.Now().Add(c.ttl)
+		c.mu.Unlock()
+		return res, nil
+	})
+	if err != nil {
+		return nil, err
+	}
+	return value.(map[string]WGStats), nil
+}
--- a/client/iface/configurer/stats_cache_test.go
+++ b/client/iface/configurer/stats_cache_test.go
@@ -0,0 +1,70 @@
+package configurer
+
+import (
+	"errors"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestStatsCache_CachesWithinTTL(t *testing.T) {
+	var calls atomic.Int64
+	c := newStatsCache(50*time.Millisecond, func() (map[string]WGStats, error) {
+		calls.Add(1)
+		return map[string]WGStats{"p": {}}, nil
+	})
+
+	for i := 0; i < 10; i++ {
+		_, err := c.get()
+		require.NoError(t, err)
+	}
+	require.Equal(t, int64(1), calls.Load(), "within TTL only one underlying fetch")
+
+	time.Sleep(60 * time.Millisecond)
+	_, err := c.get()
+	require.NoError(t, err)
+	require.Equal(t, int64(2), calls.Load(), "after TTL expiry a fresh fetch happens")
+}
+
+func TestStatsCache_SingleFlight(t *testing.T) {
+	var calls atomic.Int64
+	release := make(chan struct{})
+	c := newStatsCache(time.Minute, func() (map[string]WGStats, error) {
+		calls.Add(1)
+		<-release
+		return map[string]WGStats{}, nil
+	})
+
+	const n = 50
+	var wg sync.WaitGroup
+	wg.Add(n)
+	for i := 0; i < n; i++ {
+		go func() {
+			defer wg.Done()
+			_, _ = c.get()
+		}()
+	}
+	time.Sleep(20 * time.Millisecond)
+	close(release)
+	wg.Wait()
+
+	require.Equal(t, int64(1), calls.Load(), "concurrent misses collapse into one fetch")
+}
+
+func TestStatsCache_ErrorNotCached(t *testing.T) {
+	var calls atomic.Int64
+	wantErr := errors.New("dump failed")
+	c := newStatsCache(time.Minute, func() (map[string]WGStats, error) {
+		calls.Add(1)
+		return nil, wantErr
+	})
+
+	_, err := c.get()
+	require.ErrorIs(t, err, wantErr)
+	_, err = c.get()
+	require.ErrorIs(t, err, wantErr)
+	require.Equal(t, int64(2), calls.Load(), "errors are not cached; each call retries")
+}
--- a/client/iface/configurer/usp.go
+++ b/client/iface/configurer/usp.go
@@ -40,6 +40,7 @@ type WGUSPConfigurer struct {
 	device           *device.Device
 	deviceName       string
 	activityRecorder *bind.ActivityRecorder
+	statsCache       *statsCache

 	uapiListener net.Listener
 }
@@ -50,16 +51,19 @@ func NewUSPConfigurer(device *device.Device, deviceName string, activityRecorder
 		deviceName:       deviceName,
 		activityRecorder: activityRecorder,
 	}
+	wgCfg.statsCache = newStatsCache(statsCacheTTL, wgCfg.fetchStats)
 	wgCfg.startUAPI()
 	return wgCfg
 }

 func NewUSPConfigurerNoUAPI(device *device.Device, deviceName string, activityRecorder *bind.ActivityRecorder) *WGUSPConfigurer {
-	return &WGUSPConfigurer{
+	wgCfg := &WGUSPConfigurer{
 		device:           device,
 		deviceName:       deviceName,
 		activityRecorder: activityRecorder,
 	}
+	wgCfg.statsCache = newStatsCache(statsCacheTTL, wgCfg.fetchStats)
+	return wgCfg
 }

 func (c *WGUSPConfigurer) ConfigureInterface(privateKey string, port int) error {
@@ -348,6 +352,10 @@ func (t *WGUSPConfigurer) Close() {
 }

 func (t *WGUSPConfigurer) GetStats() (map[string]WGStats, error) {
+	return t.statsCache.get()
+}
+
+func (t *WGUSPConfigurer) fetchStats() (map[string]WGStats, error) {
 	ipc, err := t.device.IpcGet()
 	if err != nil {
 		return nil, fmt.Errorf("ipc get: %w", err)
--- a/client/internal/auth/auth.go
+++ b/client/internal/auth/auth.go
@@ -322,7 +322,6 @@ func (a *Auth) setSystemInfoFlags(info *system.Info) {
 		a.config.BlockLANAccess,
 		a.config.BlockInbound,
 		a.config.DisableIPv6,
-		a.config.LazyConnectionEnabled,
 		a.config.EnableSSHRoot,
 		a.config.EnableSSHSFTP,
 		a.config.EnableSSHLocalPortForwarding,
--- a/client/internal/conn_mgr.go
+++ b/client/internal/conn_mgr.go
@@ -16,6 +16,16 @@ import (
 	"github.com/netbirdio/netbird/route"
 )

+// lazyForce is the resolved local decision for lazy connections, layered above the
+// management feature flag. lazyForceNone defers to management.
+type lazyForce int
+
+const (
+	lazyForceNone lazyForce = iota
+	lazyForceOn
+	lazyForceOff
+)
+
 // ConnMgr coordinates both lazy connections (established on-demand) and permanent peer connections.
 //
 // The connection manager is responsible for:
@@ -28,7 +38,7 @@ type ConnMgr struct {
 	peerStore        *peerstore.Store
 	statusRecorder   *peer.Status
 	iface            lazyconn.WGIface
-	enabledLocally   bool
+	force            lazyForce
 	rosenpassEnabled bool

 	lazyConnMgr *manager.Manager
@@ -43,28 +53,34 @@ func NewConnMgr(engineConfig *EngineConfig, statusRecorder *peer.Status, peerSto
 		peerStore:        peerStore,
 		statusRecorder:   statusRecorder,
 		iface:            iface,
+		force:            resolveLazyForce(engineConfig.LazyConnection),
 		rosenpassEnabled: engineConfig.RosenpassEnabled,
 	}
-	if engineConfig.LazyConnectionEnabled || lazyconn.IsLazyConnEnabledByEnv() {
-		e.enabledLocally = true
-	}
 	return e
 }

-// Start initializes the connection manager and starts the lazy connection manager if enabled by env var or cmd line option.
+// Start initializes the connection manager. It starts the lazy connection manager when a
+// local override forces it on; with no local override it waits for the management feature flag.
 func (e *ConnMgr) Start(ctx context.Context) {
 	if e.lazyConnMgr != nil {
 		log.Errorf("lazy connection manager is already started")
 		return
 	}

-	if !e.enabledLocally {
-		log.Infof("lazy connection manager is disabled")
+	switch e.force {
+	case lazyForceOff:
+		log.Infof("lazy connection manager is disabled by local override (%s or MDM policy)", lazyconn.EnvLazyConn)
+		e.statusRecorder.UpdateLazyConnection(false)
+		return
+	case lazyForceNone:
+		log.Infof("lazy connection manager is managed by the management feature flag")
+		e.statusRecorder.UpdateLazyConnection(false)
 		return
 	}

 	if e.rosenpassEnabled {
 		log.Warnf("rosenpass connection manager is enabled, lazy connection manager will not be started")
+		e.statusRecorder.UpdateLazyConnection(false)
 		return
 	}

@@ -76,8 +92,8 @@ func (e *ConnMgr) Start(ctx context.Context) {
 // If enabled, it initializes the lazy connection manager and start it. Do not need to call Start() again.
 // If disabled, then it closes the lazy connection manager and open the connections to all peers.
 func (e *ConnMgr) UpdatedRemoteFeatureFlag(ctx context.Context, enabled bool) error {
-	// do not disable lazy connection manager if it was enabled by env var
-	if e.enabledLocally {
+	// a local override (NB_LAZY_CONN or local config) takes precedence over management
+	if e.force != lazyForceNone {
 		return nil
 	}

@@ -89,6 +105,7 @@ func (e *ConnMgr) UpdatedRemoteFeatureFlag(ctx context.Context, enabled bool) er

 		if e.rosenpassEnabled {
 			log.Infof("rosenpass connection manager is enabled, lazy connection manager will not be started")
+			e.statusRecorder.UpdateLazyConnection(false)
 			return nil
 		}

@@ -98,6 +115,7 @@ func (e *ConnMgr) UpdatedRemoteFeatureFlag(ctx context.Context, enabled bool) er
 		return e.addPeersToLazyConnManager()
 	} else {
 		if e.lazyConnMgr == nil {
+			e.statusRecorder.UpdateLazyConnection(false)
 			return nil
 		}
 		log.Infof("lazy connection manager is disabled by management feature flag")
@@ -309,6 +327,25 @@ func (e *ConnMgr) isStartedWithLazyMgr() bool {
 	return e.lazyConnMgr != nil && e.lazyCtxCancel != nil
 }

+// resolveLazyForce determines the local override. NB_LAZY_CONN takes precedence; when it
+// is unset the MDM policy override (mdmState) applies. Either wins in both directions over
+// the management feature flag; StateUnset for both defers to management.
+func resolveLazyForce(mdmState lazyconn.State) lazyForce {
+	state := lazyconn.EnvState()
+	if state == lazyconn.StateUnset {
+		state = mdmState
+	}
+
+	switch state {
+	case lazyconn.StateOn:
+		return lazyForceOn
+	case lazyconn.StateOff:
+		return lazyForceOff
+	default:
+		return lazyForceNone
+	}
+}
+
 func inactivityThresholdEnv() *time.Duration {
 	envValue := os.Getenv(lazyconn.EnvInactivityThreshold)
 	if envValue == "" {
--- a/client/internal/conn_mgr_test.go
+++ b/client/internal/conn_mgr_test.go
@@ -0,0 +1,40 @@
+package internal
+
+import (
+	"os"
+	"testing"
+
+	"github.com/netbirdio/netbird/client/internal/lazyconn"
+)
+
+func TestResolveLazyForce(t *testing.T) {
+	tests := []struct {
+		name   string
+		env    string
+		envSet bool
+		mdm    lazyconn.State
+		want   lazyForce
+	}{
+		{name: "env unset, mdm unset -> defer to management", mdm: lazyconn.StateUnset, want: lazyForceNone},
+		{name: "env on -> force on", env: "on", envSet: true, mdm: lazyconn.StateUnset, want: lazyForceOn},
+		{name: "env off -> force off", env: "off", envSet: true, mdm: lazyconn.StateUnset, want: lazyForceOff},
+		{name: "env unset, mdm on -> force on", mdm: lazyconn.StateOn, want: lazyForceOn},
+		{name: "env unset, mdm off -> force off", mdm: lazyconn.StateOff, want: lazyForceOff},
+		{name: "env on beats mdm off", env: "on", envSet: true, mdm: lazyconn.StateOff, want: lazyForceOn},
+		{name: "env off beats mdm on", env: "off", envSet: true, mdm: lazyconn.StateOn, want: lazyForceOff},
+		{name: "unrecognized env, mdm on -> mdm wins", env: "auto", envSet: true, mdm: lazyconn.StateOn, want: lazyForceOn},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Setenv(lazyconn.EnvLazyConn, tt.env)
+			if !tt.envSet {
+				os.Unsetenv(lazyconn.EnvLazyConn)
+			}
+
+			if got := resolveLazyForce(tt.mdm); got != tt.want {
+				t.Fatalf("resolveLazyForce(%v) = %v, want %v", tt.mdm, got, tt.want)
+			}
+		})
+	}
+}
--- a/client/internal/connect.go
+++ b/client/internal/connect.go
@@ -27,6 +27,7 @@ import (
 	"github.com/netbirdio/netbird/client/iface/device"
 	"github.com/netbirdio/netbird/client/iface/netstack"
 	"github.com/netbirdio/netbird/client/internal/dns"
+	"github.com/netbirdio/netbird/client/internal/lazyconn"
 	"github.com/netbirdio/netbird/client/internal/listener"
 	"github.com/netbirdio/netbird/client/internal/metrics"
 	"github.com/netbirdio/netbird/client/internal/peer"
@@ -601,7 +602,7 @@ func createEngineConfig(key wgtypes.Key, config *profilemanager.Config, peerConf
 		BlockInbound:        config.BlockInbound,
 		DisableIPv6:         config.DisableIPv6,

-		LazyConnectionEnabled: config.LazyConnectionEnabled,
+		LazyConnection: lazyconn.ParseState(config.LazyConnection),

 		MTU:     selectMTU(config.MTU, peerConfig.Mtu),
 		LogPath: logPath,
@@ -675,7 +676,6 @@ func loginToManagement(ctx context.Context, client mgm.Client, pubSSHKey []byte,
 		config.BlockLANAccess,
 		config.BlockInbound,
 		config.DisableIPv6,
-		config.LazyConnectionEnabled,
 		config.EnableSSHRoot,
 		config.EnableSSHSFTP,
 		config.EnableSSHLocalPortForwarding,
--- a/client/internal/debug/debug.go
+++ b/client/internal/debug/debug.go
@@ -681,7 +681,7 @@ func (g *BundleGenerator) addCommonConfigFields(configContent *strings.Builder)
 		configContent.WriteString(fmt.Sprintf("ClientCertKeyPath: %s\n", g.internalConfig.ClientCertKeyPath))
 	}

-	configContent.WriteString(fmt.Sprintf("LazyConnectionEnabled: %v\n", g.internalConfig.LazyConnectionEnabled))
+	configContent.WriteString(fmt.Sprintf("LazyConnection: %q\n", g.internalConfig.LazyConnection))
 	configContent.WriteString(fmt.Sprintf("MTU: %d\n", g.internalConfig.MTU))
 }

--- a/client/internal/debug/debug_test.go
+++ b/client/internal/debug/debug_test.go
@@ -885,7 +885,7 @@ func TestAddConfig_AllFieldsCovered(t *testing.T) {
 		DNSRouteInterval:              5 * time.Second,
 		ClientCertPath:                "/tmp/cert",
 		ClientCertKeyPath:             "/tmp/key",
-		LazyConnectionEnabled:         true,
+		LazyConnection:                "on",
 		MTU:                           1280,
 	}

--- a/client/internal/engine.go
+++ b/client/internal/engine.go
@@ -40,6 +40,7 @@ import (
 	"github.com/netbirdio/netbird/client/internal/dnsfwd"
 	"github.com/netbirdio/netbird/client/internal/expose"
 	"github.com/netbirdio/netbird/client/internal/ingressgw"
+	"github.com/netbirdio/netbird/client/internal/lazyconn"
 	"github.com/netbirdio/netbird/client/internal/metrics"
 	"github.com/netbirdio/netbird/client/internal/netflow"
 	nftypes "github.com/netbirdio/netbird/client/internal/netflow/types"
@@ -147,7 +148,9 @@ type EngineConfig struct {
 	BlockInbound        bool
 	DisableIPv6         bool

-	LazyConnectionEnabled bool
+	// LazyConnection is the MDM-sourced lazy-connection override; StateUnset defers to
+	// the env var and management feature flag.
+	LazyConnection lazyconn.State

 	MTU uint16

@@ -217,6 +220,12 @@ type Engine struct {
 	// networkSerial is the latest CurrentSerial (state ID) of the network sent by the Management service
 	networkSerial uint64

+	// forwardingRules holds the ingress forward rules applied for the current target.
+	// Wholesale sections (incl. forward rules) run only on the first pass of a target;
+	// it is stashed here so the final, peer-converged pass can build the lazy-connection
+	// exclude list without recomputing them on every bounded peer pass.
+	forwardingRules []firewallManager.ForwardRule
+
 	networkMonitor *networkmonitor.NetworkMonitor

 	sshServer sshServer
@@ -771,7 +780,15 @@ func (e *Engine) blockLanAccess() {

 // modifyPeers updates peers that have been modified (e.g. IP address has been changed).
 // It closes the existing connection, removes it from the peerConns map, and creates a new one.
-func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
+// maxPeersPerSyncPass is the default per-pass cap on how many peers each of
+// removePeers/modifyPeers/addNewPeers applies, so syncMsgMux is held only for a
+// batch at a time and other subsystems can interleave between passes. It is
+// passed in (not read globally) so tests can exercise the multi-pass path.
+const maxPeersPerSyncPass = 300
+
+// modifyPeers re-applies up to maxBatch changed peers per call. It returns true
+// when more changed peers remained than the cap, so the caller re-runs.
+func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch int) (bool, error) {

 	// first, check if peers have been modified
 	var modified []*mgmProto.RemotePeerConfig
@@ -801,26 +818,32 @@ func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
 		}
 	}

+	more := false
+	if len(modified) > maxBatch {
+		modified = modified[:maxBatch]
+		more = true
+	}
+
 	// second, close all modified connections and remove them from the state map
 	for _, p := range modified {
-		err := e.removePeer(p.GetWgPubKey())
-		if err != nil {
-			return err
+		if err := e.removePeer(p.GetWgPubKey()); err != nil {
+			return false, err
 		}
 	}
 	// third, add the peer connections again
 	for _, p := range modified {
-		err := e.addNewPeer(p)
-		if err != nil {
-			return err
+		if err := e.addNewPeer(p); err != nil {
+			return false, err
 		}
 	}
-	return nil
+	return more, nil
 }

 // removePeers finds and removes peers that do not exist anymore in the network map received from the Management Service.
 // It also removes peers that have been modified (e.g. change of IP address). They will be added again in addPeers method.
-func (e *Engine) removePeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
+// removePeers removes up to maxBatch peers per call. It returns true when more
+// peers remained to remove than the cap, so the caller re-runs.
+func (e *Engine) removePeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch int) (bool, error) {
 	newPeers := make([]string, 0, len(peersUpdate))
 	for _, p := range peersUpdate {
 		newPeers = append(newPeers, p.GetWgPubKey())
@@ -828,14 +851,19 @@ func (e *Engine) removePeers(peersUpdate []*mgmProto.RemotePeerConfig) error {

 	toRemove := util.SliceDiff(e.peerStore.PeersPubKey(), newPeers)

+	more := false
+	if len(toRemove) > maxBatch {
+		toRemove = toRemove[:maxBatch]
+		more = true
+	}
+
 	for _, p := range toRemove {
-		err := e.removePeer(p)
-		if err != nil {
-			return err
+		if err := e.removePeer(p); err != nil {
+			return false, err
 		}
 		log.Infof("removed peer %s", p)
 	}
-	return nil
+	return more, nil
 }

 func (e *Engine) removeAllPeers() error {
@@ -914,19 +942,17 @@ func (e *Engine) phase(name string) func() {
 	}
 }

-func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
-	started := time.Now()
-	defer func() {
-		duration := time.Since(started)
-		log.Infof("sync finished in %s", duration)
-		e.clientMetrics.RecordSyncDuration(e.ctx, duration)
-	}()
+// applySyncPass applies one bounded pass of the sync update under syncMsgMux and
+// returns true if more peers remained than the per-pass cap. It is driven by the
+// mapStateManager, which re-invokes it (releasing the lock between passes) until
+// the update is fully applied.
+func (e *Engine) applySyncPass(update *mgmProto.SyncResponse, firstPass bool) (bool, error) {
 	e.syncMsgMux.Lock()
 	defer e.syncMsgMux.Unlock()

 	// Check context INSIDE lock to ensure atomicity with shutdown
 	if e.ctx.Err() != nil {
-		return e.ctx.Err()
+		return false, e.ctx.Err()
 	}

 	if update.NetworkMap != nil && update.NetworkMap.PeerConfig != nil {
@@ -937,7 +963,7 @@ func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
 	err := e.updateNetbirdConfig(update.GetNetbirdConfig())
 	done()
 	if err != nil {
-		return err
+		return false, err
 	}

 	// Posture checks are bound to the network map presence:
@@ -947,28 +973,25 @@ func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
 	//                                        leave the previously applied checks untouched
 	nm := update.GetNetworkMap()
 	if nm == nil {
-		return nil
+		return false, nil
 	}

 	done = e.phase("checks")
 	err = e.updateChecksIfNew(update.Checks)
 	done()
 	if err != nil {
-		return err
+		return false, err
 	}

-	done = e.phase("persist")
-	e.persistSyncResponse(update)
-	done()
-
 	// only apply new changes and ignore old ones
-	if err := e.updateNetworkMap(nm); err != nil {
-		return err
+	more, err := e.updateNetworkMap(nm, maxPeersPerSyncPass, firstPass)
+	if err != nil {
+		return false, err
 	}

 	e.statusRecorder.PublishEvent(cProto.SystemEvent_INFO, cProto.SystemEvent_SYSTEM, "Network map updated", "", nil)

-	return nil
+	return more, nil
 }

 // updateNetbirdConfig applies the management-provided NetBird configuration:
@@ -1016,6 +1039,13 @@ func (e *Engine) updateNetbirdConfig(wCfg *mgmProto.NetbirdConfig) error {
 // (not syncMsgMux) is held for the whole Set so the store cannot be cleared (disabled /
 // engine close) mid-call and have this write resurrect a file that was just removed.
 func (e *Engine) persistSyncResponse(update *mgmProto.SyncResponse) {
+	// Only persist updates that carry a network map. Config-only updates (e.g. relay
+	// token rotation, STUN/TURN) have a nil NetworkMap; persisting them would overwrite
+	// the last full map on disk and break restore-on-restart.
+	if update.GetNetworkMap() == nil {
+		return
+	}
+
 	e.syncRespMux.RLock()
 	defer e.syncRespMux.RUnlock()

@@ -1130,7 +1160,6 @@ func (e *Engine) applyInfoFlags(info *system.Info) {
 		e.config.BlockLANAccess,
 		e.config.BlockInbound,
 		e.config.DisableIPv6,
-		e.config.LazyConnectionEnabled,
 		e.config.EnableSSHRoot,
 		e.config.EnableSSHSFTP,
 		e.config.EnableSSHLocalPortForwarding,
@@ -1304,7 +1333,24 @@ func (e *Engine) receiveManagementEvents() {
 		}
 		e.applyInfoFlags(info)

-		err := e.mgmClient.Sync(e.ctx, info, e.handleSync)
+		// The map-state manager converges the latest update in the background in
+		// bounded passes; the stream callback only hands it the newest target.
+		persist := func(u *mgmProto.SyncResponse) {
+			done := e.phase("persist")
+			e.persistSyncResponse(u)
+			done()
+		}
+		manager := newMapStateManager(e.applySyncPass, persist, func(d time.Duration) {
+			log.Infof("sync finished in %s", d)
+			e.clientMetrics.RecordSyncDuration(e.ctx, d)
+		})
+		e.shutdownWg.Add(1)
+		go func() {
+			defer e.shutdownWg.Done()
+			manager.run(e.ctx)
+		}()
+
+		err := e.mgmClient.Sync(e.ctx, info, manager.SetTarget)
 		if err != nil {
 			// happens if management is unavailable for a long time.
 			// We want to cancel the operation of the whole client
@@ -1355,21 +1401,107 @@ func (e *Engine) updateTURNs(turns []*mgmProto.ProtectedHostConfig) error {
 	return nil
 }

-func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
+// updateNetworkMap applies the wholesale parts (config, routes, ACL, DNS) in full
+// and up to maxBatch peers per phase. It returns true when more peers remained
+// than the cap, so the caller re-runs until convergence.
+func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap, maxBatch int, firstPass bool) (bool, error) {
 	// intentionally leave it before checking serial because for now it can happen that peer IP changed but serial didn't
 	if networkMap.GetPeerConfig() != nil {
 		err := e.updateConfig(networkMap.GetPeerConfig())
 		if err != nil {
-			return err
+			return false, err
 		}
 	}

 	serial := networkMap.GetSerial()
 	if e.networkSerial > serial {
 		log.Debugf("received outdated NetworkMap with serial %d, ignoring", serial)
-		return nil
+		return false, nil
 	}

+	// Wholesale sections (firewall/ACL, DNS, routes, forward rules) are applied
+	// up-front and only once per target: they are cheap, local, idempotent and must
+	// be in place before peers come up (fail-closed). On the bounded re-runs that only
+	// drain the remaining peer batches they are skipped — the applied forward rules are
+	// reused from e.forwardingRules for the lazy-exclude finalize.
+	if firstPass {
+		e.applyWholesale(networkMap, serial)
+	}
+
+	log.Debugf("got peers update from Management Service, total peers to connect to = %d", len(networkMap.GetRemotePeers()))
+
+	doneOffline := e.phase("offline_peers")
+	e.updateOfflinePeers(networkMap.GetOfflinePeers())
+	doneOffline()
+
+	// Filter out own peer from the remote peers list
+	localPubKey := e.config.WgPrivateKey.PublicKey().String()
+	remotePeers := make([]*mgmProto.RemotePeerConfig, 0, len(networkMap.GetRemotePeers()))
+	for _, p := range networkMap.GetRemotePeers() {
+		if p.GetWgPubKey() != localPubKey {
+			remotePeers = append(remotePeers, p)
+		}
+	}
+
+	// No special case for cleanup: when management signals RemotePeersIsEmpty (e.g. our
+	// peer was deleted), remotePeers is already empty, so the bounded diff below removes
+	// every peer in batches — same path as a normal update, no unbounded removeAllPeers
+	// held under syncMsgMux in one shot.
+	doneRemoved := e.phase("removed_peers")
+	removeMore, err := e.removePeers(remotePeers, maxBatch)
+	doneRemoved()
+	if err != nil {
+		return false, err
+	}
+
+	doneModified := e.phase("modified_peers")
+	modifyMore, err := e.modifyPeers(remotePeers, maxBatch)
+	doneModified()
+	if err != nil {
+		return false, err
+	}
+
+	doneAdded := e.phase("added_peers")
+	addMore, err := e.addNewPeers(remotePeers, maxBatch)
+	doneAdded()
+	if err != nil {
+		return false, err
+	}
+
+	// needMore signals the caller to re-run when a peer phase hit its per-pass cap.
+	needMore := removeMore || modifyMore || addMore
+
+	e.statusRecorder.FinishPeerListModifications()
+
+	e.updatePeerSSHHostKeys(remotePeers)
+
+	if err := e.updateSSHClientConfig(remotePeers); err != nil {
+		log.Warnf("failed to update SSH client config: %v", err)
+	}
+
+	e.updateSSHServerAuth(networkMap.GetSshAuth())
+
+	// Set the exclude list only once peers have fully converged (this pass added
+	// the last batch). It needs all target peers present in the store, and
+	// ExcludePeer has replace-semantics — a partial set mid-convergence would be wrong.
+	if !needMore {
+		doneLazy := e.phase("lazy_exclude")
+		excludedLazyPeers := e.toExcludedLazyPeers(e.forwardingRules, remotePeers)
+		e.connMgr.SetExcludeList(e.ctx, excludedLazyPeers)
+		doneLazy()
+	}
+
+	e.networkSerial = serial
+
+	return needMore, nil
+}
+
+// applyWholesale applies the cheap, local, idempotent map sections — lazy feature
+// flag, firewall/legacy management, DNS, routes, ACL filtering, DNS forwarder and
+// ingress forward rules — that must be in place before peers come up. It runs once
+// per target (first pass only); the resulting forward rules are stashed in
+// e.forwardingRules for the lazy-exclude finalize on the peer-converged pass.
+func (e *Engine) applyWholesale(networkMap *mgmProto.NetworkMap, serial uint64) {
 	if err := e.connMgr.UpdatedRemoteFeatureFlag(e.ctx, networkMap.GetPeerConfig().GetLazyConnectionEnabled()); err != nil {
 		log.Errorf("failed to update lazy connection feature flag: %v", err)
 	}
@@ -1442,84 +1574,7 @@ func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
 		log.Errorf("failed to update forward rules, err: %v", err)
 	}
 	done()
-
-	log.Debugf("got peers update from Management Service, total peers to connect to = %d", len(networkMap.GetRemotePeers()))
-
-	done = e.phase("offline_peers")
-	e.updateOfflinePeers(networkMap.GetOfflinePeers())
-	done()
-
-	remotePeers, err := e.reconcilePeers(networkMap)
-	if err != nil {
-		return err
-	}
-
-	// must set the exclude list after the peers are added. Without it the manager can not figure out the peers parameters from the store
-	done = e.phase("lazy_exclude")
-	excludedLazyPeers := e.toExcludedLazyPeers(forwardingRules, remotePeers)
-	e.connMgr.SetExcludeList(e.ctx, excludedLazyPeers)
-	done()
-
-	e.networkSerial = serial
-
-	return nil
-}
-
-// reconcilePeers applies the remote peer list from the network map (removing,
-// modifying and adding peers, then updating SSH config) and returns the remote
-// peers with our own peer filtered out, for use by later sync steps.
-func (e *Engine) reconcilePeers(networkMap *mgmProto.NetworkMap) ([]*mgmProto.RemotePeerConfig, error) {
-	// Filter out own peer from the remote peers list
-	localPubKey := e.config.WgPrivateKey.PublicKey().String()
-	remotePeers := make([]*mgmProto.RemotePeerConfig, 0, len(networkMap.GetRemotePeers()))
-	for _, p := range networkMap.GetRemotePeers() {
-		if p.GetWgPubKey() != localPubKey {
-			remotePeers = append(remotePeers, p)
-		}
-	}
-
-	// cleanup request, most likely our peer has been deleted
-	if networkMap.GetRemotePeersIsEmpty() {
-		err := e.removeAllPeers()
-		e.statusRecorder.FinishPeerListModifications()
-		if err != nil {
-			return nil, err
-		}
-		return remotePeers, nil
-	}
-
-	done := e.phase("removed_peers")
-	err := e.removePeers(remotePeers)
-	done()
-	if err != nil {
-		return nil, err
-	}
-
-	done = e.phase("modified_peers")
-	err = e.modifyPeers(remotePeers)
-	done()
-	if err != nil {
-		return nil, err
-	}
-
-	done = e.phase("added_peers")
-	err = e.addNewPeers(remotePeers)
-	done()
-	if err != nil {
-		return nil, err
-	}
-
-	e.statusRecorder.FinishPeerListModifications()
-
-	e.updatePeerSSHHostKeys(remotePeers)
-
-	if err := e.updateSSHClientConfig(remotePeers); err != nil {
-		log.Warnf("failed to update SSH client config: %v", err)
-	}
-
-	e.updateSSHServerAuth(networkMap.GetSshAuth())
-
-	return remotePeers, nil
+	e.forwardingRules = forwardingRules
 }

 func toDNSFeatureFlag(networkMap *mgmProto.NetworkMap) bool {
@@ -1699,14 +1754,23 @@ func addrToString(addr netip.Addr) string {
 }

 // addNewPeers adds peers that were not know before but arrived from the Management service with the update
-func (e *Engine) addNewPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
+// addNewPeers adds up to maxBatch not-yet-present peers per call. It returns true
+// when more new peers remained than the cap, so the caller re-runs.
+func (e *Engine) addNewPeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch int) (bool, error) {
+	added := 0
 	for _, p := range peersUpdate {
-		err := e.addNewPeer(p)
-		if err != nil {
-			return err
+		if _, ok := e.peerStore.PeerConn(p.GetWgPubKey()); ok {
+			continue // already present (cheap skip), does not count toward the cap
 		}
+		if added >= maxBatch {
+			return true, nil // at least one more new peer remains
+		}
+		if err := e.addNewPeer(p); err != nil {
+			return false, err
+		}
+		added++
 	}
-	return nil
+	return false, nil
 }

 // addNewPeer add peer if connection doesn't exist
@@ -1999,7 +2063,6 @@ func (e *Engine) readInitialSettings() ([]*route.Route, *nbdns.Config, bool, err
 		e.config.BlockLANAccess,
 		e.config.BlockInbound,
 		e.config.DisableIPv6,
-		e.config.LazyConnectionEnabled,
 		e.config.EnableSSHRoot,
 		e.config.EnableSSHSFTP,
 		e.config.EnableSSHLocalPortForwarding,
--- a/client/internal/engine_privileged_test.go
+++ b/client/internal/engine_privileged_test.go
@@ -124,7 +124,7 @@ func TestEngine_SSH(t *testing.T) {
 		RemotePeersIsEmpty: false,
 	}

-	err = engine.updateNetworkMap(networkMap)
+	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
 	require.NoError(t, err)

 	assert.Nil(t, engine.sshServer)
@@ -146,7 +146,7 @@ func TestEngine_SSH(t *testing.T) {
 		RemotePeersIsEmpty: false,
 	}

-	err = engine.updateNetworkMap(networkMap)
+	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
 	require.NoError(t, err)

 	time.Sleep(250 * time.Millisecond)
@@ -159,7 +159,7 @@ func TestEngine_SSH(t *testing.T) {
 		RemotePeersIsEmpty: false,
 	}

-	err = engine.updateNetworkMap(networkMap)
+	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
 	require.NoError(t, err)

 	// time.Sleep(250 * time.Millisecond)
@@ -174,7 +174,7 @@ func TestEngine_SSH(t *testing.T) {
 		RemotePeersIsEmpty: false,
 	}

-	err = engine.updateNetworkMap(networkMap)
+	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
 	require.NoError(t, err)

 	assert.Nil(t, engine.sshServer)
--- a/client/internal/engine_test.go
+++ b/client/internal/engine_test.go
@@ -437,7 +437,7 @@ func TestEngine_UpdateNetworkMap(t *testing.T) {

 	for _, c := range []testCase{case1, case2, case3, case4, case5, case6} {
 		t.Run(c.name, func(t *testing.T) {
-			err = engine.updateNetworkMap(c.networkMap)
+			_, err = engine.updateNetworkMap(c.networkMap, maxPeersPerSyncPass, true)
 			if err != nil {
 				t.Fatal(err)
 				return
@@ -464,6 +464,47 @@ func TestEngine_UpdateNetworkMap(t *testing.T) {
 			}
 		})
 	}
+
+	// chunked apply: with a per-pass cap smaller than the number of peers, a
+	// single updateNetworkMap applies one batch and reports more==true; the
+	// caller re-runs until convergence. (engine currently holds 0 peers.)
+	t.Run("chunked add converges over multiple passes", func(t *testing.T) {
+		nm := &mgmtProto.NetworkMap{
+			Serial:      6,
+			RemotePeers: []*mgmtProto.RemotePeerConfig{peer1, peer2, peer3},
+		}
+
+		more, err := engine.updateNetworkMap(nm, 1, true)
+		require.NoError(t, err)
+		require.True(t, more, "pass 1 should signal more")
+		require.Len(t, engine.peerStore.PeersPubKey(), 1)
+
+		more, err = engine.updateNetworkMap(nm, 1, false)
+		require.NoError(t, err)
+		require.True(t, more, "pass 2 should signal more")
+		require.Len(t, engine.peerStore.PeersPubKey(), 2)
+
+		more, err = engine.updateNetworkMap(nm, 1, false)
+		require.NoError(t, err)
+		require.False(t, more, "pass 3 should converge")
+		require.Len(t, engine.peerStore.PeersPubKey(), 3)
+	})
+
+	t.Run("chunked remove converges over multiple passes", func(t *testing.T) {
+		nm := &mgmtProto.NetworkMap{
+			Serial:      7,
+			RemotePeers: []*mgmtProto.RemotePeerConfig{peer1}, // remove peer2, peer3
+		}
+
+		more, err := engine.updateNetworkMap(nm, 1, true)
+		require.NoError(t, err)
+		require.True(t, more, "pass 1 should signal more (2 to remove, cap 1)")
+
+		more, err = engine.updateNetworkMap(nm, 1, false)
+		require.NoError(t, err)
+		require.False(t, more, "pass 2 should converge")
+		require.Len(t, engine.peerStore.PeersPubKey(), 1)
+	})
 }

 func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) {
@@ -634,7 +675,7 @@ func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) {
 				}
 			}()

-			err = engine.updateNetworkMap(testCase.networkMap)
+			_, err = engine.updateNetworkMap(testCase.networkMap, maxPeersPerSyncPass, true)
 			assert.NoError(t, err, "shouldn't return error")
 			assert.Equal(t, testCase.expectedSerial, input.inputSerial, "serial should match")
 			assert.Len(t, input.clientRoutes, testCase.expectedLen, "clientRoutes len should match")
@@ -838,7 +879,7 @@ func TestEngine_UpdateNetworkMapWithDNSUpdate(t *testing.T) {
 				}
 			}()

-			err = engine.updateNetworkMap(testCase.networkMap)
+			_, err = engine.updateNetworkMap(testCase.networkMap, maxPeersPerSyncPass, true)
 			assert.NoError(t, err, "shouldn't return error")
 			assert.Equal(t, testCase.expectedSerial, input.inputSerial, "serial should match")
 			assert.Len(t, input.inputNSGroups, testCase.expectedZonesLen, "zones len should match")
--- a/client/internal/lazyconn/env.go
+++ b/client/internal/lazyconn/env.go
@@ -3,24 +3,57 @@ package lazyconn
 import (
 	"os"
 	"strconv"
+	"strings"

 	log "github.com/sirupsen/logrus"
 )

 const (
-	EnvEnableLazyConn      = "NB_ENABLE_EXPERIMENTAL_LAZY_CONN"
+	EnvLazyConn            = "NB_LAZY_CONN"
 	EnvInactivityThreshold = "NB_LAZY_CONN_INACTIVITY_THRESHOLD"
 )

-func IsLazyConnEnabledByEnv() bool {
-	val := os.Getenv(EnvEnableLazyConn)
-	if val == "" {
-		return false
-	}
-	enabled, err := strconv.ParseBool(val)
-	if err != nil {
-		log.Warnf("failed to parse %s: %v", EnvEnableLazyConn, err)
-		return false
-	}
-	return enabled
+// State is the tri-state local override for lazy connections read from the environment.
+type State int
+
+const (
+	// StateUnset means no local override; defer to the management feature flag.
+	StateUnset State = iota
+	// StateOn forces lazy connections on, overriding management.
+	StateOn
+	// StateOff forces lazy connections off, overriding management.
+	StateOff
+)
+
+// EnvState reads NB_LAZY_CONN and returns the local override state.
+func EnvState() State {
+	return ParseState(os.Getenv(EnvLazyConn))
+}
+
+// ParseState interprets a lazy-connection override value (from the environment or an MDM
+// policy). It accepts the on/off aliases plus any value strconv.ParseBool understands
+// (true/false/1/0). An empty or unrecognized value returns StateUnset so that the
+// management feature flag remains in control.
+func ParseState(raw string) State {
+	if raw == "" {
+		return StateUnset
+	}
+
+	normalized := strings.ToLower(strings.TrimSpace(raw))
+	switch normalized {
+	case "on":
+		return StateOn
+	case "off":
+		return StateOff
+	}
+
+	enabled, err := strconv.ParseBool(normalized)
+	if err != nil {
+		log.Warnf("failed to parse lazy connection value %q (from %s env or MDM policy): %v", raw, EnvLazyConn, err)
+		return StateUnset
+	}
+	if enabled {
+		return StateOn
+	}
+	return StateOff
 }
--- a/client/internal/lazyconn/env_test.go
+++ b/client/internal/lazyconn/env_test.go
@@ -0,0 +1,45 @@
+package lazyconn
+
+import (
+	"os"
+	"testing"
+)
+
+func TestEnvState(t *testing.T) {
+	tests := []struct {
+		value string
+		set   bool
+		want  State
+	}{
+		{set: false, want: StateUnset},
+		{value: "", set: true, want: StateUnset},
+		{value: "on", set: true, want: StateOn},
+		{value: "ON", set: true, want: StateOn},
+		{value: "true", set: true, want: StateOn},
+		{value: "1", set: true, want: StateOn},
+		{value: " on ", set: true, want: StateOn},
+		{value: "off", set: true, want: StateOff},
+		{value: "OFF", set: true, want: StateOff},
+		{value: "false", set: true, want: StateOff},
+		{value: "0", set: true, want: StateOff},
+		{value: "auto", set: true, want: StateUnset},
+		{value: "garbage", set: true, want: StateUnset},
+	}
+
+	for _, tt := range tests {
+		name := tt.value
+		if !tt.set {
+			name = "unset"
+		}
+		t.Run(name, func(t *testing.T) {
+			t.Setenv(EnvLazyConn, tt.value)
+			if !tt.set {
+				os.Unsetenv(EnvLazyConn)
+			}
+
+			if got := EnvState(); got != tt.want {
+				t.Fatalf("EnvState() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
--- a/client/internal/mapsync.go
+++ b/client/internal/mapsync.go
@@ -0,0 +1,214 @@
+package internal
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	log "github.com/sirupsen/logrus"
+
+	mgmProto "github.com/netbirdio/netbird/shared/management/proto"
+)
+
+// mapStateManager is the single read/write point between the management stream
+// (writes) and the convergence loop (reads/applies).
+//
+// The stream calls SetTarget with the latest full SyncResponse — the complete
+// desired state. A single background goroutine (run) applies it to the engine in
+// bounded passes via apply() until converged, releasing syncMsgMux between passes
+// so other subsystems interleave. If a newer update arrives mid-flight, the loop
+// coalesces: it keeps converging toward the latest target and the intermediate one
+// is SKIPPED — never applied on its own (logged, no onConverged).
+//
+// Convergence is a single comparison: appliedGen == targetGen. targetGen
+// increments on every SetTarget (an internal generation counter, so it also covers
+// config-only updates that carry no network-map serial).
+//
+// onConverged fires once for each — and only each — map that is actually processed
+// (i.e. converged as the target). Skipped/superseded maps and dropped-on-error maps
+// do NOT fire it. So "sync finished in X" / RecordSyncDuration always corresponds
+// to a real, completed alignment.
+type mapStateManager struct {
+	// apply performs one bounded apply pass and reports whether more passes are needed.
+	// firstPass is true on the first pass of a given target, so the caller can run
+	// wholesale (firewall/routes/DNS/forward-rules) once per target and skip it on the
+	// re-runs that only drain the bounded peer batches. The manager owns this signal
+	// because it owns the convergence boundary; the engine need not track serials for it.
+	apply func(update *mgmProto.SyncResponse, firstPass bool) (bool, error)
+	// onConverged is called once per processed map, with the elapsed time since that
+	// map was received (for the sync-duration metric / "sync finished" log).
+	onConverged func(time.Duration)
+	// persist snapshots an update to disk for restore-on-restart. Called once per
+	// update received from management (in SetTarget), including ones later coalesced
+	// or skipped from apply, so the on-disk state mirrors what management last sent.
+	// The impl skips config-only updates (nil NetworkMap). May be nil.
+	persist func(*mgmProto.SyncResponse)
+
+	mu          sync.Mutex
+	target      *mgmProto.SyncResponse
+	targetGen   uint64
+	appliedGen  uint64
+	targetSetAt time.Time
+
+	wake chan struct{}
+}
+
+func newMapStateManager(apply func(update *mgmProto.SyncResponse, firstPass bool) (bool, error), persist func(*mgmProto.SyncResponse), onConverged func(time.Duration)) *mapStateManager {
+	return &mapStateManager{
+		apply:       apply,
+		persist:     persist,
+		onConverged: onConverged,
+		wake:        make(chan struct{}, 1),
+	}
+}
+
+// SetTarget records the latest update as the desired state and wakes the loop.
+// It returns immediately; convergence happens in the background. Serial-based
+// staleness of the network map is still enforced inside apply (updateNetworkMap).
+func (m *mapStateManager) SetTarget(update *mgmProto.SyncResponse) error {
+	m.mu.Lock()
+	// A target that has not settled yet (targetGen > appliedGen) is being superseded
+	// before it converged: we coalesce to the latest map and never apply this one on
+	// its own. It is SKIPPED — logged here, and it will not fire onConverged.
+	if m.target != nil && m.targetGen > m.appliedGen {
+		log.Debugf("sync map (gen %d) superseded before convergence, skipping", m.targetGen)
+	}
+	m.target = m.mergeTarget(m.target, update)
+	// Bump an internal generation counter, NOT the map serial: config-only updates
+	// (relay token rotation, STUN/TURN) arrive with NetworkMap == nil and carry no
+	// serial, yet must still be applied. Every SetTarget is therefore a distinct
+	// target regardless of payload. Map-serial staleness is enforced separately
+	// inside apply (updateNetworkMap).
+	m.targetGen++
+	m.targetSetAt = time.Now()
+	m.mu.Unlock()
+
+	select {
+	case m.wake <- struct{}{}:
+	default:
+	}
+
+	// Persist every update received from management — once per update (not per apply
+	// pass), and including ones that get coalesced/skipped from apply, so the on-disk
+	// state always reflects the latest map management sent. Done after waking the loop
+	// so convergence can start in parallel with the disk write. The persist impl skips
+	// config-only updates (nil NetworkMap).
+	if m.persist != nil {
+		m.persist(update)
+	}
+	return nil
+}
+
+// mergeTarget combines the currently pending target with a freshly received update
+// and returns the new desired state. It is called under m.mu from SetTarget and is
+// the single seam where the replace-vs-squash decision lives.
+//
+// Today management always sends a FULL map (the complete desired state), so the
+// update simply replaces whatever was pending — prev is ignored. When management
+// starts sending incremental/delta updates, squash `update` onto `prev` here; the
+// rest of the manager (generation tracking, convergence, signaling) is unaffected
+// because it already treats target as "the complete desired state, whatever it is".
+func (m *mapStateManager) mergeTarget(prev, update *mgmProto.SyncResponse) *mgmProto.SyncResponse {
+	// Nothing pending to preserve (no prev, or prev already fully applied): plain replace.
+	if prev == nil || update == nil || m.targetGen == m.appliedGen {
+		return update
+	}
+
+	// prev still has unapplied state (targetGen > appliedGen). In the sync protocol a
+	// nil component means "no change", so if `update` omits a component that prev
+	// carried, carry prev's forward — otherwise coalescing an update that superseded a
+	// not-yet-applied one would silently drop the map or config it uniquely brought.
+	// A present component in `update` is newer and wins. Management may send map-only
+	// updates (nil config) and config-only updates (nil map); both are handled here.
+	// A nil component in `update` means "no change", so fill it in from prev — otherwise
+	// coalescing an update that superseded a not-yet-applied one would drop the map or
+	// config it uniquely carried. A present component in `update` is newer and wins.
+	// We mutate `update` in place: it is a fresh per-message allocation from the sync
+	// stream (see receiveUpdatesEvents — not reused), and persisting this squashed target
+	// is correct, since it is the current full (superset) desired state.
+	if update.GetNetworkMap() == nil && prev.GetNetworkMap() != nil {
+		update.NetworkMap = prev.GetNetworkMap()
+		update.Checks = prev.Checks // checks travel with the map
+	}
+	if update.GetNetbirdConfig() == nil && prev.GetNetbirdConfig() != nil {
+		update.NetbirdConfig = prev.GetNetbirdConfig()
+	}
+	return update
+}
+
+// run drives convergence until ctx is done. It is meant to run in its own goroutine.
+func (m *mapStateManager) run(ctx context.Context) {
+	// passGen is the generation of the most recent apply() call (0 = none). A pass is
+	// the first for its target when its generation differs from the previous one —
+	// true on a fresh target and on a coalesced switch to a newer target mid-flight.
+	var passGen uint64
+	for {
+		m.mu.Lock()
+		target, tg, ag := m.target, m.targetGen, m.appliedGen
+		m.mu.Unlock()
+
+		// Fully converged (or nothing yet): block until a new target arrives.
+		if target == nil || ag == tg {
+			select {
+			case <-ctx.Done():
+				return
+			case <-m.wake:
+				continue
+			}
+		}
+
+		firstPass := tg != passGen
+		passGen = tg
+		more, err := m.apply(target, firstPass)
+		if err != nil {
+			if ctx.Err() != nil {
+				return
+			}
+			// Log and DROP this target — do not retry it. A deterministic failure
+			// (e.g. a malformed peer in the map) would otherwise spin every pass
+			// making no progress. Management is the source of truth and re-delivers
+			// the full map on the next sync, so dropping is safe; peers already
+			// applied this convergence stay (idempotent diffs) and the remainder is
+			// reconciled by the next target. Mirrors the legacy handleSync path,
+			// where the apply error was logged by the gRPC client and the update
+			// dropped. No onConverged: this target did not converge.
+			log.Errorf("apply sync pass, dropping update: %v", err)
+			m.settle(tg, false)
+			continue
+		}
+
+		if more {
+			// keep converging the current target; syncMsgMux was released by apply
+			// between passes so other subsystems interleave.
+			continue
+		}
+
+		// This pass converged. Mark applied and signal this one map.
+		m.settle(tg, true)
+		// if a newer target arrived mid-pass, settle is a no-op (targetGen != tg) and
+		// ag<tg next iteration -> apply it; this generation was skipped (logged in
+		// SetTarget) and is not signaled.
+	}
+}
+
+// settle marks generation tg as processed so the loop goes idle instead of
+// re-applying the same target. It is a no-op when a newer target arrived during the
+// pass (targetGen != tg), leaving appliedGen behind so that target re-applies — the
+// just-finished generation was already counted as skipped.
+//
+// When signal is true (the pass converged) it fires onConverged once for this map;
+// when false (the target was dropped on error) it does not — the map did not converge.
+func (m *mapStateManager) settle(tg uint64, signal bool) {
+	m.mu.Lock()
+	if m.targetGen != tg {
+		m.mu.Unlock()
+		return
+	}
+	m.appliedGen = tg
+	setAt := m.targetSetAt
+	m.mu.Unlock()
+
+	if signal && m.onConverged != nil {
+		m.onConverged(time.Since(setAt))
+	}
+}
--- a/client/internal/mapsync_test.go
+++ b/client/internal/mapsync_test.go
@@ -0,0 +1,281 @@
+package internal
+
+import (
+	"context"
+	"errors"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+
+	mgmProto "github.com/netbirdio/netbird/shared/management/proto"
+)
+
+// mergeTarget fills components missing from the incoming update with the pending
+// (not-yet-applied) prev's, in place, so a coalesced/superseded update does not drop
+// the map or config it uniquely carried.
+func TestMapStateManager_MergeTargetPreservesPendingState(t *testing.T) {
+	m := newMapStateManager(nil, nil, nil)
+
+	// config-only update while a full map is still converging (targetGen > appliedGen):
+	// the pending map (+ checks) is filled into the update in place
+	m.targetGen, m.appliedGen = 5, 4
+	prev := &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 5}}
+	update := &mgmProto.SyncResponse{NetbirdConfig: &mgmProto.NetbirdConfig{}}
+	merged := m.mergeTarget(prev, update)
+	require.Same(t, update, merged, "merges in place, returns the update")
+	require.EqualValues(t, 5, merged.GetNetworkMap().GetSerial(), "pending map preserved")
+	require.NotNil(t, merged.GetNetbirdConfig(), "new config kept")
+
+	// symmetric: map-only update while a config-only update is pending -> keep the config
+	m.targetGen, m.appliedGen = 5, 4
+	prev = &mgmProto.SyncResponse{NetbirdConfig: &mgmProto.NetbirdConfig{}}
+	update = &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 7}}
+	merged = m.mergeTarget(prev, update)
+	require.EqualValues(t, 7, merged.GetNetworkMap().GetSerial(), "new map kept")
+	require.NotNil(t, merged.GetNetbirdConfig(), "pending config preserved")
+
+	// prev already applied (targetGen == appliedGen): plain replace, no fill-in
+	m.targetGen, m.appliedGen = 5, 5
+	prev = &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 5}}
+	update = &mgmProto.SyncResponse{NetbirdConfig: &mgmProto.NetbirdConfig{}}
+	merged = m.mergeTarget(prev, update)
+	require.Same(t, update, merged)
+	require.Nil(t, merged.GetNetworkMap(), "no map grafted when prev already applied")
+
+	// nothing to carry (update has a map, prev has no config): plain replace
+	m.targetGen, m.appliedGen = 5, 4
+	prev = &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 5}}
+	update = &mgmProto.SyncResponse{NetworkMap: &mgmProto.NetworkMap{Serial: 6}}
+	require.Same(t, update, m.mergeTarget(prev, update))
+}
+
+// converges over the bounded passes (apply returns more until the 3rd pass),
+// fires onConverged exactly once, then blocks (no further apply) until a new target.
+func TestMapStateManager_ConvergesThenStops(t *testing.T) {
+	var passes int32
+	var firstPasses int32
+	converged := make(chan struct{}, 1)
+
+	apply := func(_ *mgmProto.SyncResponse, firstPass bool) (bool, error) {
+		n := atomic.AddInt32(&passes, 1)
+		if firstPass {
+			atomic.AddInt32(&firstPasses, 1)
+		}
+		return n < 3, nil // more on pass 1 and 2, converge on pass 3
+	}
+	m := newMapStateManager(apply, nil, func(time.Duration) { converged <- struct{}{} })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+
+	select {
+	case <-converged:
+	case <-time.After(2 * time.Second):
+		t.Fatal("manager did not converge")
+	}
+	require.EqualValues(t, 3, atomic.LoadInt32(&passes))
+	require.EqualValues(t, 1, atomic.LoadInt32(&firstPasses), "firstPass true only on pass 1, false on re-runs of the same target")
+
+	// once converged the loop blocks: no further apply calls
+	time.Sleep(100 * time.Millisecond)
+	require.EqualValues(t, 3, atomic.LoadInt32(&passes), "apply must not run after convergence")
+}
+
+// persist runs once per received update (not per apply pass), regardless of how many
+// bounded passes that target takes to converge.
+func TestMapStateManager_PersistsOncePerUpdate(t *testing.T) {
+	var passes, persists int32
+	converged := make(chan struct{}, 1)
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		n := atomic.AddInt32(&passes, 1)
+		return n < 3, nil // 3 passes for one target
+	}
+	persist := func(*mgmProto.SyncResponse) { atomic.AddInt32(&persists, 1) }
+	m := newMapStateManager(apply, persist, func(time.Duration) { converged <- struct{}{} })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-converged:
+	case <-time.After(2 * time.Second):
+		t.Fatal("did not converge")
+	}
+	require.EqualValues(t, 3, atomic.LoadInt32(&passes))
+	require.EqualValues(t, 1, atomic.LoadInt32(&persists), "persist once per update, not per pass")
+}
+
+// every update received from management is persisted — even one that is coalesced /
+// skipped from apply before it ever converges.
+func TestMapStateManager_PersistsEveryUpdateIncludingSkipped(t *testing.T) {
+	release := make(chan struct{})
+	var persists int32
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		<-release // hold the first apply so the second update coalesces/skips
+		return false, nil
+	}
+	persist := func(*mgmProto.SyncResponse) { atomic.AddInt32(&persists, 1) }
+	m := newMapStateManager(apply, persist, nil)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{})) // map1 -> apply blocks
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{})) // map2 supersedes map1 (skipped from apply)
+	close(release)
+
+	// both updates persisted even though map1 is skipped from apply
+	require.Eventually(t, func() bool { return atomic.LoadInt32(&persists) == 2 }, 2*time.Second, 10*time.Millisecond)
+}
+
+// each map that is actually processed (converged before the next arrives) fires
+// onConverged exactly once — mirroring the legacy per-message handleSync timing.
+func TestMapStateManager_SignalsEachProcessedMap(t *testing.T) {
+	converged := make(chan struct{}, 8)
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		return false, nil // converge in one pass
+	}
+	m := newMapStateManager(apply, nil, func(time.Duration) { converged <- struct{}{} })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	const maps = 3
+	for i := 0; i < maps; i++ {
+		require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+		select { // wait for this map to converge before sending the next (no coalescing)
+		case <-converged:
+		case <-time.After(2 * time.Second):
+			t.Fatalf("map %d not signaled", i)
+		}
+	}
+
+	// no extra signals once the stream goes quiet
+	select {
+	case <-converged:
+		t.Fatal("unexpected extra onConverged")
+	case <-time.After(100 * time.Millisecond):
+	}
+}
+
+// a map superseded before it converges is skipped: only the latest (processed) map
+// fires onConverged, not the skipped one.
+func TestMapStateManager_SkippedMapNotSignaled(t *testing.T) {
+	release := make(chan struct{})
+	var applies, converged atomic.Int32
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		applies.Add(1)
+		<-release // hold the first apply in-flight so we can queue a newer target
+		return false, nil
+	}
+	m := newMapStateManager(apply, nil, func(time.Duration) { converged.Add(1) })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	// map1 is picked up; its apply blocks on release
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	require.Eventually(t, func() bool { return applies.Load() >= 1 }, 2*time.Second, 5*time.Millisecond)
+
+	// map2 supersedes map1 before it settled -> map1 is skipped
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	close(release) // let both applies proceed
+
+	// only the processed (latest) map signals; the skipped one does not
+	require.Eventually(t, func() bool { return converged.Load() == 1 }, 2*time.Second, 10*time.Millisecond)
+	time.Sleep(150 * time.Millisecond)
+	require.EqualValues(t, 1, converged.Load(), "skipped map must not fire onConverged")
+	require.EqualValues(t, 2, applies.Load(), "both targets entered apply (map1 once, map2 once)")
+}
+
+// an apply error drops the target: no retry of the same target, no onConverged,
+// the loop goes idle — and a fresh target is still applied afterwards.
+func TestMapStateManager_DropsTargetOnError(t *testing.T) {
+	applied := make(chan struct{}, 8)
+	var failNext atomic.Bool
+	failNext.Store(true)
+
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		applied <- struct{}{}
+		if failNext.Load() {
+			return false, errors.New("boom")
+		}
+		return false, nil // converge in one pass
+	}
+	var converged atomic.Int32
+	m := newMapStateManager(apply, nil, func(time.Duration) { converged.Add(1) })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	// first target errors -> applied once, then dropped (no retry, no onConverged)
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-applied:
+	case <-time.After(2 * time.Second):
+		t.Fatal("errored target not applied")
+	}
+	select {
+	case <-applied:
+		t.Fatal("errored target must not be retried")
+	case <-time.After(150 * time.Millisecond):
+	}
+	require.EqualValues(t, 0, converged.Load(), "onConverged must not fire on error")
+
+	// a new target is still processed normally and converges
+	failNext.Store(false)
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-applied:
+	case <-time.After(2 * time.Second):
+		t.Fatal("new target after error not applied")
+	}
+	require.Eventually(t, func() bool { return converged.Load() == 1 }, 2*time.Second, 10*time.Millisecond)
+}
+
+// a new target after convergence triggers a fresh apply; an idle (converged)
+// manager does not apply on its own.
+func TestMapStateManager_ReappliesOnNewTarget(t *testing.T) {
+	applied := make(chan struct{}, 8)
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		applied <- struct{}{}
+		return false, nil // converge in one pass
+	}
+	m := newMapStateManager(apply, nil, nil)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-applied:
+	case <-time.After(2 * time.Second):
+		t.Fatal("first target not applied")
+	}
+
+	// converged → must stay idle (no spurious apply)
+	select {
+	case <-applied:
+		t.Fatal("unexpected apply while idle/converged")
+	case <-time.After(150 * time.Millisecond):
+	}
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-applied:
+	case <-time.After(2 * time.Second):
+		t.Fatal("new target not applied")
+	}
+}
--- a/client/internal/profilemanager/config.go
+++ b/client/internal/profilemanager/config.go
@@ -101,8 +101,6 @@ type ConfigInput struct {

 	DNSLabels domain.List

-	LazyConnectionEnabled *bool
-
 	MTU *uint16
 }

@@ -180,7 +178,9 @@ type Config struct {

 	ClientCertKeyPair *tls.Certificate `json:"-"`

-	LazyConnectionEnabled bool
+	// LazyConnection is the MDM-managed lazy-connection override ("on"/"off"/"").
+	// Runtime-only: re-derived from MDM policy on each load, never persisted.
+	LazyConnection string `json:"-"`

 	MTU uint16

@@ -632,12 +632,6 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
 		updated = true
 	}

-	if input.LazyConnectionEnabled != nil && *input.LazyConnectionEnabled != config.LazyConnectionEnabled {
-		log.Infof("switching lazy connection to %t", *input.LazyConnectionEnabled)
-		config.LazyConnectionEnabled = *input.LazyConnectionEnabled
-		updated = true
-	}
-
 	if input.MTU != nil && *input.MTU != config.MTU {
 		log.Infof("updating MTU to %d (old value %d)", *input.MTU, config.MTU)
 		config.MTU = *input.MTU
@@ -728,6 +722,15 @@ func (config *Config) applyMDMPolicy(policy *mdm.Policy) {
 			log.Warnf("MDM wireguard port %d out of range [1,65535]; keeping previous value", v)
 		}
 	}
+
+	if v, ok := policy.GetBool(mdm.KeyLazyConnection); ok {
+		state := "off"
+		if v {
+			state = "on"
+		}
+		config.LazyConnection = state
+		logApplied(mdm.KeyLazyConnection, state)
+	}
 }

 // parseURL parses and validates the URL for the named service. The URL
--- a/client/internal/profilemanager/config_mdm_test.go
+++ b/client/internal/profilemanager/config_mdm_test.go
@@ -130,6 +130,37 @@ func TestApply_MDMBoolKeysOverrideOnDiskValue(t *testing.T) {
 	assert.True(t, cfg.Policy().HasKey(mdm.KeyRosenpassEnabled))
 }

+func TestApply_MDMLazyConnection(t *testing.T) {
+	cases := []struct {
+		name string
+		raw  any
+		want string
+	}{
+		{"native true", true, "on"},
+		{"native false", false, "off"},
+		{"string on", "on", "on"},
+		{"string off", "off", "off"},
+		{"string yes", "yes", "on"},
+		{"string no", "no", "off"},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			withMDMPolicy(t, mdm.NewPolicy(map[string]any{
+				mdm.KeyLazyConnection: c.raw,
+			}))
+
+			cfg, err := UpdateOrCreateConfig(ConfigInput{
+				ConfigPath: filepath.Join(t.TempDir(), "config.json"),
+			})
+			require.NoError(t, err)
+			require.NotNil(t, cfg)
+
+			assert.Equal(t, c.want, cfg.LazyConnection)
+			assert.True(t, cfg.Policy().HasKey(mdm.KeyLazyConnection))
+		})
+	}
+}
+
 func TestApply_MDMPreSharedKeyRedactionSentinelRejected(t *testing.T) {
 	const maskSentinel = "**********"

--- a/client/ios/NetBirdSDK/env_list.go
+++ b/client/ios/NetBirdSDK/env_list.go
@@ -38,7 +38,7 @@ func GetEnvKeyNBForceRelay() string {

 // GetEnvKeyNBLazyConn Exports the environment variable for the iOS client
 func GetEnvKeyNBLazyConn() string {
-	return lazyconn.EnvEnableLazyConn
+	return lazyconn.EnvLazyConn
 }

 // GetEnvKeyNBInactivityThreshold Exports the environment variable for the iOS client
--- a/client/mdm/canonical_loaders.go
+++ b/client/mdm/canonical_loaders.go
@@ -27,6 +27,7 @@ var allKeys = []string{
 	KeyWireguardPort,
 	KeySplitTunnelMode,
 	KeySplitTunnelApps,
+	KeyLazyConnection,
 }

 // canonicalKey maps the lowercase form of a managed-config value name to
--- a/client/mdm/policy.go
+++ b/client/mdm/policy.go
@@ -11,6 +11,7 @@ package mdm
 import (
 	"sort"
 	"strconv"
+	"strings"

 	log "github.com/sirupsen/logrus"
 )
@@ -41,6 +42,11 @@ const (
 	// construction — only one mode can be set at a time.
 	KeySplitTunnelMode = "splitTunnelMode"
 	KeySplitTunnelApps = "splitTunnelApps"
+
+	// KeyLazyConnection forces the lazy-connection feature on or off, overriding
+	// the management feature flag. Read as a bool (native bool, or on/off,
+	// true/false, 1/0, yes/no); absent = defer to management.
+	KeyLazyConnection = "lazyConnection"
 )

 // Split-tunnel mode literals (KeySplitTunnelMode values).
@@ -62,12 +68,13 @@ var boolStringLiterals = map[string]bool{
 	"true":  true,
 	"1":     true,
 	"yes":   true,
+	"on":    true,
 	"false": false,
 	"0":     false,
 	"no":    false,
+	"off":   false,
 }

-
 // Policy holds MDM-managed settings read from the platform source. A nil or
 // empty Policy means no enforcement is active.
 type Policy struct {
@@ -150,7 +157,8 @@ func (p *Policy) GetString(key string) (string, bool) {
 }

 // GetBool returns the managed value for key coerced to bool, and whether the
-// key was set. Accepts native bool and string literals "true"/"false"/"1"/"0".
+// key was set. Accepts native bool and string literals (true/false, 1/0,
+// yes/no, on/off), case-insensitively and trimmed of surrounding whitespace.
 func (p *Policy) GetBool(key string) (bool, bool) {
 	if p == nil {
 		return false, false
@@ -163,7 +171,7 @@ func (p *Policy) GetBool(key string) (bool, bool) {
 	case bool:
 		return t, true
 	case string:
-		b, known := boolStringLiterals[t]
+		b, known := boolStringLiterals[strings.ToLower(strings.TrimSpace(t))]
 		return b, known
 	case int:
 		return t != 0, true
--- a/client/mdm/policy_test.go
+++ b/client/mdm/policy_test.go
@@ -31,8 +31,8 @@ func TestPolicy_Empty(t *testing.T) {

 func TestPolicy_HasKey(t *testing.T) {
 	p := NewPolicy(map[string]any{
-		KeyManagementURL:    "https://corp.example.com",
-		KeyDisableProfiles:  true,
+		KeyManagementURL:   "https://corp.example.com",
+		KeyDisableProfiles: true,
 	})
 	assert.False(t, p.IsEmpty())
 	assert.True(t, p.HasKey(KeyManagementURL))
@@ -53,8 +53,8 @@ func TestPolicy_ManagedKeysSorted(t *testing.T) {
 func TestPolicy_GetString(t *testing.T) {
 	p := NewPolicy(map[string]any{
 		KeyManagementURL:   "https://corp.example.com",
-		KeyDisableProfiles: true,            // wrong type for GetString
-		KeyPreSharedKey:        "",              // empty rejected
+		KeyDisableProfiles: true, // wrong type for GetString
+		KeyPreSharedKey:    "",   // empty rejected
 	})
 	v, ok := p.GetString(KeyManagementURL)
 	assert.True(t, ok)
@@ -85,6 +85,11 @@ func TestPolicy_GetBool(t *testing.T) {
 		{"string 0", "0", false, true},
 		{"string yes", "yes", true, true},
 		{"string no", "no", false, true},
+		{"string on", "on", true, true},
+		{"string off", "off", false, true},
+		{"mixed case On", "On", true, true},
+		{"upper TRUE", "TRUE", true, true},
+		{"padded yes", "  yes  ", true, true},
 		{"int nonzero", 1, true, true},
 		{"int zero", 0, false, true},
 		{"int64 nonzero", int64(2), true, true},
--- a/client/server/mdm.go
+++ b/client/server/mdm.go
@@ -152,7 +152,6 @@ func (s *Server) restartEngineForMDMLocked() error {
 	s.config = config
 	s.statusRecorder.UpdateManagementAddress(config.ManagementURL.String())
 	s.statusRecorder.UpdateRosenpass(config.RosenpassEnabled, config.RosenpassPermissive)
-	s.statusRecorder.UpdateLazyConnection(config.LazyConnectionEnabled)

 	ctx, cancel := context.WithCancel(s.rootCtx)
 	s.actCancel = cancel
@@ -305,7 +304,6 @@ func setConfigRequestHasConfigOverrides(msg *proto.SetConfigRequest) bool {
 		msg.DisableFirewall != nil ||
 		msg.BlockLanAccess != nil ||
 		msg.DisableNotifications != nil ||
-		msg.LazyConnectionEnabled != nil ||
 		msg.BlockInbound != nil ||
 		msg.DisableIpv6 != nil ||
 		msg.EnableSSHRoot != nil ||
@@ -348,7 +346,6 @@ func loginRequestHasConfigOverrides(msg *proto.LoginRequest) bool {
 		msg.BlockLanAccess != nil ||
 		msg.DisableNotifications != nil ||
 		len(msg.DnsLabels) > 0 || msg.CleanDNSLabels ||
-		msg.LazyConnectionEnabled != nil ||
 		msg.BlockInbound != nil
 }

--- a/client/server/server.go
+++ b/client/server/server.go
@@ -214,7 +214,6 @@ func (s *Server) Start() error {

 	s.statusRecorder.UpdateManagementAddress(config.ManagementURL.String())
 	s.statusRecorder.UpdateRosenpass(config.RosenpassEnabled, config.RosenpassPermissive)
-	s.statusRecorder.UpdateLazyConnection(config.LazyConnectionEnabled)

 	if s.sessionWatcher == nil {
 		s.sessionWatcher = internal.NewSessionWatcher(s.rootCtx, s.statusRecorder)
@@ -463,7 +462,6 @@ func (s *Server) setConfigInputFromRequest(msg *proto.SetConfigRequest) (profile
 	config.DisableFirewall = msg.DisableFirewall
 	config.BlockLANAccess = msg.BlockLanAccess
 	config.DisableNotifications = msg.DisableNotifications
-	config.LazyConnectionEnabled = msg.LazyConnectionEnabled
 	config.BlockInbound = msg.BlockInbound
 	config.DisableIPv6 = msg.DisableIpv6
 	config.EnableSSHRoot = msg.EnableSSHRoot
@@ -1647,7 +1645,6 @@ func (s *Server) GetConfig(ctx context.Context, req *proto.GetConfigRequest) (*p
 		ServerSSHAllowed:              *cfg.ServerSSHAllowed,
 		RosenpassEnabled:              cfg.RosenpassEnabled,
 		RosenpassPermissive:           cfg.RosenpassPermissive,
-		LazyConnectionEnabled:         cfg.LazyConnectionEnabled,
 		BlockInbound:                  cfg.BlockInbound,
 		DisableNotifications:          disableNotifications,
 		NetworkMonitor:                networkMonitor,
--- a/client/server/setconfig_test.go
+++ b/client/server/setconfig_test.go
@@ -69,43 +69,41 @@ func TestSetConfig_AllFieldsSaved(t *testing.T) {
 	disableFirewall := true
 	blockLANAccess := true
 	disableNotifications := true
-	lazyConnectionEnabled := true
 	blockInbound := true
 	disableIPv6 := true
 	mtu := int64(1280)
 	sshJWTCacheTTL := int32(300)

 	req := &proto.SetConfigRequest{
-		ProfileName:           profName,
-		Username:              currUser.Username,
-		ManagementUrl:         "https://new-api.netbird.io:443",
-		AdminURL:              "https://new-admin.netbird.io",
-		RosenpassEnabled:      &rosenpassEnabled,
-		RosenpassPermissive:   &rosenpassPermissive,
-		ServerSSHAllowed:      &serverSSHAllowed,
-		InterfaceName:         &interfaceName,
-		WireguardPort:         &wireguardPort,
-		OptionalPreSharedKey:  &preSharedKey,
-		DisableAutoConnect:    &disableAutoConnect,
-		NetworkMonitor:        &networkMonitor,
-		DisableClientRoutes:   &disableClientRoutes,
-		DisableServerRoutes:   &disableServerRoutes,
-		DisableDns:            &disableDNS,
-		DisableFirewall:       &disableFirewall,
-		BlockLanAccess:        &blockLANAccess,
-		DisableNotifications:  &disableNotifications,
-		LazyConnectionEnabled: &lazyConnectionEnabled,
-		BlockInbound:          &blockInbound,
-		DisableIpv6:           &disableIPv6,
-		NatExternalIPs:        []string{"1.2.3.4", "5.6.7.8"},
-		CleanNATExternalIPs:   false,
-		CustomDNSAddress:      []byte("1.1.1.1:53"),
-		ExtraIFaceBlacklist:   []string{"eth1", "eth2"},
-		DnsLabels:             []string{"label1", "label2"},
-		CleanDNSLabels:        false,
-		DnsRouteInterval:      durationpb.New(2 * time.Minute),
-		Mtu:                   &mtu,
-		SshJWTCacheTTL:        &sshJWTCacheTTL,
+		ProfileName:          profName,
+		Username:             currUser.Username,
+		ManagementUrl:        "https://new-api.netbird.io:443",
+		AdminURL:             "https://new-admin.netbird.io",
+		RosenpassEnabled:     &rosenpassEnabled,
+		RosenpassPermissive:  &rosenpassPermissive,
+		ServerSSHAllowed:     &serverSSHAllowed,
+		InterfaceName:        &interfaceName,
+		WireguardPort:        &wireguardPort,
+		OptionalPreSharedKey: &preSharedKey,
+		DisableAutoConnect:   &disableAutoConnect,
+		NetworkMonitor:       &networkMonitor,
+		DisableClientRoutes:  &disableClientRoutes,
+		DisableServerRoutes:  &disableServerRoutes,
+		DisableDns:           &disableDNS,
+		DisableFirewall:      &disableFirewall,
+		BlockLanAccess:       &blockLANAccess,
+		DisableNotifications: &disableNotifications,
+		BlockInbound:         &blockInbound,
+		DisableIpv6:          &disableIPv6,
+		NatExternalIPs:       []string{"1.2.3.4", "5.6.7.8"},
+		CleanNATExternalIPs:  false,
+		CustomDNSAddress:     []byte("1.1.1.1:53"),
+		ExtraIFaceBlacklist:  []string{"eth1", "eth2"},
+		DnsLabels:            []string{"label1", "label2"},
+		CleanDNSLabels:       false,
+		DnsRouteInterval:     durationpb.New(2 * time.Minute),
+		Mtu:                  &mtu,
+		SshJWTCacheTTL:       &sshJWTCacheTTL,
 	}

 	_, err = s.SetConfig(ctx, req)
@@ -140,7 +138,6 @@ func TestSetConfig_AllFieldsSaved(t *testing.T) {
 	require.Equal(t, blockLANAccess, cfg.BlockLANAccess)
 	require.NotNil(t, cfg.DisableNotifications)
 	require.Equal(t, disableNotifications, *cfg.DisableNotifications)
-	require.Equal(t, lazyConnectionEnabled, cfg.LazyConnectionEnabled)
 	require.Equal(t, blockInbound, cfg.BlockInbound)
 	require.Equal(t, disableIPv6, cfg.DisableIPv6)
 	require.Equal(t, []string{"1.2.3.4", "5.6.7.8"}, cfg.NATExternalIPs)
@@ -164,13 +161,14 @@ func verifyAllFieldsCovered(t *testing.T, req *proto.SetConfigRequest) {
 	t.Helper()

 	metadataFields := map[string]bool{
-		"state":               true, // protobuf internal
-		"sizeCache":           true, // protobuf internal
-		"unknownFields":       true, // protobuf internal
-		"Username":            true, // metadata
-		"ProfileName":         true, // metadata
-		"CleanNATExternalIPs": true, // control flag for clearing
-		"CleanDNSLabels":      true, // control flag for clearing
+		"state":                 true, // protobuf internal
+		"sizeCache":             true, // protobuf internal
+		"unknownFields":         true, // protobuf internal
+		"Username":              true, // metadata
+		"ProfileName":           true, // metadata
+		"CleanNATExternalIPs":   true, // control flag for clearing
+		"CleanDNSLabels":        true, // control flag for clearing
+		"LazyConnectionEnabled": true, // deprecated: proto field retained for compat, no longer applied
 	}

 	expectedFields := map[string]bool{
@@ -190,7 +188,6 @@ func verifyAllFieldsCovered(t *testing.T, req *proto.SetConfigRequest) {
 		"DisableFirewall":               true,
 		"BlockLanAccess":                true,
 		"DisableNotifications":          true,
-		"LazyConnectionEnabled":         true,
 		"BlockInbound":                  true,
 		"DisableIpv6":                   true,
 		"NatExternalIPs":                true,
@@ -252,7 +249,6 @@ func TestCLIFlags_MappedToSetConfig(t *testing.T) {
 		"block-lan-access":                  "BlockLanAccess",
 		"block-inbound":                     "BlockInbound",
 		"disable-ipv6":                      "DisableIpv6",
-		"enable-lazy-connection":            "LazyConnectionEnabled",
 		"external-ip-map":                   "NatExternalIPs",
 		"dns-resolver-address":              "CustomDNSAddress",
 		"extra-iface-blacklist":             "ExtraIFaceBlacklist",
@@ -269,7 +265,8 @@ func TestCLIFlags_MappedToSetConfig(t *testing.T) {

 	// SetConfigRequest fields that don't have CLI flags (settable only via UI or other means).
 	fieldsWithoutCLIFlags := map[string]bool{
-		"DisableNotifications": true, // Only settable via UI
+		"DisableNotifications":  true, // Only settable via UI
+		"LazyConnectionEnabled": true, // deprecated: no longer settable (managed by server + NB_LAZY_CONN)
 	}

 	// Get all SetConfigRequest fields to verify our map is complete.
--- a/client/system/info.go
+++ b/client/system/info.go
@@ -74,8 +74,6 @@ type Info struct {
 	BlockInbound        bool
 	DisableIPv6         bool

-	LazyConnectionEnabled bool
-
 	EnableSSHRoot                 bool
 	EnableSSHSFTP                 bool
 	EnableSSHLocalPortForwarding  bool
@@ -87,7 +85,7 @@ func (i *Info) SetFlags(
 	rosenpassEnabled, rosenpassPermissive bool,
 	serverSSHAllowed *bool,
 	disableClientRoutes, disableServerRoutes,
-	disableDNS, disableFirewall, blockLANAccess, blockInbound, disableIPv6, lazyConnectionEnabled bool,
+	disableDNS, disableFirewall, blockLANAccess, blockInbound, disableIPv6 bool,
 	enableSSHRoot, enableSSHSFTP, enableSSHLocalPortForwarding, enableSSHRemotePortForwarding *bool,
 	disableSSHAuth *bool,
 ) {
@@ -105,8 +103,6 @@ func (i *Info) SetFlags(
 	i.BlockInbound = blockInbound
 	i.DisableIPv6 = disableIPv6

-	i.LazyConnectionEnabled = lazyConnectionEnabled
-
 	if enableSSHRoot != nil {
 		i.EnableSSHRoot = *enableSSHRoot
 	}
--- a/client/ui/client_ui.go
+++ b/client/ui/client_ui.go
@@ -266,7 +266,6 @@ type serviceClient struct {
 	mAllowSSH          *systray.MenuItem
 	mAutoConnect       *systray.MenuItem
 	mEnableRosenpass   *systray.MenuItem
-	mLazyConnEnabled   *systray.MenuItem
 	mBlockInbound      *systray.MenuItem
 	mNotifications     *systray.MenuItem
 	mAdvancedSettings  *systray.MenuItem
@@ -336,11 +335,11 @@ type serviceClient struct {
 	// mNetworks + mExitNode submenu items. Combines features.DisableNetworks
 	// AND s.connected — both must be true for the menus to be active.
 	// Zero value (false) matches the Disable() call at AddMenuItem time.
-	networksMenuEnabled  bool
-	showNetworks         bool
-	wNetworks            fyne.Window
-	wProfiles            fyne.Window
-	wQuickActions        fyne.Window
+	networksMenuEnabled bool
+	showNetworks        bool
+	wNetworks           fyne.Window
+	wProfiles           fyne.Window
+	wQuickActions       fyne.Window

 	eventManager *event.Manager

@@ -1094,7 +1093,6 @@ func (s *serviceClient) onTrayReady() {
 	s.mAllowSSH = s.mSettings.AddSubMenuItemCheckbox("Allow SSH", allowSSHMenuDescr, false)
 	s.mAutoConnect = s.mSettings.AddSubMenuItemCheckbox("Connect on Startup", autoConnectMenuDescr, false)
 	s.mEnableRosenpass = s.mSettings.AddSubMenuItemCheckbox("Enable Quantum-Resistance", quantumResistanceMenuDescr, false)
-	s.mLazyConnEnabled = s.mSettings.AddSubMenuItemCheckbox("Enable Lazy Connections", lazyConnMenuDescr, false)
 	s.mBlockInbound = s.mSettings.AddSubMenuItemCheckbox("Block Inbound Connections", blockInboundMenuDescr, false)
 	s.mNotifications = s.mSettings.AddSubMenuItemCheckbox("Notifications", notificationsMenuDescr, false)
 	s.mSettings.AddSeparator()
@@ -1578,7 +1576,6 @@ func protoConfigToConfig(cfg *proto.GetConfigResponse) *profilemanager.Config {
 	config.RosenpassEnabled = cfg.RosenpassEnabled
 	config.RosenpassPermissive = cfg.RosenpassPermissive
 	config.DisableNotifications = &cfg.DisableNotifications
-	config.LazyConnectionEnabled = cfg.LazyConnectionEnabled
 	config.BlockInbound = cfg.BlockInbound
 	config.NetworkMonitor = &cfg.NetworkMonitor
 	config.DisableDNS = cfg.DisableDns
@@ -1682,12 +1679,6 @@ func (s *serviceClient) loadSettings() {
 		s.mEnableRosenpass.Uncheck()
 	}

-	if cfg.LazyConnectionEnabled {
-		s.mLazyConnEnabled.Check()
-	} else {
-		s.mLazyConnEnabled.Uncheck()
-	}
-
 	if cfg.BlockInbound {
 		s.mBlockInbound.Check()
 	} else {
@@ -1833,7 +1824,6 @@ func (s *serviceClient) updateConfig() error {
 	disableAutoStart := !s.mAutoConnect.Checked()
 	sshAllowed := s.mAllowSSH.Checked()
 	rosenpassEnabled := s.mEnableRosenpass.Checked()
-	lazyConnectionEnabled := s.mLazyConnEnabled.Checked()
 	blockInbound := s.mBlockInbound.Checked()
 	notificationsDisabled := !s.mNotifications.Checked()

@@ -1856,14 +1846,13 @@ func (s *serviceClient) updateConfig() error {
 	}

 	req := proto.SetConfigRequest{
-		ProfileName:           activeProf.ID.String(),
-		Username:              currUser.Username,
-		DisableAutoConnect:    &disableAutoStart,
-		ServerSSHAllowed:      &sshAllowed,
-		RosenpassEnabled:      &rosenpassEnabled,
-		LazyConnectionEnabled: &lazyConnectionEnabled,
-		BlockInbound:          &blockInbound,
-		DisableNotifications:  &notificationsDisabled,
+		ProfileName:          activeProf.ID.String(),
+		Username:             currUser.Username,
+		DisableAutoConnect:   &disableAutoStart,
+		ServerSSHAllowed:     &sshAllowed,
+		RosenpassEnabled:     &rosenpassEnabled,
+		BlockInbound:         &blockInbound,
+		DisableNotifications: &notificationsDisabled,
 	}

 	if _, err := conn.SetConfig(s.ctx, &req); err != nil {
--- a/client/ui/const.go
+++ b/client/ui/const.go
@@ -4,7 +4,6 @@ const (
 	allowSSHMenuDescr          = "Allow SSH connections"
 	autoConnectMenuDescr       = "Connect automatically when the service starts"
 	quantumResistanceMenuDescr = "Enable post-quantum security via Rosenpass"
-	lazyConnMenuDescr          = "[Experimental] Enable lazy connections"
 	blockInboundMenuDescr      = "Block inbound connections to the local machine and routed networks"
 	notificationsMenuDescr     = "Enable notifications"
 	advancedSettingsMenuDescr  = "Advanced settings of the application"
--- a/client/ui/event_handler.go
+++ b/client/ui/event_handler.go
@@ -43,8 +43,6 @@ func (h *eventHandler) listen(ctx context.Context) {
 			h.handleAutoConnectClick()
 		case <-h.client.mEnableRosenpass.ClickedCh:
 			h.handleRosenpassClick()
-		case <-h.client.mLazyConnEnabled.ClickedCh:
-			h.handleLazyConnectionClick()
 		case <-h.client.mBlockInbound.ClickedCh:
 			h.handleBlockInboundClick()
 		case <-h.client.mAdvancedSettings.ClickedCh:
@@ -152,15 +150,6 @@ func (h *eventHandler) handleRosenpassClick() {
 	}
 }

-func (h *eventHandler) handleLazyConnectionClick() {
-	h.toggleCheckbox(h.client.mLazyConnEnabled)
-	if err := h.updateConfigWithErr(); err != nil {
-		h.toggleCheckbox(h.client.mLazyConnEnabled) // revert checkbox state on error
-		log.Errorf("failed to update config: %v", err)
-		h.client.notifier.Send("Error", "Failed to update lazy connection settings")
-	}
-}
-
 func (h *eventHandler) handleBlockInboundClick() {
 	h.toggleCheckbox(h.client.mBlockInbound)
 	if err := h.updateConfigWithErr(); err != nil {
--- a/e2e/agentnetwork/vllm_test.go
+++ b/e2e/agentnetwork/vllm_test.go
@@ -0,0 +1,171 @@
+//go:build e2e
+
+package agentnetwork
+
+import (
+	"context"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/netbirdio/netbird/e2e/harness"
+	"github.com/netbirdio/netbird/shared/management/http/api"
+)
+
+// TestVLLMProvider proves the proxy supports a self-hosted vLLM backend. vLLM is
+// OpenAI-compatible, so it uses the "vllm" catalog entry (KindCustom) and is
+// reached over plain HTTP — no TLS anywhere on the path:
+//
+//	client --tunnel--> netbird proxy --http--> vllm (:8000, OpenAI-compatible)
+//
+// The mock vLLM server answers /v1/chat/completions with an OpenAI-shaped
+// completion carrying a non-zero usage block. The test asserts the chat returns
+// 200 with the completion, that the request is recorded in the access log by its
+// session id, and that vLLM's usage block is metered into a consumption row —
+// which together prove request routing, response parsing, and token accounting
+// all work for a self-hosted OpenAI-compatible provider.
+//
+// It needs no external credentials (the mock ignores auth), so it always runs.
+func TestVLLMProvider(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
+	defer cancel()
+
+	vllm, err := harness.StartVLLM(ctx, srv)
+	require.NoError(t, err, "start mock vLLM server")
+	t.Cleanup(func() { _ = vllm.Terminate(context.Background()) })
+
+	grp, err := srv.API().Groups.Create(ctx, api.PostApiGroupsJSONRequestBody{Name: "e2e-vllm"})
+	require.NoError(t, err, "create group")
+	t.Cleanup(func() { _ = srv.API().Groups.Delete(context.Background(), grp.Id) })
+
+	ephemeral := false
+	sk, err := srv.API().SetupKeys.Create(ctx, api.PostApiSetupKeysJSONRequestBody{
+		Name:       "e2e-vllm-client",
+		Type:       "reusable",
+		ExpiresIn:  86400,
+		UsageLimit: 0,
+		AutoGroups: []string{grp.Id},
+		Ephemeral:  &ephemeral,
+	})
+	require.NoError(t, err, "mint setup key")
+	require.NotEmpty(t, sk.Key, "setup key plaintext")
+
+	// vLLM provider pointed at the mock over plain HTTP. The mock ignores auth,
+	// so a dummy key satisfies the "Bearer ${API_KEY}" template. The served model
+	// is enumerated so the router dispatches this model string to this provider.
+	dummyKey := "sk-vllm-e2e"
+	prov, err := srv.CreateProvider(ctx, api.AgentNetworkProviderRequest{
+		Name:             "vllm",
+		ProviderId:       "vllm",
+		UpstreamUrl:      vllm.URL,
+		ApiKey:           &dummyKey,
+		Enabled:          ptr(true),
+		BootstrapCluster: ptr(harness.AgentNetworkCluster),
+		Models: &[]api.AgentNetworkProviderModel{
+			{Id: harness.VLLMModel, InputPer1k: 0.001, OutputPer1k: 0.002},
+		},
+	})
+	require.NoError(t, err, "create vllm provider")
+	t.Cleanup(func() { _ = srv.DeleteProvider(context.Background(), prov.Id) })
+
+	// Token limit far above the handful of tokens this test drives, so it never
+	// blocks but switches on usage metering — the switch that makes consumption
+	// rows get recorded.
+	enabled := true
+	pol, err := srv.CreatePolicy(ctx, api.AgentNetworkPolicyRequest{
+		Name:                   "e2e-vllm-allow",
+		Enabled:                &enabled,
+		SourceGroups:           []string{grp.Id},
+		DestinationProviderIds: []string{prov.Id},
+		Limits: &api.AgentNetworkPolicyLimits{
+			TokenLimit: api.AgentNetworkPolicyTokenLimit{
+				Enabled:       true,
+				GroupCap:      10_000_000,
+				UserCap:       10_000_000,
+				WindowSeconds: 60,
+			},
+		},
+	})
+	require.NoError(t, err, "create policy")
+	t.Cleanup(func() { _ = srv.DeletePolicy(context.Background(), pol.Id) })
+
+	settings, err := srv.GetSettings(ctx)
+	require.NoError(t, err, "read settings")
+	require.NotEmpty(t, settings.Endpoint, "endpoint must be assigned")
+
+	proxyToken, err := srv.CreateProxyTokenCLI(ctx, "e2e-vllm-proxy")
+	require.NoError(t, err, "mint proxy token")
+	px, err := harness.StartProxy(ctx, srv, proxyToken)
+	require.NoError(t, err, "start proxy")
+	t.Cleanup(func() { _ = px.Terminate(context.Background()) })
+
+	cl, err := harness.StartClient(ctx, srv, sk.Key)
+	require.NoError(t, err, "start client")
+	t.Cleanup(func() { _ = cl.Terminate(context.Background()) })
+
+	require.NoError(t, cl.WaitConnected(ctx, 90*time.Second), "client must connect to management")
+	if err := cl.WaitProxyPeer(ctx, 180*time.Second); err != nil {
+		t.Fatalf("client did not see the proxy peer: %v\n=== proxy logs ===\n%s", err, px.Logs(context.Background()))
+	}
+	proxyIP, err := cl.ResolveProxyIP(ctx, settings.Endpoint)
+	require.NoError(t, err, "resolve endpoint to proxy IP")
+
+	before, _ := srv.ListAccessLogs(ctx)
+	sessionID := "e2e-session-vllm"
+
+	// Retry to absorb tunnel/DNS jitter on the first call.
+	var code int
+	var body string
+	deadline := time.Now().Add(90 * time.Second)
+	for time.Now().Before(deadline) {
+		c, b, cerr := cl.Chat(ctx, settings.Endpoint, proxyIP, harness.WireChat, harness.VLLMModel, "Reply with exactly: pong", sessionID)
+		if cerr == nil {
+			code, body = c, b
+			if code == 200 {
+				break
+			}
+		}
+		time.Sleep(5 * time.Second)
+	}
+	require.Equal(t, 200, code,
+		"chat through the vLLM provider must return 200; body: %s\n=== vllm logs ===\n%s\n=== proxy logs ===\n%s",
+		body, vllm.Logs(context.Background()), px.Logs(context.Background()))
+	require.True(t, strings.Contains(body, "chat.completion"),
+		"body should be an OpenAI-compatible chat completion; got: %s", body)
+
+	// The request must surface as an access-log row carrying our session id.
+	require.Eventually(t, func() bool {
+		logs, lerr := srv.ListAccessLogs(ctx)
+		return lerr == nil && logs.TotalRecords > before.TotalRecords
+	}, 30*time.Second, 2*time.Second, "an access-log row should be ingested for the vLLM provider")
+
+	require.Eventually(t, func() bool {
+		logs, lerr := srv.ListAccessLogs(ctx)
+		if lerr != nil {
+			return false
+		}
+		for _, r := range logs.Data {
+			if r.SessionId != nil && *r.SessionId == sessionID {
+				return true
+			}
+		}
+		return false
+	}, 30*time.Second, 2*time.Second, "session id %q must be recorded in an access-log row", sessionID)
+
+	// vLLM's usage block (prompt_tokens=11, completion_tokens=2) must be parsed
+	// and metered into a consumption row with positive token counts.
+	require.Eventually(t, func() bool {
+		rows, lerr := srv.ListConsumption(ctx)
+		if lerr != nil {
+			return false
+		}
+		for _, r := range rows {
+			if r.TokensInput > 0 && r.TokensOutput > 0 {
+				return true
+			}
+		}
+		return false
+	}, 60*time.Second, 3*time.Second, "vLLM usage must be metered into a consumption row")
+}
--- a/e2e/harness/vllm.go
+++ b/e2e/harness/vllm.go
@@ -0,0 +1,113 @@
+//go:build e2e
+
+package harness
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"time"
+
+	"github.com/docker/docker/api/types/container"
+	"github.com/testcontainers/testcontainers-go"
+	"github.com/testcontainers/testcontainers-go/wait"
+)
+
+const (
+	vllmImage = "nginx:alpine"
+	vllmAlias = "vllm"
+	vllmPort  = "8000/tcp"
+	// VLLMModel is the served model id the mock advertises and echoes back. It
+	// matches a real small model commonly served by vLLM so the provider's
+	// enumerated model and the client's request line up.
+	VLLMModel = "Qwen/Qwen2.5-0.5B-Instruct"
+)
+
+// vllmNginxConf emulates a vLLM OpenAI-compatible server over plain HTTP (vLLM's
+// default: no TLS, port 8000). It answers /v1/models with a one-model list and
+// any chat/completions path with a canned OpenAI-shaped chat completion carrying
+// a non-zero usage block, so the proxy's OpenAI parser records real token
+// consumption. Running actual vLLM in CI is infeasible (GPU + multi-GB model
+// download), so this stands in for the wire contract the proxy depends on.
+const vllmNginxConf = `pid /tmp/nginx.pid;
+events {}
+http {
+  server {
+    listen 8000;
+    location = /v1/models {
+      default_type application/json;
+      return 200 '{"object":"list","data":[{"id":"Qwen/Qwen2.5-0.5B-Instruct","object":"model","owned_by":"vllm"}]}';
+    }
+    location / {
+      default_type application/json;
+      return 200 '{"id":"chatcmpl-e2e-vllm","object":"chat.completion","created":1700000000,"model":"Qwen/Qwen2.5-0.5B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"pong"},"finish_reason":"stop"}],"usage":{"prompt_tokens":11,"completion_tokens":2,"total_tokens":13}}';
+    }
+  }
+}
+`
+
+// VLLM is a mock vLLM OpenAI-compatible server on the combined server's network,
+// reachable at http://vllm:8000. A "vllm" provider points at it to exercise the
+// proxy's support for self-hosted OpenAI-compatible backends.
+type VLLM struct {
+	container testcontainers.Container
+	workDir   string
+	// URL is the upstream URL the vllm provider points at (http://<alias>:8000).
+	URL string
+}
+
+// StartVLLM runs the mock vLLM server on the shared network over plain HTTP.
+func StartVLLM(ctx context.Context, c *Combined) (*VLLM, error) {
+	workDir, err := os.MkdirTemp("/tmp", "nb-e2e-vllm-*")
+	if err != nil {
+		return nil, fmt.Errorf("create vllm work dir: %w", err)
+	}
+	// Widen so the (non-root worker) nginx container can traverse the bind mount.
+	if err := os.Chmod(workDir, 0o755); err != nil { //nolint:gosec // throwaway e2e config dir
+		return nil, fmt.Errorf("chmod vllm dir: %w", err)
+	}
+	if err := os.WriteFile(filepath.Join(workDir, "nginx.conf"), []byte(vllmNginxConf), 0o644); err != nil { //nolint:gosec // non-secret e2e config
+		return nil, fmt.Errorf("write nginx conf: %w", err)
+	}
+
+	req := testcontainers.ContainerRequest{
+		Image:          vllmImage,
+		ExposedPorts:   []string{vllmPort},
+		Networks:       []string{c.network.Name},
+		NetworkAliases: map[string][]string{c.network.Name: {vllmAlias}},
+		Cmd:            []string{"nginx", "-c", "/conf/nginx.conf", "-g", "daemon off;"},
+		HostConfigModifier: func(hc *container.HostConfig) {
+			hc.Binds = append(hc.Binds, workDir+":/conf:ro")
+		},
+		WaitingFor: wait.ForListeningPort(vllmPort).WithStartupTimeout(60 * time.Second),
+	}
+
+	ctr, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
+		ContainerRequest: req,
+		Started:          true,
+	})
+	if err != nil {
+		_ = os.RemoveAll(workDir)
+		return nil, fmt.Errorf("start vllm container: %w", err)
+	}
+
+	return &VLLM{container: ctr, workDir: workDir, URL: "http://" + vllmAlias + ":8000"}, nil
+}
+
+// Logs returns the vLLM container logs, for diagnostics on failure.
+func (v *VLLM) Logs(ctx context.Context) string {
+	return containerLogs(ctx, v.container)
+}
+
+// Terminate stops the vLLM container and cleans its work dir.
+func (v *VLLM) Terminate(ctx context.Context) error {
+	var err error
+	if v.container != nil {
+		err = v.container.Terminate(ctx)
+	}
+	if v.workDir != "" {
+		_ = os.RemoveAll(v.workDir)
+	}
+	return err
+}
--- a/infrastructure_files/getting-started.sh
+++ b/infrastructure_files/getting-started.sh
@@ -351,11 +351,6 @@ initialize_default_values() {
  NETBIRD_STUN_PORT=3478

  # Docker images
-  # Record whether the operator explicitly pinned the server/proxy images via
-  # env vars, so the agent-network preset can pick its own defaults without
-  # clobbering an explicit override.
-  NETBIRD_SERVER_IMAGE_EXPLICIT=${NETBIRD_SERVER_IMAGE:+true}
-  NETBIRD_PROXY_IMAGE_EXPLICIT=${NETBIRD_PROXY_IMAGE:+true}
  DASHBOARD_IMAGE=${DASHBOARD_IMAGE:-"netbirdio/dashboard:latest"}
  # Combined server replaces separate signal, relay, and management containers
  NETBIRD_SERVER_IMAGE=${NETBIRD_SERVER_IMAGE:-"netbirdio/netbird-server:latest"}
@@ -415,15 +410,6 @@ apply_agent_network_preset() {
  ENABLE_PROXY="true"
  ENABLE_CROWDSEC="false"

-  # Agent-network ships dedicated server/proxy images. Honor an explicit
-  # env override; otherwise pin the agent-network builds.
-  if [[ "${NETBIRD_SERVER_IMAGE_EXPLICIT}" != "true" ]]; then
-    NETBIRD_SERVER_IMAGE="netbirdio/netbird-server:0.74.0-rc.2"
-  fi
-  if [[ "${NETBIRD_PROXY_IMAGE_EXPLICIT}" != "true" ]]; then
-    NETBIRD_PROXY_IMAGE="netbirdio/reverse-proxy:0.74.0-rc.2"
-  fi
-
  if [[ -n "${NETBIRD_LETSENCRYPT_EMAIL}" ]]; then
    TRAEFIK_ACME_EMAIL="${NETBIRD_LETSENCRYPT_EMAIL}"
  else
--- a/management/internals/modules/agentnetwork/catalog/catalog.go
+++ b/management/internals/modules/agentnetwork/catalog/catalog.go
@@ -627,6 +627,21 @@ var providers = []Provider{
 		},
 		Models: []Model{},
 	},
+	{
+		// vLLM is an OpenAI-compatible self-hosted server. It behaves like
+		// the generic custom entry; it gets its own catalog id purely so it
+		// surfaces as a named "vLLM" choice in the provider picker.
+		ID:                 "vllm",
+		Kind:               KindCustom,
+		Name:               "vLLM",
+		Description:        "Self-hosted vLLM (OpenAI-compatible)",
+		DefaultHost:        "",
+		AuthHeaderName:     "Authorization",
+		AuthHeaderTemplate: "Bearer ${API_KEY}",
+		DefaultContentType: "application/json",
+		BrandColor:         "#30A2FF",
+		Models:             []Model{},
+	},
 	{
 		ID:                 "custom",
 		Kind:               KindCustom,
--- a/management/internals/shared/grpc/conversion.go
+++ b/management/internals/shared/grpc/conversion.go
@@ -47,16 +47,13 @@ func init() {
 	precomputedDeprecatedRemotePeersConstraint = constraint
 }

+// toNetbirdConfig converts the server configuration to the wire representation. It returns
+// nil when no server config is set (the fan-out network-map path) because clients treat any
+// non-nil config as authoritative: a config without a relay section is interpreted as relay
+// disabled and wipes the clients' relay URLs.
 func toNetbirdConfig(config *nbconfig.Config, turnCredentials *Token, relayToken *Token, extraSettings *types.ExtraSettings, settings *types.Settings) *proto.NetbirdConfig {
 	if config == nil {
-		if settings == nil {
-			return nil
-		}
-		return &proto.NetbirdConfig{
-			Metrics: &proto.MetricsConfig{
-				Enabled: settings.MetricsPushEnabled,
-			},
-		}
+		return nil
 	}

 	var stuns []*proto.HostConfig
--- a/management/internals/shared/grpc/conversion_test.go
+++ b/management/internals/shared/grpc/conversion_test.go
@@ -8,11 +8,13 @@ import (
 	"time"

 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"

 	nbdns "github.com/netbirdio/netbird/dns"
 	"github.com/netbirdio/netbird/management/internals/controllers/network_map"
 	"github.com/netbirdio/netbird/management/internals/controllers/network_map/controller/cache"
 	nbconfig "github.com/netbirdio/netbird/management/internals/server/config"
+	"github.com/netbirdio/netbird/management/server/types"
 )

 func TestToProtocolDNSConfigWithCache(t *testing.T) {
@@ -263,3 +265,39 @@ func TestEncodeSessionExpiresAt(t *testing.T) {
 		assert.True(t, got.AsTime().Equal(deadline))
 	})
 }
+
+// TestToNetbirdConfig_RelayInvariant guards against the v0.74.0 relay-wipe regression.
+// Clients treat any non-nil NetbirdConfig as authoritative and interpret a missing relay
+// section as relay disabled, wiping their relay URLs. toNetbirdConfig must therefore
+// return nil when no server config is set (the fan-out network-map path) instead of a
+// partial config, and a result built from a relay-enabled config must carry the relay
+// section.
+func TestToNetbirdConfig_RelayInvariant(t *testing.T) {
+	settings := &types.Settings{MetricsPushEnabled: true}
+
+	t.Run("nil server config returns nil config", func(t *testing.T) {
+		nbCfg := toNetbirdConfig(nil, nil, nil, nil, settings)
+		assert.Nil(t, nbCfg, "fan-out updates must not carry a partial NetbirdConfig even when settings are present")
+	})
+
+	t.Run("relay-enabled config carries relay section", func(t *testing.T) {
+		cfg := &nbconfig.Config{
+			Stuns: []*nbconfig.Host{{Proto: nbconfig.UDP, URI: "stun:stun.example.com:3478"}},
+			TURNConfig: &nbconfig.TURNConfig{
+				Turns: []*nbconfig.Host{{Proto: nbconfig.UDP, URI: "turn:turn.example.com:3478", Username: "user", Password: "pass"}},
+			},
+			Relay:  &nbconfig.Relay{Addresses: []string{"rels://relay.example.com:443"}},
+			Signal: &nbconfig.Host{Proto: nbconfig.HTTP, URI: "signal.example.com:10000"},
+		}
+		relayToken := &Token{Payload: "token-payload", Signature: "token-signature"}
+
+		nbCfg := toNetbirdConfig(cfg, nil, relayToken, nil, settings)
+		require.NotNil(t, nbCfg)
+		require.NotNil(t, nbCfg.Relay, "non-nil NetbirdConfig must include the relay section")
+		assert.Equal(t, cfg.Relay.Addresses, nbCfg.Relay.Urls, "relay URLs should match the server config")
+		assert.Equal(t, relayToken.Payload, nbCfg.Relay.TokenPayload, "relay token payload should be set")
+		assert.Equal(t, relayToken.Signature, nbCfg.Relay.TokenSignature, "relay token signature should be set")
+		require.NotNil(t, nbCfg.Metrics)
+		assert.True(t, nbCfg.Metrics.Enabled, "metrics flag should carry the settings value")
+	})
+}
--- a/management/server/peer_test.go
+++ b/management/server/peer_test.go
@@ -1048,11 +1048,7 @@ func testUpdateAccountPeers(t *testing.T) {

 			for _, channel := range peerChannels {
 				update := <-channel
-				assert.NotNil(t, update.Update.NetbirdConfig)
-				assert.Nil(t, update.Update.NetbirdConfig.Stuns)
-				assert.Nil(t, update.Update.NetbirdConfig.Turns)
-				assert.Nil(t, update.Update.NetbirdConfig.Signal)
-				assert.Nil(t, update.Update.NetbirdConfig.Relay)
+				assert.Nil(t, update.Update.NetbirdConfig, "fan-out updates must not carry a NetbirdConfig; clients treat a config without relay as relay disabled and wipe their relay URLs")
 				assert.Equal(t, tc.peers, len(update.Update.NetworkMap.RemotePeers))
 				assert.Equal(t, tc.peers*2, len(update.Update.NetworkMap.FirewallRules))
 			}
--- a/shared/management/client/grpc.go
+++ b/shared/management/client/grpc.go
@@ -33,10 +33,15 @@ const ConnectTimeout = 10 * time.Second
 const healthCheckTimeout = 5 * time.Second

 const (
-	// EnvMaxRecvMsgSize overrides the default gRPC max receive message size (4 MB)
+	// EnvMaxRecvMsgSize overrides the default gRPC max receive message size
 	// for the management client connection. Value is in bytes.
 	EnvMaxRecvMsgSize = "NB_MANAGEMENT_GRPC_MAX_MSG_SIZE"

+	// defaultMaxRecvMsgSize is the max gRPC receive message size used for the
+	// management client connection when EnvMaxRecvMsgSize is unset or invalid.
+	// It overrides the gRPC library default of 4 MB.
+	defaultMaxRecvMsgSize = 1024 * 1024 * 16
+
 	errMsgMgmtPublicKey    = "failed getting Management Service public key: %s"
 	errMsgNoMgmtConnection = "no connection to management"
 )
@@ -84,22 +89,22 @@ type ExposeResponse struct {
 }

 // MaxRecvMsgSize returns the configured max gRPC receive message size from
-// the environment, or 0 if unset (which uses the gRPC default of 4 MB).
+// the environment, or defaultMaxRecvMsgSize (16 MB) if unset or invalid.
 func MaxRecvMsgSize() int {
 	val := os.Getenv(EnvMaxRecvMsgSize)
 	if val == "" {
-		return 0
+		return defaultMaxRecvMsgSize
 	}

 	size, err := strconv.Atoi(val)
 	if err != nil {
 		log.Warnf("invalid %s value %q, using default: %v", EnvMaxRecvMsgSize, val, err)
-		return 0
+		return defaultMaxRecvMsgSize
 	}

 	if size <= 0 {
 		log.Warnf("invalid %s value %d, must be positive, using default", EnvMaxRecvMsgSize, size)
-		return 0
+		return defaultMaxRecvMsgSize
 	}

 	return size
@@ -1030,8 +1035,6 @@ func infoToMetaData(info *system.Info) *proto.PeerSystemMeta {
 			BlockLANAccess:      info.BlockLANAccess,
 			BlockInbound:        info.BlockInbound,
 			DisableIPv6:         info.DisableIPv6,
-
-			LazyConnectionEnabled: info.LazyConnectionEnabled,
 		},

 		Capabilities: peerCapabilities(*info),
--- a/shared/management/client/grpc_test.go
+++ b/shared/management/client/grpc_test.go
@@ -21,11 +21,11 @@ func TestMaxRecvMsgSize(t *testing.T) {
 		envValue string
 		expected int
 	}{
-		{name: "unset returns 0", envValue: "", expected: 0},
+		{name: "unset returns default", envValue: "", expected: defaultMaxRecvMsgSize},
 		{name: "valid value", envValue: "10485760", expected: 10485760},
-		{name: "non-numeric returns 0", envValue: "abc", expected: 0},
-		{name: "negative returns 0", envValue: "-1", expected: 0},
-		{name: "zero returns 0", envValue: "0", expected: 0},
+		{name: "non-numeric returns default", envValue: "abc", expected: defaultMaxRecvMsgSize},
+		{name: "negative returns default", envValue: "-1", expected: defaultMaxRecvMsgSize},
+		{name: "zero returns default", envValue: "0", expected: defaultMaxRecvMsgSize},
 	}

 	for _, tt := range tests {
Author	SHA1	Message	Date
riccardom	9d613189f6	Merge remote-tracking branch 'origin/main' into netmap_progressive_alignment	2026-07-03 09:48:36 +02:00
Zoltan Papp	4b3dd9103d	[client] Fix slow wg operations (#6633 ) * [iface] Drop redundant device dump in kernel configure() wgctrl.ConfigureDevice already returns an error when the interface is missing, so the preceding wg.Device() existence check is redundant. That check dumps the entire device (all peers) on every configure() call, making it O(peers) per call and turning bulk peer insertion into O(peers^2): inserting N peers one by one re-parsed the whole growing peer list N times. Removing it keeps each peer write constant-time regardless of how many peers are already configured. * [iface] Cache WireGuard stats to collapse per-peer device dumps Each peer runs a WGWatcher that polls GetStats(), and every call dumps the whole device, so with N peers the watchers perform O(N) full dumps per poll cycle (O(N^2) work) while each keeps only its own peer's entry. Wrap the kernel and userspace configurer GetStats() in a short-TTL cache with singleflight: the staggered per-peer calls share a single device dump per window and concurrent misses collapse into one dump. The kernel and userspace WireGuard APIs have no per-peer stats query (a get always returns the whole device), so a shared cached snapshot avoids the repeated full dumps. * Ignore .claude directory	2026-07-02 20:42:43 +02:00
Riccardo Manfrin	8e3b284f4b	[client] Increase mgmt grpc buff size to 16MB (#6641 )	2026-07-02 17:50:18 +02:00
Maycon Santos	21aa933584	[misc] Fix GHCR image push after dockers_v2 migration (#6653 )	2026-07-02 17:21:06 +02:00
Misha Bragin	1dfa85a917	[management] Add vLLM e2e test (#6649 ) * Add vLLM to Agent Network * Add vllm e2e test	2026-07-02 15:36:51 +02:00
Maycon Santos	859fe19fff	[management] return nil when config is not set (#6642 ) * [management] return nil when config is not set * [management] add relay invariant test and enforce config behavior	2026-07-02 14:55:55 +02:00
Misha Bragin	e40cb294f6	[management] Add vLLM to Agent Network (#6643 )	2026-07-02 14:45:24 +02:00
Maycon Santos	e203e0f42a	[self-hosted] Remove unused server/proxy image override logic in getting-started.sh (#6636 )	2026-07-02 14:20:23 +02:00
riccardom	7673067605	Allow also nm only (empty config) syncs to be squashed into target state [changing behavior!] This makes the debug bundle contain the SUPERSET of the map with nm updates and config updates	2026-07-02 12:54:09 +02:00
Zoltan Papp	167be3a30f	[ci] Run privileged client tests natively with sudo on Linux (#6635 ) Restore the pre-split native, sudo-based run for the Linux Client / Unit job: build with the privileged tag and run under sudo, matching the darwin job. Excludes the dockertest harness (client/testutil/privileged) so it does not recurse into a container spawn. The Docker privileged job is kept as-is.	2026-07-02 12:15:57 +02:00
Viktor Liu	1d8b5f6e5c	[client] Make lazy connections opt-out via NB_LAZY_CONN (#6617 )	2026-07-02 10:58:16 +02:00
riccardom	79567fe347	RemotePeers is already created as an empty list we add peer only if not IsEmpty	2026-07-01 00:02:58 +02:00
riccardom	cf8d92fbb0	Fixup case of config only map superseeding nm map	2026-06-30 23:43:44 +02:00
riccardom	b70fc4015b	Merge remote-tracking branch 'origin/main' into netmap_progressive_alignment # Conflicts: # client/internal/engine.go	2026-06-30 23:29:15 +02:00
riccardom	4988b6726e	Aligns new tests to signature	2026-06-28 17:25:17 +02:00
riccardom	2552830184	Prevents skipping of intermediate map updates potentially not applied by moving the persistence from applySync to the map state manager	2026-06-28 17:23:34 +02:00
riccardom	3b8fc688f4	Do the wholesale (firewall/routes/dns) once only	2026-06-28 17:23:34 +02:00
riccardom	d82d62e818	Adds explicit merge call for future map updates	2026-06-28 17:20:00 +02:00
riccardom	0bf964dad7	Do not process intermediate one if new ones are fresher just use the freshest	2026-06-28 17:20:00 +02:00
riccardom	297dcb3e24	Always run onConverged for every map that is processed	2026-06-28 17:20:00 +02:00
riccardom	bc22926fe0	Drop in case of error, will reconcile with next update	2026-06-28 17:20:00 +02:00
riccardom	d3f2ef9adb	Comment why not serial	2026-06-28 17:20:00 +02:00
riccardom	5bec1e8f03	Adds map state manager	2026-06-28 17:20:00 +02:00
riccardom	74bb5c613e	Allows to specify max batch for tests	2026-06-28 17:20:00 +02:00
riccardom	29dde908ae	Modifies handleSync to support progressive peers conns convergence	2026-06-28 17:19:27 +02:00