Set logger in ConnectedIP test to avoid nil deref

Add logs
Revert "[client] Drop signaling-side ICE candidate filter, drop overlay STUN at mux read-side instead (#6142 )"
2026-06-25 09:19:57 +00:00 · 2026-06-25 10:15:07 +02:00 · 2026-06-24 21:46:14 +02:00 · 2026-06-24 21:29:05 +02:00 · 2026-06-24 15:35:58 +02:00 · 2026-06-24 15:04:22 +02:00
37 changed files with 952 additions and 216 deletions
--- a/client/cmd/debug.go
+++ b/client/cmd/debug.go
@@ -130,7 +130,7 @@ func debugConfigDump(cmd *cobra.Command, _ []string) error {

 	client := proto.NewDaemonServiceClient(conn)
 	resp, err := client.GetConfig(cmd.Context(), &proto.GetConfigRequest{
-		ProfileName: activeProf.Name,
+		ProfileName: string(activeProf.ID),
 		Username:    currUser.Username,
 	})
 	if err != nil {
--- a/client/iface/bind/ice_bind.go
+++ b/client/iface/bind/ice_bind.go
@@ -41,6 +41,7 @@ type ICEBind struct {
 	*wgConn.StdNetBind

 	transportNet transport.Net
+	filterFn     udpmux.FilterFn
 	address      wgaddr.Address
 	mtu          uint16

@@ -60,11 +61,12 @@ type ICEBind struct {
 	ipv6Conn *net.UDPConn
 }

-func NewICEBind(transportNet transport.Net, address wgaddr.Address, mtu uint16) *ICEBind {
+func NewICEBind(transportNet transport.Net, filterFn udpmux.FilterFn, address wgaddr.Address, mtu uint16) *ICEBind {
 	b, _ := wgConn.NewStdNetBind().(*wgConn.StdNetBind)
 	ib := &ICEBind{
 		StdNetBind:       b,
 		transportNet:     transportNet,
+		filterFn:         filterFn,
 		address:          address,
 		mtu:              mtu,
 		endpoints:        make(map[netip.Addr]net.Conn),
@@ -263,6 +265,7 @@ func (s *ICEBind) createOrUpdateMux() {
 		udpmux.UniversalUDPMuxParams{
 			UDPConn:   muxConn,
 			Net:       s.transportNet,
+			FilterFn:  s.filterFn,
 			WGAddress: s.address,
 			MTU:       s.mtu,
 		},
--- a/client/iface/bind/ice_bind_test.go
+++ b/client/iface/bind/ice_bind_test.go
@@ -289,7 +289,7 @@ func setupICEBind(t *testing.T) *ICEBind {
 		IP:      netip.MustParseAddr("100.64.0.1"),
 		Network: netip.MustParsePrefix("100.64.0.0/10"),
 	}
-	return NewICEBind(transportNet, address, 1280)
+	return NewICEBind(transportNet, nil, address, 1280)
 }

 func createDualStackConns(t *testing.T) (*net.UDPConn, *net.UDPConn) {
--- a/client/iface/device/device_kernel_unix.go
+++ b/client/iface/device/device_kernel_unix.go
@@ -32,6 +32,8 @@ type TunKernelDevice struct {
 	link       *wgLink
 	udpMuxConn net.PacketConn
 	udpMux     *udpmux.UniversalUDPMuxDefault
+
+	filterFn udpmux.FilterFn
 }

 func NewKernelDevice(name string, address wgaddr.Address, wgPort int, key string, mtu uint16, transportNet transport.Net) *TunKernelDevice {
@@ -102,6 +104,7 @@ func (t *TunKernelDevice) Up() (*udpmux.UniversalUDPMuxDefault, error) {
 	bindParams := udpmux.UniversalUDPMuxParams{
 		UDPConn:   nbnet.WrapPacketConn(rawSock),
 		Net:       t.transportNet,
+		FilterFn:  t.filterFn,
 		WGAddress: t.address,
 		MTU:       t.mtu,
 	}
--- a/client/iface/iface.go
+++ b/client/iface/iface.go
@@ -63,6 +63,7 @@ type WGIFaceOpts struct {
 	MTU          uint16
 	MobileArgs   *device.MobileIFaceArguments
 	TransportNet transport.Net
+	FilterFn     udpmux.FilterFn
 	DisableDNS   bool
 }

--- a/client/iface/iface_new.go
+++ b/client/iface/iface_new.go
@@ -11,7 +11,7 @@ import (

 // NewWGIFace Creates a new WireGuard interface instance
 func NewWGIFace(opts WGIFaceOpts) (*WGIface, error) {
-	iceBind := bind.NewICEBind(opts.TransportNet, opts.Address, opts.MTU)
+	iceBind := bind.NewICEBind(opts.TransportNet, opts.FilterFn, opts.Address, opts.MTU)

 	var tun WGTunDevice
 	if netstack.IsEnabled() {
--- a/client/iface/iface_new_android.go
+++ b/client/iface/iface_new_android.go
@@ -9,7 +9,7 @@ import (

 // NewWGIFace Creates a new WireGuard interface instance
 func NewWGIFace(opts WGIFaceOpts) (*WGIface, error) {
-	iceBind := bind.NewICEBind(opts.TransportNet, opts.Address, opts.MTU)
+	iceBind := bind.NewICEBind(opts.TransportNet, opts.FilterFn, opts.Address, opts.MTU)

 	if netstack.IsEnabled() {
 		wgIFace := &WGIface{
--- a/client/iface/iface_new_ios.go
+++ b/client/iface/iface_new_ios.go
@@ -10,7 +10,7 @@ import (

 // NewWGIFace Creates a new WireGuard interface instance
 func NewWGIFace(opts WGIFaceOpts) (*WGIface, error) {
-	iceBind := bind.NewICEBind(opts.TransportNet, opts.Address, opts.MTU)
+	iceBind := bind.NewICEBind(opts.TransportNet, opts.FilterFn, opts.Address, opts.MTU)

 	wgIFace := &WGIface{
 		tun:            device.NewTunDevice(opts.IFaceName, opts.Address, opts.WGPort, opts.WGPrivKey, opts.MTU, iceBind, opts.MobileArgs.TunFd),
--- a/client/iface/iface_new_linux.go
+++ b/client/iface/iface_new_linux.go
@@ -14,7 +14,7 @@ import (
 // NewWGIFace Creates a new WireGuard interface instance
 func NewWGIFace(opts WGIFaceOpts) (*WGIface, error) {
 	if netstack.IsEnabled() {
-		iceBind := bind.NewICEBind(opts.TransportNet, opts.Address, opts.MTU)
+		iceBind := bind.NewICEBind(opts.TransportNet, opts.FilterFn, opts.Address, opts.MTU)
 		return &WGIface{
 			tun:            device.NewNetstackDevice(opts.IFaceName, opts.Address, opts.WGPort, opts.WGPrivKey, opts.MTU, iceBind, netstack.ListenAddr()),
 			userspaceBind:  true,
@@ -30,7 +30,7 @@ func NewWGIFace(opts WGIFaceOpts) (*WGIface, error) {
 	}

 	if device.ModuleTunIsLoaded() {
-		iceBind := bind.NewICEBind(opts.TransportNet, opts.Address, opts.MTU)
+		iceBind := bind.NewICEBind(opts.TransportNet, opts.FilterFn, opts.Address, opts.MTU)
 		return &WGIface{
 			tun:            device.NewTunDevice(opts.IFaceName, opts.Address, opts.WGPort, opts.WGPrivKey, opts.MTU, iceBind),
 			userspaceBind:  true,
--- a/client/iface/udpmux/universal.go
+++ b/client/iface/udpmux/universal.go
@@ -8,6 +8,8 @@ import (
 	"context"
 	"fmt"
 	"net"
+	"net/netip"
+	"sync"
 	"time"

 	log "github.com/sirupsen/logrus"
@@ -20,6 +22,10 @@ import (
 	"github.com/netbirdio/netbird/client/iface/wgaddr"
 )

+// FilterFn is a function that filters out candidates based on the address.
+// If it returns true, the address is to be filtered. It also returns the prefix of matching route.
+type FilterFn func(address netip.Addr) (bool, netip.Prefix, error)
+
 // UniversalUDPMuxDefault handles STUN and TURN servers packets by wrapping the original UDPConn
 // It then passes packets to the UDPMux that does the actual connection muxing.
 type UniversalUDPMuxDefault struct {
@@ -37,6 +43,7 @@ type UniversalUDPMuxParams struct {
 	UDPConn               net.PacketConn
 	XORMappedAddrCacheTTL time.Duration
 	Net                   transport.Net
+	FilterFn              FilterFn
 	WGAddress             wgaddr.Address
 	MTU                   uint16
 }
@@ -61,6 +68,7 @@ func NewUniversalUDPMuxDefault(params UniversalUDPMuxParams) *UniversalUDPMuxDef
 		PacketConn: params.UDPConn,
 		mux:        m,
 		logger:     params.Logger,
+		filterFn:   params.FilterFn,
 		address:    params.WGAddress,
 	}

@@ -107,12 +115,15 @@ func (m *UniversalUDPMuxDefault) ReadFromConn(ctx context.Context) {
 	}
 }

-// UDPConn is a wrapper around UDPMux conn that overrides WriteTo to drop packets destined for the overlay subnet.
+// UDPConn is a wrapper around UDPMux conn that overrides ReadFrom and handles STUN/TURN packets
 type UDPConn struct {
 	net.PacketConn
-	mux     *UniversalUDPMuxDefault
-	logger  logging.LeveledLogger
-	address wgaddr.Address
+	mux      *UniversalUDPMuxDefault
+	logger   logging.LeveledLogger
+	filterFn FilterFn
+	// TODO: reset cache on route changes
+	addrCache sync.Map
+	address   wgaddr.Address
 }

 // GetPacketConn returns the underlying PacketConn
@@ -121,18 +132,67 @@ func (u *UDPConn) GetPacketConn() net.PacketConn {
 }

 func (u *UDPConn) WriteTo(b []byte, addr net.Addr) (int, error) {
-	udpAddr, ok := addr.(*net.UDPAddr)
-	if !ok {
+	if u.filterFn == nil {
 		return u.PacketConn.WriteTo(b, addr)
 	}
-	dst := udpAddr.AddrPort().Addr().Unmap()
-	if (u.address.Network.IsValid() && u.address.Network.Contains(dst)) || (u.address.IPv6Net.IsValid() && u.address.IPv6Net.Contains(dst)) {
-		log.Warnf("address %s is part of the NetBird network %s, refusing to write", addr, u.address)
-		return 0, fmt.Errorf("address %s is part of the NetBird network %s, refusing to write", addr, u.address)
+
+	if isRouted, found := u.addrCache.Load(addr.String()); found {
+		return u.handleCachedAddress(isRouted.(bool), b, addr)
+	}
+
+	return u.handleUncachedAddress(b, addr)
+}
+
+func (u *UDPConn) handleCachedAddress(isRouted bool, b []byte, addr net.Addr) (int, error) {
+	if isRouted {
+		return 0, fmt.Errorf("address %s is part of a routed network, refusing to write", addr)
 	}
 	return u.PacketConn.WriteTo(b, addr)
 }

+func (u *UDPConn) handleUncachedAddress(b []byte, addr net.Addr) (int, error) {
+	if err := u.performFilterCheck(addr); err != nil {
+		return 0, err
+	}
+	return u.PacketConn.WriteTo(b, addr)
+}
+
+func (u *UDPConn) performFilterCheck(addr net.Addr) error {
+	host, err := getHostFromAddr(addr)
+	if err != nil {
+		log.Errorf("Failed to get host from address %s: %v", addr, err)
+		return nil
+	}
+
+	a, err := netip.ParseAddr(host)
+	if err != nil {
+		log.Errorf("Failed to parse address %s: %v", addr, err)
+		return nil
+	}
+
+	if u.address.Network.Contains(a) {
+		log.Warnf("address %s is part of the NetBird network %s, refusing to write", addr, u.address)
+		return fmt.Errorf("address %s is part of the NetBird network %s, refusing to write", addr, u.address)
+	}
+
+	if isRouted, prefix, err := u.filterFn(a); err != nil {
+		log.Errorf("Failed to check if address %s is routed: %v", addr, err)
+	} else {
+		u.addrCache.Store(addr.String(), isRouted)
+		if isRouted {
+			// Extra log, as the error only shows up with ICE logging enabled
+			log.Infof("address %s is part of routed network %s, refusing to write", addr, prefix)
+			return fmt.Errorf("address %s is part of routed network %s, refusing to write", addr, prefix)
+		}
+	}
+	return nil
+}
+
+func getHostFromAddr(addr net.Addr) (string, error) {
+	host, _, err := net.SplitHostPort(addr.String())
+	return host, err
+}
+
 // GetSharedConn returns the shared udp conn
 func (m *UniversalUDPMuxDefault) GetSharedConn() net.PacketConn {
 	return m.params.UDPConn
@@ -165,13 +225,6 @@ func (m *UniversalUDPMuxDefault) HandleSTUNMessage(msg *stun.Message, addr net.A
 		return nil
 	}

-	src := udpAddr.AddrPort().Addr().Unmap()
-	wg := m.params.WGAddress
-	if (wg.Network.IsValid() && wg.Network.Contains(src)) || (wg.IPv6Net.IsValid() && wg.IPv6Net.Contains(src)) {
-		log.Debugf("dropping STUN message from overlay source %s", udpAddr)
-		return nil
-	}
-
 	if m.isXORMappedResponse(msg, udpAddr.String()) {
 		err := m.handleXORMappedResponse(udpAddr, msg)
 		if err != nil {
--- a/client/iface/wgproxy/proxy_linux_test.go
+++ b/client/iface/wgproxy/proxy_linux_test.go
@@ -66,7 +66,7 @@ func seedProxyForProxyCloseByRemoteConn() ([]proxyInstance, error) {
 	if err != nil {
 		return nil, err
 	}
-	iceBind := bind.NewICEBind(nil, wgAddress, 1280)
+	iceBind := bind.NewICEBind(nil, nil, wgAddress, 1280)
 	endpointAddress := &net.UDPAddr{
 		IP:   net.IPv4(10, 0, 0, 1),
 		Port: 1234,
--- a/client/iface/wgproxy/proxy_seed_test.go
+++ b/client/iface/wgproxy/proxy_seed_test.go
@@ -22,7 +22,7 @@ func seedProxyForProxyCloseByRemoteConn() ([]proxyInstance, error) {
 	if err != nil {
 		return nil, err
 	}
-	iceBind := bind.NewICEBind(nil, wgAddress, 1280)
+	iceBind := bind.NewICEBind(nil, nil, wgAddress, 1280)
 	endpointAddress := &net.UDPAddr{
 		IP:   net.IPv4(10, 0, 0, 1),
 		Port: 1234,
--- a/client/internal/dns/mgmt/mgmt.go
+++ b/client/internal/dns/mgmt/mgmt.go
@@ -51,13 +51,20 @@ type cachedRecord struct {
 }

 // Resolver caches critical NetBird infrastructure domains.
-// records, refreshing, mgmtDomain and serverDomains are all guarded by mutex.
+// records, refreshing, failedResolves, mgmtDomain and serverDomains are all
+// guarded by mutex.
 type Resolver struct {
 	records       map[dns.Question]*cachedRecord
 	mgmtDomain    *domain.Domain
 	serverDomains *dnsconfig.ServerDomains
 	mutex         sync.RWMutex

+	// failedResolves records the last failed initial resolve per domain so a
+	// domain that never resolves isn't retried on every server-domains update
+	// until refreshBackoff elapses. Entries are cleared on success and pruned
+	// to the current server-domains set.
+	failedResolves map[domain.Domain]time.Time
+
 	chain            ChainResolver
 	chainMaxPriority int
 	refreshGroup     singleflight.Group
@@ -76,9 +83,10 @@ type Resolver struct {
 // NewResolver creates a new management domains cache resolver.
 func NewResolver() *Resolver {
 	return &Resolver{
-		records:    make(map[dns.Question]*cachedRecord),
-		refreshing: make(map[dns.Question]*atomic.Bool),
-		cacheTTL:   resolveCacheTTL(),
+		records:        make(map[dns.Question]*cachedRecord),
+		refreshing:     make(map[dns.Question]*atomic.Bool),
+		failedResolves: make(map[domain.Domain]time.Time),
+		cacheTTL:       resolveCacheTTL(),
 	}
 }

@@ -173,7 +181,9 @@ func (m *Resolver) continueToNext(w dns.ResponseWriter, r *dns.Msg) {

 // AddDomain resolves a domain and stores its A/AAAA records in the cache.
 // A family that resolves NODATA (nil err, zero records) evicts any stale
-// entry for that qtype.
+// entry for that qtype. When one family hard-errors while the other succeeds,
+// the resolved family is still cached but AddDomain returns an error so the
+// caller retries the incomplete resolve rather than treating it as complete.
 func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
 	dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))

@@ -203,6 +213,10 @@ func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
 	log.Debugf("added/updated domain=%s with %d A records and %d AAAA records",
 		d.SafeString(), len(aRecords), len(aaaaRecords))

+	if errA != nil || errAAAA != nil {
+		return fmt.Errorf("resolve %s: incomplete, a family failed: %w", d.SafeString(), errors.Join(errA, errAAAA))
+	}
+
 	return nil
 }

@@ -462,6 +476,7 @@ func (m *Resolver) RemoveDomain(d domain.Domain) error {
 	delete(m.records, qAAAA)
 	delete(m.refreshing, qA)
 	delete(m.refreshing, qAAAA)
+	delete(m.failedResolves, d)

 	log.Debugf("removed domain=%s from cache", d.SafeString())
 	return nil
@@ -505,6 +520,7 @@ func (m *Resolver) UpdateFromServerDomains(ctx context.Context, serverDomains dn
 		allDomains := m.extractDomainsFromServerDomains(updatedServerDomains)
 		currentDomains := m.GetCachedDomains()
 		removedDomains = m.removeStaleDomains(currentDomains, allDomains)
+		m.pruneFailedResolves(allDomains)
 	}

 	m.addNewDomains(ctx, newDomains)
@@ -577,13 +593,85 @@ func (m *Resolver) isManagementDomain(domain domain.Domain) bool {
 	return m.mgmtDomain != nil && domain == *m.mgmtDomain
 }

-// addNewDomains resolves and caches all domains from the update
+// addNewDomains resolves and caches domains that are not yet in the cache,
+// running the lookups concurrently. Domains already cached are skipped and left
+// to the stale-while-revalidate refresh path, so a sync never re-resolves them
+// synchronously: once NetBird owns the OS resolver the resolve runs through the
+// handler chain and would otherwise dial the managed upstreams under the engine
+// sync lock on every update.
 func (m *Resolver) addNewDomains(ctx context.Context, newDomains domain.List) {
+	var wg sync.WaitGroup
+	seen := make(map[domain.Domain]struct{}, len(newDomains))
 	for _, newDomain := range newDomains {
-		if err := m.AddDomain(ctx, newDomain); err != nil {
-			log.Warnf("failed to add/update domain=%s: %v", newDomain.SafeString(), err)
-		} else {
-			log.Debugf("added/updated management cache domain=%s", newDomain.SafeString())
+		if _, dup := seen[newDomain]; dup {
+			continue
+		}
+		seen[newDomain] = struct{}{}
+
+		if !m.needsResolve(newDomain) {
+			continue
+		}
+
+		wg.Add(1)
+		go func(d domain.Domain) {
+			defer wg.Done()
+			if err := m.AddDomain(ctx, d); err != nil {
+				m.markResolveFailed(d)
+				log.Warnf("failed to add/update domain=%s: %v", d.SafeString(), err)
+				return
+			}
+			m.clearResolveFailed(d)
+			log.Debugf("added/updated management cache domain=%s", d.SafeString())
+		}(newDomain)
+	}
+	wg.Wait()
+}
+
+// needsResolve reports whether d should be resolved now. A recent failed or
+// incomplete resolve gates retries on the backoff even when one family is
+// already cached, so a transiently-failed family is retried instead of being
+// treated as fully resolved. Otherwise a domain with any cached record is left
+// to the stale-while-revalidate refresh path.
+func (m *Resolver) needsResolve(d domain.Domain) bool {
+	dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))
+
+	m.mutex.RLock()
+	defer m.mutex.RUnlock()
+
+	if failedAt, ok := m.failedResolves[d]; ok {
+		return time.Since(failedAt) >= refreshBackoff
+	}
+
+	for _, qtype := range []uint16{dns.TypeA, dns.TypeAAAA} {
+		q := dns.Question{Name: dnsName, Qtype: qtype, Qclass: dns.ClassINET}
+		if _, ok := m.records[q]; ok {
+			return false
+		}
+	}
+	return true
+}
+
+func (m *Resolver) markResolveFailed(d domain.Domain) {
+	m.mutex.Lock()
+	m.failedResolves[d] = time.Now()
+	m.mutex.Unlock()
+}
+
+func (m *Resolver) clearResolveFailed(d domain.Domain) {
+	m.mutex.Lock()
+	delete(m.failedResolves, d)
+	m.mutex.Unlock()
+}
+
+// pruneFailedResolves drops failure markers for domains no longer present in
+// the server-domains set, keeping the map bounded to the current set (a
+// failed-only domain has no cached record, so RemoveDomain never sees it).
+func (m *Resolver) pruneFailedResolves(domains domain.List) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	for d := range m.failedResolves {
+		if !slices.Contains(domains, d) {
+			delete(m.failedResolves, d)
 		}
 	}
 }
--- a/client/internal/dns/mgmt/mgmt_refresh_test.go
+++ b/client/internal/dns/mgmt/mgmt_refresh_test.go
@@ -21,6 +21,7 @@ type fakeChain struct {
 	mu       sync.Mutex
 	calls    map[string]int
 	answers  map[string][]dns.RR
+	qErr     map[string]error
 	err      error
 	hasRoot  bool
 	onLookup func()
@@ -30,6 +31,7 @@ func newFakeChain() *fakeChain {
 	return &fakeChain{
 		calls:   map[string]int{},
 		answers: map[string][]dns.RR{},
+		qErr:    map[string]error{},
 		hasRoot: true,
 	}
 }
@@ -47,6 +49,9 @@ func (f *fakeChain) ResolveInternal(ctx context.Context, msg *dns.Msg, maxPriori
 	f.calls[key]++
 	answers := f.answers[key]
 	err := f.err
+	if err == nil {
+		err = f.qErr[key]
+	}
 	onLookup := f.onLookup
 	f.mu.Unlock()

@@ -75,6 +80,12 @@ func (f *fakeChain) setAnswer(name string, qtype uint16, ip string) {
 	}
 }

+func (f *fakeChain) setErr(name string, qtype uint16, err error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.qErr[name+"|"+dns.TypeToString[qtype]] = err
+}
+
 func (f *fakeChain) callCount(name string, qtype uint16) int {
 	f.mu.Lock()
 	defer f.mu.Unlock()
--- a/client/internal/dns/mgmt/mgmt_resolve_test.go
+++ b/client/internal/dns/mgmt/mgmt_resolve_test.go
@@ -0,0 +1,183 @@
+package mgmt
+
+import (
+	"context"
+	"errors"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/miekg/dns"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	dnsconfig "github.com/netbirdio/netbird/client/internal/dns/config"
+	"github.com/netbirdio/netbird/shared/management/domain"
+)
+
+// A domain already in the cache must not be re-resolved on a subsequent server
+// domains update; it is left to the stale-while-revalidate refresh path.
+func TestResolver_UpdateFromServerDomains_SkipsCached(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("signal.example.com.", dns.TypeA, "10.0.0.2")
+	r.SetChainResolver(chain, 50)
+
+	sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
+
+	_, err := r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"first update must resolve the domain")
+
+	_, err = r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"cached domain must not be re-resolved on a subsequent update")
+}
+
+// New domains in a single update must resolve concurrently rather than serially.
+func TestResolver_AddNewDomains_ResolvesConcurrently(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+
+	var inflight, maxInflight atomic.Int32
+	chain.onLookup = func() {
+		n := inflight.Add(1)
+		for {
+			old := maxInflight.Load()
+			if n <= old || maxInflight.CompareAndSwap(old, n) {
+				break
+			}
+		}
+		time.Sleep(50 * time.Millisecond)
+		inflight.Add(-1)
+	}
+
+	relays := []domain.Domain{"a.example.com", "b.example.com", "c.example.com", "d.example.com"}
+	for _, d := range relays {
+		chain.setAnswer(dns.Fqdn(string(d)), dns.TypeA, "10.0.0.2")
+	}
+	r.SetChainResolver(chain, 50)
+
+	start := time.Now()
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: relays})
+	require.NoError(t, err)
+	elapsed := time.Since(start)
+
+	assert.GreaterOrEqual(t, int(maxInflight.Load()), 2, "domains must resolve concurrently")
+	// Serial resolution of 4 domains would take at least 4*50ms; concurrent is far less.
+	assert.Less(t, elapsed, 300*time.Millisecond, "resolution should not be serial")
+}
+
+// A domain that fails to resolve must not be retried on every update; the
+// failure backoff suppresses re-resolution until it expires.
+func TestResolver_UpdateFromServerDomains_BacksOffFailures(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.err = errors.New("resolve boom")
+	r.SetChainResolver(chain, 50)
+
+	sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
+
+	_, err := r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"first update must attempt the resolve")
+
+	_, err = r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"failed resolve must back off and not retry on the next update")
+}
+
+// A domain listed under more than one server-domain type (e.g. STUN and TURN on
+// the same host) must be resolved once per update, not once per occurrence.
+func TestResolver_AddNewDomains_DedupesDuplicateDomains(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("dup.example.com.", dns.TypeA, "10.0.0.9")
+	r.SetChainResolver(chain, 50)
+
+	sd := dnsconfig.ServerDomains{
+		Stuns: []domain.Domain{"dup.example.com"},
+		Turns: []domain.Domain{"dup.example.com"},
+	}
+
+	_, err := r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	assert.Equal(t, 1, chain.callCount("dup.example.com.", dns.TypeA),
+		"a domain appearing under multiple server-domain types must resolve once")
+}
+
+// A failure marker must be dropped once its domain leaves the server-domains set
+// so the map stays bounded to the current set.
+func TestResolver_UpdateFromServerDomains_PrunesFailedResolves(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.err = errors.New("resolve boom")
+	r.SetChainResolver(chain, 50)
+
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("gone.example.com")})
+	require.NoError(t, err)
+	r.mutex.RLock()
+	_, marked := r.failedResolves[domain.Domain("gone.example.com")]
+	r.mutex.RUnlock()
+	require.True(t, marked, "failed resolve must be recorded")
+
+	_, err = r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("other.example.com")})
+	require.NoError(t, err)
+	r.mutex.RLock()
+	_, stillMarked := r.failedResolves[domain.Domain("gone.example.com")]
+	r.mutex.RUnlock()
+	assert.False(t, stillMarked, "failure marker for a domain no longer in the set must be pruned")
+}
+
+// When one family hard-errors while the other resolves, the domain is cached
+// for the working family but recorded as incomplete so the failed family is
+// retried under backoff instead of being treated as fully resolved forever.
+func TestResolver_AddNewDomains_RetriesPartialFamilyFailure(t *testing.T) {
+	d := domain.Domain("relay.example.com")
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("relay.example.com.", dns.TypeA, "10.0.0.2")
+	chain.setErr("relay.example.com.", dns.TypeAAAA, errors.New("servfail"))
+	r.SetChainResolver(chain, 50)
+
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
+	require.NoError(t, err)
+
+	r.mutex.RLock()
+	_, aCached := r.records[dns.Question{Name: "relay.example.com.", Qtype: dns.TypeA, Qclass: dns.ClassINET}]
+	_, marked := r.failedResolves[d]
+	r.mutex.RUnlock()
+	require.True(t, aCached, "the working family must still be cached")
+	require.True(t, marked, "a partial failure must be recorded so the failed family is retried")
+
+	assert.False(t, r.needsResolve(d), "within the backoff window the domain is not retried")
+
+	r.mutex.Lock()
+	r.failedResolves[d] = time.Now().Add(-2 * refreshBackoff)
+	r.mutex.Unlock()
+	assert.True(t, r.needsResolve(d), "after the backoff elapses the domain is retried to pick up the missing family")
+}
+
+// A family that returns NODATA (legitimately absent, e.g. an IPv4-only host) is
+// not a failure: the domain must not be marked for retry, otherwise it would be
+// re-resolved on every sync.
+func TestResolver_AddNewDomains_NodataIsNotFailure(t *testing.T) {
+	d := domain.Domain("v4only.example.com")
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("v4only.example.com.", dns.TypeA, "10.0.0.2")
+	r.SetChainResolver(chain, 50)
+
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
+	require.NoError(t, err)
+
+	r.mutex.RLock()
+	_, marked := r.failedResolves[d]
+	r.mutex.RUnlock()
+	assert.False(t, marked, "a NODATA family must not be recorded as a failure")
+	assert.False(t, r.needsResolve(d), "an IPv4-only host must not be re-resolved on later syncs")
+}
--- a/client/internal/engine.go
+++ b/client/internal/engine.go
@@ -14,6 +14,7 @@ import (
 	"sort"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"time"

 	"github.com/hashicorp/go-multierror"
@@ -53,6 +54,7 @@ import (
 	"github.com/netbirdio/netbird/client/internal/relay"
 	"github.com/netbirdio/netbird/client/internal/rosenpass"
 	"github.com/netbirdio/netbird/client/internal/routemanager"
+	"github.com/netbirdio/netbird/client/internal/routemanager/systemops"
 	"github.com/netbirdio/netbird/client/internal/statemanager"
 	"github.com/netbirdio/netbird/client/internal/syncstore"
 	"github.com/netbirdio/netbird/client/internal/updater"
@@ -88,6 +90,13 @@ var ErrResetConnection = fmt.Errorf("reset connection")

 var ErrEngineAlreadyStarted = errors.New("engine already started")

+// engineRestartCount and engineLastRestart track client-restart cadence across
+// engine recreations so a restart loop is distinguishable from rare restarts.
+var (
+	engineRestartCount atomic.Int64
+	engineLastRestart  atomic.Int64
+)
+
 type EngineConfig struct {
 	WgPort      int
 	WgIfaceName string
@@ -909,14 +918,23 @@ func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
 	if e.ctx.Err() != nil {
 		return e.ctx.Err()
 	}
+	serial := update.GetNetworkMap().GetSerial()
+	if nm := update.GetNetworkMap(); nm != nil {
+		log.Infof("sync update: serial=%d remotePeers=%d offlinePeers=%d routes=%d firewallRules=%d checks=%d configPresent=%v remotePeersEmpty=%v",
+			nm.GetSerial(), len(nm.GetRemotePeers()), len(nm.GetOfflinePeers()), len(nm.GetRoutes()),
+			len(nm.GetFirewallRules()), len(update.GetChecks()), update.GetNetbirdConfig() != nil, nm.GetRemotePeersIsEmpty())
+	} else {
+		log.Infof("sync update: config-only (no network map), configPresent=%v", update.GetNetbirdConfig() != nil)
+	}

 	if update.NetworkMap != nil && update.NetworkMap.PeerConfig != nil {
 		e.handleAutoUpdateVersion(update.NetworkMap.PeerConfig.AutoUpdate)
 	}
-
+	startTime := time.Now()
 	if err := e.updateNetbirdConfig(update.GetNetbirdConfig()); err != nil {
 		return err
 	}
+	log.Infof("netbird config updated in %s, serial=%d", time.Since(startTime), serial)

 	// Posture checks are bound to the network map presence:
 	//   NetworkMap != nil, checks present -> apply the received checks
@@ -927,17 +945,21 @@ func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
 	if nm == nil {
 		return nil
 	}
-
+	startTime = time.Now()
 	if err := e.updateChecksIfNew(update.Checks); err != nil {
 		return err
 	}
+	log.Infof("checks updated in %s, serial=%d", time.Since(startTime), serial)

+	startTime = time.Now()
 	e.persistSyncResponse(update)
-
+	log.Infof("sync response persisted in %s, serial=%d", time.Since(startTime), serial)
 	// only apply new changes and ignore old ones
+	startTime = time.Now()
 	if err := e.updateNetworkMap(nm); err != nil {
 		return err
 	}
+	log.Infof("network map updated in %s, serial=%d", time.Since(startTime), serial)

 	e.statusRecorder.PublishEvent(cProto.SystemEvent_INFO, cProto.SystemEvent_SYSTEM, "Network map updated", "", nil)

@@ -1357,44 +1379,56 @@ func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {

 	dnsConfig := toDNSConfig(protoDNSConfig, e.wgInterface.Address())

+	startTime := time.Now()
 	if err := e.dnsServer.UpdateDNSServer(serial, dnsConfig); err != nil {
 		log.Errorf("failed to update dns server, err: %v", err)
 	}
+	log.Infof("updated dns server in %v, serial=%d", time.Since(startTime), serial)

 	e.routeManager.SetDNSForwarderPort(dnsConfig.ForwarderPort)

 	// apply routes first, route related actions might depend on routing being enabled
+	startTime = time.Now()
 	routes := toRoutes(networkMap.GetRoutes())
 	serverRoutes, clientRoutes := e.routeManager.ClassifyRoutes(routes)
-
+	log.Infof("updated routes in %v, serial=%d", time.Since(startTime), serial)
 	// lazy mgr needs to be aware of which routes are available before they are applied
 	if e.connMgr != nil {
 		e.connMgr.UpdateRouteHAMap(clientRoutes)
 		log.Debugf("updated lazy connection manager with %d HA groups", len(clientRoutes))
 	}

+	startTime = time.Now()
 	dnsRouteFeatureFlag := toDNSFeatureFlag(networkMap)
 	if err := e.routeManager.UpdateRoutes(serial, serverRoutes, clientRoutes, dnsRouteFeatureFlag); err != nil {
 		log.Errorf("failed to update routes: %v", err)
 	}
+	log.Infof("updated routes in %v, serial=%d", time.Since(startTime), serial)

+	startTime = time.Now()
 	if e.acl != nil {
 		e.acl.ApplyFiltering(networkMap, dnsRouteFeatureFlag)
 	}
+	log.Infof("updated filtering in %v, serial=%d", time.Since(startTime), serial)

+	startTime = time.Now()
 	fwdEntries := toRouteDomains(e.config.WgPrivateKey.PublicKey().String(), routes)
 	e.updateDNSForwarder(dnsRouteFeatureFlag, fwdEntries)
+	log.Infof("updated DNS forwarder in %v, serial=%d", time.Since(startTime), serial)

+	startTime = time.Now()
 	// Ingress forward rules
 	forwardingRules, err := e.updateForwardRules(networkMap.GetForwardingRules())
 	if err != nil {
 		log.Errorf("failed to update forward rules, err: %v", err)
 	}
+	log.Infof("updated forward rules in %v, serial=%d", time.Since(startTime), serial)

 	log.Debugf("got peers update from Management Service, total peers to connect to = %d", len(networkMap.GetRemotePeers()))

+	startTime = time.Now()
 	e.updateOfflinePeers(networkMap.GetOfflinePeers())
-
+	log.Infof("updated offline peers in %v, serial=%d", time.Since(startTime), serial)
 	// Filter out own peer from the remote peers list
 	localPubKey := e.config.WgPrivateKey.PublicKey().String()
 	remotePeers := make([]*mgmProto.RemotePeerConfig, 0, len(networkMap.GetRemotePeers()))
@@ -1412,20 +1446,24 @@ func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
 			return err
 		}
 	} else {
+		startTime = time.Now()
 		err := e.removePeers(remotePeers)
 		if err != nil {
 			return err
 		}
-
+		log.Infof("removed peers in %v, serial=%d", time.Since(startTime), serial)
+		startTime = time.Now()
 		err = e.modifyPeers(remotePeers)
 		if err != nil {
 			return err
 		}
-
+		log.Infof("modified peers in %v, serial=%d", time.Since(startTime), serial)
+		startTime = time.Now()
 		err = e.addNewPeers(remotePeers)
 		if err != nil {
 			return err
 		}
+		log.Infof("added peers in %v, serial=%d", time.Since(startTime), serial)

 		e.statusRecorder.FinishPeerListModifications()

@@ -1438,9 +1476,11 @@ func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
 		e.updateSSHServerAuth(networkMap.GetSshAuth())
 	}

+	startTime = time.Now()
 	// must set the exclude list after the peers are added. Without it the manager can not figure out the peers parameters from the store
 	excludedLazyPeers := e.toExcludedLazyPeers(forwardingRules, remotePeers)
 	e.connMgr.SetExcludeList(e.ctx, excludedLazyPeers)
+	log.Infof("updated lazy connection manager exclude list in %v, serial=%d", time.Since(startTime), serial)

 	e.networkSerial = serial

@@ -1955,6 +1995,7 @@ func (e *Engine) newWgIface() (*iface.WGIface, error) {
 		WGPrivKey:    e.config.WgPrivateKey.String(),
 		MTU:          e.config.MTU,
 		TransportNet: transportNet,
+		FilterFn:     e.addrViaRoutes,
 		DisableDNS:   e.config.DisableDNS,
 	}

@@ -2171,7 +2212,14 @@ func (e *Engine) triggerClientRestart() {
 		return
 	}

-	log.Info("restarting engine")
+	// Cadence survives engine recreation (package-level), so a restart loop shows
+	// as a fast-climbing count with a short gap, distinct from rare intentional restarts.
+	n := engineRestartCount.Add(1)
+	var sinceLast time.Duration
+	if prev := engineLastRestart.Swap(time.Now().UnixNano()); prev != 0 {
+		sinceLast = time.Since(time.Unix(0, prev))
+	}
+	log.Infof("restarting engine (restart #%d, %s since previous)", n, sinceLast.Round(time.Second))
 	CtxGetState(e.ctx).Set(StatusConnecting)
 	_ = CtxGetState(e.ctx).Wrap(ErrResetConnection)
 	log.Infof("cancelling client context, engine will be recreated")
@@ -2202,6 +2250,21 @@ func (e *Engine) startNetworkMonitor() {
 	}()
 }

+func (e *Engine) addrViaRoutes(addr netip.Addr) (bool, netip.Prefix, error) {
+	var vpnRoutes []netip.Prefix
+	for _, routes := range e.routeManager.GetClientRoutes() {
+		if len(routes) > 0 && routes[0] != nil {
+			vpnRoutes = append(vpnRoutes, routes[0].Network)
+		}
+	}
+
+	if isVpn, prefix := systemops.IsAddrRouted(addr, vpnRoutes); isVpn {
+		return true, prefix, nil
+	}
+
+	return false, netip.Prefix{}, nil
+}
+
 func (e *Engine) stopDNSServer() {
 	if e.dnsServer == nil {
 		return
--- a/client/internal/peer/worker_ice.go
+++ b/client/internal/peer/worker_ice.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"net"
+	"net/netip"
 	"strconv"
 	"sync"
 	"time"
@@ -164,6 +165,10 @@ func (w *WorkerICE) OnRemoteCandidate(candidate ice.Candidate, haRoutes route.HA
 		return
 	}

+	if candidateViaRoutes(candidate, haRoutes) {
+		return
+	}
+
 	if err := w.agent.AddRemoteCandidate(candidate); err != nil {
 		w.log.Errorf("error while handling remote candidate")
 		return
@@ -461,7 +466,7 @@ func (w *WorkerICE) createForwardedCandidate(srflxCandidate ice.Candidate, mappi
 }

 func (w *WorkerICE) onICESelectedCandidatePair(agent *icemaker.ThreadSafeAgent, c1, c2 ice.Candidate) {
-	w.log.Debugf("selected candidate pair [local <-> remote] -> [%s <-> %s], peer %s", c1.String(), c2.String(),
+	w.log.Infof("selected candidate pair [local <-> remote] -> [%s <-> %s], peer %s", c1.String(), c2.String(),
 		w.config.Key)

 	pairStat, ok := agent.GetSelectedCandidatePairStats()
@@ -584,6 +589,34 @@ func extraSrflxCandidate(candidate ice.Candidate) (*ice.CandidateServerReflexive
 	return ec, nil
 }

+func candidateViaRoutes(candidate ice.Candidate, clientRoutes route.HAMap) bool {
+	addr, err := netip.ParseAddr(candidate.Address())
+	if err != nil {
+		log.Errorf("Failed to parse IP address %s: %v", candidate.Address(), err)
+		return false
+	}
+
+	var routePrefixes []netip.Prefix
+	for _, routes := range clientRoutes {
+		if len(routes) > 0 && routes[0] != nil {
+			routePrefixes = append(routePrefixes, routes[0].Network)
+		}
+	}
+
+	for _, prefix := range routePrefixes {
+		// default route is handled by route exclusion / ip rules
+		if prefix.Bits() == 0 {
+			continue
+		}
+
+		if prefix.Contains(addr) {
+			log.Debugf("Ignoring candidate [%s], its address is part of routed network %s", candidate.String(), prefix)
+			return true
+		}
+	}
+	return false
+}
+
 func isRelayCandidate(candidate ice.Candidate) bool {
 	return candidate.Type() == ice.CandidateTypeRelay
 }
--- a/client/internal/routemanager/systemops/systemops_generic.go
+++ b/client/internal/routemanager/systemops/systemops_generic.go
@@ -121,12 +121,9 @@ func (r *SysOps) addRouteToNonVPNIntf(prefix netip.Prefix, vpnIntf wgIface, init
 		return Nexthop{}, vars.ErrRouteNotAllowed
 	}

-	// BSDs blackhole a /32 added inside a directly-connected subnet; Linux/Windows need it to beat the wt0 route.
-	switch runtime.GOOS {
-	case "darwin", "freebsd", "netbsd", "openbsd", "dragonfly":
-		if isLocal, subnet := r.isPrefixInLocalSubnets(prefix); isLocal {
-			return Nexthop{}, fmt.Errorf("prefix %s is part of local subnet %s: %w", prefix, subnet, vars.ErrRouteNotAllowed)
-		}
+	// Check if the prefix is part of any local subnets
+	if isLocal, subnet := r.isPrefixInLocalSubnets(prefix); isLocal {
+		return Nexthop{}, fmt.Errorf("prefix %s is part of local subnet %s: %w", prefix, subnet, vars.ErrRouteNotAllowed)
 	}

 	// Determine the exit interface and next hop for the prefix, so we can add a specific route
--- a/management/internals/shared/grpc/server.go
+++ b/management/internals/shared/grpc/server.go
@@ -1205,7 +1205,7 @@ func (s *Server) SyncMeta(ctx context.Context, req *proto.EncryptedMessage) (*pr
 		return nil, msg
 	}

-	err = s.accountManager.SyncPeerMeta(ctx, peerKey.String(), extractPeerMeta(ctx, syncMetaReq.GetMeta()))
+	err = s.accountManager.SyncPeerMeta(ctx, peerKey.String(), extractPeerMeta(ctx, syncMetaReq.GetMeta()), realIP)
 	if err != nil {
 		return nil, mapError(ctx, err)
 	}
@@ -1254,7 +1254,10 @@ func (s *Server) Logout(ctx context.Context, req *proto.EncryptedMessage) (*prot
 func toProtocolChecks(ctx context.Context, postureChecks []*posture.Checks) []*proto.Checks {
 	protoChecks := make([]*proto.Checks, 0, len(postureChecks))
 	for _, postureCheck := range postureChecks {
-		protoChecks = append(protoChecks, toProtocolCheck(postureCheck))
+		check := toProtocolCheck(postureCheck)
+		if check != nil {
+			protoChecks = append(protoChecks, check)
+		}
 	}

 	return protoChecks
@@ -1278,5 +1281,9 @@ func toProtocolCheck(postureCheck *posture.Checks) *proto.Checks {
 		}
 	}

+	if len(protoCheck.Files) == 0 {
+		return nil
+	}
+
 	return protoCheck
 }
--- a/management/server/account.go
+++ b/management/server/account.go
@@ -1889,12 +1889,12 @@ func domainIsUpToDate(domain string, domainCategory string, userAuth auth.UserAu
 // concurrent stream that started earlier loses the optimistic-lock race
 // in MarkPeerConnected and bails without writing.
 func (am *DefaultAccountManager) SyncAndMarkPeer(ctx context.Context, accountID string, peerPubKey string, meta nbpeer.PeerSystemMeta, realIP net.IP, syncTime time.Time) (*nbpeer.Peer, *types.NetworkMap, []*posture.Checks, int64, error) {
-	peer, netMap, postureChecks, dnsfwdPort, err := am.SyncPeer(ctx, types.PeerSync{WireGuardPubKey: peerPubKey, Meta: meta}, accountID)
+	peer, netMap, postureChecks, dnsfwdPort, err := am.SyncPeer(ctx, types.PeerSync{WireGuardPubKey: peerPubKey, Meta: meta, RealIP: realIP}, accountID)
 	if err != nil {
 		return nil, nil, nil, 0, fmt.Errorf("error syncing peer: %w", err)
 	}

-	if err := am.MarkPeerConnected(ctx, peerPubKey, realIP, accountID, syncTime.UnixNano(), netMap); err != nil {
+	if err := am.MarkPeerConnected(ctx, peerPubKey, accountID, syncTime.UnixNano(), netMap); err != nil {
 		log.WithContext(ctx).Warnf("failed marking peer as connected %s %v", peerPubKey, err)
 	}

@@ -1914,13 +1914,13 @@ func (am *DefaultAccountManager) OnPeerDisconnected(ctx context.Context, account
 	return nil
 }

-func (am *DefaultAccountManager) SyncPeerMeta(ctx context.Context, peerPubKey string, meta nbpeer.PeerSystemMeta) error {
+func (am *DefaultAccountManager) SyncPeerMeta(ctx context.Context, peerPubKey string, meta nbpeer.PeerSystemMeta, realIP net.IP) error {
 	accountID, err := am.Store.GetAccountIDByPeerPubKey(ctx, peerPubKey)
 	if err != nil {
 		return err
 	}

-	_, _, _, _, err = am.SyncPeer(ctx, types.PeerSync{WireGuardPubKey: peerPubKey, Meta: meta, UpdateAccountPeers: true}, accountID)
+	_, _, _, _, err = am.SyncPeer(ctx, types.PeerSync{WireGuardPubKey: peerPubKey, Meta: meta, RealIP: realIP, UpdateAccountPeers: true}, accountID)
 	if err != nil {
 		return err
 	}
--- a/management/server/account/manager.go
+++ b/management/server/account/manager.go
@@ -62,7 +62,7 @@ type Manager interface {
 	GetUserFromUserAuth(ctx context.Context, userAuth auth.UserAuth) (*types.User, error)
 	ListUsers(ctx context.Context, accountID string) ([]*types.User, error)
 	GetPeers(ctx context.Context, accountID, userID, nameFilter, ipFilter string) ([]*nbpeer.Peer, error)
-	MarkPeerConnected(ctx context.Context, peerKey string, realIP net.IP, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error
+	MarkPeerConnected(ctx context.Context, peerKey string, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error
 	MarkPeerDisconnected(ctx context.Context, peerKey string, accountID string, sessionStartedAt int64) error
 	DeletePeer(ctx context.Context, accountID, peerID, userID string) error
 	UpdatePeer(ctx context.Context, accountID, userID string, p *nbpeer.Peer) (*nbpeer.Peer, error)
@@ -123,7 +123,7 @@ type Manager interface {
 	GetValidatedPeers(ctx context.Context, accountID string) (map[string]struct{}, map[string]string, error)
 	SyncAndMarkPeer(ctx context.Context, accountID string, peerPubKey string, meta nbpeer.PeerSystemMeta, realIP net.IP, syncTime time.Time) (*nbpeer.Peer, *types.NetworkMap, []*posture.Checks, int64, error)
 	OnPeerDisconnected(ctx context.Context, accountID string, peerPubKey string, streamStartTime time.Time) error
-	SyncPeerMeta(ctx context.Context, peerPubKey string, meta nbpeer.PeerSystemMeta) error
+	SyncPeerMeta(ctx context.Context, peerPubKey string, meta nbpeer.PeerSystemMeta, realIP net.IP) error
 	FindExistingPostureCheck(accountID string, checks *posture.ChecksDefinition) (*posture.Checks, error)
 	GetAccountIDForPeerKey(ctx context.Context, peerKey string) (string, error)
 	GetAccountSettings(ctx context.Context, accountID string, userID string) (*types.Settings, error)
--- a/management/server/account/manager_mock.go
+++ b/management/server/account/manager_mock.go
@@ -1323,17 +1323,17 @@ func (mr *MockManagerMockRecorder) ExtendPeerSession(ctx, peerPubKey, userID int
 }

 // MarkPeerConnected mocks base method.
-func (m *MockManager) MarkPeerConnected(ctx context.Context, peerKey string, realIP net.IP, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error {
+func (m *MockManager) MarkPeerConnected(ctx context.Context, peerKey string, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "MarkPeerConnected", ctx, peerKey, realIP, accountID, sessionStartedAt, nmap)
+	ret := m.ctrl.Call(m, "MarkPeerConnected", ctx, peerKey, accountID, sessionStartedAt, nmap)
 	ret0, _ := ret[0].(error)
 	return ret0
 }

 // MarkPeerConnected indicates an expected call of MarkPeerConnected.
-func (mr *MockManagerMockRecorder) MarkPeerConnected(ctx, peerKey, realIP, accountID, sessionStartedAt, nmap interface{}) *gomock.Call {
+func (mr *MockManagerMockRecorder) MarkPeerConnected(ctx, peerKey, accountID, sessionStartedAt, nmap interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MarkPeerConnected", reflect.TypeOf((*MockManager)(nil).MarkPeerConnected), ctx, peerKey, realIP, accountID, sessionStartedAt, nmap)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MarkPeerConnected", reflect.TypeOf((*MockManager)(nil).MarkPeerConnected), ctx, peerKey, accountID, sessionStartedAt, nmap)
 }

 // MarkPeerDisconnected mocks base method.
@@ -1586,17 +1586,17 @@ func (mr *MockManagerMockRecorder) SyncPeer(ctx, sync, accountID interface{}) *g
 }

 // SyncPeerMeta mocks base method.
-func (m *MockManager) SyncPeerMeta(ctx context.Context, peerPubKey string, meta peer.PeerSystemMeta) error {
+func (m *MockManager) SyncPeerMeta(ctx context.Context, peerPubKey string, meta peer.PeerSystemMeta, realIP net.IP) error {
 	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "SyncPeerMeta", ctx, peerPubKey, meta)
+	ret := m.ctrl.Call(m, "SyncPeerMeta", ctx, peerPubKey, meta, realIP)
 	ret0, _ := ret[0].(error)
 	return ret0
 }

 // SyncPeerMeta indicates an expected call of SyncPeerMeta.
-func (mr *MockManagerMockRecorder) SyncPeerMeta(ctx, peerPubKey, meta interface{}) *gomock.Call {
+func (mr *MockManagerMockRecorder) SyncPeerMeta(ctx, peerPubKey, meta, realIP interface{}) *gomock.Call {
 	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SyncPeerMeta", reflect.TypeOf((*MockManager)(nil).SyncPeerMeta), ctx, peerPubKey, meta)
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SyncPeerMeta", reflect.TypeOf((*MockManager)(nil).SyncPeerMeta), ctx, peerPubKey, meta, realIP)
 }

 // SyncUserJWTGroups mocks base method.
--- a/management/server/account_test.go
+++ b/management/server/account_test.go
@@ -1836,7 +1836,7 @@ func TestDefaultAccountManager_UpdatePeer_PeerLoginExpiration(t *testing.T) {
 	accountID, err := manager.GetAccountIDByUserID(context.Background(), auth.UserAuth{UserId: userID})
 	require.NoError(t, err, "unable to get the account")

-	err = manager.MarkPeerConnected(context.Background(), key.PublicKey().String(), nil, accountID, time.Now().UTC().UnixNano(), nil)
+	err = manager.MarkPeerConnected(context.Background(), key.PublicKey().String(), accountID, time.Now().UTC().UnixNano(), nil)
 	require.NoError(t, err, "unable to mark peer connected")

 	_, err = manager.UpdateAccountSettings(context.Background(), accountID, userID, &types.Settings{
@@ -1907,7 +1907,7 @@ func TestDefaultAccountManager_MarkPeerConnected_PeerLoginExpiration(t *testing.
 	require.NoError(t, err, "unable to get the account")

 	// when we mark peer as connected, the peer login expiration routine should trigger
-	err = manager.MarkPeerConnected(context.Background(), key.PublicKey().String(), nil, accountID, time.Now().UTC().UnixNano(), nil)
+	err = manager.MarkPeerConnected(context.Background(), key.PublicKey().String(), accountID, time.Now().UTC().UnixNano(), nil)
 	require.NoError(t, err, "unable to mark peer connected")

 	failed := waitTimeout(wg, time.Second)
@@ -1916,6 +1916,117 @@ func TestDefaultAccountManager_MarkPeerConnected_PeerLoginExpiration(t *testing.
 	}
 }

+func TestDefaultAccountManager_MarkPeerDisconnected_SchedulesInactivityExpiration(t *testing.T) {
+	manager, _, err := createManager(t)
+	require.NoError(t, err, "unable to create account manager")
+
+	accountID, err := manager.GetAccountIDByUserID(context.Background(), auth.UserAuth{UserId: userID})
+	require.NoError(t, err, "unable to create an account")
+
+	key, err := wgtypes.GenerateKey()
+	require.NoError(t, err, "unable to generate WireGuard key")
+	peerPubKey := key.PublicKey().String()
+
+	_, _, _, _, err = manager.AddPeer(context.Background(), "", "", userID, &nbpeer.Peer{
+		Key:                         peerPubKey,
+		Meta:                        nbpeer.PeerSystemMeta{Hostname: "test-peer"},
+		InactivityExpirationEnabled: true,
+	}, false)
+	require.NoError(t, err, "unable to add peer")
+
+	_, err = manager.UpdateAccountSettings(context.Background(), accountID, userID, &types.Settings{
+		PeerLoginExpiration:             time.Hour,
+		PeerLoginExpirationEnabled:      true,
+		PeerInactivityExpiration:        time.Hour,
+		PeerInactivityExpirationEnabled: true,
+		Extra:                           &types.ExtraSettings{},
+	})
+	require.NoError(t, err, "expecting to update account settings successfully but got error")
+
+	// Establish a session so the matching-token disconnect is actually applied.
+	streamStartTime := time.Now().UTC()
+	err = manager.MarkPeerConnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano(), nil)
+	require.NoError(t, err, "unable to mark peer connected")
+
+	// Install the mock only now, so the assertion observes the disconnect, not
+	// the earlier connect.
+	scheduled := make(chan struct{}, 1)
+	manager.peerInactivityExpiry = &MockScheduler{
+		CancelFunc: func(ctx context.Context, IDs []string) {},
+		ScheduleFunc: func(ctx context.Context, in time.Duration, ID string, job func() (nextRunIn time.Duration, reschedule bool)) {
+			select {
+			case scheduled <- struct{}{}:
+			default:
+			}
+		},
+	}
+
+	err = manager.MarkPeerDisconnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano())
+	require.NoError(t, err, "unable to mark peer disconnected")
+
+	select {
+	case <-scheduled:
+		// expected: disconnect re-armed the inactivity expiry timer
+	case <-time.After(time.Second):
+		t.Fatal("expected inactivity expiration to be rescheduled when an eligible peer disconnects")
+	}
+}
+
+func TestDefaultAccountManager_MarkPeerDisconnected_SkipsInactivityExpirationWhenDisabled(t *testing.T) {
+	manager, _, err := createManager(t)
+	require.NoError(t, err, "unable to create account manager")
+
+	accountID, err := manager.GetAccountIDByUserID(context.Background(), auth.UserAuth{UserId: userID})
+	require.NoError(t, err, "unable to create an account")
+
+	key, err := wgtypes.GenerateKey()
+	require.NoError(t, err, "unable to generate WireGuard key")
+	peerPubKey := key.PublicKey().String()
+
+	_, _, _, _, err = manager.AddPeer(context.Background(), "", "", userID, &nbpeer.Peer{
+		Key:                         peerPubKey,
+		Meta:                        nbpeer.PeerSystemMeta{Hostname: "test-peer"},
+		InactivityExpirationEnabled: true,
+	}, false)
+	require.NoError(t, err, "unable to add peer")
+
+	// Peer is eligible (SSO + inactivity enabled) but the account-level setting
+	// stays disabled, so disconnect must not schedule anything.
+	_, err = manager.UpdateAccountSettings(context.Background(), accountID, userID, &types.Settings{
+		PeerLoginExpiration:             time.Hour,
+		PeerLoginExpirationEnabled:      true,
+		PeerInactivityExpiration:        time.Hour,
+		PeerInactivityExpirationEnabled: false,
+		Extra:                           &types.ExtraSettings{},
+	})
+	require.NoError(t, err, "expecting to update account settings successfully but got error")
+
+	streamStartTime := time.Now().UTC()
+	err = manager.MarkPeerConnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano(), nil)
+	require.NoError(t, err, "unable to mark peer connected")
+
+	scheduled := make(chan struct{}, 1)
+	manager.peerInactivityExpiry = &MockScheduler{
+		CancelFunc: func(ctx context.Context, IDs []string) {},
+		ScheduleFunc: func(ctx context.Context, in time.Duration, ID string, job func() (nextRunIn time.Duration, reschedule bool)) {
+			select {
+			case scheduled <- struct{}{}:
+			default:
+			}
+		},
+	}
+
+	err = manager.MarkPeerDisconnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano())
+	require.NoError(t, err, "unable to mark peer disconnected")
+
+	select {
+	case <-scheduled:
+		t.Fatal("inactivity expiration must not be scheduled while the account-level setting is disabled")
+	case <-time.After(200 * time.Millisecond):
+		// expected: nothing scheduled
+	}
+}
+
 func TestDefaultAccountManager_OnPeerDisconnected_LastSeenCheck(t *testing.T) {
 	manager, _, err := createManager(t)
 	require.NoError(t, err, "unable to create account manager")
@@ -1935,7 +2046,7 @@ func TestDefaultAccountManager_OnPeerDisconnected_LastSeenCheck(t *testing.T) {

 	t.Run("disconnect peer when session token matches", func(t *testing.T) {
 		streamStartTime := time.Now().UTC()
-		err = manager.MarkPeerConnected(context.Background(), peerPubKey, nil, accountID, streamStartTime.UnixNano(), nil)
+		err = manager.MarkPeerConnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano(), nil)
 		require.NoError(t, err, "unable to mark peer connected")

 		peer, err := manager.Store.GetPeerByPeerPubKey(context.Background(), store.LockingStrengthNone, peerPubKey)
@@ -1956,7 +2067,7 @@ func TestDefaultAccountManager_OnPeerDisconnected_LastSeenCheck(t *testing.T) {
 	t.Run("skip disconnect when stored session is newer (zombie stream protection)", func(t *testing.T) {
 		// Newer stream wins on connect (sets SessionStartedAt = now ns).
 		streamStartTime := time.Now().UTC()
-		err = manager.MarkPeerConnected(context.Background(), peerPubKey, nil, accountID, streamStartTime.UnixNano(), nil)
+		err = manager.MarkPeerConnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano(), nil)
 		require.NoError(t, err, "unable to mark peer connected")

 		peer, err := manager.Store.GetPeerByPeerPubKey(context.Background(), store.LockingStrengthNone, peerPubKey)
@@ -1980,7 +2091,7 @@ func TestDefaultAccountManager_OnPeerDisconnected_LastSeenCheck(t *testing.T) {

 	t.Run("skip stale connect when stored session is newer (blocked goroutine protection)", func(t *testing.T) {
 		node2SyncTime := time.Now().UTC()
-		err = manager.MarkPeerConnected(context.Background(), peerPubKey, nil, accountID, node2SyncTime.UnixNano(), nil)
+		err = manager.MarkPeerConnected(context.Background(), peerPubKey, accountID, node2SyncTime.UnixNano(), nil)
 		require.NoError(t, err, "node 2 should connect peer")

 		peer, err := manager.Store.GetPeerByPeerPubKey(context.Background(), store.LockingStrengthNone, peerPubKey)
@@ -1990,7 +2101,7 @@ func TestDefaultAccountManager_OnPeerDisconnected_LastSeenCheck(t *testing.T) {
 			"SessionStartedAt should equal node2SyncTime token")

 		node1StaleSyncTime := node2SyncTime.Add(-1 * time.Minute)
-		err = manager.MarkPeerConnected(context.Background(), peerPubKey, nil, accountID, node1StaleSyncTime.UnixNano(), nil)
+		err = manager.MarkPeerConnected(context.Background(), peerPubKey, accountID, node1StaleSyncTime.UnixNano(), nil)
 		require.NoError(t, err, "stale connect should not return error")

 		peer, err = manager.Store.GetPeerByPeerPubKey(context.Background(), store.LockingStrengthNone, peerPubKey)
@@ -2052,7 +2163,7 @@ func TestDefaultAccountManager_MarkPeerConnected_ConcurrentRace(t *testing.T) {
 			defer done.Done()
 			ready.Done()
 			start.Wait()
-			errs <- manager.MarkPeerConnected(context.Background(), peerPubKey, nil, accountID, token, nil)
+			errs <- manager.MarkPeerConnected(context.Background(), peerPubKey, accountID, token, nil)
 		}()
 	}

@@ -2093,7 +2204,7 @@ func TestDefaultAccountManager_UpdateAccountSettings_PeerLoginExpiration(t *test
 	account, err := manager.Store.GetAccount(context.Background(), accountID)
 	require.NoError(t, err, "unable to get the account")

-	err = manager.MarkPeerConnected(context.Background(), key.PublicKey().String(), nil, accountID, time.Now().UTC().UnixNano(), nil)
+	err = manager.MarkPeerConnected(context.Background(), key.PublicKey().String(), accountID, time.Now().UTC().UnixNano(), nil)
 	require.NoError(t, err, "unable to mark peer connected")

 	wg := &sync.WaitGroup{}
--- a/management/server/mock_server/account_mock.go
+++ b/management/server/mock_server/account_mock.go
@@ -39,7 +39,7 @@ type MockAccountManager struct {
 	GetUserFromUserAuthFunc               func(ctx context.Context, userAuth auth.UserAuth) (*types.User, error)
 	ListUsersFunc                         func(ctx context.Context, accountID string) ([]*types.User, error)
 	GetPeersFunc                          func(ctx context.Context, accountID, userID, nameFilter, ipFilter string) ([]*nbpeer.Peer, error)
-	MarkPeerConnectedFunc                 func(ctx context.Context, peerKey string, realIP net.IP, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error
+	MarkPeerConnectedFunc                 func(ctx context.Context, peerKey string, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error
 	MarkPeerDisconnectedFunc              func(ctx context.Context, peerKey string, accountID string, sessionStartedAt int64) error
 	SyncAndMarkPeerFunc                   func(ctx context.Context, accountID string, peerPubKey string, meta nbpeer.PeerSystemMeta, realIP net.IP, syncTime time.Time) (*nbpeer.Peer, *types.NetworkMap, []*posture.Checks, int64, error)
 	DeletePeerFunc                        func(ctx context.Context, accountID, peerKey, userID string) error
@@ -114,7 +114,7 @@ type MockAccountManager struct {
 	GetIdpManagerFunc                     func() idp.Manager
 	UpdateIntegratedValidatorFunc         func(ctx context.Context, accountID, userID, validator string, groups []string) error
 	GroupValidationFunc                   func(ctx context.Context, accountId string, groups []string) (bool, error)
-	SyncPeerMetaFunc                      func(ctx context.Context, peerPubKey string, meta nbpeer.PeerSystemMeta) error
+	SyncPeerMetaFunc                      func(ctx context.Context, peerPubKey string, meta nbpeer.PeerSystemMeta, realIP net.IP) error
 	FindExistingPostureCheckFunc          func(accountID string, checks *posture.ChecksDefinition) (*posture.Checks, error)
 	GetAccountIDForPeerKeyFunc            func(ctx context.Context, peerKey string) (string, error)
 	GetAccountByIDFunc                    func(ctx context.Context, accountID string, userID string) (*types.Account, error)
@@ -345,9 +345,9 @@ func (am *MockAccountManager) GetAccountIDByUserID(ctx context.Context, userAuth
 }

 // MarkPeerConnected mock implementation of MarkPeerConnected from server.AccountManager interface
-func (am *MockAccountManager) MarkPeerConnected(ctx context.Context, peerKey string, realIP net.IP, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error {
+func (am *MockAccountManager) MarkPeerConnected(ctx context.Context, peerKey string, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error {
 	if am.MarkPeerConnectedFunc != nil {
-		return am.MarkPeerConnectedFunc(ctx, peerKey, realIP, accountID, sessionStartedAt, nmap)
+		return am.MarkPeerConnectedFunc(ctx, peerKey, accountID, sessionStartedAt, nmap)
 	}
 	return status.Errorf(codes.Unimplemented, "method MarkPeerConnected is not implemented")
 }
@@ -975,9 +975,9 @@ func (am *MockAccountManager) GroupValidation(ctx context.Context, accountId str
 }

 // SyncPeerMeta mocks SyncPeerMeta of the AccountManager interface
-func (am *MockAccountManager) SyncPeerMeta(ctx context.Context, peerPubKey string, meta nbpeer.PeerSystemMeta) error {
+func (am *MockAccountManager) SyncPeerMeta(ctx context.Context, peerPubKey string, meta nbpeer.PeerSystemMeta, realIP net.IP) error {
 	if am.SyncPeerMetaFunc != nil {
-		return am.SyncPeerMetaFunc(ctx, peerPubKey, meta)
+		return am.SyncPeerMetaFunc(ctx, peerPubKey, meta, realIP)
 	}
 	return status.Errorf(codes.Unimplemented, "method SyncPeerMeta is not implemented")
 }
--- a/management/server/peer.go
+++ b/management/server/peer.go
@@ -74,7 +74,7 @@ func (am *DefaultAccountManager) GetPeers(ctx context.Context, accountID, userID
 //
 // Disconnects use MarkPeerDisconnected and require the session to match
 // exactly; see PeerStatus.SessionStartedAt for the protocol.
-func (am *DefaultAccountManager) MarkPeerConnected(ctx context.Context, peerPubKey string, realIP net.IP, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error {
+func (am *DefaultAccountManager) MarkPeerConnected(ctx context.Context, peerPubKey string, accountID string, sessionStartedAt int64, nmap *types.NetworkMap) error {
 	start := time.Now()
 	defer func() {
 		am.metrics.AccountManagerMetrics().RecordPeerStatusUpdateDuration(telemetry.PeerStatusConnect, time.Since(start))
@@ -102,10 +102,6 @@ func (am *DefaultAccountManager) MarkPeerConnected(ctx context.Context, peerPubK
 	}
 	am.metrics.AccountManagerMetrics().CountPeerStatusUpdate(telemetry.PeerStatusConnect, telemetry.PeerStatusApplied)

-	if am.geo != nil && realIP != nil {
-		am.updatePeerLocationIfChanged(ctx, accountID, peer, realIP)
-	}
-
 	if err = am.schedulePeerExpirations(ctx, accountID, peer); err != nil {
 		return err
 	}
@@ -192,27 +188,40 @@ func (am *DefaultAccountManager) MarkPeerDisconnected(ctx context.Context, peerP
 		}
 	}

+	if peer.AddedWithSSOLogin() && peer.InactivityExpirationEnabled {
+		settings, err := am.Store.GetAccountSettings(ctx, store.LockingStrengthNone, accountID)
+		if err != nil {
+			log.WithContext(ctx).Warnf("failed getting account settings to schedule inactivity expiration for peer %s: %v", peer.ID, err)
+		} else if settings.PeerInactivityExpirationEnabled {
+			am.checkAndSchedulePeerInactivityExpiration(ctx, accountID)
+		}
+	}
+
 	return nil
 }

-// updatePeerLocationIfChanged refreshes the geolocation on a separate
-// row update, only when the connection IP actually changed. Geo lookups
-// are expensive so we skip same-IP reconnects.
-func (am *DefaultAccountManager) updatePeerLocationIfChanged(ctx context.Context, accountID string, peer *nbpeer.Peer, realIP net.IP) {
+// resolvePeerLocation looks up the geo location for realIP, returning nil when
+// there is nothing to apply: geo disabled, no real IP, the IP is unchanged from
+// what the peer already has, or the lookup failed. Geo lookups are skipped on
+// same-IP reconnects since they are comparatively expensive. The returned value
+// is applied by Peer.UpdateMetaIfNew so the change is persisted by its peer save.
+func (am *DefaultAccountManager) resolvePeerLocation(ctx context.Context, peer *nbpeer.Peer, realIP net.IP) *nbpeer.Location {
+	if am.geo == nil || realIP == nil {
+		return nil
+	}
 	if peer.Location.ConnectionIP != nil && peer.Location.ConnectionIP.Equal(realIP) {
-		return
+		return nil
 	}
 	location, err := am.geo.Lookup(realIP)
 	if err != nil {
 		log.WithContext(ctx).Warnf("failed to get location for peer %s realip: [%s]: %v", peer.ID, realIP.String(), err)
-		return
+		return nil
 	}
-	peer.Location.ConnectionIP = realIP
-	peer.Location.CountryCode = location.Country.ISOCode
-	peer.Location.CityName = location.City.Names.En
-	peer.Location.GeoNameID = location.City.GeonameID
-	if err := am.Store.SavePeerLocation(ctx, accountID, peer); err != nil {
-		log.WithContext(ctx).Warnf("could not store location for peer %s: %s", peer.ID, err)
+	return &nbpeer.Location{
+		ConnectionIP: realIP,
+		CountryCode:  location.Country.ISOCode,
+		CityName:     location.City.Names.En,
+		GeoNameID:    location.City.GeonameID,
 	}
 }

@@ -980,7 +989,8 @@ func getPeerIPDNSLabel(ip netip.Addr, peerHostName string) (string, error) {
 // SyncPeer checks whether peer is eligible for receiving NetworkMap (authenticated) and returns its NetworkMap if eligible
 func (am *DefaultAccountManager) SyncPeer(ctx context.Context, sync types.PeerSync, accountID string) (*nbpeer.Peer, *types.NetworkMap, []*posture.Checks, int64, error) {
 	var peer *nbpeer.Peer
-	var updated, versionChanged, ipv6CapabilityChanged bool
+	var ipv6CapabilityChanged bool
+	var metaDiff nbpeer.MetaDiff
 	var err error

 	settings, err := am.Store.GetAccountSettings(ctx, store.LockingStrengthNone, accountID)
@@ -1010,9 +1020,10 @@ func (am *DefaultAccountManager) SyncPeer(ctx context.Context, sync types.PeerSy
 		}

 		oldHasIPv6Cap := peer.HasCapability(nbpeer.PeerCapabilityIPv6Overlay)
-		updated, versionChanged = peer.UpdateMetaIfNew(ctx, sync.Meta)
+		newLocation := am.resolvePeerLocation(ctx, peer, sync.RealIP)
+		metaDiff = peer.UpdateMetaIfNew(ctx, sync.Meta, newLocation)
 		ipv6CapabilityChanged = oldHasIPv6Cap != peer.HasCapability(nbpeer.PeerCapabilityIPv6Overlay)
-		if updated {
+		if metaDiff.Updated() {
 			am.metrics.AccountManagerMetrics().CountPeerMetUpdate()
 			log.WithContext(ctx).Tracef("peer %s metadata updated", peer.ID)
 			if err = transaction.SavePeer(ctx, accountID, peer); err != nil {
@@ -1040,9 +1051,10 @@ func (am *DefaultAccountManager) SyncPeer(ctx context.Context, sync types.PeerSy
 		return nil, nil, nil, 0, err
 	}

-	if isStatusChanged || sync.UpdateAccountPeers || ipv6CapabilityChanged || (updated && (len(resPostureChecks) > 0 || versionChanged)) {
+	metaDiffAffectsPosture := posture.AffectsPosture(&metaDiff, resPostureChecks)
+	if isStatusChanged || sync.UpdateAccountPeers || ipv6CapabilityChanged || metaDiffAffectsPosture || metaDiff.VersionChanged || metaDiff.Hostname {
 		changedPeerIDs := []string{peer.ID}
-		affectedPeerIDs := am.syncPeerAffectedPeers(ctx, accountID, peer.ID, nmap, peerNotValid, updated, len(resPostureChecks) > 0)
+		affectedPeerIDs := am.syncPeerAffectedPeers(ctx, accountID, peer.ID, nmap, peerNotValid, metaDiffAffectsPosture)
 		if err = am.networkMapController.OnPeersUpdated(ctx, accountID, changedPeerIDs, affectedPeerIDs); err != nil {
 			return nil, nil, nil, 0, fmt.Errorf("notify network map controller of peer update: %w", err)
 		}
@@ -1059,8 +1071,8 @@ func (am *DefaultAccountManager) SyncPeer(ctx context.Context, sync types.PeerSy
 // metadata change that flips a posture result removes this peer from others'
 // maps asymmetrically; that case (and an invalid peer, whose map is empty) falls
 // back to the resolver.
-func (am *DefaultAccountManager) syncPeerAffectedPeers(ctx context.Context, accountID, peerID string, nmap *types.NetworkMap, peerNotValid, metaUpdated, hasPostureChecks bool) []string {
-	if peerNotValid || (metaUpdated && hasPostureChecks) {
+func (am *DefaultAccountManager) syncPeerAffectedPeers(ctx context.Context, accountID, peerID string, nmap *types.NetworkMap, peerNotValid, metaChangeAffectedPosture bool) []string {
+	if peerNotValid || metaChangeAffectedPosture {
 		return am.resolveAffectedPeersForPeerChanges(ctx, am.Store, accountID, []string{peerID})
 	}
 	return affectedPeerIDsFromNetworkMap(nmap, peerID)
--- a/management/server/peer/peer.go
+++ b/management/server/peer/peer.go
@@ -256,14 +256,18 @@ func (p *Peer) Copy() *Peer {
 	}
 }

-// UpdateMetaIfNew updates peer's system metadata if new information is provided
-// returns true if meta was updated, false otherwise
-func (p *Peer) UpdateMetaIfNew(ctx context.Context, meta PeerSystemMeta) (updated, versionChanged bool) {
+// UpdateMetaIfNew updates peer's system metadata and connection geo location if
+// new information is provided. newLocation is the geo location resolved from the
+// peer's current connection IP, or nil when there is nothing to apply (geo
+// disabled, no real IP, or the IP is unchanged); the caller owns the expensive
+// lookup and the same-IP guard. It returns a MetaDiff describing what changed;
+// diff.Updated() reports whether the peer needs to be persisted.
+func (p *Peer) UpdateMetaIfNew(ctx context.Context, meta PeerSystemMeta, newLocation *Location) MetaDiff {
 	if meta.isEmpty() {
-		return updated, versionChanged
+		return MetaDiff{}
 	}

-	versionChanged = p.Meta.WtVersion != meta.WtVersion
+	versionChanged := p.Meta.WtVersion != meta.WtVersion

 	// Avoid overwriting UIVersion if the update was triggered sole by the CLI client
 	if meta.UIVersion == "" {
@@ -272,97 +276,177 @@ func (p *Peer) UpdateMetaIfNew(ctx context.Context, meta PeerSystemMeta) (update

 	oldVersion := p.Meta.WtVersion

-	diff := metaDiff(p.Meta, meta)
-	if len(diff) != 0 {
+	diff := diffMeta(p.Meta, meta)
+	if diff.Any() {
 		p.Meta = meta
-		updated = true
+	}
+	diff.VersionChanged = versionChanged
+
+	locationInfo := ""
+	if newLocation != nil {
+		p.Location = *newLocation
+		diff.LocationChanged = true
+		locationInfo = fmt.Sprintf("location changed to %s, ", newLocation.ConnectionIP)
 	}

 	versionInfo := ""
-	if versionChanged {
+	if diff.VersionChanged {
 		versionInfo = fmt.Sprintf("version changed: %s -> %s, ", oldVersion, meta.WtVersion)
 	}

-	if len(diff) > 0 || versionChanged {
+	if diff.Any() || diff.VersionChanged || diff.LocationChanged {
 		log.WithContext(ctx).
-			Debugf("peer meta updated, %s%d field(s) changed: %s", versionInfo, len(diff), strings.Join(diff, ", "))
+			Debugf("peer meta updated, %s%s%d field(s) changed: %s", versionInfo, locationInfo, len(diff.Changed), strings.Join(diff.Changed, ", "))
 	}

-	return updated, versionChanged
+	return diff
+}
+
+// MetaDiff records which PeerSystemMeta fields differ between two metas. Each bool
+// maps to a single struct field, except Environment, which is split into Cloud and
+// Platform. Changed holds the human-readable `field: <old> -> <new>` entries so the
+// existing log line and isEqual can be derived from the same comparison.
+//
+// VersionChanged and LocationChanged sit outside the per-meta-field set:
+// VersionChanged tracks the WireGuard client version specifically (compared before
+// the UIVersion fixup, to signal client upgrades) and LocationChanged tracks the
+// peer's connection geo location, which lives on Peer rather than PeerSystemMeta.
+// Neither contributes an entry to Changed, so the field-coverage accounting stays
+// driven purely by the PeerSystemMeta comparison.
+type MetaDiff struct {
+	Hostname            bool
+	GoOS                bool
+	Kernel              bool
+	KernelVersion       bool
+	Core                bool
+	Platform            bool
+	OS                  bool
+	OSVersion           bool
+	WtVersion           bool
+	UIVersion           bool
+	SystemSerialNumber  bool
+	SystemProductName   bool
+	SystemManufacturer  bool
+	EnvironmentCloud    bool
+	EnvironmentPlatform bool
+	Flags               bool
+	Capabilities        bool
+	NetworkAddresses    bool
+	Files               bool
+
+	VersionChanged  bool
+	LocationChanged bool
+
+	Changed []string
+}
+
+// Any reports whether any PeerSystemMeta field changed.
+func (d MetaDiff) Any() bool {
+	return len(d.Changed) != 0
+}
+
+// Updated reports whether the peer needs to be persisted: any meta field changed
+// or the geo location changed. The version flag alone does not imply a write,
+// since a version change is also reflected in the WtVersion meta field.
+func (d MetaDiff) Updated() bool {
+	return d.Any() || d.LocationChanged || d.VersionChanged
 }

-// metaDiff returns a human-readable list of the fields that differ between the
-// old and new meta, each formatted as `field: <old> -> <new>`. It is the single
-// source of truth for meta comparison: isEqual reports equality as an empty
-// diff, so the log line can never disagree with the change decision. Slices are
-// cloned before sorting, so callers' meta is not mutated.
 func metaDiff(oldMeta, newMeta PeerSystemMeta) []string {
-	var diff []string
+	return diffMeta(oldMeta, newMeta).Changed
+}
+
+// diffMeta compares two metas field by field, returning both a per-field flag set
+// (for callers that need to know exactly what changed, e.g. matching against
+// posture checks) and the human-readable Changed list. It is the single source of
+// truth for meta comparison: isEqual reports equality as an empty diff, so the log
+// line, the change decision, and the flags can never disagree.
+func diffMeta(oldMeta, newMeta PeerSystemMeta) MetaDiff {
+	var d MetaDiff
 	add := func(field string, oldVal, newVal any) {
-		diff = append(diff, fmt.Sprintf("%s: %v -> %v", field, oldVal, newVal))
+		d.Changed = append(d.Changed, fmt.Sprintf("%s: %v -> %v", field, oldVal, newVal))
 	}

 	if oldMeta.Hostname != newMeta.Hostname {
+		d.Hostname = true
 		add("hostname", oldMeta.Hostname, newMeta.Hostname)
 	}
 	if oldMeta.GoOS != newMeta.GoOS {
+		d.GoOS = true
 		add("goos", oldMeta.GoOS, newMeta.GoOS)
 	}
 	if oldMeta.Kernel != newMeta.Kernel {
+		d.Kernel = true
 		add("kernel", oldMeta.Kernel, newMeta.Kernel)
 	}
 	if oldMeta.KernelVersion != newMeta.KernelVersion {
+		d.KernelVersion = true
 		add("kernel_version", oldMeta.KernelVersion, newMeta.KernelVersion)
 	}
 	if oldMeta.Core != newMeta.Core {
+		d.Core = true
 		add("core", oldMeta.Core, newMeta.Core)
 	}
 	if oldMeta.Platform != newMeta.Platform {
+		d.Platform = true
 		add("platform", oldMeta.Platform, newMeta.Platform)
 	}
 	if oldMeta.OS != newMeta.OS {
+		d.OS = true
 		add("os", oldMeta.OS, newMeta.OS)
 	}
 	if oldMeta.OSVersion != newMeta.OSVersion {
+		d.OSVersion = true
 		add("os_version", oldMeta.OSVersion, newMeta.OSVersion)
 	}
 	if oldMeta.WtVersion != newMeta.WtVersion {
+		d.WtVersion = true
 		add("wt_version", oldMeta.WtVersion, newMeta.WtVersion)
 	}
 	if oldMeta.UIVersion != newMeta.UIVersion {
+		d.UIVersion = true
 		add("ui_version", oldMeta.UIVersion, newMeta.UIVersion)
 	}
 	if oldMeta.SystemSerialNumber != newMeta.SystemSerialNumber {
+		d.SystemSerialNumber = true
 		add("system_serial_number", oldMeta.SystemSerialNumber, newMeta.SystemSerialNumber)
 	}
 	if oldMeta.SystemProductName != newMeta.SystemProductName {
+		d.SystemProductName = true
 		add("system_product_name", oldMeta.SystemProductName, newMeta.SystemProductName)
 	}
 	if oldMeta.SystemManufacturer != newMeta.SystemManufacturer {
+		d.SystemManufacturer = true
 		add("system_manufacturer", oldMeta.SystemManufacturer, newMeta.SystemManufacturer)
 	}
 	if oldMeta.Environment.Cloud != newMeta.Environment.Cloud {
+		d.EnvironmentCloud = true
 		add("environment_cloud", oldMeta.Environment.Cloud, newMeta.Environment.Cloud)
 	}
 	if oldMeta.Environment.Platform != newMeta.Environment.Platform {
+		d.EnvironmentPlatform = true
 		add("environment_platform", oldMeta.Environment.Platform, newMeta.Environment.Platform)
 	}
 	if !oldMeta.Flags.isEqual(newMeta.Flags) {
+		d.Flags = true
 		add("flags", fmt.Sprintf("%+v", oldMeta.Flags), fmt.Sprintf("%+v", newMeta.Flags))
 	}
 	if !capabilitiesEqual(oldMeta.Capabilities, newMeta.Capabilities) {
+		d.Capabilities = true
 		add("capabilities", oldMeta.Capabilities, newMeta.Capabilities)
 	}

 	if !sameMultiset(oldMeta.NetworkAddresses, newMeta.NetworkAddresses) {
+		d.NetworkAddresses = true
 		add("network_addresses", fmt.Sprintf("%v", oldMeta.NetworkAddresses), fmt.Sprintf("%v", newMeta.NetworkAddresses))
 	}

 	if !sameMultiset(oldMeta.Files, newMeta.Files) {
+		d.Files = true
 		add("files", fmt.Sprintf("%v", oldMeta.Files), fmt.Sprintf("%v", newMeta.Files))
 	}

-	return diff
+	return d
 }

 // sameMultiset reports whether two slices contain the same elements with the
--- a/management/server/posture/checks.go
+++ b/management/server/posture/checks.go
@@ -7,6 +7,7 @@ import (
 	"regexp"

 	"github.com/hashicorp/go-version"
+
 	nbpeer "github.com/netbirdio/netbird/management/server/peer"
 	"github.com/netbirdio/netbird/shared/management/http/api"
 	"github.com/netbirdio/netbird/shared/management/status"
@@ -51,6 +52,34 @@ type Checks struct {
 	Checks ChecksDefinition `gorm:"serializer:json"`
 }

+// AffectsPosture reports whether the peer metadata changes described by diff can
+// alter the outcome of any of the given posture checks. It maps each check kind to
+// the metadata fields it inspects, so an unrelated change (e.g. a hostname update)
+// does not force a posture re-evaluation.
+func AffectsPosture(diff *nbpeer.MetaDiff, checks []*Checks) bool {
+	if diff == nil {
+		return false
+	}
+	for _, c := range checks {
+		if c.Checks.ProcessCheck != nil && diff.Files {
+			return true
+		}
+		if c.Checks.OSVersionCheck != nil && (diff.OSVersion || diff.OS || diff.KernelVersion) {
+			return true
+		}
+		if c.Checks.NBVersionCheck != nil && diff.WtVersion {
+			return true
+		}
+		if c.Checks.GeoLocationCheck != nil && diff.LocationChanged {
+			return true
+		}
+		if c.Checks.PeerNetworkRangeCheck != nil && diff.NetworkAddresses {
+			return true
+		}
+	}
+	return false
+}
+
 // ChecksDefinition contains definition of actual check
 type ChecksDefinition struct {
 	NBVersionCheck        *NBVersionCheck        `json:",omitempty"`
--- a/management/server/store/sql_store.go
+++ b/management/server/store/sql_store.go
@@ -581,28 +581,6 @@ func (s *SqlStore) MarkPeerDisconnectedIfSameSession(ctx context.Context, accoun
 	return result.RowsAffected > 0, nil
 }

-func (s *SqlStore) SavePeerLocation(ctx context.Context, accountID string, peerWithLocation *nbpeer.Peer) error {
-	// To maintain data integrity, we create a copy of the peer's location to prevent unintended updates to other fields.
-	var peerCopy nbpeer.Peer
-	// Since the location field has been migrated to JSON serialization,
-	// updating the struct ensures the correct data format is inserted into the database.
-	peerCopy.Location = peerWithLocation.Location
-
-	result := s.db.Model(&nbpeer.Peer{}).
-		Where(accountAndIDQueryCondition, accountID, peerWithLocation.ID).
-		Updates(peerCopy)
-
-	if result.Error != nil {
-		return status.Errorf(status.Internal, "failed to save peer locations to store: %v", result.Error)
-	}
-
-	if result.RowsAffected == 0 {
-		return status.Errorf(status.NotFound, peerNotFoundFMT, peerWithLocation.ID)
-	}
-
-	return nil
-}
-
 // ApproveAccountPeers marks all peers that currently require approval in the given account as approved.
 func (s *SqlStore) ApproveAccountPeers(ctx context.Context, accountID string) (int, error) {
 	result := s.db.Model(&nbpeer.Peer{}).
--- a/management/server/store/sql_store_test.go
+++ b/management/server/store/sql_store_test.go
@@ -618,56 +618,6 @@ func TestSqlStore_SavePeerStatus(t *testing.T) {
 	assert.WithinDurationf(t, newStatus.LastSeen, actual.LastSeen.UTC(), time.Millisecond, "LastSeen should be equal")
 }

-func TestSqlStore_SavePeerLocation(t *testing.T) {
-	store, cleanUp, err := NewTestStoreFromSQL(context.Background(), "../testdata/store.sql", t.TempDir())
-	t.Cleanup(cleanUp)
-	assert.NoError(t, err)
-
-	account, err := store.GetAccount(context.Background(), "bf1c8084-ba50-4ce7-9439-34653001fc3b")
-	require.NoError(t, err)
-
-	peer := &nbpeer.Peer{
-		AccountID: account.Id,
-		ID:        "testpeer",
-		Location: nbpeer.Location{
-			ConnectionIP: net.ParseIP("0.0.0.0"),
-			CountryCode:  "YY",
-			CityName:     "City",
-			GeoNameID:    1,
-		},
-		CreatedAt: time.Now().UTC(),
-		Meta:      nbpeer.PeerSystemMeta{},
-	}
-	// error is expected as peer is not in store yet
-	err = store.SavePeerLocation(context.Background(), account.Id, peer)
-	assert.Error(t, err)
-
-	account.Peers[peer.ID] = peer
-	err = store.SaveAccount(context.Background(), account)
-	require.NoError(t, err)
-
-	peer.Location.ConnectionIP = net.ParseIP("35.1.1.1")
-	peer.Location.CountryCode = "DE"
-	peer.Location.CityName = "Berlin"
-	peer.Location.GeoNameID = 2950159
-
-	err = store.SavePeerLocation(context.Background(), account.Id, account.Peers[peer.ID])
-	assert.NoError(t, err)
-
-	account, err = store.GetAccount(context.Background(), account.Id)
-	require.NoError(t, err)
-
-	actual := account.Peers[peer.ID].Location
-	assert.Equal(t, peer.Location, actual)
-
-	peer.ID = "non-existing-peer"
-	err = store.SavePeerLocation(context.Background(), account.Id, peer)
-	assert.Error(t, err)
-	parsedErr, ok := status.FromError(err)
-	require.True(t, ok)
-	require.Equal(t, status.NotFound, parsedErr.Type(), "should return not found error")
-}
-
 func Test_TestGetAccountByPrivateDomain(t *testing.T) {
 	if runtime.GOOS == "windows" {
 		t.Skip("The SQLite store is not properly supported by Windows yet")
--- a/management/server/store/store.go
+++ b/management/server/store/store.go
@@ -185,7 +185,6 @@ type Store interface {
 	// recorded by the database. Returns true when the update happened,
 	// false when a newer session has taken over.
 	MarkPeerDisconnectedIfSameSession(ctx context.Context, accountID, peerID string, sessionStartedAt int64) (bool, error)
-	SavePeerLocation(ctx context.Context, accountID string, peer *nbpeer.Peer) error
 	ApproveAccountPeers(ctx context.Context, accountID string) (int, error)
 	DeletePeer(ctx context.Context, accountID string, peerID string) error

--- a/management/server/store/store_mock.go
+++ b/management/server/store/store_mock.go
@@ -2968,20 +2968,6 @@ func (mr *MockStoreMockRecorder) SavePeer(ctx, accountID, peer interface{}) *gom
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SavePeer", reflect.TypeOf((*MockStore)(nil).SavePeer), ctx, accountID, peer)
 }

-// SavePeerLocation mocks base method.
-func (m *MockStore) SavePeerLocation(ctx context.Context, accountID string, peer *peer.Peer) error {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "SavePeerLocation", ctx, accountID, peer)
-	ret0, _ := ret[0].(error)
-	return ret0
-}
-
-// SavePeerLocation indicates an expected call of SavePeerLocation.
-func (mr *MockStoreMockRecorder) SavePeerLocation(ctx, accountID, peer interface{}) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SavePeerLocation", reflect.TypeOf((*MockStore)(nil).SavePeerLocation), ctx, accountID, peer)
-}
-
 // SavePeerStatus mocks base method.
 func (m *MockStore) SavePeerStatus(ctx context.Context, accountID, peerID string, status peer.PeerStatus) error {
 	m.ctrl.T.Helper()
--- a/management/server/types/peer.go
+++ b/management/server/types/peer.go
@@ -12,6 +12,9 @@ type PeerSync struct {
 	WireGuardPubKey string
 	// Meta is the system information passed by peer, must be always present
 	Meta nbpeer.PeerSystemMeta
+	// RealIP is the peer's connection IP, used to refresh its geo location.
+	// May be nil when the request has no associated connection IP.
+	RealIP net.IP
 	// UpdateAccountPeers indicate updating account peers,
 	// which occurs when the peer's metadata is updated
 	UpdateAccountPeers bool
--- a/shared/relay/client/client.go
+++ b/shared/relay/client/client.go
@@ -196,6 +196,11 @@ type Client struct {
 // Transport returns the negotiated relay transport of the current connection,
 // or an empty string when not connected.
 func (c *Client) Transport() string {
+	c.log.Warnf("start to call Transport")
+	defer func() {
+		c.log.Warnf("end of call Transport")
+	}()
+	c.log.Warnf("query Transport:")
 	c.mu.Lock()
 	defer c.mu.Unlock()
 	return c.transport
@@ -244,6 +249,7 @@ func NewClientWithServerIP(serverURL string, serverIP netip.Addr, authTokenStore
 // Connect establishes a connection to the relay server. It blocks until the connection is established or an error occurs.
 func (c *Client) Connect(ctx context.Context) error {
 	c.log.Infof("connecting to relay server")
+	defer c.log.Warnf("end of call Connect to relay server")
 	c.readLoopMutex.Lock()
 	defer c.readLoopMutex.Unlock()

@@ -286,6 +292,8 @@ func (c *Client) Connect(ctx context.Context) error {
 // todo: what should happen if call with the same peerID with multiple times?
 func (c *Client) OpenConn(ctx context.Context, dstPeerID string) (net.Conn, error) {
 	peerID := messages.HashID(dstPeerID)
+	c.log.Infof("open conn to peer: %v", dstPeerID)
+	defer c.log.Warnf("end of OpenConn to peer: %v", dstPeerID)

 	c.mu.Lock()
 	if !c.serviceIsRunning {
@@ -354,6 +362,9 @@ func (c *Client) ServerInstanceURL() (string, error) {
 // extracted from the underlying socket's RemoteAddr. Zero value if not
 // connected or if the address is not an IP literal.
 func (c *Client) ConnectedIP() netip.Addr {
+	c.log.Infof("run ConnectedIP")
+	defer c.log.Warnf("end of Connected IP")
+
 	c.mu.Lock()
 	conn := c.relayConn
 	c.mu.Unlock()
@@ -376,6 +387,8 @@ func (c *Client) SetOnDisconnectListener(fn func(string)) {

 // HasConns returns true if there are connections.
 func (c *Client) HasConns() bool {
+	c.log.Infof("run HasConns")
+	defer c.log.Warnf("end of HasConns")
 	c.mu.Lock()
 	defer c.mu.Unlock()
 	return len(c.conns) > 0
--- a/shared/relay/client/client_serverip_test.go
+++ b/shared/relay/client/client_serverip_test.go
@@ -8,6 +8,7 @@ import (
 	"testing"
 	"time"

+	log "github.com/sirupsen/logrus"
 	"go.opentelemetry.io/otel"

 	"github.com/netbirdio/netbird/client/iface"
@@ -252,7 +253,7 @@ func TestClient_ConnectedIPParsesRemoteAddr(t *testing.T) {
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			c := &Client{relayConn: stubConn{remote: staticAddr{s: tt.s}}}
+			c := &Client{log: log.WithField("relay", tt.name), relayConn: stubConn{remote: staticAddr{s: tt.s}}}
 			got := c.ConnectedIP()
 			var gotStr string
 			if got.IsValid() {
--- a/shared/signal/client/grpc.go
+++ b/shared/signal/client/grpc.go
@@ -78,6 +78,23 @@ type GrpcClient struct {
 	// transport-alive but no longer delivering messages. It is the source of
 	// truth IsHealthy reads, and is cleared once any frame is received again.
 	receiveStalled atomic.Bool
+	// receiveHandoffBlocked is set while the receive loop is parked handing a
+	// message to a busy decryption worker. The loop stops calling Recv (and
+	// markReceived) in that window, so the stream looks silent though it is
+	// healthy. The watchdog reads this to avoid misreading self-inflicted
+	// receive backpressure as a dead stream: reconnecting cannot help, since the
+	// new stream feeds the same worker, and only triggers a reconnect storm.
+	receiveHandoffBlocked atomic.Bool
+	// lastDecrypt holds the Unix-nano timestamp of the last message the decryption
+	// worker pulled off its queue. Diagnostic only: it lets a stall log show
+	// whether the worker was draining (busy) or idle when the stream went silent.
+	lastDecrypt atomic.Int64
+	// handoffWaitTotal, handoffWaitMax (nanos) and handoffWaitCount accumulate the
+	// time the receive loop spent blocked handing messages to the worker. This is
+	// time not spent reading the stream, so it quantifies receive backpressure.
+	handoffWaitTotal atomic.Int64
+	handoffWaitMax   atomic.Int64
+	handoffWaitCount atomic.Int64
 }

 // NewClient creates a new Signal client
@@ -353,6 +370,8 @@ func (c *GrpcClient) SendToStream(msg *proto.EncryptedMessage) error {

 // decryptMessage decrypts the body of the msg using Wireguard private key and Remote peer's public key
 func (c *GrpcClient) decryptMessage(msg *proto.EncryptedMessage) (*proto.Message, error) {
+	c.lastDecrypt.Store(time.Now().UnixNano())
+
 	remoteKey, err := wgtypes.ParseKey(msg.GetKey())
 	if err != nil {
 		return nil, err
@@ -439,6 +458,22 @@ func (c *GrpcClient) idleSinceReceive() time.Duration {
 	return time.Since(time.Unix(0, c.lastReceived.Load()))
 }

+// idleSinceDecrypt returns how long since the worker last pulled a message.
+// Diagnostic only: distinguishes a busy/wedged worker from an idle one.
+func (c *GrpcClient) idleSinceDecrypt() time.Duration {
+	return time.Since(time.Unix(0, c.lastDecrypt.Load()))
+}
+
+// receiveAlive reports whether the receive stream shows liveness: it delivered a
+// frame within the inactivity threshold, or the receive loop is currently parked
+// handing a message to a busy decryption worker. In the latter case the loop has
+// stopped calling Recv, so the stream looks silent while being healthy, and
+// reconnecting would not help, so the watchdog must treat it as alive.
+func (c *GrpcClient) receiveAlive() bool {
+	return c.idleSinceReceive() < receiveInactivityThreshold ||
+		c.receiveHandoffBlocked.Load()
+}
+
 // watchReceiveStream guards against a receive stream that is transport-alive but
 // no longer delivering messages. While the stream is idle past
 // receiveInactivityThreshold it sends a self-addressed probe that the Signal
@@ -450,18 +485,55 @@ func (c *GrpcClient) watchReceiveStream(ctx context.Context, cancelStream contex
 	defer ticker.Stop()

 	var probeSentAt time.Time
+	var holdLogged bool
+	var statTicks int
+	var lastStatTotal int64
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
-			if c.idleSinceReceive() < receiveInactivityThreshold {
+			// Periodic backpressure summary so time lost to the worker handoff is
+			// visible even when no stall fires. Emitted ~once a minute and only
+			// when the wait grew, to stay quiet on a healthy stream.
+			if statTicks++; statTicks >= int(time.Minute/receiveWatchdogInterval) {
+				statTicks = 0
+				if total, max, count := c.handoffWaitStats(); int64(total) > lastStatTotal {
+					log.Infof("signal receive backpressure: handoffWaitTotal=%s (+%s last min) handoffWaitMax=%s handoffMsgs=%d",
+						total.Round(time.Second), (total - time.Duration(lastStatTotal)).Round(time.Millisecond),
+						max.Round(time.Millisecond), count)
+					lastStatTotal = int64(total)
+				}
+			}
+
+			if c.receiveAlive() {
+				// Attribute the case that matters in the field: silent past the
+				// threshold but held because the receive loop is parked on the
+				// worker handoff (backpressure), not a dead stream. Log once per
+				// hold episode so a persistent worker stall is visible at info.
+				if c.idleSinceReceive() >= receiveInactivityThreshold && c.receiveHandoffBlocked.Load() {
+					if !holdLogged {
+						total, max, count := c.handoffWaitStats()
+						log.Infof("signal receive idle %s, loop blocked on worker handoff (idleDecrypt=%s queueDepth=%d connState=%s handoffWaitTotal=%s handoffWaitMax=%s handoffMsgs=%d); holding stream",
+							c.idleSinceReceive().Round(time.Second), c.idleSinceDecrypt().Round(time.Second),
+							c.decryptionWorker.QueueLen(), c.signalConn.GetState(),
+							total.Round(time.Second), max.Round(time.Millisecond), count)
+						holdLogged = true
+					}
+				} else {
+					holdLogged = false
+				}
 				probeSentAt = time.Time{}
 				continue
 			}
+			holdLogged = false

 			if !probeSentAt.IsZero() && time.Since(probeSentAt) >= receiveProbeTimeout {
-				log.Warnf("signal receive stream stalled: no messages for %s and probe did not return, reconnecting", c.idleSinceReceive().Round(time.Second))
+				total, max, count := c.handoffWaitStats()
+				log.Warnf("signal receive stream stalled, reconnecting: idleRecv=%s idleDecrypt=%s handoffBlocked=%v queueDepth=%d connState=%s handoffWaitTotal=%s handoffWaitMax=%s handoffMsgs=%d probe did not return",
+					c.idleSinceReceive().Round(time.Second), c.idleSinceDecrypt().Round(time.Second),
+					c.receiveHandoffBlocked.Load(), c.decryptionWorker.QueueLen(), c.signalConn.GetState(),
+					total.Round(time.Second), max.Round(time.Millisecond), count)
 				c.receiveStalled.Store(true)
 				c.notifyDisconnected(errReceiveStreamStalled)
 				cancelStream()
@@ -517,12 +589,37 @@ func (c *GrpcClient) receive(stream proto.SignalExchange_ConnectStreamClient) er
 			continue
 		}

+		// The handoff blocks while the worker is busy, which parks this loop and
+		// stops Recv. Flag it so the watchdog does not read the resulting silence
+		// as a dead stream, and account the wait as receive backpressure.
+		handoffStart := time.Now()
+		c.receiveHandoffBlocked.Store(true)
 		if err := c.decryptionWorker.AddMsg(c.ctx, msg); err != nil {
 			log.Errorf("failed to add message to decryption worker: %v", err)
 		}
+		c.receiveHandoffBlocked.Store(false)
+		c.recordHandoffWait(time.Since(handoffStart))
 	}
 }

+// recordHandoffWait accumulates the time the receive loop was blocked handing a
+// message to the worker.
+func (c *GrpcClient) recordHandoffWait(d time.Duration) {
+	c.handoffWaitTotal.Add(int64(d))
+	c.handoffWaitCount.Add(1)
+	for {
+		cur := c.handoffWaitMax.Load()
+		if int64(d) <= cur || c.handoffWaitMax.CompareAndSwap(cur, int64(d)) {
+			break
+		}
+	}
+}
+
+// handoffWaitStats returns cumulative receive-loop handoff backpressure.
+func (c *GrpcClient) handoffWaitStats() (total, max time.Duration, count int64) {
+	return time.Duration(c.handoffWaitTotal.Load()), time.Duration(c.handoffWaitMax.Load()), c.handoffWaitCount.Load()
+}
+
 func (c *GrpcClient) startEncryptionWorker(handler func(msg *proto.Message) error) {
 	if c.decryptionWorker != nil {
 		return
--- a/shared/signal/client/watchdog_test.go
+++ b/shared/signal/client/watchdog_test.go
@@ -82,3 +82,27 @@ func TestReceiveProbeRoundTrips(t *testing.T) {
 		t.Fatal("self-addressed heartbeat did not round-trip back through the signal server")
 	}
 }
+
+// TestReceiveAliveTreatsHandoffBlockAsLiveness reproduces the false positive
+// where a busy decryption worker parks the receive loop on the worker handoff,
+// so Recv (and markReceived) stops firing even though the stream is healthy.
+// With the receive stream silent past the inactivity threshold but the loop
+// blocked on handoff, the watchdog must consider the stream alive rather than
+// tear it down (reconnecting feeds the same worker and would not help).
+func TestReceiveAliveTreatsHandoffBlockAsLiveness(t *testing.T) {
+	c := &GrpcClient{}
+
+	// Receive stream silent and the loop not blocked on handoff: genuinely stalled.
+	c.lastReceived.Store(time.Now().Add(-2 * receiveInactivityThreshold).UnixNano())
+	require.False(t, c.receiveAlive(), "silent stream with the receive loop idle must be treated as stalled")
+
+	// Receive stream silent but the loop is parked handing a message to a busy
+	// worker: self-inflicted backpressure, not a dead stream, must not tear down.
+	c.receiveHandoffBlocked.Store(true)
+	require.True(t, c.receiveAlive(), "a receive loop blocked on worker handoff must keep the stream alive")
+
+	// Handoff drained, loop back to reading, a frame just arrived: alive via the receive path.
+	c.receiveHandoffBlocked.Store(false)
+	c.markReceived()
+	require.True(t, c.receiveAlive(), "a freshly received frame must keep the stream alive")
+}
--- a/shared/signal/client/worker.go
+++ b/shared/signal/client/worker.go
@@ -32,6 +32,13 @@ func (w *Worker) AddMsg(ctx context.Context, msg *proto.EncryptedMessage) error
 	return nil
 }

+// QueueLen returns the number of messages buffered for decryption. Diagnostic
+// only: a non-empty queue while the receive stream is silent indicates the
+// receive loop is parked on the handoff rather than the stream being dead.
+func (w *Worker) QueueLen() int {
+	return len(w.encryptedMsgPool)
+}
+
 func (w *Worker) Work(ctx context.Context) {
 	for {
 		select {
Author	SHA1	Message	Date
Zoltán Papp	566d21c2c3	Set logger in ConnectedIP test to avoid nil deref	2026-06-25 10:15:07 +02:00
Zoltán Papp	858e2d1c34	Add logs	2026-06-24 21:46:14 +02:00
Zoltán Papp	35ed69bfe7	Revert "[client] Drop signaling-side ICE candidate filter, drop overlay STUN at mux read-side instead (#6142 )" This reverts commit `b57f714350`. Restores the signaling-side ICE candidate filter and the engine/worker_ice STUN gating that #6142 removed, re-adding the mux read-side changes in udpmux/universal.go to their pre-#6142 form.	2026-06-24 21:29:05 +02:00
mlsmaycon	8446713d28	temp timing logs	2026-06-24 15:35:58 +02:00
Viktor Liu	12f2e69af2	Log signal stall, ICE pair selection, restart cadence, sync content, and receive backpressure to attribute the regression	2026-06-24 15:04:22 +02:00
Viktor Liu	4cb2c62f2a	Keep signal stream alive while receive loop is blocked on worker handoff	2026-06-24 12:44:04 +02:00
Viktor Liu	17b2044596	[client] Skip re-resolving cached management cache domains (#6518 )	2026-06-23 17:55:57 +02:00
Bethuel Mmbaga	07101c59ac	[management] Reschedule inactivity expiration when a peer disconnects (#6523 )	2026-06-23 17:44:32 +03:00
Riccardo Manfrin	51b6f6291b	Fixup debug config (#6514 )	2026-06-22 22:01:49 +02:00
Pascal Fischer	2ebf26006a	[management] empty file check in nmap on other posturechecks (#6511 )	2026-06-22 19:54:38 +02:00
Pascal Fischer	211a26019a	[management] validate meta change against posture checks (#6510 )	2026-06-22 19:42:04 +02:00