[client] Wait for signal receive watchdog to stop before reconnect

The per-stream watchReceiveStream goroutine was started fire-and-forget and never joined. On reconnect a lingering watchdog could still flip shared client state (receiveStalled, the disconnect notifier) on the freshly established stream, since cancelStream only cancels its own stream context. Track the watchdog with a WaitGroup and wait for it to exit (after cancelling its stream) before the operation returns, so each reconnect starts with no stale watchdog.
Keep signal stream alive while receive loop is blocked on worker handoff
2026-07-02 20:59:56 +00:00 · 2026-06-28 16:03:01 +02:00 · 2026-06-24 12:44:04 +02:00 · 2026-06-23 17:55:57 +02:00 · 2026-06-23 17:44:32 +03:00 · 2026-06-22 22:01:49 +02:00
8 changed files with 470 additions and 13 deletions
--- a/client/cmd/debug.go
+++ b/client/cmd/debug.go
@@ -130,7 +130,7 @@ func debugConfigDump(cmd *cobra.Command, _ []string) error {

 	client := proto.NewDaemonServiceClient(conn)
 	resp, err := client.GetConfig(cmd.Context(), &proto.GetConfigRequest{
-		ProfileName: activeProf.Name,
+		ProfileName: string(activeProf.ID),
 		Username:    currUser.Username,
 	})
 	if err != nil {
--- a/client/internal/dns/mgmt/mgmt.go
+++ b/client/internal/dns/mgmt/mgmt.go
@@ -51,13 +51,20 @@ type cachedRecord struct {
 }

 // Resolver caches critical NetBird infrastructure domains.
-// records, refreshing, mgmtDomain and serverDomains are all guarded by mutex.
+// records, refreshing, failedResolves, mgmtDomain and serverDomains are all
+// guarded by mutex.
 type Resolver struct {
 	records       map[dns.Question]*cachedRecord
 	mgmtDomain    *domain.Domain
 	serverDomains *dnsconfig.ServerDomains
 	mutex         sync.RWMutex

+	// failedResolves records the last failed initial resolve per domain so a
+	// domain that never resolves isn't retried on every server-domains update
+	// until refreshBackoff elapses. Entries are cleared on success and pruned
+	// to the current server-domains set.
+	failedResolves map[domain.Domain]time.Time
+
 	chain            ChainResolver
 	chainMaxPriority int
 	refreshGroup     singleflight.Group
@@ -76,9 +83,10 @@ type Resolver struct {
 // NewResolver creates a new management domains cache resolver.
 func NewResolver() *Resolver {
 	return &Resolver{
-		records:    make(map[dns.Question]*cachedRecord),
-		refreshing: make(map[dns.Question]*atomic.Bool),
-		cacheTTL:   resolveCacheTTL(),
+		records:        make(map[dns.Question]*cachedRecord),
+		refreshing:     make(map[dns.Question]*atomic.Bool),
+		failedResolves: make(map[domain.Domain]time.Time),
+		cacheTTL:       resolveCacheTTL(),
 	}
 }

@@ -173,7 +181,9 @@ func (m *Resolver) continueToNext(w dns.ResponseWriter, r *dns.Msg) {

 // AddDomain resolves a domain and stores its A/AAAA records in the cache.
 // A family that resolves NODATA (nil err, zero records) evicts any stale
-// entry for that qtype.
+// entry for that qtype. When one family hard-errors while the other succeeds,
+// the resolved family is still cached but AddDomain returns an error so the
+// caller retries the incomplete resolve rather than treating it as complete.
 func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
 	dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))

@@ -203,6 +213,10 @@ func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
 	log.Debugf("added/updated domain=%s with %d A records and %d AAAA records",
 		d.SafeString(), len(aRecords), len(aaaaRecords))

+	if errA != nil || errAAAA != nil {
+		return fmt.Errorf("resolve %s: incomplete, a family failed: %w", d.SafeString(), errors.Join(errA, errAAAA))
+	}
+
 	return nil
 }

@@ -462,6 +476,7 @@ func (m *Resolver) RemoveDomain(d domain.Domain) error {
 	delete(m.records, qAAAA)
 	delete(m.refreshing, qA)
 	delete(m.refreshing, qAAAA)
+	delete(m.failedResolves, d)

 	log.Debugf("removed domain=%s from cache", d.SafeString())
 	return nil
@@ -505,6 +520,7 @@ func (m *Resolver) UpdateFromServerDomains(ctx context.Context, serverDomains dn
 		allDomains := m.extractDomainsFromServerDomains(updatedServerDomains)
 		currentDomains := m.GetCachedDomains()
 		removedDomains = m.removeStaleDomains(currentDomains, allDomains)
+		m.pruneFailedResolves(allDomains)
 	}

 	m.addNewDomains(ctx, newDomains)
@@ -577,13 +593,85 @@ func (m *Resolver) isManagementDomain(domain domain.Domain) bool {
 	return m.mgmtDomain != nil && domain == *m.mgmtDomain
 }

-// addNewDomains resolves and caches all domains from the update
+// addNewDomains resolves and caches domains that are not yet in the cache,
+// running the lookups concurrently. Domains already cached are skipped and left
+// to the stale-while-revalidate refresh path, so a sync never re-resolves them
+// synchronously: once NetBird owns the OS resolver the resolve runs through the
+// handler chain and would otherwise dial the managed upstreams under the engine
+// sync lock on every update.
 func (m *Resolver) addNewDomains(ctx context.Context, newDomains domain.List) {
+	var wg sync.WaitGroup
+	seen := make(map[domain.Domain]struct{}, len(newDomains))
 	for _, newDomain := range newDomains {
-		if err := m.AddDomain(ctx, newDomain); err != nil {
-			log.Warnf("failed to add/update domain=%s: %v", newDomain.SafeString(), err)
-		} else {
-			log.Debugf("added/updated management cache domain=%s", newDomain.SafeString())
+		if _, dup := seen[newDomain]; dup {
+			continue
+		}
+		seen[newDomain] = struct{}{}
+
+		if !m.needsResolve(newDomain) {
+			continue
+		}
+
+		wg.Add(1)
+		go func(d domain.Domain) {
+			defer wg.Done()
+			if err := m.AddDomain(ctx, d); err != nil {
+				m.markResolveFailed(d)
+				log.Warnf("failed to add/update domain=%s: %v", d.SafeString(), err)
+				return
+			}
+			m.clearResolveFailed(d)
+			log.Debugf("added/updated management cache domain=%s", d.SafeString())
+		}(newDomain)
+	}
+	wg.Wait()
+}
+
+// needsResolve reports whether d should be resolved now. A recent failed or
+// incomplete resolve gates retries on the backoff even when one family is
+// already cached, so a transiently-failed family is retried instead of being
+// treated as fully resolved. Otherwise a domain with any cached record is left
+// to the stale-while-revalidate refresh path.
+func (m *Resolver) needsResolve(d domain.Domain) bool {
+	dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))
+
+	m.mutex.RLock()
+	defer m.mutex.RUnlock()
+
+	if failedAt, ok := m.failedResolves[d]; ok {
+		return time.Since(failedAt) >= refreshBackoff
+	}
+
+	for _, qtype := range []uint16{dns.TypeA, dns.TypeAAAA} {
+		q := dns.Question{Name: dnsName, Qtype: qtype, Qclass: dns.ClassINET}
+		if _, ok := m.records[q]; ok {
+			return false
+		}
+	}
+	return true
+}
+
+func (m *Resolver) markResolveFailed(d domain.Domain) {
+	m.mutex.Lock()
+	m.failedResolves[d] = time.Now()
+	m.mutex.Unlock()
+}
+
+func (m *Resolver) clearResolveFailed(d domain.Domain) {
+	m.mutex.Lock()
+	delete(m.failedResolves, d)
+	m.mutex.Unlock()
+}
+
+// pruneFailedResolves drops failure markers for domains no longer present in
+// the server-domains set, keeping the map bounded to the current set (a
+// failed-only domain has no cached record, so RemoveDomain never sees it).
+func (m *Resolver) pruneFailedResolves(domains domain.List) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	for d := range m.failedResolves {
+		if !slices.Contains(domains, d) {
+			delete(m.failedResolves, d)
 		}
 	}
 }
--- a/client/internal/dns/mgmt/mgmt_refresh_test.go
+++ b/client/internal/dns/mgmt/mgmt_refresh_test.go
@@ -21,6 +21,7 @@ type fakeChain struct {
 	mu       sync.Mutex
 	calls    map[string]int
 	answers  map[string][]dns.RR
+	qErr     map[string]error
 	err      error
 	hasRoot  bool
 	onLookup func()
@@ -30,6 +31,7 @@ func newFakeChain() *fakeChain {
 	return &fakeChain{
 		calls:   map[string]int{},
 		answers: map[string][]dns.RR{},
+		qErr:    map[string]error{},
 		hasRoot: true,
 	}
 }
@@ -47,6 +49,9 @@ func (f *fakeChain) ResolveInternal(ctx context.Context, msg *dns.Msg, maxPriori
 	f.calls[key]++
 	answers := f.answers[key]
 	err := f.err
+	if err == nil {
+		err = f.qErr[key]
+	}
 	onLookup := f.onLookup
 	f.mu.Unlock()

@@ -75,6 +80,12 @@ func (f *fakeChain) setAnswer(name string, qtype uint16, ip string) {
 	}
 }

+func (f *fakeChain) setErr(name string, qtype uint16, err error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.qErr[name+"|"+dns.TypeToString[qtype]] = err
+}
+
 func (f *fakeChain) callCount(name string, qtype uint16) int {
 	f.mu.Lock()
 	defer f.mu.Unlock()
--- a/client/internal/dns/mgmt/mgmt_resolve_test.go
+++ b/client/internal/dns/mgmt/mgmt_resolve_test.go
@@ -0,0 +1,183 @@
+package mgmt
+
+import (
+	"context"
+	"errors"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/miekg/dns"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	dnsconfig "github.com/netbirdio/netbird/client/internal/dns/config"
+	"github.com/netbirdio/netbird/shared/management/domain"
+)
+
+// A domain already in the cache must not be re-resolved on a subsequent server
+// domains update; it is left to the stale-while-revalidate refresh path.
+func TestResolver_UpdateFromServerDomains_SkipsCached(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("signal.example.com.", dns.TypeA, "10.0.0.2")
+	r.SetChainResolver(chain, 50)
+
+	sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
+
+	_, err := r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"first update must resolve the domain")
+
+	_, err = r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"cached domain must not be re-resolved on a subsequent update")
+}
+
+// New domains in a single update must resolve concurrently rather than serially.
+func TestResolver_AddNewDomains_ResolvesConcurrently(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+
+	var inflight, maxInflight atomic.Int32
+	chain.onLookup = func() {
+		n := inflight.Add(1)
+		for {
+			old := maxInflight.Load()
+			if n <= old || maxInflight.CompareAndSwap(old, n) {
+				break
+			}
+		}
+		time.Sleep(50 * time.Millisecond)
+		inflight.Add(-1)
+	}
+
+	relays := []domain.Domain{"a.example.com", "b.example.com", "c.example.com", "d.example.com"}
+	for _, d := range relays {
+		chain.setAnswer(dns.Fqdn(string(d)), dns.TypeA, "10.0.0.2")
+	}
+	r.SetChainResolver(chain, 50)
+
+	start := time.Now()
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: relays})
+	require.NoError(t, err)
+	elapsed := time.Since(start)
+
+	assert.GreaterOrEqual(t, int(maxInflight.Load()), 2, "domains must resolve concurrently")
+	// Serial resolution of 4 domains would take at least 4*50ms; concurrent is far less.
+	assert.Less(t, elapsed, 300*time.Millisecond, "resolution should not be serial")
+}
+
+// A domain that fails to resolve must not be retried on every update; the
+// failure backoff suppresses re-resolution until it expires.
+func TestResolver_UpdateFromServerDomains_BacksOffFailures(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.err = errors.New("resolve boom")
+	r.SetChainResolver(chain, 50)
+
+	sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
+
+	_, err := r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"first update must attempt the resolve")
+
+	_, err = r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"failed resolve must back off and not retry on the next update")
+}
+
+// A domain listed under more than one server-domain type (e.g. STUN and TURN on
+// the same host) must be resolved once per update, not once per occurrence.
+func TestResolver_AddNewDomains_DedupesDuplicateDomains(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("dup.example.com.", dns.TypeA, "10.0.0.9")
+	r.SetChainResolver(chain, 50)
+
+	sd := dnsconfig.ServerDomains{
+		Stuns: []domain.Domain{"dup.example.com"},
+		Turns: []domain.Domain{"dup.example.com"},
+	}
+
+	_, err := r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	assert.Equal(t, 1, chain.callCount("dup.example.com.", dns.TypeA),
+		"a domain appearing under multiple server-domain types must resolve once")
+}
+
+// A failure marker must be dropped once its domain leaves the server-domains set
+// so the map stays bounded to the current set.
+func TestResolver_UpdateFromServerDomains_PrunesFailedResolves(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.err = errors.New("resolve boom")
+	r.SetChainResolver(chain, 50)
+
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("gone.example.com")})
+	require.NoError(t, err)
+	r.mutex.RLock()
+	_, marked := r.failedResolves[domain.Domain("gone.example.com")]
+	r.mutex.RUnlock()
+	require.True(t, marked, "failed resolve must be recorded")
+
+	_, err = r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("other.example.com")})
+	require.NoError(t, err)
+	r.mutex.RLock()
+	_, stillMarked := r.failedResolves[domain.Domain("gone.example.com")]
+	r.mutex.RUnlock()
+	assert.False(t, stillMarked, "failure marker for a domain no longer in the set must be pruned")
+}
+
+// When one family hard-errors while the other resolves, the domain is cached
+// for the working family but recorded as incomplete so the failed family is
+// retried under backoff instead of being treated as fully resolved forever.
+func TestResolver_AddNewDomains_RetriesPartialFamilyFailure(t *testing.T) {
+	d := domain.Domain("relay.example.com")
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("relay.example.com.", dns.TypeA, "10.0.0.2")
+	chain.setErr("relay.example.com.", dns.TypeAAAA, errors.New("servfail"))
+	r.SetChainResolver(chain, 50)
+
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
+	require.NoError(t, err)
+
+	r.mutex.RLock()
+	_, aCached := r.records[dns.Question{Name: "relay.example.com.", Qtype: dns.TypeA, Qclass: dns.ClassINET}]
+	_, marked := r.failedResolves[d]
+	r.mutex.RUnlock()
+	require.True(t, aCached, "the working family must still be cached")
+	require.True(t, marked, "a partial failure must be recorded so the failed family is retried")
+
+	assert.False(t, r.needsResolve(d), "within the backoff window the domain is not retried")
+
+	r.mutex.Lock()
+	r.failedResolves[d] = time.Now().Add(-2 * refreshBackoff)
+	r.mutex.Unlock()
+	assert.True(t, r.needsResolve(d), "after the backoff elapses the domain is retried to pick up the missing family")
+}
+
+// A family that returns NODATA (legitimately absent, e.g. an IPv4-only host) is
+// not a failure: the domain must not be marked for retry, otherwise it would be
+// re-resolved on every sync.
+func TestResolver_AddNewDomains_NodataIsNotFailure(t *testing.T) {
+	d := domain.Domain("v4only.example.com")
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("v4only.example.com.", dns.TypeA, "10.0.0.2")
+	r.SetChainResolver(chain, 50)
+
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
+	require.NoError(t, err)
+
+	r.mutex.RLock()
+	_, marked := r.failedResolves[d]
+	r.mutex.RUnlock()
+	assert.False(t, marked, "a NODATA family must not be recorded as a failure")
+	assert.False(t, r.needsResolve(d), "an IPv4-only host must not be re-resolved on later syncs")
+}
--- a/management/server/account_test.go
+++ b/management/server/account_test.go
@@ -1916,6 +1916,117 @@ func TestDefaultAccountManager_MarkPeerConnected_PeerLoginExpiration(t *testing.
 	}
 }

+func TestDefaultAccountManager_MarkPeerDisconnected_SchedulesInactivityExpiration(t *testing.T) {
+	manager, _, err := createManager(t)
+	require.NoError(t, err, "unable to create account manager")
+
+	accountID, err := manager.GetAccountIDByUserID(context.Background(), auth.UserAuth{UserId: userID})
+	require.NoError(t, err, "unable to create an account")
+
+	key, err := wgtypes.GenerateKey()
+	require.NoError(t, err, "unable to generate WireGuard key")
+	peerPubKey := key.PublicKey().String()
+
+	_, _, _, _, err = manager.AddPeer(context.Background(), "", "", userID, &nbpeer.Peer{
+		Key:                         peerPubKey,
+		Meta:                        nbpeer.PeerSystemMeta{Hostname: "test-peer"},
+		InactivityExpirationEnabled: true,
+	}, false)
+	require.NoError(t, err, "unable to add peer")
+
+	_, err = manager.UpdateAccountSettings(context.Background(), accountID, userID, &types.Settings{
+		PeerLoginExpiration:             time.Hour,
+		PeerLoginExpirationEnabled:      true,
+		PeerInactivityExpiration:        time.Hour,
+		PeerInactivityExpirationEnabled: true,
+		Extra:                           &types.ExtraSettings{},
+	})
+	require.NoError(t, err, "expecting to update account settings successfully but got error")
+
+	// Establish a session so the matching-token disconnect is actually applied.
+	streamStartTime := time.Now().UTC()
+	err = manager.MarkPeerConnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano(), nil)
+	require.NoError(t, err, "unable to mark peer connected")
+
+	// Install the mock only now, so the assertion observes the disconnect, not
+	// the earlier connect.
+	scheduled := make(chan struct{}, 1)
+	manager.peerInactivityExpiry = &MockScheduler{
+		CancelFunc: func(ctx context.Context, IDs []string) {},
+		ScheduleFunc: func(ctx context.Context, in time.Duration, ID string, job func() (nextRunIn time.Duration, reschedule bool)) {
+			select {
+			case scheduled <- struct{}{}:
+			default:
+			}
+		},
+	}
+
+	err = manager.MarkPeerDisconnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano())
+	require.NoError(t, err, "unable to mark peer disconnected")
+
+	select {
+	case <-scheduled:
+		// expected: disconnect re-armed the inactivity expiry timer
+	case <-time.After(time.Second):
+		t.Fatal("expected inactivity expiration to be rescheduled when an eligible peer disconnects")
+	}
+}
+
+func TestDefaultAccountManager_MarkPeerDisconnected_SkipsInactivityExpirationWhenDisabled(t *testing.T) {
+	manager, _, err := createManager(t)
+	require.NoError(t, err, "unable to create account manager")
+
+	accountID, err := manager.GetAccountIDByUserID(context.Background(), auth.UserAuth{UserId: userID})
+	require.NoError(t, err, "unable to create an account")
+
+	key, err := wgtypes.GenerateKey()
+	require.NoError(t, err, "unable to generate WireGuard key")
+	peerPubKey := key.PublicKey().String()
+
+	_, _, _, _, err = manager.AddPeer(context.Background(), "", "", userID, &nbpeer.Peer{
+		Key:                         peerPubKey,
+		Meta:                        nbpeer.PeerSystemMeta{Hostname: "test-peer"},
+		InactivityExpirationEnabled: true,
+	}, false)
+	require.NoError(t, err, "unable to add peer")
+
+	// Peer is eligible (SSO + inactivity enabled) but the account-level setting
+	// stays disabled, so disconnect must not schedule anything.
+	_, err = manager.UpdateAccountSettings(context.Background(), accountID, userID, &types.Settings{
+		PeerLoginExpiration:             time.Hour,
+		PeerLoginExpirationEnabled:      true,
+		PeerInactivityExpiration:        time.Hour,
+		PeerInactivityExpirationEnabled: false,
+		Extra:                           &types.ExtraSettings{},
+	})
+	require.NoError(t, err, "expecting to update account settings successfully but got error")
+
+	streamStartTime := time.Now().UTC()
+	err = manager.MarkPeerConnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano(), nil)
+	require.NoError(t, err, "unable to mark peer connected")
+
+	scheduled := make(chan struct{}, 1)
+	manager.peerInactivityExpiry = &MockScheduler{
+		CancelFunc: func(ctx context.Context, IDs []string) {},
+		ScheduleFunc: func(ctx context.Context, in time.Duration, ID string, job func() (nextRunIn time.Duration, reschedule bool)) {
+			select {
+			case scheduled <- struct{}{}:
+			default:
+			}
+		},
+	}
+
+	err = manager.MarkPeerDisconnected(context.Background(), peerPubKey, accountID, streamStartTime.UnixNano())
+	require.NoError(t, err, "unable to mark peer disconnected")
+
+	select {
+	case <-scheduled:
+		t.Fatal("inactivity expiration must not be scheduled while the account-level setting is disabled")
+	case <-time.After(200 * time.Millisecond):
+		// expected: nothing scheduled
+	}
+}
+
 func TestDefaultAccountManager_OnPeerDisconnected_LastSeenCheck(t *testing.T) {
 	manager, _, err := createManager(t)
 	require.NoError(t, err, "unable to create account manager")
--- a/management/server/peer.go
+++ b/management/server/peer.go
@@ -188,6 +188,15 @@ func (am *DefaultAccountManager) MarkPeerDisconnected(ctx context.Context, peerP
 		}
 	}

+	if peer.AddedWithSSOLogin() && peer.InactivityExpirationEnabled {
+		settings, err := am.Store.GetAccountSettings(ctx, store.LockingStrengthNone, accountID)
+		if err != nil {
+			log.WithContext(ctx).Warnf("failed getting account settings to schedule inactivity expiration for peer %s: %v", peer.ID, err)
+		} else if settings.PeerInactivityExpirationEnabled {
+			am.checkAndSchedulePeerInactivityExpiration(ctx, accountID)
+		}
+	}
+
 	return nil
 }

--- a/shared/signal/client/grpc.go
+++ b/shared/signal/client/grpc.go
@@ -78,6 +78,14 @@ type GrpcClient struct {
 	// transport-alive but no longer delivering messages. It is the source of
 	// truth IsHealthy reads, and is cleared once any frame is received again.
 	receiveStalled atomic.Bool
+	// receiveHandoffBlocked is set while the receive loop is parked handing a
+	// message to a busy decryption worker. The loop stops calling Recv (and
+	// markReceived) in that window, so the stream looks silent though it is
+	// healthy. The watchdog reads this to avoid misreading self-inflicted
+	// receive backpressure as a dead stream: reconnecting cannot help, since the
+	// new stream feeds the same worker, and only triggers a reconnect storm.
+	receiveHandoffBlocked atomic.Bool
+	watchdogWg            sync.WaitGroup
 }

 // NewClient creates a new Signal client
@@ -193,10 +201,18 @@ func (c *GrpcClient) Receive(ctx context.Context, msgHandler func(msg *proto.Mes
 		// Guard the receive direction: the transport can stay healthy while the
 		// server stops delivering messages. The watchdog reconnects via cancelStream.
 		c.markReceived()
-		go c.watchReceiveStream(streamCtx, cancelStream)
+		c.watchdogWg.Add(1)
+		go func() {
+			defer c.watchdogWg.Done()
+			c.watchReceiveStream(streamCtx, cancelStream)
+		}()

 		// start receiving messages from the Signal stream (from other peers through signal)
 		err = c.receive(stream)
+
+		cancelStream()
+		c.watchdogWg.Wait()
+
 		if err != nil {
 			// Check the parent context, not streamCtx: a watchdog-triggered
 			// cancelStream must reconnect, only a parent cancel is shutdown.
@@ -439,6 +455,16 @@ func (c *GrpcClient) idleSinceReceive() time.Duration {
 	return time.Since(time.Unix(0, c.lastReceived.Load()))
 }

+// receiveAlive reports whether the receive stream shows liveness: it delivered a
+// frame within the inactivity threshold, or the receive loop is currently parked
+// handing a message to a busy decryption worker. In the latter case the loop has
+// stopped calling Recv, so the stream looks silent while being healthy, and
+// reconnecting would not help, so the watchdog must treat it as alive.
+func (c *GrpcClient) receiveAlive() bool {
+	return c.idleSinceReceive() < receiveInactivityThreshold ||
+		c.receiveHandoffBlocked.Load()
+}
+
 // watchReceiveStream guards against a receive stream that is transport-alive but
 // no longer delivering messages. While the stream is idle past
 // receiveInactivityThreshold it sends a self-addressed probe that the Signal
@@ -455,7 +481,7 @@ func (c *GrpcClient) watchReceiveStream(ctx context.Context, cancelStream contex
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
-			if c.idleSinceReceive() < receiveInactivityThreshold {
+			if c.receiveAlive() {
 				probeSentAt = time.Time{}
 				continue
 			}
@@ -517,9 +543,14 @@ func (c *GrpcClient) receive(stream proto.SignalExchange_ConnectStreamClient) er
 			continue
 		}

+		// The handoff blocks while the worker is busy, which parks this loop and
+		// stops Recv. Flag it so the watchdog does not read the resulting silence
+		// as a dead stream.
+		c.receiveHandoffBlocked.Store(true)
 		if err := c.decryptionWorker.AddMsg(c.ctx, msg); err != nil {
 			log.Errorf("failed to add message to decryption worker: %v", err)
 		}
+		c.receiveHandoffBlocked.Store(false)
 	}
 }

--- a/shared/signal/client/watchdog_test.go
+++ b/shared/signal/client/watchdog_test.go
@@ -82,3 +82,27 @@ func TestReceiveProbeRoundTrips(t *testing.T) {
 		t.Fatal("self-addressed heartbeat did not round-trip back through the signal server")
 	}
 }
+
+// TestReceiveAliveTreatsHandoffBlockAsLiveness reproduces the false positive
+// where a busy decryption worker parks the receive loop on the worker handoff,
+// so Recv (and markReceived) stops firing even though the stream is healthy.
+// With the receive stream silent past the inactivity threshold but the loop
+// blocked on handoff, the watchdog must consider the stream alive rather than
+// tear it down (reconnecting feeds the same worker and would not help).
+func TestReceiveAliveTreatsHandoffBlockAsLiveness(t *testing.T) {
+	c := &GrpcClient{}
+
+	// Receive stream silent and the loop not blocked on handoff: genuinely stalled.
+	c.lastReceived.Store(time.Now().Add(-2 * receiveInactivityThreshold).UnixNano())
+	require.False(t, c.receiveAlive(), "silent stream with the receive loop idle must be treated as stalled")
+
+	// Receive stream silent but the loop is parked handing a message to a busy
+	// worker: self-inflicted backpressure, not a dead stream, must not tear down.
+	c.receiveHandoffBlocked.Store(true)
+	require.True(t, c.receiveAlive(), "a receive loop blocked on worker handoff must keep the stream alive")
+
+	// Handoff drained, loop back to reading, a frame just arrived: alive via the receive path.
+	c.receiveHandoffBlocked.Store(false)
+	c.markReceived()
+	require.True(t, c.receiveAlive(), "a freshly received frame must keep the stream alive")
+}
Author	SHA1	Message	Date
Zoltan Papp	4a4d506221	[client] Wait for signal receive watchdog to stop before reconnect The per-stream watchReceiveStream goroutine was started fire-and-forget and never joined. On reconnect a lingering watchdog could still flip shared client state (receiveStalled, the disconnect notifier) on the freshly established stream, since cancelStream only cancels its own stream context. Track the watchdog with a WaitGroup and wait for it to exit (after cancelling its stream) before the operation returns, so each reconnect starts with no stale watchdog.	2026-06-28 16:03:01 +02:00
Viktor Liu	4cb2c62f2a	Keep signal stream alive while receive loop is blocked on worker handoff	2026-06-24 12:44:04 +02:00
Viktor Liu	17b2044596	[client] Skip re-resolving cached management cache domains (#6518 )	2026-06-23 17:55:57 +02:00
Bethuel Mmbaga	07101c59ac	[management] Reschedule inactivity expiration when a peer disconnects (#6523 )	2026-06-23 17:44:32 +03:00
Riccardo Manfrin	51b6f6291b	Fixup debug config (#6514 )	2026-06-22 22:01:49 +02:00