[client] Fix tray flicker and stuck Connecting during management retry

The status snapshot tore down on every management retry because state.Status() blanks the status when an error is wrapped, and the SubscribeStatus stream propagated that as FailedPrecondition. The UI treated any stream error as "daemon not running" and flickered the tray to Not running between retries. Disconnect was also unresponsive: Down set Idle before the retry goroutine exited, which then overwrote it with Set(Connecting) on the next attempt; the backoff sleep (up to 15s) wasn't context-aware, so the goroutine kept running long after actCancel. - buildStatusResponse falls back to the underlying status (via new state.CurrentStatus) instead of breaking the stream on wrapped errors. - UI only flips to DaemonUnavailable on codes.Unavailable / non-status errors, so a live daemon returning FailedPrecondition is not reported as down. - connect retry uses backoff.WithContext so actCancel interrupts the inter-attempt sleep, and skips Wrap(err) when the dial fails due to ctx cancellation. - Down sets Idle after waiting for giveUpChan, so the retry goroutine can no longer race the disconnect. - Tray hides Connect during Connecting and keeps Disconnect enabled so the user can abort an in-flight connection attempt.
2026-05-12 19:59:56 +00:00 · 2026-05-12 20:38:30 +02:00
parent 100d25a062
commit e3efaa5e59
5 changed files with 77 additions and 12 deletions
--- a/client/internal/connect.go
+++ b/client/internal/connect.go
@@ -258,6 +258,15 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 		log.Debugf("connecting to the Management service %s", c.config.ManagementURL.Host)
 		mgmClient, err := mgm.NewClient(engineCtx, c.config.ManagementURL.Host, myPrivateKey, mgmTlsEnabled)
 		if err != nil {
+			// On daemon shutdown / Down() the parent context is cancelled
+			// and the dial fails with "context canceled". Wrapping that
+			// into state would leave the snapshot stuck at Connecting+err
+			// until the backoff loop wakes up — instead let the operation
+			// return cleanly so the deferred state.Set(StatusIdle) takes
+			// effect on the next iteration.
+			if c.ctx.Err() != nil {
+				return nil
+			}
 			return wrapErr(gstatus.Errorf(codes.FailedPrecondition, "failed connecting to Management Service : %s", err))
 		}
 		mgmNotifier := statusRecorderToMgmConnStateNotifier(c.statusRecorder)
@@ -426,7 +435,11 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 	}

 	c.statusRecorder.ClientStart()
-	err = backoff.Retry(operation, backOff)
+	// Wrap the backoff with c.ctx so Down()/actCancel propagates into the
+	// inter-attempt sleep — otherwise a 15s MaxInterval can keep the retry
+	// loop alive long after the caller asked to give up, leaving the
+	// status stream stuck at Connecting.
+	err = backoff.Retry(operation, backoff.WithContext(backOff, c.ctx))
 	if err != nil {
 		log.Debugf("exiting client retry loop due to unrecoverable error: %s", err)
 		if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) {
--- a/client/internal/state.go
+++ b/client/internal/state.go
@@ -57,6 +57,17 @@ func (c *contextState) Status() (StatusType, error) {
 	return c.status, nil
 }

+// CurrentStatus returns the last status set via Set, ignoring any wrapped
+// error. Use when the status is needed for reporting purposes (e.g. the
+// status snapshot stream) and a transient wrapped error from a retry loop
+// shouldn't blank out the underlying status.
+func (c *contextState) CurrentStatus() StatusType {
+	c.mutex.Lock()
+	defer c.mutex.Unlock()
+
+	return c.status
+}
+
 func (c *contextState) Wrap(err error) error {
 	c.mutex.Lock()
 	defer c.mutex.Unlock()
--- a/client/server/server.go
+++ b/client/server/server.go
@@ -846,9 +846,6 @@ func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownRes
 		return nil, err
 	}

-	state := internal.CtxGetState(s.rootCtx)
-	state.Set(internal.StatusIdle)
-
 	s.mutex.Unlock()

 	// Wait for the connectWithRetryRuns goroutine to finish with a short timeout.
@@ -863,6 +860,12 @@ func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownRes
 		}
 	}

+	// Set Idle only after the retry goroutine has exited (or timed out).
+	// Setting it earlier races with the goroutine's own Set(StatusConnecting)
+	// at the top of each retry attempt, which would leave the snapshot
+	// stuck at Connecting long after the user asked to disconnect.
+	internal.CtxGetState(s.rootCtx).Set(internal.StatusIdle)
+
 	return &proto.DownResponse{}, nil
 }

@@ -1123,9 +1126,16 @@ func (s *Server) Status(
 // state. Shared between the unary Status RPC and the SubscribeStatus
 // stream so both paths return identical snapshots.
 func (s *Server) buildStatusResponse(msg *proto.StatusRequest) (*proto.StatusResponse, error) {
-	status, err := internal.CtxGetState(s.rootCtx).Status()
+	state := internal.CtxGetState(s.rootCtx)
+	status, err := state.Status()
 	if err != nil {
-		return nil, err
+		// state.Status() blanks the status when err is set (e.g. management
+		// retry loop wrapped a connection error). The underlying status is
+		// still meaningful and the failure is already surfaced via
+		// FullStatus.ManagementState.Error, so don't propagate err — that
+		// would tear down the SubscribeStatus stream and cause the UI to
+		// mark the daemon as unreachable on every retry.
+		status = state.CurrentStatus()
 	}

 	if status == internal.StatusNeedsLogin && s.isSessionActive.Load() {
--- a/client/ui/services/peers.go
+++ b/client/ui/services/peers.go
@@ -11,6 +11,8 @@ import (

 	"github.com/cenkalti/backoff/v4"
 	log "github.com/sirupsen/logrus"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"

 	"github.com/netbirdio/netbird/client/proto"
 )
@@ -185,6 +187,23 @@ func (s *Peers) Get(ctx context.Context) (Status, error) {
 	return statusFromProto(resp), nil
 }

+// isDaemonUnreachable reports whether a gRPC stream error indicates the
+// daemon socket itself is not answering (process down, socket missing,
+// permission denied) versus the daemon responding with an application-level
+// error code. Only the former should flip the tray to "Not running" — a
+// daemon that returns FailedPrecondition (e.g. while it's retrying the
+// management connection) is alive and shouldn't be reported as down.
+func isDaemonUnreachable(err error) bool {
+	if err == nil {
+		return false
+	}
+	st, ok := status.FromError(err)
+	if !ok {
+		return true
+	}
+	return st.Code() == codes.Unavailable
+}
+
 // statusStreamLoop subscribes to the daemon's SubscribeStatus stream and
 // re-emits each FullStatus snapshot on the Wails event bus. The first
 // message is the current snapshot; subsequent messages fire on
@@ -225,7 +244,9 @@ func (s *Peers) statusStreamLoop(ctx context.Context) {
 		}
 		stream, err := cli.SubscribeStatus(ctx, &proto.StatusRequest{GetFullPeerStatus: true})
 		if err != nil {
-			emitUnavailable()
+			if isDaemonUnreachable(err) {
+				emitUnavailable()
+			}
 			return fmt.Errorf("subscribe status: %w", err)
 		}
 		for {
@@ -234,7 +255,9 @@ func (s *Peers) statusStreamLoop(ctx context.Context) {
 				if ctx.Err() != nil {
 					return ctx.Err()
 				}
-				emitUnavailable()
+				if isDaemonUnreachable(err) {
+					emitUnavailable()
+				}
 				return fmt.Errorf("status stream recv: %w", err)
 			}
 			unavailable = false
--- a/client/ui/tray.go
+++ b/client/ui/tray.go
@@ -479,6 +479,7 @@ func (t *Tray) applyStatus(st services.Status) {
 			strings.EqualFold(st.Status, statusSessionExpired) ||
 			strings.EqualFold(st.Status, statusLoginFailed)
 		daemonUnavailable := strings.EqualFold(st.Status, services.StatusDaemonUnavailable)
+		connecting := strings.EqualFold(st.Status, statusConnecting)
 		if t.statusItem != nil {
 			// When the daemon needs re-authentication the status row turns
 			// into the actionable Login entry — Connect would only fail.
@@ -496,12 +497,19 @@ func (t *Tray) applyStatus(st services.Status) {
 			t.applyStatusIndicator(st.Status)
 		}
 		if t.upItem != nil {
-			t.upItem.SetHidden(connected || needsLogin || daemonUnavailable)
-			t.upItem.SetEnabled(!connected && !needsLogin && !daemonUnavailable)
+			// Hide Connect whenever an Up action would be a no-op or would
+			// only fail: tunnel already up, daemon mid-connect (Disconnect
+			// takes over the slot so the user can abort), login required,
+			// or daemon socket unreachable.
+			t.upItem.SetHidden(connected || connecting || needsLogin || daemonUnavailable)
+			t.upItem.SetEnabled(!connected && !connecting && !needsLogin && !daemonUnavailable)
 		}
 		if t.downItem != nil {
-			t.downItem.SetHidden(!connected)
-			t.downItem.SetEnabled(connected)
+			// Disconnect is the abort path while the daemon is still
+			// retrying the management dial — without it the user has no
+			// way to stop the loop short of killing the daemon.
+			t.downItem.SetHidden(!connected && !connecting)
+			t.downItem.SetEnabled(connected || connecting)
 		}
 		// Exit Node and Resources surface tunnel-routed state, so only
 		// expose them while the tunnel is up. Settings/Debug-Bundle just