[client] Fix tray flicker and stuck Connecting during management retry

The status snapshot tore down on every management retry because state.Status() blanks the status when an error is wrapped, and the SubscribeStatus stream propagated that as FailedPrecondition. The UI treated any stream error as "daemon not running" and flickered the tray to Not running between retries. Disconnect was also unresponsive: Down set Idle before the retry goroutine exited, which then overwrote it with Set(Connecting) on the next attempt; the backoff sleep (up to 15s) wasn't context-aware, so the goroutine kept running long after actCancel. - buildStatusResponse falls back to the underlying status (via new state.CurrentStatus) instead of breaking the stream on wrapped errors. - UI only flips to DaemonUnavailable on codes.Unavailable / non-status errors, so a live daemon returning FailedPrecondition is not reported as down. - connect retry uses backoff.WithContext so actCancel interrupts the inter-attempt sleep, and skips Wrap(err) when the dial fails due to ctx cancellation. - Down sets Idle after waiting for giveUpChan, so the retry goroutine can no longer race the disconnect. - Tray hides Connect during Connecting and keeps Disconnect enabled so the user can abort an in-flight connection attempt.
2026-05-12 19:59:56 +00:00 · 2026-05-12 20:38:30 +02:00
parent 100d25a062
commit e3efaa5e59
5 changed files with 77 additions and 12 deletions
--- a/client/server/server.go
+++ b/client/server/server.go
@@ -846,9 +846,6 @@ func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownRes
 		return nil, err
 	}

-	state := internal.CtxGetState(s.rootCtx)
-	state.Set(internal.StatusIdle)
-
 	s.mutex.Unlock()

 	// Wait for the connectWithRetryRuns goroutine to finish with a short timeout.
@@ -863,6 +860,12 @@ func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownRes
 		}
 	}

+	// Set Idle only after the retry goroutine has exited (or timed out).
+	// Setting it earlier races with the goroutine's own Set(StatusConnecting)
+	// at the top of each retry attempt, which would leave the snapshot
+	// stuck at Connecting long after the user asked to disconnect.
+	internal.CtxGetState(s.rootCtx).Set(internal.StatusIdle)
+
 	return &proto.DownResponse{}, nil
 }

@@ -1123,9 +1126,16 @@ func (s *Server) Status(
 // state. Shared between the unary Status RPC and the SubscribeStatus
 // stream so both paths return identical snapshots.
 func (s *Server) buildStatusResponse(msg *proto.StatusRequest) (*proto.StatusResponse, error) {
-	status, err := internal.CtxGetState(s.rootCtx).Status()
+	state := internal.CtxGetState(s.rootCtx)
+	status, err := state.Status()
 	if err != nil {
-		return nil, err
+		// state.Status() blanks the status when err is set (e.g. management
+		// retry loop wrapped a connection error). The underlying status is
+		// still meaningful and the failure is already surfaced via
+		// FullStatus.ManagementState.Error, so don't propagate err — that
+		// would tear down the SubscribeStatus stream and cause the UI to
+		// mark the daemon as unreachable on every retry.
+		status = state.CurrentStatus()
 	}

 	if status == internal.StatusNeedsLogin && s.isSessionActive.Load() {