diff --git a/client/internal/connect.go b/client/internal/connect.go index 8c0e9b1ba..a5d12d896 100644 --- a/client/internal/connect.go +++ b/client/internal/connect.go @@ -258,6 +258,15 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan log.Debugf("connecting to the Management service %s", c.config.ManagementURL.Host) mgmClient, err := mgm.NewClient(engineCtx, c.config.ManagementURL.Host, myPrivateKey, mgmTlsEnabled) if err != nil { + // On daemon shutdown / Down() the parent context is cancelled + // and the dial fails with "context canceled". Wrapping that + // into state would leave the snapshot stuck at Connecting+err + // until the backoff loop wakes up — instead let the operation + // return cleanly so the deferred state.Set(StatusIdle) takes + // effect on the next iteration. + if c.ctx.Err() != nil { + return nil + } return wrapErr(gstatus.Errorf(codes.FailedPrecondition, "failed connecting to Management Service : %s", err)) } mgmNotifier := statusRecorderToMgmConnStateNotifier(c.statusRecorder) @@ -426,7 +435,11 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan } c.statusRecorder.ClientStart() - err = backoff.Retry(operation, backOff) + // Wrap the backoff with c.ctx so Down()/actCancel propagates into the + // inter-attempt sleep — otherwise a 15s MaxInterval can keep the retry + // loop alive long after the caller asked to give up, leaving the + // status stream stuck at Connecting. + err = backoff.Retry(operation, backoff.WithContext(backOff, c.ctx)) if err != nil { log.Debugf("exiting client retry loop due to unrecoverable error: %s", err) if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) { diff --git a/client/internal/state.go b/client/internal/state.go index 041cb73f8..20db33ecc 100644 --- a/client/internal/state.go +++ b/client/internal/state.go @@ -57,6 +57,17 @@ func (c *contextState) Status() (StatusType, error) { return c.status, nil } +// CurrentStatus returns the last status set via Set, ignoring any wrapped +// error. Use when the status is needed for reporting purposes (e.g. the +// status snapshot stream) and a transient wrapped error from a retry loop +// shouldn't blank out the underlying status. +func (c *contextState) CurrentStatus() StatusType { + c.mutex.Lock() + defer c.mutex.Unlock() + + return c.status +} + func (c *contextState) Wrap(err error) error { c.mutex.Lock() defer c.mutex.Unlock() diff --git a/client/server/server.go b/client/server/server.go index 532fbfaca..1daec9973 100644 --- a/client/server/server.go +++ b/client/server/server.go @@ -846,9 +846,6 @@ func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownRes return nil, err } - state := internal.CtxGetState(s.rootCtx) - state.Set(internal.StatusIdle) - s.mutex.Unlock() // Wait for the connectWithRetryRuns goroutine to finish with a short timeout. @@ -863,6 +860,12 @@ func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownRes } } + // Set Idle only after the retry goroutine has exited (or timed out). + // Setting it earlier races with the goroutine's own Set(StatusConnecting) + // at the top of each retry attempt, which would leave the snapshot + // stuck at Connecting long after the user asked to disconnect. + internal.CtxGetState(s.rootCtx).Set(internal.StatusIdle) + return &proto.DownResponse{}, nil } @@ -1123,9 +1126,16 @@ func (s *Server) Status( // state. Shared between the unary Status RPC and the SubscribeStatus // stream so both paths return identical snapshots. func (s *Server) buildStatusResponse(msg *proto.StatusRequest) (*proto.StatusResponse, error) { - status, err := internal.CtxGetState(s.rootCtx).Status() + state := internal.CtxGetState(s.rootCtx) + status, err := state.Status() if err != nil { - return nil, err + // state.Status() blanks the status when err is set (e.g. management + // retry loop wrapped a connection error). The underlying status is + // still meaningful and the failure is already surfaced via + // FullStatus.ManagementState.Error, so don't propagate err — that + // would tear down the SubscribeStatus stream and cause the UI to + // mark the daemon as unreachable on every retry. + status = state.CurrentStatus() } if status == internal.StatusNeedsLogin && s.isSessionActive.Load() { diff --git a/client/ui/services/peers.go b/client/ui/services/peers.go index 2fa2a1b12..b658cf9aa 100644 --- a/client/ui/services/peers.go +++ b/client/ui/services/peers.go @@ -11,6 +11,8 @@ import ( "github.com/cenkalti/backoff/v4" log "github.com/sirupsen/logrus" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" "github.com/netbirdio/netbird/client/proto" ) @@ -185,6 +187,23 @@ func (s *Peers) Get(ctx context.Context) (Status, error) { return statusFromProto(resp), nil } +// isDaemonUnreachable reports whether a gRPC stream error indicates the +// daemon socket itself is not answering (process down, socket missing, +// permission denied) versus the daemon responding with an application-level +// error code. Only the former should flip the tray to "Not running" — a +// daemon that returns FailedPrecondition (e.g. while it's retrying the +// management connection) is alive and shouldn't be reported as down. +func isDaemonUnreachable(err error) bool { + if err == nil { + return false + } + st, ok := status.FromError(err) + if !ok { + return true + } + return st.Code() == codes.Unavailable +} + // statusStreamLoop subscribes to the daemon's SubscribeStatus stream and // re-emits each FullStatus snapshot on the Wails event bus. The first // message is the current snapshot; subsequent messages fire on @@ -225,7 +244,9 @@ func (s *Peers) statusStreamLoop(ctx context.Context) { } stream, err := cli.SubscribeStatus(ctx, &proto.StatusRequest{GetFullPeerStatus: true}) if err != nil { - emitUnavailable() + if isDaemonUnreachable(err) { + emitUnavailable() + } return fmt.Errorf("subscribe status: %w", err) } for { @@ -234,7 +255,9 @@ func (s *Peers) statusStreamLoop(ctx context.Context) { if ctx.Err() != nil { return ctx.Err() } - emitUnavailable() + if isDaemonUnreachable(err) { + emitUnavailable() + } return fmt.Errorf("status stream recv: %w", err) } unavailable = false diff --git a/client/ui/tray.go b/client/ui/tray.go index 3b0c8c0db..916a92278 100644 --- a/client/ui/tray.go +++ b/client/ui/tray.go @@ -479,6 +479,7 @@ func (t *Tray) applyStatus(st services.Status) { strings.EqualFold(st.Status, statusSessionExpired) || strings.EqualFold(st.Status, statusLoginFailed) daemonUnavailable := strings.EqualFold(st.Status, services.StatusDaemonUnavailable) + connecting := strings.EqualFold(st.Status, statusConnecting) if t.statusItem != nil { // When the daemon needs re-authentication the status row turns // into the actionable Login entry — Connect would only fail. @@ -496,12 +497,19 @@ func (t *Tray) applyStatus(st services.Status) { t.applyStatusIndicator(st.Status) } if t.upItem != nil { - t.upItem.SetHidden(connected || needsLogin || daemonUnavailable) - t.upItem.SetEnabled(!connected && !needsLogin && !daemonUnavailable) + // Hide Connect whenever an Up action would be a no-op or would + // only fail: tunnel already up, daemon mid-connect (Disconnect + // takes over the slot so the user can abort), login required, + // or daemon socket unreachable. + t.upItem.SetHidden(connected || connecting || needsLogin || daemonUnavailable) + t.upItem.SetEnabled(!connected && !connecting && !needsLogin && !daemonUnavailable) } if t.downItem != nil { - t.downItem.SetHidden(!connected) - t.downItem.SetEnabled(connected) + // Disconnect is the abort path while the daemon is still + // retrying the management dial — without it the user has no + // way to stop the loop short of killing the daemon. + t.downItem.SetHidden(!connected && !connecting) + t.downItem.SetEnabled(connected || connecting) } // Exit Node and Resources surface tunnel-routed state, so only // expose them while the tunnel is up. Settings/Debug-Bundle just