From 780e9f57a574fd7df2cd0114db6c7fe307b0bef7 Mon Sep 17 00:00:00 2001 From: Viktor Liu Date: Mon, 9 Feb 2026 01:44:46 +0800 Subject: [PATCH] Improve mgmt backoff --- go.mod | 1 - go.sum | 2 -- proxy/server.go | 68 ++++++++++++++++++++++++------------------------- 3 files changed, 33 insertions(+), 38 deletions(-) diff --git a/go.mod b/go.mod index af19bb44e..8329574f6 100644 --- a/go.mod +++ b/go.mod @@ -40,7 +40,6 @@ require ( github.com/c-robinson/iplib v1.0.3 github.com/caddyserver/certmagic v0.21.3 github.com/cilium/ebpf v0.15.0 - github.com/cloudflare/backoff v0.0.0-20240920015135-e46b80a3a7d0 github.com/coder/websocket v1.8.13 github.com/coreos/go-iptables v0.7.0 github.com/coreos/go-oidc/v3 v3.14.1 diff --git a/go.sum b/go.sum index 61f97b662..17e5c8ffa 100644 --- a/go.sum +++ b/go.sum @@ -107,8 +107,6 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cilium/ebpf v0.15.0 h1:7NxJhNiBT3NG8pZJ3c+yfrVdHY8ScgKD27sScgjLMMk= github.com/cilium/ebpf v0.15.0/go.mod h1:DHp1WyrLeiBh19Cf/tfiSMhqheEiK8fXFZ4No0P1Hso= -github.com/cloudflare/backoff v0.0.0-20240920015135-e46b80a3a7d0 h1:pRcxfaAlK0vR6nOeQs7eAEvjJzdGXl8+KaBlcvpQTyQ= -github.com/cloudflare/backoff v0.0.0-20240920015135-e46b80a3a7d0/go.mod h1:rzgs2ZOiguV6/NpiDgADjRLPNyZlApIWxKpkT+X8SdY= github.com/coder/websocket v1.8.13 h1:f3QZdXy7uGVz+4uCJy2nTZyM0yTBj8yANEHhqlXZ9FE= github.com/coder/websocket v1.8.13/go.mod h1:LNVeNrXQZfe5qhS9ALED3uA+l5pPqvwXg3CKoDBB2gs= github.com/containerd/containerd v1.7.29 h1:90fWABQsaN9mJhGkoVnuzEY+o1XDPbg9BTC9QTAHnuE= diff --git a/proxy/server.go b/proxy/server.go index 13ab8a4f0..2b89da61d 100644 --- a/proxy/server.go +++ b/proxy/server.go @@ -21,7 +21,7 @@ import ( "path/filepath" "time" - "github.com/cloudflare/backoff" + backoff "github.com/cenkalti/backoff/v4" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/credentials" @@ -295,12 +295,21 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) { } func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.ProxyServiceClient) { - b := backoff.New(0, 0) - initialSyncDone := false - for { - s.Logger.Debug("Getting mapping updates from management server") + bo := &backoff.ExponentialBackOff{ + InitialInterval: 800 * time.Millisecond, + RandomizationFactor: 1, + Multiplier: 1.7, + MaxInterval: 10 * time.Second, + MaxElapsedTime: 0, // retry indefinitely until context is canceled + Stop: backoff.Stop, + Clock: backoff.SystemClock, + } + + initialSyncDone := false + + operation := func() error { + s.Logger.Debug("connecting to management mapping stream") - // Mark management as disconnected while we're attempting to reconnect. if s.healthChecker != nil { s.healthChecker.SetManagementConnected(false) } @@ -312,47 +321,36 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr Address: s.ProxyURL, }) if err != nil { - s.Logger.WithError(err).Warn("Could not get mapping updates, will retry") - backoffDuration := b.Duration() - s.Logger.WithFields(log.Fields{ - "backoff": backoffDuration, - "error": err, - }).Error("Unable to create mapping client to management server, retrying connection after backoff") - time.Sleep(backoffDuration) - continue + return fmt.Errorf("create mapping stream: %w", err) } - // Mark management as connected once stream is established. if s.healthChecker != nil { s.healthChecker.SetManagementConnected(true) } - s.Logger.Debug("Got mapping updates client from management server") + s.Logger.Debug("management mapping stream established") - err = s.handleMappingStream(ctx, mappingClient, &initialSyncDone) + // Stream established — reset backoff so the next failure retries quickly. + bo.Reset() + + streamErr := s.handleMappingStream(ctx, mappingClient, &initialSyncDone) if s.healthChecker != nil { s.healthChecker.SetManagementConnected(false) } - backoffDuration := b.Duration() - switch { - case errors.Is(err, context.Canceled), - errors.Is(err, context.DeadlineExceeded): - // Context is telling us that it is time to quit so gracefully exit here. - // No need to log the error as it is a parent context causing this return. - s.Logger.Debugf("Got context error, will exit loop: %v", err) - return - case err != nil: - // Log the error and then retry the connection. - s.Logger.WithFields(log.Fields{ - "backoff": backoffDuration, - "error": err, - }).Error("Error processing mapping stream from management server, retrying connection after backoff") - default: - // TODO: should this really be at error level? Maybe, if you start getting lots of these this could be an indication of connectivity issues. - s.Logger.WithField("backoff", backoffDuration).Error("Management mapping connection terminated by the server, retrying connection after backoff") + if streamErr == nil { + return fmt.Errorf("stream closed by server") } - time.Sleep(backoffDuration) + + return fmt.Errorf("mapping stream: %w", streamErr) + } + + notify := func(err error, next time.Duration) { + s.Logger.Warnf("management connection failed, retrying in %s: %v", next.Truncate(time.Millisecond), err) + } + + if err := backoff.RetryNotify(operation, backoff.WithContext(bo, ctx), notify); err != nil { + s.Logger.WithError(err).Debug("management mapping worker exiting") } }