Files
netbird/shared/relay/client/guard.go
Zoltan Papp 8fc4265995 [relay] evict foreign client cache on disconnect (#6015)
* [relay] evict foreign client cache on disconnect

When a foreign relay's TCP connection drops, the manager's
onServerDisconnected handler only triggered reconnect logic for the
home server; the disconnected foreign entry stayed in the relayClients
cache. Subsequent OpenConn calls reused the closed client until the
60-second cleanup tick evicted it, breaking peer connectivity through
that relay for up to a minute.

Evict the foreign entry from the cache on disconnect so the next
OpenConn dials a fresh client.

Also:
- Make the reconnect backoff cap configurable via WithMaxBackoffInterval
  ManagerOption; the previous hard-coded 60s constant forced
  TestAutoReconnect to sleep ~61s. Test now polls Ready() and finishes
  in ~2s.
- Add NB_HOME_RELAY_SERVERS env var that overrides the relay URL list
  received from management, so a peer can be pinned to a specific home
  relay (used by the netbird-conn-lab Edge 4 reproducer).

* [client] treat empty NB_HOME_RELAY_SERVERS as unset

Returning (urls=[], ok=true) when the env var contained only separators or
whitespace caused callers to wipe the mgmt-provided relay list, leaving the
peer with no relays. Treat a parsed-empty result the same as an unset env.
2026-04-28 15:04:48 +02:00

156 lines
3.9 KiB
Go

package client
import (
"context"
"time"
"github.com/cenkalti/backoff/v4"
log "github.com/sirupsen/logrus"
)
const defaultMaxBackoffInterval = 60 * time.Second
// Guard manage the reconnection tries to the Relay server in case of disconnection event.
type Guard struct {
// OnNewRelayClient is a channel that is used to notify the relay manager about a new relay client instance.
OnNewRelayClient chan *Client
OnReconnected chan struct{}
serverPicker *ServerPicker
// maxBackoffInterval caps the exponential backoff between reconnect
// attempts.
maxBackoffInterval time.Duration
}
// NewGuard creates a new guard for the relay client. A non-positive
// maxBackoffInterval falls back to defaultMaxBackoffInterval.
func NewGuard(sp *ServerPicker, maxBackoffInterval time.Duration) *Guard {
if maxBackoffInterval <= 0 {
maxBackoffInterval = defaultMaxBackoffInterval
}
g := &Guard{
OnNewRelayClient: make(chan *Client, 1),
OnReconnected: make(chan struct{}, 1),
serverPicker: sp,
maxBackoffInterval: maxBackoffInterval,
}
return g
}
// StartReconnectTrys is called when the relay client is disconnected from the relay server.
// It attempts to reconnect to the relay server. The function first tries a quick reconnect
// to the same server that was used before, if the server URL is still valid. If the quick
// reconnect fails, it starts a ticker to periodically attempt server picking until it
// succeeds or the context is done.
//
// Parameters:
// - ctx: The context to control the lifecycle of the reconnection attempts.
// - relayClient: The relay client instance that was disconnected.
// todo prevent multiple reconnection instances. In the current usage it should not happen, but it is better to prevent
func (g *Guard) StartReconnectTrys(ctx context.Context, relayClient *Client) {
// try to reconnect to the same server
if ok := g.tryToQuickReconnect(ctx, relayClient); ok {
g.notifyReconnected()
return
}
// start a ticker to pick a new server
ticker := g.exponentTicker(ctx)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if err := g.retry(ctx); err != nil {
log.Errorf("failed to pick new Relay server: %s", err)
continue
}
return
case <-ctx.Done():
return
}
}
}
func (g *Guard) tryToQuickReconnect(parentCtx context.Context, rc *Client) bool {
if rc == nil {
return false
}
if !g.isServerURLStillValid(rc) {
return false
}
if cancelled := waiteBeforeRetry(parentCtx); !cancelled {
return false
}
log.Infof("try to reconnect to Relay server: %s", rc.connectionURL)
if err := rc.Connect(parentCtx); err != nil {
log.Errorf("failed to reconnect to relay server: %s", err)
return false
}
return true
}
func (g *Guard) retry(ctx context.Context) error {
log.Infof("try to pick up a new Relay server")
relayClient, err := g.serverPicker.PickServer(ctx)
if err != nil {
return err
}
// prevent to work with a deprecated Relay client instance
g.drainRelayClientChan()
g.OnNewRelayClient <- relayClient
return nil
}
func (g *Guard) drainRelayClientChan() {
select {
case <-g.OnNewRelayClient:
default:
}
}
func (g *Guard) isServerURLStillValid(rc *Client) bool {
for _, url := range g.serverPicker.ServerURLs.Load().([]string) {
if url == rc.connectionURL {
return true
}
}
return false
}
func (g *Guard) notifyReconnected() {
select {
case g.OnReconnected <- struct{}{}:
default:
}
}
func (g *Guard) exponentTicker(ctx context.Context) *backoff.Ticker {
bo := backoff.WithContext(&backoff.ExponentialBackOff{
InitialInterval: 2 * time.Second,
Multiplier: 2,
MaxInterval: g.maxBackoffInterval,
Clock: backoff.SystemClock,
}, ctx)
return backoff.NewTicker(bo)
}
func waiteBeforeRetry(ctx context.Context) bool {
timer := time.NewTimer(1500 * time.Millisecond)
defer timer.Stop()
select {
case <-timer.C:
return true
case <-ctx.Done():
return false
}
}