netbird/client/internal/peer/guard/guard.go

package guard

import (
	"context"
	"time"

	"github.com/cenkalti/backoff/v4"
	log "github.com/sirupsen/logrus"
)

// ConnStatus represents the connection state as seen by the guard.
type ConnStatus int

const (
	// ConnStatusDisconnected means neither ICE nor Relay is connected.
	ConnStatusDisconnected ConnStatus = iota
	// ConnStatusPartiallyConnected means Relay is connected but ICE is not.
	ConnStatusPartiallyConnected
	// ConnStatusConnected means all required connections are established.
	ConnStatusConnected
)

type connStatusFunc func() ConnStatus

// Guard is responsible for the reconnection logic.
// It will trigger to send an offer to the peer then has connection issues.
// Watch these events:
// - Relay client reconnected to home server
// - Signal server connection state changed
// - ICE connection disconnected
// - Relayed connection disconnected
// - ICE candidate changes
type Guard struct {
	log                     *log.Entry
	isConnectedOnAllWay     connStatusFunc
	timeout                 time.Duration
	srWatcher               *SRWatcher
	relayedConnDisconnected chan struct{}
	iCEConnDisconnected     chan struct{}
}

func NewGuard(log *log.Entry, isConnectedFn connStatusFunc, timeout time.Duration, srWatcher *SRWatcher) *Guard {
	return &Guard{
		log:                     log,
		isConnectedOnAllWay:     isConnectedFn,
		timeout:                 timeout,
		srWatcher:               srWatcher,
		relayedConnDisconnected: make(chan struct{}, 1),
		iCEConnDisconnected:     make(chan struct{}, 1),
	}
}

func (g *Guard) Start(ctx context.Context, eventCallback func()) {
	g.log.Infof("starting guard for reconnection with MaxInterval: %s", g.timeout)
	g.reconnectLoopWithRetry(ctx, eventCallback)
}

func (g *Guard) SetRelayedConnDisconnected() {
	select {
	case g.relayedConnDisconnected <- struct{}{}:
	default:
	}
}

func (g *Guard) SetICEConnDisconnected() {
	select {
	case g.iCEConnDisconnected <- struct{}{}:
	default:
	}
}

// reconnectLoopWithRetry periodically checks the connection status and sends offers to re-establish connectivity.
//
// Behavior depends on the connection state reported by isConnectedOnAllWay:
//   - Connected: no action, the peer is fully reachable.
//   - Disconnected (neither ICE nor Relay): retries aggressively with exponential backoff (800ms doubling
//     up to timeout), never gives up. This ensures rapid recovery when the peer has no connectivity at all.
//   - PartiallyConnected (Relay up, ICE not): retries up to 3 times with exponential backoff, then switches
//     to one attempt per hour. This limits signaling traffic when relay already provides connectivity.
//
// External events (relay/ICE disconnect, signal/relay reconnect, candidate changes) reset the retry
// counter and backoff ticker, giving ICE a fresh chance after network conditions change.
func (g *Guard) reconnectLoopWithRetry(ctx context.Context, callback func()) {
	srReconnectedChan := g.srWatcher.NewListener()
	defer g.srWatcher.RemoveListener(srReconnectedChan)

	ticker := g.initialTicker(ctx)
	defer ticker.Stop()

	tickerChannel := ticker.C

	iceState := &iceRetryState{log: g.log}
	defer iceState.reset()

	for {
		select {
		case <-tickerChannel:
			switch g.isConnectedOnAllWay() {
			case ConnStatusConnected:
				// all good, nothing to do
			case ConnStatusDisconnected:
				callback()
			case ConnStatusPartiallyConnected:
				if iceState.attempt() {
					callback()
				} else {
					ticker.Stop()
					tickerChannel = iceState.hourlyC()
				}
			}

		case <-g.relayedConnDisconnected:
			g.log.Debugf("Relay connection changed, reset reconnection ticker")
			ticker.Stop()
			ticker = g.newReconnectTicker(ctx)
			tickerChannel = ticker.C
			iceState.reset()

		case <-g.iCEConnDisconnected:
			g.log.Debugf("ICE connection changed, reset reconnection ticker")
			ticker.Stop()
			ticker = g.newReconnectTicker(ctx)
			tickerChannel = ticker.C
			iceState.reset()

		case <-srReconnectedChan:
			g.log.Debugf("has network changes, reset reconnection ticker")
			ticker.Stop()
			ticker = g.newReconnectTicker(ctx)
			tickerChannel = ticker.C
			iceState.reset()

		case <-ctx.Done():
			g.log.Debugf("context is done, stop reconnect loop")
			return
		}
	}
}

// initialTicker give chance to the peer to establish the initial connection.
func (g *Guard) initialTicker(ctx context.Context) *backoff.Ticker {
	bo := backoff.WithContext(&backoff.ExponentialBackOff{
		InitialInterval:     3 * time.Second,
		RandomizationFactor: 0.1,
		Multiplier:          2,
		MaxInterval:         g.timeout,
		Stop:                backoff.Stop,
		Clock:               backoff.SystemClock,
	}, ctx)

	return backoff.NewTicker(bo)
}

func (g *Guard) newReconnectTicker(ctx context.Context) *backoff.Ticker {
	bo := backoff.WithContext(&backoff.ExponentialBackOff{
		InitialInterval:     800 * time.Millisecond,
		RandomizationFactor: 0.1,
		Multiplier:          2,
		MaxInterval:         g.timeout,
		Stop:                backoff.Stop,
		Clock:               backoff.SystemClock,
	}, ctx)

	ticker := backoff.NewTicker(bo)
	<-ticker.C // consume the initial tick what is happening right after the ticker has been created

	return ticker
}