mirror of
https://github.com/netbirdio/netbird.git
synced 2026-04-24 19:26:39 +00:00
* [client] Suppress ICE signaling and periodic offers in force-relay mode When NB_FORCE_RELAY is enabled, skip WorkerICE creation entirely, suppress ICE credentials in offer/answer messages, disable the periodic ICE candidate monitor, and fix isConnectedOnAllWay to only check relay status so the guard stops sending unnecessary offers. * [client] Dynamically suppress ICE based on remote peer's offer credentials Track whether the remote peer includes ICE credentials in its offers/answers. When remote stops sending ICE credentials, skip ICE listener dispatch, suppress ICE credentials in responses, and exclude ICE from the guard connectivity check. When remote resumes sending ICE credentials, re-enable all ICE behavior. * [client] Fix nil SessionID panic and force ICE teardown on relay-only transition Fix nil pointer dereference in signalOfferAnswer when SessionID is nil (relay-only offers). Close stale ICE agent immediately when remote peer stops sending ICE credentials to avoid traffic black-hole during the ICE disconnect timeout. * [client] Add relay-only fallback check when ICE is unavailable Ensure the relay connection is supported with the peer when ICE is disabled to prevent connectivity issues. * [client] Add tri-state connection status to guard for smarter ICE retry (#5828) * [client] Add tri-state connection status to guard for smarter ICE retry Refactor isConnectedOnAllWay to return a ConnStatus enum (Connected, Disconnected, PartiallyConnected) instead of a boolean. When relay is up but ICE is not (PartiallyConnected), limit ICE offers to 3 retries with exponential backoff then fall back to hourly attempts, reducing unnecessary signaling traffic. Fully disconnected peers continue to retry aggressively. External events (relay/ICE disconnect, signal/relay reconnect) reset retry state to give ICE a fresh chance. * [client] Clarify guard ICE retry state and trace log trigger Split iceRetryState.attempt into shouldRetry (pure predicate) and enterHourlyMode (explicit state transition) so the caller in reconnectLoopWithRetry reads top-to-bottom. Restore the original trace-log behavior in isConnectedOnAllWay so it only logs on full disconnection, not on the new PartiallyConnected state. * [client] Extract pure evalConnStatus and add unit tests Split isConnectedOnAllWay into a thin method that snapshots state and a pure evalConnStatus helper that takes a connStatusInputs struct, so the tri-state decision logic can be exercised without constructing full Worker or Handshaker objects. Add table-driven tests covering force-relay, ICE-unavailable and fully-available code paths, plus unit tests for iceRetryState budget/hourly transitions and reset. * [client] Improve grammar in logs and refactor ICE credential checks
170 lines
5.1 KiB
Go
170 lines
5.1 KiB
Go
package guard
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/cenkalti/backoff/v4"
|
|
log "github.com/sirupsen/logrus"
|
|
)
|
|
|
|
// ConnStatus represents the connection state as seen by the guard.
|
|
type ConnStatus int
|
|
|
|
const (
|
|
// ConnStatusDisconnected means neither ICE nor Relay is connected.
|
|
ConnStatusDisconnected ConnStatus = iota
|
|
// ConnStatusPartiallyConnected means Relay is connected but ICE is not.
|
|
ConnStatusPartiallyConnected
|
|
// ConnStatusConnected means all required connections are established.
|
|
ConnStatusConnected
|
|
)
|
|
|
|
type connStatusFunc func() ConnStatus
|
|
|
|
// Guard is responsible for the reconnection logic.
|
|
// It will trigger to send an offer to the peer then has connection issues.
|
|
// Watch these events:
|
|
// - Relay client reconnected to home server
|
|
// - Signal server connection state changed
|
|
// - ICE connection disconnected
|
|
// - Relayed connection disconnected
|
|
// - ICE candidate changes
|
|
type Guard struct {
|
|
log *log.Entry
|
|
isConnectedOnAllWay connStatusFunc
|
|
timeout time.Duration
|
|
srWatcher *SRWatcher
|
|
relayedConnDisconnected chan struct{}
|
|
iCEConnDisconnected chan struct{}
|
|
}
|
|
|
|
func NewGuard(log *log.Entry, isConnectedFn connStatusFunc, timeout time.Duration, srWatcher *SRWatcher) *Guard {
|
|
return &Guard{
|
|
log: log,
|
|
isConnectedOnAllWay: isConnectedFn,
|
|
timeout: timeout,
|
|
srWatcher: srWatcher,
|
|
relayedConnDisconnected: make(chan struct{}, 1),
|
|
iCEConnDisconnected: make(chan struct{}, 1),
|
|
}
|
|
}
|
|
|
|
func (g *Guard) Start(ctx context.Context, eventCallback func()) {
|
|
g.log.Infof("starting guard for reconnection with MaxInterval: %s", g.timeout)
|
|
g.reconnectLoopWithRetry(ctx, eventCallback)
|
|
}
|
|
|
|
func (g *Guard) SetRelayedConnDisconnected() {
|
|
select {
|
|
case g.relayedConnDisconnected <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
|
|
func (g *Guard) SetICEConnDisconnected() {
|
|
select {
|
|
case g.iCEConnDisconnected <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
|
|
// reconnectLoopWithRetry periodically checks the connection status and sends offers to re-establish connectivity.
|
|
//
|
|
// Behavior depends on the connection state reported by isConnectedOnAllWay:
|
|
// - Connected: no action, the peer is fully reachable.
|
|
// - Disconnected (neither ICE nor Relay): retries aggressively with exponential backoff (800ms doubling
|
|
// up to timeout), never gives up. This ensures rapid recovery when the peer has no connectivity at all.
|
|
// - PartiallyConnected (Relay up, ICE not): retries up to 3 times with exponential backoff, then switches
|
|
// to one attempt per hour. This limits signaling traffic when relay already provides connectivity.
|
|
//
|
|
// External events (relay/ICE disconnect, signal/relay reconnect, candidate changes) reset the retry
|
|
// counter and backoff ticker, giving ICE a fresh chance after network conditions change.
|
|
func (g *Guard) reconnectLoopWithRetry(ctx context.Context, callback func()) {
|
|
srReconnectedChan := g.srWatcher.NewListener()
|
|
defer g.srWatcher.RemoveListener(srReconnectedChan)
|
|
|
|
ticker := g.initialTicker(ctx)
|
|
defer ticker.Stop()
|
|
|
|
tickerChannel := ticker.C
|
|
|
|
iceState := &iceRetryState{log: g.log}
|
|
defer iceState.reset()
|
|
|
|
for {
|
|
select {
|
|
case <-tickerChannel:
|
|
switch g.isConnectedOnAllWay() {
|
|
case ConnStatusConnected:
|
|
// all good, nothing to do
|
|
case ConnStatusDisconnected:
|
|
callback()
|
|
case ConnStatusPartiallyConnected:
|
|
if iceState.shouldRetry() {
|
|
callback()
|
|
} else {
|
|
iceState.enterHourlyMode()
|
|
ticker.Stop()
|
|
tickerChannel = iceState.hourlyC()
|
|
}
|
|
}
|
|
|
|
case <-g.relayedConnDisconnected:
|
|
g.log.Debugf("Relay connection changed, reset reconnection ticker")
|
|
ticker.Stop()
|
|
ticker = g.newReconnectTicker(ctx)
|
|
tickerChannel = ticker.C
|
|
iceState.reset()
|
|
|
|
case <-g.iCEConnDisconnected:
|
|
g.log.Debugf("ICE connection changed, reset reconnection ticker")
|
|
ticker.Stop()
|
|
ticker = g.newReconnectTicker(ctx)
|
|
tickerChannel = ticker.C
|
|
iceState.reset()
|
|
|
|
case <-srReconnectedChan:
|
|
g.log.Debugf("has network changes, reset reconnection ticker")
|
|
ticker.Stop()
|
|
ticker = g.newReconnectTicker(ctx)
|
|
tickerChannel = ticker.C
|
|
iceState.reset()
|
|
|
|
case <-ctx.Done():
|
|
g.log.Debugf("context is done, stop reconnect loop")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// initialTicker give chance to the peer to establish the initial connection.
|
|
func (g *Guard) initialTicker(ctx context.Context) *backoff.Ticker {
|
|
bo := backoff.WithContext(&backoff.ExponentialBackOff{
|
|
InitialInterval: 3 * time.Second,
|
|
RandomizationFactor: 0.1,
|
|
Multiplier: 2,
|
|
MaxInterval: g.timeout,
|
|
Stop: backoff.Stop,
|
|
Clock: backoff.SystemClock,
|
|
}, ctx)
|
|
|
|
return backoff.NewTicker(bo)
|
|
}
|
|
|
|
func (g *Guard) newReconnectTicker(ctx context.Context) *backoff.Ticker {
|
|
bo := backoff.WithContext(&backoff.ExponentialBackOff{
|
|
InitialInterval: 800 * time.Millisecond,
|
|
RandomizationFactor: 0.1,
|
|
Multiplier: 2,
|
|
MaxInterval: g.timeout,
|
|
Stop: backoff.Stop,
|
|
Clock: backoff.SystemClock,
|
|
}, ctx)
|
|
|
|
ticker := backoff.NewTicker(bo)
|
|
<-ticker.C // consume the initial tick what is happening right after the ticker has been created
|
|
|
|
return ticker
|
|
}
|