mirror of
https://github.com/netbirdio/netbird.git
synced 2026-04-16 07:16:38 +00:00
[client] Fix controller re-connection (#2758)
Rethink the peer reconnection implementation
This commit is contained in:
194
client/internal/peer/guard/guard.go
Normal file
194
client/internal/peer/guard/guard.go
Normal file
@@ -0,0 +1,194 @@
|
||||
package guard
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/cenkalti/backoff/v4"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
reconnectMaxElapsedTime = 30 * time.Minute
|
||||
)
|
||||
|
||||
type isConnectedFunc func() bool
|
||||
|
||||
// Guard is responsible for the reconnection logic.
|
||||
// It will trigger to send an offer to the peer then has connection issues.
|
||||
// Watch these events:
|
||||
// - Relay client reconnected to home server
|
||||
// - Signal server connection state changed
|
||||
// - ICE connection disconnected
|
||||
// - Relayed connection disconnected
|
||||
// - ICE candidate changes
|
||||
type Guard struct {
|
||||
Reconnect chan struct{}
|
||||
log *log.Entry
|
||||
isController bool
|
||||
isConnectedOnAllWay isConnectedFunc
|
||||
timeout time.Duration
|
||||
srWatcher *SRWatcher
|
||||
relayedConnDisconnected chan bool
|
||||
iCEConnDisconnected chan bool
|
||||
}
|
||||
|
||||
func NewGuard(log *log.Entry, isController bool, isConnectedFn isConnectedFunc, timeout time.Duration, srWatcher *SRWatcher) *Guard {
|
||||
return &Guard{
|
||||
Reconnect: make(chan struct{}, 1),
|
||||
log: log,
|
||||
isController: isController,
|
||||
isConnectedOnAllWay: isConnectedFn,
|
||||
timeout: timeout,
|
||||
srWatcher: srWatcher,
|
||||
relayedConnDisconnected: make(chan bool, 1),
|
||||
iCEConnDisconnected: make(chan bool, 1),
|
||||
}
|
||||
}
|
||||
|
||||
func (g *Guard) Start(ctx context.Context) {
|
||||
if g.isController {
|
||||
g.reconnectLoopWithRetry(ctx)
|
||||
} else {
|
||||
g.listenForDisconnectEvents(ctx)
|
||||
}
|
||||
}
|
||||
|
||||
func (g *Guard) SetRelayedConnDisconnected(changed bool) {
|
||||
select {
|
||||
case g.relayedConnDisconnected <- changed:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
func (g *Guard) SetICEConnDisconnected(changed bool) {
|
||||
select {
|
||||
case g.iCEConnDisconnected <- changed:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// reconnectLoopWithRetry periodically check (max 30 min) the connection status.
|
||||
// Try to send offer while the P2P is not established or while the Relay is not connected if is it supported
|
||||
func (g *Guard) reconnectLoopWithRetry(ctx context.Context) {
|
||||
waitForInitialConnectionTry(ctx)
|
||||
|
||||
srReconnectedChan := g.srWatcher.NewListener()
|
||||
defer g.srWatcher.RemoveListener(srReconnectedChan)
|
||||
|
||||
ticker := g.prepareExponentTicker(ctx)
|
||||
defer ticker.Stop()
|
||||
|
||||
tickerChannel := ticker.C
|
||||
|
||||
g.log.Infof("start reconnect loop...")
|
||||
for {
|
||||
select {
|
||||
case t := <-tickerChannel:
|
||||
if t.IsZero() {
|
||||
g.log.Infof("retry timed out, stop periodic offer sending")
|
||||
// after backoff timeout the ticker.C will be closed. We need to a dummy channel to avoid loop
|
||||
tickerChannel = make(<-chan time.Time)
|
||||
continue
|
||||
}
|
||||
|
||||
if !g.isConnectedOnAllWay() {
|
||||
g.triggerOfferSending()
|
||||
}
|
||||
|
||||
case changed := <-g.relayedConnDisconnected:
|
||||
if !changed {
|
||||
continue
|
||||
}
|
||||
g.log.Debugf("Relay connection changed, reset reconnection ticker")
|
||||
ticker.Stop()
|
||||
ticker = g.prepareExponentTicker(ctx)
|
||||
tickerChannel = ticker.C
|
||||
|
||||
case changed := <-g.iCEConnDisconnected:
|
||||
if !changed {
|
||||
continue
|
||||
}
|
||||
g.log.Debugf("ICE connection changed, reset reconnection ticker")
|
||||
ticker.Stop()
|
||||
ticker = g.prepareExponentTicker(ctx)
|
||||
tickerChannel = ticker.C
|
||||
|
||||
case <-srReconnectedChan:
|
||||
g.log.Debugf("has network changes, reset reconnection ticker")
|
||||
ticker.Stop()
|
||||
ticker = g.prepareExponentTicker(ctx)
|
||||
tickerChannel = ticker.C
|
||||
|
||||
case <-ctx.Done():
|
||||
g.log.Debugf("context is done, stop reconnect loop")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// listenForDisconnectEvents is used when the peer is not a controller and it should reconnect to the peer
|
||||
// when the connection is lost. It will try to establish a connection only once time if before the connection was established
|
||||
// It track separately the ice and relay connection status. Just because a lower priority connection reestablished it does not
|
||||
// mean that to switch to it. We always force to use the higher priority connection.
|
||||
func (g *Guard) listenForDisconnectEvents(ctx context.Context) {
|
||||
srReconnectedChan := g.srWatcher.NewListener()
|
||||
defer g.srWatcher.RemoveListener(srReconnectedChan)
|
||||
|
||||
g.log.Infof("start listen for reconnect events...")
|
||||
for {
|
||||
select {
|
||||
case changed := <-g.relayedConnDisconnected:
|
||||
if !changed {
|
||||
continue
|
||||
}
|
||||
g.log.Debugf("Relay connection changed, triggering reconnect")
|
||||
g.triggerOfferSending()
|
||||
case changed := <-g.iCEConnDisconnected:
|
||||
if !changed {
|
||||
continue
|
||||
}
|
||||
g.log.Debugf("ICE state changed, try to send new offer")
|
||||
g.triggerOfferSending()
|
||||
case <-srReconnectedChan:
|
||||
g.triggerOfferSending()
|
||||
case <-ctx.Done():
|
||||
g.log.Debugf("context is done, stop reconnect loop")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (g *Guard) prepareExponentTicker(ctx context.Context) *backoff.Ticker {
|
||||
bo := backoff.WithContext(&backoff.ExponentialBackOff{
|
||||
InitialInterval: 800 * time.Millisecond,
|
||||
RandomizationFactor: 0.1,
|
||||
Multiplier: 2,
|
||||
MaxInterval: g.timeout,
|
||||
MaxElapsedTime: reconnectMaxElapsedTime,
|
||||
Stop: backoff.Stop,
|
||||
Clock: backoff.SystemClock,
|
||||
}, ctx)
|
||||
|
||||
ticker := backoff.NewTicker(bo)
|
||||
<-ticker.C // consume the initial tick what is happening right after the ticker has been created
|
||||
|
||||
return ticker
|
||||
}
|
||||
|
||||
func (g *Guard) triggerOfferSending() {
|
||||
select {
|
||||
case g.Reconnect <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// Give chance to the peer to establish the initial connection.
|
||||
// With it, we can decrease to send necessary offer
|
||||
func waitForInitialConnectionTry(ctx context.Context) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(3 * time.Second):
|
||||
}
|
||||
}
|
||||
135
client/internal/peer/guard/ice_monitor.go
Normal file
135
client/internal/peer/guard/ice_monitor.go
Normal file
@@ -0,0 +1,135 @@
|
||||
package guard
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/pion/ice/v3"
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
icemaker "github.com/netbirdio/netbird/client/internal/peer/ice"
|
||||
"github.com/netbirdio/netbird/client/internal/stdnet"
|
||||
)
|
||||
|
||||
const (
|
||||
candidatesMonitorPeriod = 5 * time.Minute
|
||||
candidateGatheringTimeout = 5 * time.Second
|
||||
)
|
||||
|
||||
type ICEMonitor struct {
|
||||
ReconnectCh chan struct{}
|
||||
|
||||
iFaceDiscover stdnet.ExternalIFaceDiscover
|
||||
iceConfig icemaker.Config
|
||||
|
||||
currentCandidates []ice.Candidate
|
||||
candidatesMu sync.Mutex
|
||||
}
|
||||
|
||||
func NewICEMonitor(iFaceDiscover stdnet.ExternalIFaceDiscover, config icemaker.Config) *ICEMonitor {
|
||||
cm := &ICEMonitor{
|
||||
ReconnectCh: make(chan struct{}, 1),
|
||||
iFaceDiscover: iFaceDiscover,
|
||||
iceConfig: config,
|
||||
}
|
||||
return cm
|
||||
}
|
||||
|
||||
func (cm *ICEMonitor) Start(ctx context.Context, onChanged func()) {
|
||||
ufrag, pwd, err := icemaker.GenerateICECredentials()
|
||||
if err != nil {
|
||||
log.Warnf("Failed to generate ICE credentials: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(candidatesMonitorPeriod)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
changed, err := cm.handleCandidateTick(ctx, ufrag, pwd)
|
||||
if err != nil {
|
||||
log.Warnf("Failed to check ICE changes: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if changed {
|
||||
onChanged()
|
||||
}
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (cm *ICEMonitor) handleCandidateTick(ctx context.Context, ufrag string, pwd string) (bool, error) {
|
||||
log.Debugf("Gathering ICE candidates")
|
||||
|
||||
agent, err := icemaker.NewAgent(cm.iFaceDiscover, cm.iceConfig, candidateTypesP2P(), ufrag, pwd)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("create ICE agent: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
if err := agent.Close(); err != nil {
|
||||
log.Warnf("Failed to close ICE agent: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
gatherDone := make(chan struct{})
|
||||
err = agent.OnCandidate(func(c ice.Candidate) {
|
||||
log.Tracef("Got candidate: %v", c)
|
||||
if c == nil {
|
||||
close(gatherDone)
|
||||
}
|
||||
})
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("set ICE candidate handler: %w", err)
|
||||
}
|
||||
|
||||
if err := agent.GatherCandidates(); err != nil {
|
||||
return false, fmt.Errorf("gather ICE candidates: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, candidateGatheringTimeout)
|
||||
defer cancel()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return false, fmt.Errorf("wait for gathering timed out")
|
||||
case <-gatherDone:
|
||||
}
|
||||
|
||||
candidates, err := agent.GetLocalCandidates()
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("get local candidates: %w", err)
|
||||
}
|
||||
log.Tracef("Got candidates: %v", candidates)
|
||||
|
||||
return cm.updateCandidates(candidates), nil
|
||||
}
|
||||
|
||||
func (cm *ICEMonitor) updateCandidates(newCandidates []ice.Candidate) bool {
|
||||
cm.candidatesMu.Lock()
|
||||
defer cm.candidatesMu.Unlock()
|
||||
|
||||
if len(cm.currentCandidates) != len(newCandidates) {
|
||||
cm.currentCandidates = newCandidates
|
||||
return true
|
||||
}
|
||||
|
||||
for i, candidate := range cm.currentCandidates {
|
||||
if candidate.Address() != newCandidates[i].Address() {
|
||||
cm.currentCandidates = newCandidates
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func candidateTypesP2P() []ice.CandidateType {
|
||||
return []ice.CandidateType{ice.CandidateTypeHost, ice.CandidateTypeServerReflexive}
|
||||
}
|
||||
119
client/internal/peer/guard/sr_watcher.go
Normal file
119
client/internal/peer/guard/sr_watcher.go
Normal file
@@ -0,0 +1,119 @@
|
||||
package guard
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/netbirdio/netbird/client/internal/peer/ice"
|
||||
"github.com/netbirdio/netbird/client/internal/stdnet"
|
||||
)
|
||||
|
||||
type chNotifier interface {
|
||||
SetOnReconnectedListener(func())
|
||||
Ready() bool
|
||||
}
|
||||
|
||||
type SRWatcher struct {
|
||||
signalClient chNotifier
|
||||
relayManager chNotifier
|
||||
|
||||
listeners map[chan struct{}]struct{}
|
||||
mu sync.Mutex
|
||||
iFaceDiscover stdnet.ExternalIFaceDiscover
|
||||
iceConfig ice.Config
|
||||
|
||||
cancelIceMonitor context.CancelFunc
|
||||
}
|
||||
|
||||
// NewSRWatcher creates a new SRWatcher. This watcher will notify the listeners when the ICE candidates change or the
|
||||
// Relay connection is reconnected or the Signal client reconnected.
|
||||
func NewSRWatcher(signalClient chNotifier, relayManager chNotifier, iFaceDiscover stdnet.ExternalIFaceDiscover, iceConfig ice.Config) *SRWatcher {
|
||||
srw := &SRWatcher{
|
||||
signalClient: signalClient,
|
||||
relayManager: relayManager,
|
||||
iFaceDiscover: iFaceDiscover,
|
||||
iceConfig: iceConfig,
|
||||
listeners: make(map[chan struct{}]struct{}),
|
||||
}
|
||||
return srw
|
||||
}
|
||||
|
||||
func (w *SRWatcher) Start() {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
if w.cancelIceMonitor != nil {
|
||||
return
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
w.cancelIceMonitor = cancel
|
||||
|
||||
iceMonitor := NewICEMonitor(w.iFaceDiscover, w.iceConfig)
|
||||
go iceMonitor.Start(ctx, w.onICEChanged)
|
||||
w.signalClient.SetOnReconnectedListener(w.onReconnected)
|
||||
w.relayManager.SetOnReconnectedListener(w.onReconnected)
|
||||
|
||||
}
|
||||
|
||||
func (w *SRWatcher) Close() {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
if w.cancelIceMonitor == nil {
|
||||
return
|
||||
}
|
||||
w.cancelIceMonitor()
|
||||
w.signalClient.SetOnReconnectedListener(nil)
|
||||
w.relayManager.SetOnReconnectedListener(nil)
|
||||
}
|
||||
|
||||
func (w *SRWatcher) NewListener() chan struct{} {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
listenerChan := make(chan struct{}, 1)
|
||||
w.listeners[listenerChan] = struct{}{}
|
||||
return listenerChan
|
||||
}
|
||||
|
||||
func (w *SRWatcher) RemoveListener(listenerChan chan struct{}) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
delete(w.listeners, listenerChan)
|
||||
close(listenerChan)
|
||||
}
|
||||
|
||||
func (w *SRWatcher) onICEChanged() {
|
||||
if !w.signalClient.Ready() {
|
||||
return
|
||||
}
|
||||
|
||||
log.Infof("network changes detected by ICE agent")
|
||||
w.notify()
|
||||
}
|
||||
|
||||
func (w *SRWatcher) onReconnected() {
|
||||
if !w.signalClient.Ready() {
|
||||
return
|
||||
}
|
||||
if !w.relayManager.Ready() {
|
||||
return
|
||||
}
|
||||
|
||||
log.Infof("reconnected to Signal or Relay server")
|
||||
w.notify()
|
||||
}
|
||||
|
||||
func (w *SRWatcher) notify() {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
for listener := range w.listeners {
|
||||
select {
|
||||
case listener <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user