[client] Fix controller re-connection (#2758)

Rethink the peer reconnection implementation
This commit is contained in:
Zoltan Papp
2024-10-24 11:43:14 +02:00
committed by GitHub
parent 869537c951
commit 4e918e55ba
29 changed files with 813 additions and 523 deletions

View File

@@ -10,7 +10,6 @@ import (
"sync"
"time"
"github.com/cenkalti/backoff/v4"
"github.com/pion/ice/v3"
log "github.com/sirupsen/logrus"
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
@@ -18,6 +17,8 @@ import (
"github.com/netbirdio/netbird/client/iface"
"github.com/netbirdio/netbird/client/iface/configurer"
"github.com/netbirdio/netbird/client/iface/wgproxy"
"github.com/netbirdio/netbird/client/internal/peer/guard"
icemaker "github.com/netbirdio/netbird/client/internal/peer/ice"
"github.com/netbirdio/netbird/client/internal/stdnet"
relayClient "github.com/netbirdio/netbird/relay/client"
"github.com/netbirdio/netbird/route"
@@ -32,8 +33,6 @@ const (
connPriorityRelay ConnPriority = 1
connPriorityICETurn ConnPriority = 1
connPriorityICEP2P ConnPriority = 2
reconnectMaxElapsedTime = 30 * time.Minute
)
type WgConfig struct {
@@ -63,7 +62,7 @@ type ConnConfig struct {
RosenpassAddr string
// ICEConfig ICE protocol configuration
ICEConfig ICEConfig
ICEConfig icemaker.Config
}
type WorkerCallbacks struct {
@@ -106,16 +105,12 @@ type Conn struct {
wgProxyICE wgproxy.Proxy
wgProxyRelay wgproxy.Proxy
// for reconnection operations
iCEDisconnected chan bool
relayDisconnected chan bool
connMonitor *ConnMonitor
reconnectCh <-chan struct{}
guard *guard.Guard
}
// NewConn creates a new not opened Conn to the remote peer.
// To establish a connection run Conn.Open
func NewConn(engineCtx context.Context, config ConnConfig, statusRecorder *Status, signaler *Signaler, iFaceDiscover stdnet.ExternalIFaceDiscover, relayManager *relayClient.Manager) (*Conn, error) {
func NewConn(engineCtx context.Context, config ConnConfig, statusRecorder *Status, signaler *Signaler, iFaceDiscover stdnet.ExternalIFaceDiscover, relayManager *relayClient.Manager, srWatcher *guard.SRWatcher) (*Conn, error) {
allowedIP, allowedNet, err := net.ParseCIDR(config.WgConfig.AllowedIps)
if err != nil {
log.Errorf("failed to parse allowedIPS: %v", err)
@@ -126,29 +121,19 @@ func NewConn(engineCtx context.Context, config ConnConfig, statusRecorder *Statu
connLog := log.WithField("peer", config.Key)
var conn = &Conn{
log: connLog,
ctx: ctx,
ctxCancel: ctxCancel,
config: config,
statusRecorder: statusRecorder,
signaler: signaler,
relayManager: relayManager,
allowedIP: allowedIP,
allowedNet: allowedNet.String(),
statusRelay: NewAtomicConnStatus(),
statusICE: NewAtomicConnStatus(),
iCEDisconnected: make(chan bool, 1),
relayDisconnected: make(chan bool, 1),
log: connLog,
ctx: ctx,
ctxCancel: ctxCancel,
config: config,
statusRecorder: statusRecorder,
signaler: signaler,
relayManager: relayManager,
allowedIP: allowedIP,
allowedNet: allowedNet.String(),
statusRelay: NewAtomicConnStatus(),
statusICE: NewAtomicConnStatus(),
}
conn.connMonitor, conn.reconnectCh = NewConnMonitor(
signaler,
iFaceDiscover,
config,
conn.relayDisconnected,
conn.iCEDisconnected,
)
rFns := WorkerRelayCallbacks{
OnConnReady: conn.relayConnectionIsReady,
OnDisconnected: conn.onWorkerRelayStateDisconnected,
@@ -159,7 +144,8 @@ func NewConn(engineCtx context.Context, config ConnConfig, statusRecorder *Statu
OnStatusChanged: conn.onWorkerICEStateDisconnected,
}
conn.workerRelay = NewWorkerRelay(connLog, config, relayManager, rFns)
ctrl := isController(config)
conn.workerRelay = NewWorkerRelay(connLog, ctrl, config, relayManager, rFns)
relayIsSupportedLocally := conn.workerRelay.RelayIsSupportedLocally()
conn.workerICE, err = NewWorkerICE(ctx, connLog, config, signaler, iFaceDiscover, statusRecorder, relayIsSupportedLocally, wFns)
@@ -174,6 +160,8 @@ func NewConn(engineCtx context.Context, config ConnConfig, statusRecorder *Statu
conn.handshaker.AddOnNewOfferListener(conn.workerICE.OnNewOffer)
}
conn.guard = guard.NewGuard(connLog, ctrl, conn.isConnectedOnAllWay, config.Timeout, srWatcher)
go conn.handshaker.Listen()
return conn, nil
@@ -184,6 +172,7 @@ func NewConn(engineCtx context.Context, config ConnConfig, statusRecorder *Statu
// be used.
func (conn *Conn) Open() {
conn.log.Debugf("open connection to peer")
conn.mu.Lock()
defer conn.mu.Unlock()
conn.opened = true
@@ -200,24 +189,19 @@ func (conn *Conn) Open() {
conn.log.Warnf("error while updating the state err: %v", err)
}
go conn.startHandshakeAndReconnect()
go conn.startHandshakeAndReconnect(conn.ctx)
}
func (conn *Conn) startHandshakeAndReconnect() {
conn.waitInitialRandomSleepTime()
func (conn *Conn) startHandshakeAndReconnect(ctx context.Context) {
conn.waitInitialRandomSleepTime(ctx)
err := conn.handshaker.sendOffer()
if err != nil {
conn.log.Errorf("failed to send initial offer: %v", err)
}
go conn.connMonitor.Start(conn.ctx)
if conn.workerRelay.IsController() {
conn.reconnectLoopWithRetry()
} else {
conn.reconnectLoopForOnDisconnectedEvent()
}
go conn.guard.Start(ctx)
go conn.listenGuardEvent(ctx)
}
// Close closes this peer Conn issuing a close event to the Conn closeCh
@@ -316,104 +300,6 @@ func (conn *Conn) GetKey() string {
return conn.config.Key
}
func (conn *Conn) reconnectLoopWithRetry() {
// Give chance to the peer to establish the initial connection.
// With it, we can decrease to send necessary offer
select {
case <-conn.ctx.Done():
return
case <-time.After(3 * time.Second):
}
ticker := conn.prepareExponentTicker()
defer ticker.Stop()
time.Sleep(1 * time.Second)
for {
select {
case t := <-ticker.C:
if t.IsZero() {
// in case if the ticker has been canceled by context then avoid the temporary loop
return
}
if conn.workerRelay.IsRelayConnectionSupportedWithPeer() {
if conn.statusRelay.Get() == StatusDisconnected || conn.statusICE.Get() == StatusDisconnected {
conn.log.Tracef("connectivity guard timedout, relay state: %s, ice state: %s", conn.statusRelay, conn.statusICE)
}
} else {
if conn.statusICE.Get() == StatusDisconnected {
conn.log.Tracef("connectivity guard timedout, ice state: %s", conn.statusICE)
}
}
// checks if there is peer connection is established via relay or ice
if conn.isConnected() {
continue
}
err := conn.handshaker.sendOffer()
if err != nil {
conn.log.Errorf("failed to do handshake: %v", err)
}
case <-conn.reconnectCh:
ticker.Stop()
ticker = conn.prepareExponentTicker()
case <-conn.ctx.Done():
conn.log.Debugf("context is done, stop reconnect loop")
return
}
}
}
func (conn *Conn) prepareExponentTicker() *backoff.Ticker {
bo := backoff.WithContext(&backoff.ExponentialBackOff{
InitialInterval: 800 * time.Millisecond,
RandomizationFactor: 0.1,
Multiplier: 2,
MaxInterval: conn.config.Timeout,
MaxElapsedTime: reconnectMaxElapsedTime,
Stop: backoff.Stop,
Clock: backoff.SystemClock,
}, conn.ctx)
ticker := backoff.NewTicker(bo)
<-ticker.C // consume the initial tick what is happening right after the ticker has been created
return ticker
}
// reconnectLoopForOnDisconnectedEvent is used when the peer is not a controller and it should reconnect to the peer
// when the connection is lost. It will try to establish a connection only once time if before the connection was established
// It track separately the ice and relay connection status. Just because a lover priority connection reestablished it does not
// mean that to switch to it. We always force to use the higher priority connection.
func (conn *Conn) reconnectLoopForOnDisconnectedEvent() {
for {
select {
case changed := <-conn.relayDisconnected:
if !changed {
continue
}
conn.log.Debugf("Relay state changed, try to send new offer")
case changed := <-conn.iCEDisconnected:
if !changed {
continue
}
conn.log.Debugf("ICE state changed, try to send new offer")
case <-conn.ctx.Done():
conn.log.Debugf("context is done, stop reconnect loop")
return
}
err := conn.handshaker.SendOffer()
if err != nil {
conn.log.Errorf("failed to do handshake: %v", err)
}
}
}
// configureConnection starts proxying traffic from/to local Wireguard and sets connection status to StatusConnected
func (conn *Conn) iCEConnectionIsReady(priority ConnPriority, iceConnInfo ICEConnInfo) {
conn.mu.Lock()
@@ -513,7 +399,7 @@ func (conn *Conn) onWorkerICEStateDisconnected(newState ConnStatus) {
changed := conn.statusICE.Get() != newState && newState != StatusConnecting
conn.statusICE.Set(newState)
conn.notifyReconnectLoopICEDisconnected(changed)
conn.guard.SetICEConnDisconnected(changed)
peerState := State{
PubKey: conn.config.Key,
@@ -604,7 +490,7 @@ func (conn *Conn) onWorkerRelayStateDisconnected() {
changed := conn.statusRelay.Get() != StatusDisconnected
conn.statusRelay.Set(StatusDisconnected)
conn.notifyReconnectLoopRelayDisconnected(changed)
conn.guard.SetRelayedConnDisconnected(changed)
peerState := State{
PubKey: conn.config.Key,
@@ -617,6 +503,20 @@ func (conn *Conn) onWorkerRelayStateDisconnected() {
}
}
func (conn *Conn) listenGuardEvent(ctx context.Context) {
for {
select {
case <-conn.guard.Reconnect:
conn.log.Debugf("send offer to peer")
if err := conn.handshaker.SendOffer(); err != nil {
conn.log.Errorf("failed to send offer: %v", err)
}
case <-ctx.Done():
return
}
}
}
func (conn *Conn) configureWGEndpoint(addr *net.UDPAddr) error {
return conn.config.WgConfig.WgInterface.UpdatePeer(
conn.config.WgConfig.RemoteKey,
@@ -693,7 +593,7 @@ func (conn *Conn) doOnConnected(remoteRosenpassPubKey []byte, remoteRosenpassAdd
}
}
func (conn *Conn) waitInitialRandomSleepTime() {
func (conn *Conn) waitInitialRandomSleepTime(ctx context.Context) {
minWait := 100
maxWait := 800
duration := time.Duration(rand.Intn(maxWait-minWait)+minWait) * time.Millisecond
@@ -702,7 +602,7 @@ func (conn *Conn) waitInitialRandomSleepTime() {
defer timeout.Stop()
select {
case <-conn.ctx.Done():
case <-ctx.Done():
case <-timeout.C:
}
}
@@ -731,11 +631,17 @@ func (conn *Conn) evalStatus() ConnStatus {
return StatusDisconnected
}
func (conn *Conn) isConnected() bool {
func (conn *Conn) isConnectedOnAllWay() (connected bool) {
conn.mu.Lock()
defer conn.mu.Unlock()
if conn.statusICE.Get() != StatusConnected && conn.statusICE.Get() != StatusConnecting {
defer func() {
if !connected {
conn.logTraceConnState()
}
}()
if conn.statusICE.Get() == StatusDisconnected {
return false
}
@@ -805,20 +711,6 @@ func (conn *Conn) removeWgPeer() error {
return conn.config.WgConfig.WgInterface.RemovePeer(conn.config.WgConfig.RemoteKey)
}
func (conn *Conn) notifyReconnectLoopRelayDisconnected(changed bool) {
select {
case conn.relayDisconnected <- changed:
default:
}
}
func (conn *Conn) notifyReconnectLoopICEDisconnected(changed bool) {
select {
case conn.iCEDisconnected <- changed:
default:
}
}
func (conn *Conn) handleConfigurationFailure(err error, wgProxy wgproxy.Proxy) {
conn.log.Warnf("Failed to update wg peer configuration: %v", err)
if wgProxy != nil {
@@ -831,6 +723,18 @@ func (conn *Conn) handleConfigurationFailure(err error, wgProxy wgproxy.Proxy) {
}
}
func (conn *Conn) logTraceConnState() {
if conn.workerRelay.IsRelayConnectionSupportedWithPeer() {
conn.log.Tracef("connectivity guard check, relay state: %s, ice state: %s", conn.statusRelay, conn.statusICE)
} else {
conn.log.Tracef("connectivity guard check, ice state: %s", conn.statusICE)
}
}
func isController(config ConnConfig) bool {
return config.LocalKey > config.Key
}
func isRosenpassEnabled(remoteRosenpassPubKey []byte) bool {
return remoteRosenpassPubKey != nil
}