mirror of
https://github.com/netbirdio/netbird.git
synced 2026-04-18 16:26:38 +00:00
Introduce larger retries for the agent (#379)
The Management client will try reconnecting in case. of network issues or non-permanent errors. If the device was off-boarded, then the client will stop retrying.
This commit is contained in:
@@ -89,14 +89,13 @@ func NewClient(ctx context.Context, addr string, key wgtypes.Key, tlsEnabled boo
|
||||
func defaultBackoff(ctx context.Context) backoff.BackOff {
|
||||
return backoff.WithContext(&backoff.ExponentialBackOff{
|
||||
InitialInterval: 800 * time.Millisecond,
|
||||
RandomizationFactor: backoff.DefaultRandomizationFactor,
|
||||
Multiplier: backoff.DefaultMultiplier,
|
||||
RandomizationFactor: 1,
|
||||
Multiplier: 1.7,
|
||||
MaxInterval: 10 * time.Second,
|
||||
MaxElapsedTime: 12 * time.Hour, //stop after 12 hours of trying, the error will be propagated to the general retry of the client
|
||||
MaxElapsedTime: 3 * 30 * 24 * time.Hour, // 3 months
|
||||
Stop: backoff.Stop,
|
||||
Clock: backoff.SystemClock,
|
||||
}, ctx)
|
||||
|
||||
}
|
||||
|
||||
// Receive Connects to the Signal Exchange message stream and starts receiving messages.
|
||||
@@ -112,8 +111,12 @@ func (c *GrpcClient) Receive(msgHandler func(msg *proto.Message) error) error {
|
||||
c.notifyStreamDisconnected()
|
||||
|
||||
log.Debugf("signal connection state %v", c.signalConn.GetState())
|
||||
if !c.Ready() {
|
||||
return fmt.Errorf("no connection to signal")
|
||||
connState := c.signalConn.GetState()
|
||||
if connState == connectivity.Shutdown {
|
||||
return backoff.Permanent(fmt.Errorf("connection to signal has been shut down"))
|
||||
} else if !(connState == connectivity.Ready || connState == connectivity.Idle) {
|
||||
c.signalConn.WaitForStateChange(c.ctx, connState)
|
||||
return fmt.Errorf("connection to signal is not ready and in %s state", connState)
|
||||
}
|
||||
|
||||
// connect to Signal stream identifying ourselves with a public Wireguard key
|
||||
@@ -131,7 +134,8 @@ func (c *GrpcClient) Receive(msgHandler func(msg *proto.Message) error) error {
|
||||
// start receiving messages from the Signal stream (from other peers through signal)
|
||||
err = c.receive(stream, msgHandler)
|
||||
if err != nil {
|
||||
log.Warnf("disconnected from the Signal Exchange due to an error: %v", err)
|
||||
// we need this reset because after a successful connection and a consequent error, backoff lib doesn't
|
||||
// reset times and next try will start with a long delay
|
||||
backOff.Reset()
|
||||
return err
|
||||
}
|
||||
@@ -141,7 +145,7 @@ func (c *GrpcClient) Receive(msgHandler func(msg *proto.Message) error) error {
|
||||
|
||||
err := backoff.Retry(operation, backOff)
|
||||
if err != nil {
|
||||
log.Errorf("exiting Signal Service connection retry loop due to unrecoverable error: %s", err)
|
||||
log.Errorf("exiting the Signal service connection retry loop due to the unrecoverable error: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -308,13 +312,13 @@ func (c *GrpcClient) receive(stream proto.SignalExchange_ConnectStreamClient,
|
||||
for {
|
||||
msg, err := stream.Recv()
|
||||
if s, ok := status.FromError(err); ok && s.Code() == codes.Canceled {
|
||||
log.Warnf("stream canceled (usually indicates shutdown)")
|
||||
log.Debugf("stream canceled (usually indicates shutdown)")
|
||||
return err
|
||||
} else if s.Code() == codes.Unavailable {
|
||||
log.Warnf("Signal Service is unavailable")
|
||||
log.Debugf("Signal Service is unavailable")
|
||||
return err
|
||||
} else if err == io.EOF {
|
||||
log.Warnf("Signal Service stream closed by server")
|
||||
log.Debugf("Signal Service stream closed by server")
|
||||
return err
|
||||
} else if err != nil {
|
||||
return err
|
||||
|
||||
Reference in New Issue
Block a user