mirror of
https://github.com/netbirdio/netbird.git
synced 2026-04-18 16:26:38 +00:00
[client] Fix engine shutdown deadlock and sync-signal message handling races (#4891)
* Fix engine shutdown deadlock and message handling races - Release syncMsgMux before waiting for shutdownWg to prevent deadlock - Check context inside lock in handleSync and receiveSignalEvents - Prevents nil pointer access when messages arrive during engine stop
This commit is contained in:
@@ -280,7 +280,6 @@ func (e *Engine) Stop() error {
|
||||
return nil
|
||||
}
|
||||
e.syncMsgMux.Lock()
|
||||
defer e.syncMsgMux.Unlock()
|
||||
|
||||
if e.connMgr != nil {
|
||||
e.connMgr.Close()
|
||||
@@ -295,7 +294,7 @@ func (e *Engine) Stop() error {
|
||||
if os.Getenv("NB_REMOVE_BEFORE_DNS") == "true" && os.Getenv("NB_REMOVE_BEFORE_ROUTES") != "true" {
|
||||
log.Info("removing peers before dns")
|
||||
if err := e.removeAllPeers(); err != nil {
|
||||
return fmt.Errorf("failed to remove all peers: %s", err)
|
||||
log.Errorf("failed to remove all peers: %s", err)
|
||||
}
|
||||
}
|
||||
if err := e.stopSSHServer(); err != nil {
|
||||
@@ -319,7 +318,7 @@ func (e *Engine) Stop() error {
|
||||
if os.Getenv("NB_REMOVE_BEFORE_ROUTES") == "true" && os.Getenv("NB_REMOVE_BEFORE_DNS") != "true" {
|
||||
log.Info("removing peers before routes")
|
||||
if err := e.removeAllPeers(); err != nil {
|
||||
return fmt.Errorf("failed to remove all peers: %s", err)
|
||||
log.Errorf("failed to remove all peers: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -338,7 +337,7 @@ func (e *Engine) Stop() error {
|
||||
if os.Getenv("NB_REMOVE_BEFORE_DNS") != "true" && os.Getenv("NB_REMOVE_BEFORE_ROUTES") != "true" {
|
||||
log.Info("removing peers after dns and routes")
|
||||
if err := e.removeAllPeers(); err != nil {
|
||||
return fmt.Errorf("failed to remove all peers: %s", err)
|
||||
log.Errorf("failed to remove all peers: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -353,16 +352,18 @@ func (e *Engine) Stop() error {
|
||||
e.flowManager.Close()
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
defer cancel()
|
||||
stateCtx, stateCancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
defer stateCancel()
|
||||
|
||||
if err := e.stateManager.Stop(ctx); err != nil {
|
||||
return fmt.Errorf("failed to stop state manager: %w", err)
|
||||
if err := e.stateManager.Stop(stateCtx); err != nil {
|
||||
log.Errorf("failed to stop state manager: %v", err)
|
||||
}
|
||||
if err := e.stateManager.PersistState(context.Background()); err != nil {
|
||||
log.Errorf("failed to persist state: %v", err)
|
||||
}
|
||||
|
||||
e.syncMsgMux.Unlock()
|
||||
|
||||
timeout := e.calculateShutdownTimeout()
|
||||
log.Debugf("waiting for goroutines to finish with timeout: %v", timeout)
|
||||
shutdownCtx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
@@ -448,8 +449,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create rosenpass manager: %w", err)
|
||||
}
|
||||
err := e.rpManager.Run()
|
||||
if err != nil {
|
||||
if err := e.rpManager.Run(); err != nil {
|
||||
return fmt.Errorf("run rosenpass manager: %w", err)
|
||||
}
|
||||
}
|
||||
@@ -501,6 +501,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
|
||||
}
|
||||
|
||||
if err := e.createFirewall(); err != nil {
|
||||
e.close()
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -766,6 +767,11 @@ func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
|
||||
e.syncMsgMux.Lock()
|
||||
defer e.syncMsgMux.Unlock()
|
||||
|
||||
// Check context INSIDE lock to ensure atomicity with shutdown
|
||||
if e.ctx.Err() != nil {
|
||||
return e.ctx.Err()
|
||||
}
|
||||
|
||||
if update.GetNetbirdConfig() != nil {
|
||||
wCfg := update.GetNetbirdConfig()
|
||||
err := e.updateTURNs(wCfg.GetTurns())
|
||||
@@ -1385,6 +1391,11 @@ func (e *Engine) receiveSignalEvents() {
|
||||
e.syncMsgMux.Lock()
|
||||
defer e.syncMsgMux.Unlock()
|
||||
|
||||
// Check context INSIDE lock to ensure atomicity with shutdown
|
||||
if e.ctx.Err() != nil {
|
||||
return e.ctx.Err()
|
||||
}
|
||||
|
||||
conn, ok := e.peerStore.PeerConn(msg.Key)
|
||||
if !ok {
|
||||
return fmt.Errorf("wrongly addressed message %s", msg.Key)
|
||||
|
||||
Reference in New Issue
Block a user