mirror of
https://github.com/netbirdio/netbird.git
synced 2026-04-19 00:36:38 +00:00
[management] Replace in-memory expose tracker with SQL-backed operations (#5494)
The expose tracker used sync.Map for in-memory TTL tracking of active expose sessions, which broke and lost all sessions on restart. Replace with SQL-backed operations that reuse the existing meta_last_renewed_at column: - Add store methods: RenewEphemeralService, GetExpiredEphemeralServices, CountEphemeralServicesByPeer, EphemeralServiceExists - Move duplicate/limit checks inside a transaction with row-level locking (SELECT ... FOR UPDATE) to prevent concurrent bypass - Reaper re-checks expiry under row lock to avoid deleting a just-renewed service and prevent duplicate event emission - Add composite index on (source, source_peer) for efficient queries - Batch-limit and column-select the reaper query to avoid DB/GC spikes - Filter out malformed rows with empty source_peer
This commit is contained in:
@@ -2,7 +2,7 @@ package manager
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"math/rand/v2"
|
||||
"time"
|
||||
|
||||
"github.com/netbirdio/netbird/shared/management/status"
|
||||
@@ -13,108 +13,20 @@ const (
|
||||
exposeTTL = 90 * time.Second
|
||||
exposeReapInterval = 30 * time.Second
|
||||
maxExposesPerPeer = 10
|
||||
exposeReapBatch = 100
|
||||
)
|
||||
|
||||
type trackedExpose struct {
|
||||
mu sync.Mutex
|
||||
domain string
|
||||
accountID string
|
||||
peerID string
|
||||
lastRenewed time.Time
|
||||
expiring bool
|
||||
type exposeReaper struct {
|
||||
manager *Manager
|
||||
}
|
||||
|
||||
type exposeTracker struct {
|
||||
activeExposes sync.Map
|
||||
exposeCreateMu sync.Mutex
|
||||
manager *Manager
|
||||
}
|
||||
|
||||
func exposeKey(peerID, domain string) string {
|
||||
return peerID + ":" + domain
|
||||
}
|
||||
|
||||
// TrackExposeIfAllowed atomically checks the per-peer limit and registers a new
|
||||
// active expose session under the same lock. Returns (true, false) if the expose
|
||||
// was already tracked (duplicate), (false, true) if tracking succeeded, and
|
||||
// (false, false) if the peer has reached the limit.
|
||||
func (t *exposeTracker) TrackExposeIfAllowed(peerID, domain, accountID string) (alreadyTracked, ok bool) {
|
||||
t.exposeCreateMu.Lock()
|
||||
defer t.exposeCreateMu.Unlock()
|
||||
|
||||
key := exposeKey(peerID, domain)
|
||||
_, loaded := t.activeExposes.LoadOrStore(key, &trackedExpose{
|
||||
domain: domain,
|
||||
accountID: accountID,
|
||||
peerID: peerID,
|
||||
lastRenewed: time.Now(),
|
||||
})
|
||||
if loaded {
|
||||
return true, false
|
||||
}
|
||||
|
||||
if t.CountPeerExposes(peerID) > maxExposesPerPeer {
|
||||
t.activeExposes.Delete(key)
|
||||
return false, false
|
||||
}
|
||||
|
||||
return false, true
|
||||
}
|
||||
|
||||
// UntrackExpose removes an active expose session from tracking.
|
||||
func (t *exposeTracker) UntrackExpose(peerID, domain string) {
|
||||
t.activeExposes.Delete(exposeKey(peerID, domain))
|
||||
}
|
||||
|
||||
// CountPeerExposes returns the number of active expose sessions for a peer.
|
||||
func (t *exposeTracker) CountPeerExposes(peerID string) int {
|
||||
count := 0
|
||||
t.activeExposes.Range(func(_, val any) bool {
|
||||
if expose := val.(*trackedExpose); expose.peerID == peerID {
|
||||
count++
|
||||
}
|
||||
return true
|
||||
})
|
||||
return count
|
||||
}
|
||||
|
||||
// MaxExposesPerPeer returns the maximum number of concurrent exposes allowed per peer.
|
||||
func (t *exposeTracker) MaxExposesPerPeer() int {
|
||||
return maxExposesPerPeer
|
||||
}
|
||||
|
||||
// RenewTrackedExpose updates the in-memory lastRenewed timestamp for a tracked expose.
|
||||
// Returns false if the expose is not tracked or is being reaped.
|
||||
func (t *exposeTracker) RenewTrackedExpose(peerID, domain string) bool {
|
||||
key := exposeKey(peerID, domain)
|
||||
val, ok := t.activeExposes.Load(key)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
expose := val.(*trackedExpose)
|
||||
expose.mu.Lock()
|
||||
if expose.expiring {
|
||||
expose.mu.Unlock()
|
||||
return false
|
||||
}
|
||||
expose.lastRenewed = time.Now()
|
||||
expose.mu.Unlock()
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
// StopTrackedExpose removes an active expose session from tracking.
|
||||
// Returns false if the expose was not tracked.
|
||||
func (t *exposeTracker) StopTrackedExpose(peerID, domain string) bool {
|
||||
key := exposeKey(peerID, domain)
|
||||
_, ok := t.activeExposes.LoadAndDelete(key)
|
||||
return ok
|
||||
}
|
||||
|
||||
// StartExposeReaper starts a background goroutine that reaps expired expose sessions.
|
||||
func (t *exposeTracker) StartExposeReaper(ctx context.Context) {
|
||||
// StartExposeReaper starts a background goroutine that reaps expired ephemeral services from the DB.
|
||||
func (r *exposeReaper) StartExposeReaper(ctx context.Context) {
|
||||
go func() {
|
||||
// start with a random delay
|
||||
rn := rand.IntN(10)
|
||||
time.Sleep(time.Duration(rn) * time.Second)
|
||||
|
||||
ticker := time.NewTicker(exposeReapInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
@@ -123,41 +35,31 @@ func (t *exposeTracker) StartExposeReaper(ctx context.Context) {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
t.reapExpiredExposes()
|
||||
r.reapExpiredExposes(ctx)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (t *exposeTracker) reapExpiredExposes() {
|
||||
t.activeExposes.Range(func(key, val any) bool {
|
||||
expose := val.(*trackedExpose)
|
||||
expose.mu.Lock()
|
||||
expired := time.Since(expose.lastRenewed) > exposeTTL
|
||||
if expired {
|
||||
expose.expiring = true
|
||||
}
|
||||
expose.mu.Unlock()
|
||||
func (r *exposeReaper) reapExpiredExposes(ctx context.Context) {
|
||||
expired, err := r.manager.store.GetExpiredEphemeralServices(ctx, exposeTTL, exposeReapBatch)
|
||||
if err != nil {
|
||||
log.Errorf("failed to get expired ephemeral services: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if !expired {
|
||||
return true
|
||||
for _, svc := range expired {
|
||||
log.Infof("reaping expired expose session for peer %s, domain %s", svc.SourcePeer, svc.Domain)
|
||||
|
||||
err := r.manager.deleteExpiredPeerService(ctx, svc.AccountID, svc.SourcePeer, svc.ID)
|
||||
if err == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
log.Infof("reaping expired expose session for peer %s, domain %s", expose.peerID, expose.domain)
|
||||
|
||||
err := t.manager.deleteServiceFromPeer(context.Background(), expose.accountID, expose.peerID, expose.domain, true)
|
||||
|
||||
s, _ := status.FromError(err)
|
||||
|
||||
switch {
|
||||
case err == nil:
|
||||
t.activeExposes.Delete(key)
|
||||
case s.ErrorType == status.NotFound:
|
||||
log.Debugf("service %s was already deleted", expose.domain)
|
||||
default:
|
||||
log.Errorf("failed to delete expired peer-exposed service for domain %s: %v", expose.domain, err)
|
||||
if s, ok := status.FromError(err); ok && s.ErrorType == status.NotFound {
|
||||
log.Debugf("service %s was already deleted by another instance", svc.Domain)
|
||||
} else {
|
||||
log.Errorf("failed to delete expired peer-exposed service for domain %s: %v", svc.Domain, err)
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user