Files
netbird/management/internals/server/controllers.go
Maycon Santos af24fd7796 [management] Add metrics for peer status updates and ephemeral cleanup (#6196)
* [management] Add metrics for peer status updates and ephemeral cleanup

The session-fenced MarkPeerConnected / MarkPeerDisconnected path and
the ephemeral peer cleanup loop both run silently today: when fencing
rejects a stale stream, when a cleanup tick deletes peers, or when a
batch delete fails, we have no operational signal beyond log lines.

Add OpenTelemetry counters and a histogram so the same SLO-style
dashboards that already exist for the network-map controller can cover
peer connect/disconnect and ephemeral cleanup too.

All new attributes are bounded enums: operation in {connect,disconnect}
and outcome in {applied,stale,error,peer_not_found}. No account, peer,
or user ID is ever written as a metric label — total cardinality is
fixed at compile time (8 counter series, 2 histogram series, 4 unlabeled
ephemeral series).

Metric methods are nil-receiver safe so test composition that doesn't
wire telemetry (the bulk of the existing tests) works unchanged. The
ephemeral manager exposes a SetMetrics setter rather than taking the
collector through its constructor, keeping the constructor signature
stable across all test call sites.

* [management] Add OpenTelemetry metrics for ephemeral peer cleanup

Introduce counters for tracking ephemeral peer cleanup, including peers pending deletion, cleanup runs, successful deletions, and failed batches. Metrics are nil-receiver safe to ensure compatibility with test setups without telemetry.
2026-05-18 22:55:19 +02:00

148 lines
5.1 KiB
Go

package server
import (
"context"
log "github.com/sirupsen/logrus"
"github.com/netbirdio/management-integrations/integrations"
"github.com/netbirdio/netbird/management/internals/modules/reverseproxy/proxy"
proxymanager "github.com/netbirdio/netbird/management/internals/modules/reverseproxy/proxy/manager"
"github.com/netbirdio/netbird/management/internals/controllers/network_map"
nmapcontroller "github.com/netbirdio/netbird/management/internals/controllers/network_map/controller"
"github.com/netbirdio/netbird/management/internals/controllers/network_map/update_channel"
"github.com/netbirdio/netbird/management/internals/modules/peers/ephemeral"
"github.com/netbirdio/netbird/management/internals/modules/peers/ephemeral/manager"
"github.com/netbirdio/netbird/management/internals/shared/grpc"
"github.com/netbirdio/netbird/management/server"
"github.com/netbirdio/netbird/management/server/auth"
"github.com/netbirdio/netbird/management/server/integrations/integrated_validator"
"github.com/netbirdio/netbird/management/server/integrations/port_forwarding"
"github.com/netbirdio/netbird/management/server/job"
nbjwt "github.com/netbirdio/netbird/shared/auth/jwt"
)
func (s *BaseServer) PeersUpdateManager() network_map.PeersUpdateManager {
return Create(s, func() network_map.PeersUpdateManager {
return update_channel.NewPeersUpdateManager(s.Metrics())
})
}
func (s *BaseServer) JobManager() *job.Manager {
return Create(s, func() *job.Manager {
return job.NewJobManager(s.Metrics(), s.Store(), s.PeersManager())
})
}
func (s *BaseServer) IntegratedValidator() integrated_validator.IntegratedValidator {
return Create(s, func() integrated_validator.IntegratedValidator {
integratedPeerValidator, err := integrations.NewIntegratedValidator(
context.Background(),
s.PeersManager(),
s.SettingsManager(),
s.EventStore(),
s.CacheStore())
if err != nil {
log.Errorf("failed to create integrated peer validator: %v", err)
}
return integratedPeerValidator
})
}
func (s *BaseServer) ProxyController() port_forwarding.Controller {
return Create(s, func() port_forwarding.Controller {
return integrations.NewController(s.Store())
})
}
func (s *BaseServer) SecretsManager() grpc.SecretsManager {
return Create(s, func() grpc.SecretsManager {
secretsManager, err := grpc.NewTimeBasedAuthSecretsManager(s.PeersUpdateManager(), s.Config.TURNConfig, s.Config.Relay, s.SettingsManager(), s.GroupsManager())
if err != nil {
log.Fatalf("failed to create secrets manager: %v", err)
}
return secretsManager
})
}
func (s *BaseServer) SessionStore() *auth.SessionStore {
return Create(s, func() *auth.SessionStore {
return auth.NewSessionStore(s.CacheStore())
})
}
func (s *BaseServer) AuthManager() auth.Manager {
audiences := s.Config.GetAuthAudiences()
audience := s.Config.HttpConfig.AuthAudience
keysLocation := s.Config.HttpConfig.AuthKeysLocation
signingKeyRefreshEnabled := s.Config.HttpConfig.IdpSignKeyRefreshEnabled
issuer := s.Config.HttpConfig.AuthIssuer
userIDClaim := s.Config.HttpConfig.AuthUserIDClaim
var keyFetcher nbjwt.KeyFetcher
// Use embedded IdP configuration if available
if oauthProvider := s.OAuthConfigProvider(); oauthProvider != nil {
audiences = oauthProvider.GetClientIDs()
if len(audiences) > 0 {
audience = audiences[0] // Use the first client ID as the primary audience
}
keyFetcher = oauthProvider.GetKeyFetcher()
// Fall back to default keys location if direct key fetching is not available
if keyFetcher == nil {
keysLocation = oauthProvider.GetLocalKeysLocation()
}
signingKeyRefreshEnabled = true
issuer = oauthProvider.GetIssuer()
userIDClaim = oauthProvider.GetUserIDClaim()
}
return Create(s, func() auth.Manager {
return auth.NewManager(s.Store(),
issuer,
audience,
keysLocation,
userIDClaim,
audiences,
signingKeyRefreshEnabled,
keyFetcher)
})
}
func (s *BaseServer) EphemeralManager() ephemeral.Manager {
return Create(s, func() ephemeral.Manager {
em := manager.NewEphemeralManager(s.Store(), s.PeersManager())
if metrics := s.Metrics(); metrics != nil {
em.SetMetrics(metrics.EphemeralPeersMetrics())
}
return em
})
}
func (s *BaseServer) NetworkMapController() network_map.Controller {
return Create(s, func() network_map.Controller {
return nmapcontroller.NewController(context.Background(), s.Store(), s.Metrics(), s.PeersUpdateManager(), s.AccountRequestBuffer(), s.IntegratedValidator(), s.SettingsManager(), s.DNSDomain(), s.ProxyController(), s.EphemeralManager(), s.Config)
})
}
func (s *BaseServer) ServiceProxyController() proxy.Controller {
return Create(s, func() proxy.Controller {
controller, err := proxymanager.NewGRPCController(s.ReverseProxyGRPCServer(), s.Metrics().GetMeter())
if err != nil {
log.Fatalf("failed to create service proxy controller: %v", err)
}
return controller
})
}
func (s *BaseServer) AccountRequestBuffer() *server.AccountRequestBuffer {
return Create(s, func() *server.AccountRequestBuffer {
return server.NewAccountRequestBuffer(context.Background(), s.Store())
})
}
func (s *BaseServer) DNSDomain() string {
return s.dnsDomain
}