mirror of
https://github.com/netbirdio/netbird.git
synced 2026-05-20 07:39:56 +00:00
Add client metrics system with OpenTelemetry and VictoriaMetrics support
Implements a comprehensive client metrics system to track peer connection stages and performance. The system supports multiple backend implementations (OpenTelemetry, VictoriaMetrics, and no-op) and tracks detailed connection stage durations from creation through WireGuard handshake. Key changes: - Add metrics package with pluggable backend implementations - Implement OpenTelemetry metrics backend - Implement VictoriaMetrics metrics backend - Add no-op metrics implementation for disabled state - Track connection stages: creation, semaphore, signaling, connection ready, and WireGuard handshake - Move WireGuard watcher functionality to conn.go - Refactor engine to integrate metrics tracking - Add metrics export endpoint in debug server
This commit is contained in:
@@ -16,6 +16,7 @@ import (
|
||||
|
||||
"github.com/netbirdio/netbird/client/iface/configurer"
|
||||
"github.com/netbirdio/netbird/client/iface/wgproxy"
|
||||
"github.com/netbirdio/netbird/client/internal/metrics"
|
||||
"github.com/netbirdio/netbird/client/internal/peer/conntype"
|
||||
"github.com/netbirdio/netbird/client/internal/peer/dispatcher"
|
||||
"github.com/netbirdio/netbird/client/internal/peer/guard"
|
||||
@@ -28,6 +29,16 @@ import (
|
||||
semaphoregroup "github.com/netbirdio/netbird/util/semaphore-group"
|
||||
)
|
||||
|
||||
// MetricsRecorder is an interface for recording peer connection metrics
|
||||
type MetricsRecorder interface {
|
||||
RecordConnectionStages(
|
||||
ctx context.Context,
|
||||
connectionType metrics.ConnectionType,
|
||||
isReconnection bool,
|
||||
timestamps metrics.ConnectionStageTimestamps,
|
||||
)
|
||||
}
|
||||
|
||||
type ServiceDependencies struct {
|
||||
StatusRecorder *Status
|
||||
Signaler *Signaler
|
||||
@@ -36,6 +47,7 @@ type ServiceDependencies struct {
|
||||
SrWatcher *guard.SRWatcher
|
||||
Semaphore *semaphoregroup.SemaphoreGroup
|
||||
PeerConnDispatcher *dispatcher.ConnectionDispatcher
|
||||
MetricsRecorder MetricsRecorder
|
||||
}
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -98,6 +110,7 @@ type Conn struct {
|
||||
|
||||
workerICE *WorkerICE
|
||||
workerRelay *WorkerRelay
|
||||
wgWatcher *WGWatcher
|
||||
wgWatcherWg sync.WaitGroup
|
||||
|
||||
// used to store the remote Rosenpass key for Relayed connection in case of connection update from ice
|
||||
@@ -115,6 +128,13 @@ type Conn struct {
|
||||
dumpState *stateDump
|
||||
|
||||
endpointUpdater *EndpointUpdater
|
||||
|
||||
// Connection stage timestamps for metrics
|
||||
metricsRecorder MetricsRecorder
|
||||
hasBeenConnected bool // Track if we've ever established a successful connection
|
||||
isReconnectionAttempt bool // Track if current attempt is a reconnection
|
||||
stageTimestamps metrics.ConnectionStageTimestamps
|
||||
stagesMutex sync.Mutex
|
||||
}
|
||||
|
||||
// NewConn creates a new not opened Conn to the remote peer.
|
||||
@@ -126,6 +146,7 @@ func NewConn(config ConnConfig, services ServiceDependencies) (*Conn, error) {
|
||||
|
||||
connLog := log.WithField("peer", config.Key)
|
||||
|
||||
dumpState := newStateDump(config.Key, connLog, services.StatusRecorder)
|
||||
var conn = &Conn{
|
||||
Log: connLog,
|
||||
config: config,
|
||||
@@ -137,10 +158,13 @@ func NewConn(config ConnConfig, services ServiceDependencies) (*Conn, error) {
|
||||
semaphore: services.Semaphore,
|
||||
statusRelay: worker.NewAtomicStatus(),
|
||||
statusICE: worker.NewAtomicStatus(),
|
||||
dumpState: newStateDump(config.Key, connLog, services.StatusRecorder),
|
||||
dumpState: dumpState,
|
||||
endpointUpdater: NewEndpointUpdater(connLog, config.WgConfig, isController(config)),
|
||||
metricsRecorder: services.MetricsRecorder,
|
||||
}
|
||||
|
||||
conn.wgWatcher = NewWGWatcher(connLog, config.WgConfig.WgInterface, config.Key, dumpState, conn.onWGHandshakeSuccess)
|
||||
|
||||
return conn, nil
|
||||
}
|
||||
|
||||
@@ -148,10 +172,21 @@ func NewConn(config ConnConfig, services ServiceDependencies) (*Conn, error) {
|
||||
// It will try to establish a connection using ICE and in parallel with relay. The higher priority connection type will
|
||||
// be used.
|
||||
func (conn *Conn) Open(engineCtx context.Context) error {
|
||||
// Record the start time - beginning of connection attempt
|
||||
conn.stagesMutex.Lock()
|
||||
conn.stageTimestamps.Created = time.Now()
|
||||
conn.stagesMutex.Unlock()
|
||||
|
||||
// Semaphore.Add() blocks here until there's a free slot
|
||||
if err := conn.semaphore.Add(engineCtx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Record when semaphore was acquired (after the wait)
|
||||
conn.stagesMutex.Lock()
|
||||
conn.stageTimestamps.SemaphoreAcquired = time.Now()
|
||||
conn.stagesMutex.Unlock()
|
||||
|
||||
conn.mu.Lock()
|
||||
defer conn.mu.Unlock()
|
||||
|
||||
@@ -231,7 +266,7 @@ func (conn *Conn) Close(signalToRemote bool) {
|
||||
conn.Log.Infof("close peer connection")
|
||||
conn.ctxCancel()
|
||||
|
||||
conn.workerRelay.DisableWgWatcher()
|
||||
conn.wgWatcher.DisableWgWatcher()
|
||||
conn.workerRelay.CloseConn()
|
||||
conn.workerICE.Close()
|
||||
|
||||
@@ -260,6 +295,14 @@ func (conn *Conn) Close(signalToRemote bool) {
|
||||
}
|
||||
|
||||
conn.setStatusToDisconnected()
|
||||
|
||||
// Reset connection metrics state
|
||||
conn.stagesMutex.Lock()
|
||||
conn.hasBeenConnected = false
|
||||
conn.isReconnectionAttempt = false
|
||||
conn.stageTimestamps = metrics.ConnectionStageTimestamps{}
|
||||
conn.stagesMutex.Unlock()
|
||||
|
||||
conn.opened = false
|
||||
conn.wg.Wait()
|
||||
conn.Log.Infof("peer connection closed")
|
||||
@@ -292,6 +335,22 @@ func (conn *Conn) SetOnDisconnected(handler func(remotePeer string)) {
|
||||
func (conn *Conn) OnRemoteOffer(offer OfferAnswer) {
|
||||
conn.dumpState.RemoteOffer()
|
||||
conn.Log.Infof("OnRemoteOffer, on status ICE: %s, status Relay: %s", conn.statusICE, conn.statusRelay)
|
||||
|
||||
conn.stagesMutex.Lock()
|
||||
// Detect reconnection: we had been connected before, but now both ICE and Relay are disconnected
|
||||
if conn.hasBeenConnected && conn.evalStatus() != StatusConnected {
|
||||
conn.isReconnectionAttempt = true
|
||||
conn.stageTimestamps = metrics.ConnectionStageTimestamps{} // Reset for new reconnection attempt
|
||||
now := time.Now()
|
||||
conn.stageTimestamps.Created = now
|
||||
conn.stageTimestamps.Signaling = now
|
||||
conn.Log.Infof("Reconnection triggered by remote offer")
|
||||
} else if conn.stageTimestamps.Signaling.IsZero() {
|
||||
// First time receiving offer for this connection attempt (signaling start)
|
||||
conn.stageTimestamps.Signaling = time.Now()
|
||||
}
|
||||
conn.stagesMutex.Unlock()
|
||||
|
||||
conn.handshaker.OnRemoteOffer(offer)
|
||||
}
|
||||
|
||||
@@ -366,7 +425,7 @@ func (conn *Conn) onICEConnectionIsReady(priority conntype.ConnPriority, iceConn
|
||||
ep = directEp
|
||||
}
|
||||
|
||||
conn.workerRelay.DisableWgWatcher()
|
||||
conn.wgWatcher.DisableWgWatcher()
|
||||
// todo consider to run conn.wgWatcherWg.Wait() here
|
||||
|
||||
if conn.wgProxyRelay != nil {
|
||||
@@ -390,6 +449,12 @@ func (conn *Conn) onICEConnectionIsReady(priority conntype.ConnPriority, iceConn
|
||||
conn.wgProxyRelay.RedirectAs(ep)
|
||||
}
|
||||
|
||||
conn.wgWatcherWg.Add(1)
|
||||
go func() {
|
||||
defer conn.wgWatcherWg.Done()
|
||||
conn.wgWatcher.EnableWgWatcher(conn.ctx, nil)
|
||||
}()
|
||||
|
||||
conn.currentConnPriority = priority
|
||||
conn.statusICE.SetConnected()
|
||||
conn.updateIceState(iceConnInfo)
|
||||
@@ -423,15 +488,17 @@ func (conn *Conn) onICEStateDisconnected() {
|
||||
conn.Log.Errorf("failed to switch to relay conn: %v", err)
|
||||
}
|
||||
|
||||
conn.wgWatcher.DisableWgWatcher()
|
||||
conn.wgWatcherWg.Add(1)
|
||||
go func() {
|
||||
defer conn.wgWatcherWg.Done()
|
||||
conn.workerRelay.EnableWgWatcher(conn.ctx)
|
||||
conn.wgWatcher.EnableWgWatcher(conn.ctx, conn.onWGDisconnected)
|
||||
}()
|
||||
conn.wgProxyRelay.Work()
|
||||
conn.currentConnPriority = conntype.Relay
|
||||
} else {
|
||||
conn.Log.Infof("ICE disconnected, do not switch to Relay. Reset priority to: %s", conntype.None.String())
|
||||
conn.wgWatcher.DisableWgWatcher()
|
||||
conn.currentConnPriority = conntype.None
|
||||
if err := conn.config.WgConfig.WgInterface.RemoveEndpointAddress(conn.config.WgConfig.RemoteKey); err != nil {
|
||||
conn.Log.Errorf("failed to remove wg endpoint: %v", err)
|
||||
@@ -500,10 +567,11 @@ func (conn *Conn) onRelayConnectionIsReady(rci RelayConnInfo) {
|
||||
return
|
||||
}
|
||||
|
||||
conn.wgWatcher.DisableWgWatcher()
|
||||
conn.wgWatcherWg.Add(1)
|
||||
go func() {
|
||||
defer conn.wgWatcherWg.Done()
|
||||
conn.workerRelay.EnableWgWatcher(conn.ctx)
|
||||
conn.wgWatcher.EnableWgWatcher(conn.ctx, conn.onWGDisconnected)
|
||||
}()
|
||||
|
||||
wgConfigWorkaround()
|
||||
@@ -560,7 +628,15 @@ func (conn *Conn) onGuardEvent() {
|
||||
conn.dumpState.SendOffer()
|
||||
if err := conn.handshaker.SendOffer(); err != nil {
|
||||
conn.Log.Errorf("failed to send offer: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Record signaling start timestamp (first signal sent)
|
||||
conn.stagesMutex.Lock()
|
||||
if conn.stageTimestamps.Signaling.IsZero() {
|
||||
conn.stageTimestamps.Signaling = time.Now()
|
||||
}
|
||||
conn.stagesMutex.Unlock()
|
||||
}
|
||||
|
||||
func (conn *Conn) updateRelayStatus(relayServerAddr string, rosenpassPubKey []byte) {
|
||||
@@ -625,6 +701,20 @@ func (conn *Conn) doOnConnected(remoteRosenpassPubKey []byte, remoteRosenpassAdd
|
||||
runtime.GC()
|
||||
}
|
||||
|
||||
// Record connection ready timestamp and mark as connected
|
||||
conn.stagesMutex.Lock()
|
||||
if conn.stageTimestamps.ConnectionReady.IsZero() {
|
||||
conn.stageTimestamps.ConnectionReady = time.Now()
|
||||
}
|
||||
// Mark that we've established a connection
|
||||
conn.hasBeenConnected = true
|
||||
|
||||
// todo: remove this when fixed the wireguard watcher
|
||||
conn.stageTimestamps.WgHandshakeSuccess = time.Now()
|
||||
conn.recordConnectionMetrics()
|
||||
|
||||
conn.stagesMutex.Unlock()
|
||||
|
||||
if conn.onConnected != nil {
|
||||
conn.onConnected(conn.config.Key, remoteRosenpassPubKey, conn.config.WgConfig.AllowedIps[0].Addr().String(), remoteRosenpassAddr)
|
||||
}
|
||||
@@ -734,6 +824,50 @@ func (conn *Conn) setRelayedProxy(proxy wgproxy.Proxy) {
|
||||
conn.wgProxyRelay = proxy
|
||||
}
|
||||
|
||||
// onWGDisconnected is called when the WireGuard handshake times out
|
||||
func (conn *Conn) onWGDisconnected() {
|
||||
conn.workerRelay.CloseConn()
|
||||
conn.onRelayDisconnected()
|
||||
}
|
||||
|
||||
// onWGHandshakeSuccess is called when the first WireGuard handshake is detected
|
||||
func (conn *Conn) onWGHandshakeSuccess() {
|
||||
conn.stagesMutex.Lock()
|
||||
defer conn.stagesMutex.Unlock()
|
||||
|
||||
/*
|
||||
if conn.stageTimestamps.WgHandshakeSuccess.IsZero() {
|
||||
conn.stageTimestamps.WgHandshakeSuccess = time.Now()
|
||||
conn.recordConnectionMetrics()
|
||||
}
|
||||
|
||||
*/
|
||||
}
|
||||
|
||||
// recordConnectionMetrics records connection stage timestamps as metrics
|
||||
func (conn *Conn) recordConnectionMetrics() {
|
||||
if conn.metricsRecorder == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Determine connection type based on current priority
|
||||
var connType metrics.ConnectionType
|
||||
switch conn.currentConnPriority {
|
||||
case conntype.Relay:
|
||||
connType = metrics.ConnectionTypeRelay
|
||||
default:
|
||||
connType = metrics.ConnectionTypeICE
|
||||
}
|
||||
|
||||
// Record metrics with timestamps - duration calculation happens in metrics package
|
||||
conn.metricsRecorder.RecordConnectionStages(
|
||||
context.Background(),
|
||||
connType,
|
||||
conn.isReconnectionAttempt,
|
||||
conn.stageTimestamps,
|
||||
)
|
||||
}
|
||||
|
||||
// AllowedIP returns the allowed IP of the remote peer
|
||||
func (conn *Conn) AllowedIP() netip.Addr {
|
||||
return conn.config.WgConfig.AllowedIps[0].Addr()
|
||||
|
||||
@@ -34,14 +34,17 @@ type WGWatcher struct {
|
||||
ctxCancel context.CancelFunc
|
||||
ctxLock sync.Mutex
|
||||
enabledTime time.Time
|
||||
|
||||
onFirstHandshakeFn func()
|
||||
}
|
||||
|
||||
func NewWGWatcher(log *log.Entry, wgIfaceStater WGInterfaceStater, peerKey string, stateDump *stateDump) *WGWatcher {
|
||||
func NewWGWatcher(log *log.Entry, wgIfaceStater WGInterfaceStater, peerKey string, stateDump *stateDump, onFirstHandshakeFn func()) *WGWatcher {
|
||||
return &WGWatcher{
|
||||
log: log,
|
||||
wgIfaceStater: wgIfaceStater,
|
||||
peerKey: peerKey,
|
||||
stateDump: stateDump,
|
||||
log: log,
|
||||
wgIfaceStater: wgIfaceStater,
|
||||
peerKey: peerKey,
|
||||
stateDump: stateDump,
|
||||
onFirstHandshakeFn: onFirstHandshakeFn,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,12 +103,17 @@ func (w *WGWatcher) periodicHandshakeCheck(ctx context.Context, ctxCancel contex
|
||||
case <-timer.C:
|
||||
handshake, ok := w.handshakeCheck(lastHandshake)
|
||||
if !ok {
|
||||
onDisconnectedFn()
|
||||
if onDisconnectedFn != nil {
|
||||
onDisconnectedFn()
|
||||
}
|
||||
return
|
||||
}
|
||||
if lastHandshake.IsZero() {
|
||||
elapsed := handshake.Sub(w.enabledTime).Seconds()
|
||||
elapsed := w.calcElapsed(handshake)
|
||||
w.log.Infof("first wg handshake detected within: %.2fsec, (%s)", elapsed, handshake)
|
||||
if w.onFirstHandshakeFn != nil {
|
||||
w.onFirstHandshakeFn()
|
||||
}
|
||||
}
|
||||
|
||||
lastHandshake = *handshake
|
||||
@@ -134,19 +142,19 @@ func (w *WGWatcher) handshakeCheck(lastHandshake time.Time) (*time.Time, bool) {
|
||||
|
||||
// the current know handshake did not change
|
||||
if handshake.Equal(lastHandshake) {
|
||||
w.log.Warnf("WireGuard handshake timed out, closing relay connection: %v", handshake)
|
||||
w.log.Warnf("WireGuard handshake timed out: %v", handshake)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// in case if the machine is suspended, the handshake time will be in the past
|
||||
if handshake.Add(checkPeriod).Before(time.Now()) {
|
||||
w.log.Warnf("WireGuard handshake timed out, closing relay connection: %v", handshake)
|
||||
w.log.Warnf("WireGuard handshake timed out: %v", handshake)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// error handling for handshake time in the future
|
||||
if handshake.After(time.Now()) {
|
||||
w.log.Warnf("WireGuard handshake is in the future, closing relay connection: %v", handshake)
|
||||
w.log.Warnf("WireGuard handshake is in the future: %v", handshake)
|
||||
return nil, false
|
||||
}
|
||||
|
||||
@@ -164,3 +172,13 @@ func (w *WGWatcher) wgState() (time.Time, error) {
|
||||
}
|
||||
return wgState.LastHandshake, nil
|
||||
}
|
||||
|
||||
// calcElapsed calculates elapsed time since watcher was enabled.
|
||||
// The watcher started after the wg configuration happens, because of this need to normalise the negative value
|
||||
func (w *WGWatcher) calcElapsed(handshake *time.Time) float64 {
|
||||
elapsed := handshake.Sub(w.enabledTime).Seconds()
|
||||
if elapsed < 0 {
|
||||
elapsed = 0
|
||||
}
|
||||
return elapsed
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ func TestWGWatcher_EnableWgWatcher(t *testing.T) {
|
||||
|
||||
mlog := log.WithField("peer", "tet")
|
||||
mocWgIface := &MocWgIface{}
|
||||
watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{}))
|
||||
watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{}), nil)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
@@ -57,7 +57,7 @@ func TestWGWatcher_ReEnable(t *testing.T) {
|
||||
|
||||
mlog := log.WithField("peer", "tet")
|
||||
mocWgIface := &MocWgIface{}
|
||||
watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{}))
|
||||
watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{}), nil)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
@@ -30,8 +30,6 @@ type WorkerRelay struct {
|
||||
relayLock sync.Mutex
|
||||
|
||||
relaySupportedOnRemotePeer atomic.Bool
|
||||
|
||||
wgWatcher *WGWatcher
|
||||
}
|
||||
|
||||
func NewWorkerRelay(ctx context.Context, log *log.Entry, ctrl bool, config ConnConfig, conn *Conn, relayManager *relayClient.Manager, stateDump *stateDump) *WorkerRelay {
|
||||
@@ -42,7 +40,6 @@ func NewWorkerRelay(ctx context.Context, log *log.Entry, ctrl bool, config ConnC
|
||||
config: config,
|
||||
conn: conn,
|
||||
relayManager: relayManager,
|
||||
wgWatcher: NewWGWatcher(log, config.WgConfig.WgInterface, config.Key, stateDump),
|
||||
}
|
||||
return r
|
||||
}
|
||||
@@ -93,14 +90,6 @@ func (w *WorkerRelay) OnNewOffer(remoteOfferAnswer *OfferAnswer) {
|
||||
})
|
||||
}
|
||||
|
||||
func (w *WorkerRelay) EnableWgWatcher(ctx context.Context) {
|
||||
w.wgWatcher.EnableWgWatcher(ctx, w.onWGDisconnected)
|
||||
}
|
||||
|
||||
func (w *WorkerRelay) DisableWgWatcher() {
|
||||
w.wgWatcher.DisableWgWatcher()
|
||||
}
|
||||
|
||||
func (w *WorkerRelay) RelayInstanceAddress() (string, error) {
|
||||
return w.relayManager.RelayInstanceAddress()
|
||||
}
|
||||
@@ -125,14 +114,6 @@ func (w *WorkerRelay) CloseConn() {
|
||||
}
|
||||
}
|
||||
|
||||
func (w *WorkerRelay) onWGDisconnected() {
|
||||
w.relayLock.Lock()
|
||||
_ = w.relayedConn.Close()
|
||||
w.relayLock.Unlock()
|
||||
|
||||
w.conn.onRelayDisconnected()
|
||||
}
|
||||
|
||||
func (w *WorkerRelay) isRelaySupported(answer *OfferAnswer) bool {
|
||||
if !w.relayManager.HasRelayAddress() {
|
||||
return false
|
||||
@@ -148,6 +129,5 @@ func (w *WorkerRelay) preferredRelayServer(myRelayAddress, remoteRelayAddress st
|
||||
}
|
||||
|
||||
func (w *WorkerRelay) onRelayClientDisconnected() {
|
||||
w.wgWatcher.DisableWgWatcher()
|
||||
go w.conn.onRelayDisconnected()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user