add more metrics, improve metrics, reduce metrics impact on other packages

This commit is contained in:
Alisdair MacLeod
2026-02-12 12:36:54 +00:00
parent 23abb5743c
commit c37ebc6fb3
4 changed files with 123 additions and 41 deletions

View File

@@ -2,55 +2,143 @@ package metrics
import (
"net/http"
"strconv"
"time"
"github.com/netbirdio/netbird/proxy/internal/proxy"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
type Metrics struct {
requestsTotal prometheus.Counter
requestDuration prometheus.Histogram
activeRequests prometheus.Counter
backendDuration prometheus.Histogram
requestsTotal prometheus.Counter
activeRequests prometheus.Gauge
configuredDomains prometheus.Gauge
pathsPerDomain *prometheus.GaugeVec
requestDuration *prometheus.HistogramVec
backendDuration *prometheus.HistogramVec
}
func New(reg prometheus.Registerer) *Metrics {
promFactory := promauto.With(reg)
return &Metrics{
requestsTotal: promauto.With(reg).NewCounter(prometheus.CounterOpts{
requestsTotal: promFactory.NewCounter(prometheus.CounterOpts{
Name: "netbird_proxy_requests_total",
Help: "Total number of requests made to the netbird proxy",
}),
requestDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
Name: "netbird_proxy_request_duration_seconds",
Help: "Duration of requests made to the netbird proxy",
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
}),
activeRequests: promauto.With(reg).NewCounter(prometheus.CounterOpts{
Name: "netbird_proxy_active_requests_total",
activeRequests: promFactory.NewGauge(prometheus.GaugeOpts{
Name: "netbird_proxy_active_requests_count",
Help: "Current in-flight requests handled by the netbird proxy",
}),
backendDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
configuredDomains: promFactory.NewGauge(prometheus.GaugeOpts{
Name: "netbird_proxy_domains_count",
Help: "Current number of domains configured on the netbird proxy",
}),
pathsPerDomain: promFactory.NewGaugeVec(
prometheus.GaugeOpts{
Name: "netbird_proxy_paths_count",
Help: "Current number of paths configured on the netbird proxy labelled by domain",
},
[]string{"domain"},
),
requestDuration: promFactory.NewHistogramVec(
prometheus.HistogramOpts{
Name: "netbird_proxy_request_duration_seconds",
Help: "Duration of requests made to the netbird proxy",
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
},
[]string{"status", "size", "method", "host", "path"},
),
backendDuration: promFactory.NewHistogramVec(prometheus.HistogramOpts{
Name: "netbird_proxy_backend_duration_seconds",
Help: "Duration of peer round trip time from the netbird proxy",
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10},
}),
},
[]string{"status", "size", "method", "host", "path"},
),
}
}
type responseInterceptor struct {
http.ResponseWriter
status int
size int
}
func (w *responseInterceptor) WriteHeader(status int) {
w.status = status
}
func (w *responseInterceptor) Write(b []byte) (int, error) {
size, err := w.ResponseWriter.Write(b)
w.size += size
return size, err
}
func (m *Metrics) Middleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
m.requestsTotal.Inc()
m.activeRequests.Inc()
interceptor := &responseInterceptor{ResponseWriter: w}
start := time.Now()
next.ServeHTTP(w, r)
next.ServeHTTP(interceptor, r)
duration := time.Since(start)
m.activeRequests.Desc()
m.requestDuration.Observe(time.Since(start).Seconds())
m.requestDuration.With(prometheus.Labels{
"status": strconv.Itoa(interceptor.status),
"size": strconv.Itoa(interceptor.size),
"method": r.Method,
"host": r.Host,
"path": r.URL.Path,
}).Observe(duration.Seconds())
})
}
func (m *Metrics) CompleteRoundTrip(t time.Duration) {
m.backendDuration.Observe(t.Seconds())
type roundTripperFunc func(*http.Request) (*http.Response, error)
func (f roundTripperFunc) RoundTrip(r *http.Request) (*http.Response, error) {
return f(r)
}
func (m *Metrics) RoundTripper(next http.RoundTripper) http.RoundTripper {
return roundTripperFunc(func(req *http.Request) (*http.Response, error) {
start := time.Now()
res, err := next.RoundTrip(req)
duration := time.Since(start)
labels := prometheus.Labels{
"status": strconv.Itoa(res.StatusCode),
"size": strconv.Itoa(int(res.ContentLength)),
"method": req.Method,
"host": req.Host,
"path": req.URL.Path,
}
// Not all labels will be available if there was an error.
if res != nil {
labels["status"] = strconv.Itoa(res.StatusCode)
labels["size"] = strconv.Itoa(int(res.ContentLength))
}
m.backendDuration.With(labels).Observe(duration.Seconds())
return res, err
})
}
func (m *Metrics) AddMapping(mapping proxy.Mapping) {
m.configuredDomains.Inc()
m.pathsPerDomain.With(prometheus.Labels{
"domain": mapping.Host,
}).Set(float64(len(mapping.Paths)))
}
func (m *Metrics) RemoveMapping(mapping proxy.Mapping) {
m.configuredDomains.Dec()
m.pathsPerDomain.With(prometheus.Labels{
"domain": mapping.Host,
}).Set(0)
}

View File

@@ -55,8 +55,6 @@ type managementClient interface {
CreateProxyPeer(ctx context.Context, req *proto.CreateProxyPeerRequest, opts ...grpc.CallOption) (*proto.CreateProxyPeerResponse, error)
}
type backendMetricRecorder func(duration time.Duration)
// NetBird provides an http.RoundTripper implementation
// backed by underlying NetBird connections.
// Clients are keyed by AccountID, allowing multiple domains to share the same connection.
@@ -72,8 +70,6 @@ type NetBird struct {
clients map[types.AccountID]*clientEntry
initLogOnce sync.Once
statusNotifier statusNotifier
recordBackendDuration backendMetricRecorder
}
// ClientDebugInfo contains debug information about a client.
@@ -384,10 +380,6 @@ func (n *NetBird) RoundTrip(req *http.Request) (*http.Response, error) {
resp, err := transport.RoundTrip(req)
duration := time.Since(start)
if n.recordBackendDuration != nil {
n.recordBackendDuration(duration)
}
if err != nil {
n.logger.Debugf("roundtrip: method=%s host=%s url=%s account=%s duration=%s err=%v",
req.Method, req.Host, req.URL.String(), accountID, duration.Truncate(time.Millisecond), err)
@@ -496,20 +488,19 @@ func (n *NetBird) ListClientsForStartup() map[types.AccountID]*embed.Client {
// NewNetBird creates a new NetBird transport. Set wgPort to 0 for a random
// OS-assigned port. A fixed port only works with single-account deployments;
// multiple accounts will fail to bind the same port.
func NewNetBird(mgmtAddr, proxyID, proxyAddr string, wgPort int, logger *log.Logger, notifier statusNotifier, mgmtClient managementClient, metric backendMetricRecorder) *NetBird {
func NewNetBird(mgmtAddr, proxyID, proxyAddr string, wgPort int, logger *log.Logger, notifier statusNotifier, mgmtClient managementClient) *NetBird {
if logger == nil {
logger = log.StandardLogger()
}
return &NetBird{
mgmtAddr: mgmtAddr,
proxyID: proxyID,
proxyAddr: proxyAddr,
wgPort: wgPort,
logger: logger,
clients: make(map[types.AccountID]*clientEntry),
statusNotifier: notifier,
mgmtClient: mgmtClient,
recordBackendDuration: metric,
mgmtAddr: mgmtAddr,
proxyID: proxyID,
proxyAddr: proxyAddr,
wgPort: wgPort,
logger: logger,
clients: make(map[types.AccountID]*clientEntry),
statusNotifier: notifier,
mgmtClient: mgmtClient,
}
}

View File

@@ -23,7 +23,7 @@ func (m *mockMgmtClient) CreateProxyPeer(_ context.Context, _ *proto.CreateProxy
// mockNetBird creates a NetBird instance for testing without actually connecting.
// It uses an invalid management URL to prevent real connections.
func mockNetBird() *NetBird {
return NewNetBird("http://invalid.test:9999", "test-proxy", "invalid.test", 0, nil, nil, &mockMgmtClient{}, nil)
return NewNetBird("http://invalid.test:9999", "test-proxy", "invalid.test", 0, nil, nil, &mockMgmtClient{})
}
func TestNetBird_AddPeer_CreatesClientForNewAccount(t *testing.T) {

View File

@@ -61,6 +61,7 @@ type Server struct {
debug *http.Server
healthServer *health.Server
healthChecker *health.Checker
meter *metrics.Metrics
// Mostly used for debugging on management.
startTime time.Time
@@ -151,7 +152,7 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) {
// Start up metrics gathering
reg := prometheus.NewRegistry()
meter := metrics.New(reg)
s.meter = metrics.New(reg)
// The very first thing to do should be to connect to the Management server.
// Without this connection, the Proxy cannot do anything.
@@ -199,7 +200,7 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) {
// Initialize the netbird client, this is required to build peer connections
// to proxy over.
s.netbird = roundtrip.NewNetBird(s.ManagementAddress, s.ID, s.ProxyURL, s.WireguardPort, s.Logger, s, s.mgmtClient, meter.CompleteRoundTrip)
s.netbird = roundtrip.NewNetBird(s.ManagementAddress, s.ID, s.ProxyURL, s.WireguardPort, s.Logger, s, s.mgmtClient)
// When generating ACME certificates, start a challenge server.
tlsConfig := &tls.Config{}
@@ -245,7 +246,7 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) {
}
// Configure the reverse proxy using NetBird's HTTP Client Transport for proxying.
s.proxy = proxy.NewReverseProxy(s.netbird, s.ForwardedProto, s.TrustedProxies, s.Logger)
s.proxy = proxy.NewReverseProxy(s.meter.RoundTripper(s.netbird), s.ForwardedProto, s.TrustedProxies, s.Logger)
// Configure the authentication middleware with session validator for OIDC group checks.
s.auth = auth.NewMiddleware(s.Logger, s.mgmtClient)
@@ -292,7 +293,7 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) {
// Start the reverse proxy HTTPS server.
s.https = &http.Server{
Addr: addr,
Handler: meter.Middleware(accessLog.Middleware(web.AssetHandler(s.auth.Protect(s.proxy)))),
Handler: s.meter.Middleware(accessLog.Middleware(web.AssetHandler(s.auth.Protect(s.proxy)))),
TLSConfig: tlsConfig,
}
@@ -570,6 +571,7 @@ func (s *Server) updateMapping(ctx context.Context, mapping *proto.ProxyMapping)
return fmt.Errorf("auth setup for domain %s: %w", mapping.GetDomain(), err)
}
s.proxy.AddMapping(s.protoToMapping(mapping))
s.meter.AddMapping(s.protoToMapping(mapping))
return nil
}
@@ -588,6 +590,7 @@ func (s *Server) removeMapping(ctx context.Context, mapping *proto.ProxyMapping)
}
s.auth.RemoveDomain(mapping.GetDomain())
s.proxy.RemoveMapping(s.protoToMapping(mapping))
s.meter.RemoveMapping(s.protoToMapping(mapping))
}
func (s *Server) protoToMapping(mapping *proto.ProxyMapping) proxy.Mapping {