From c37ebc6fb3c970945aed40f23f3761495670a133 Mon Sep 17 00:00:00 2001 From: Alisdair MacLeod Date: Thu, 12 Feb 2026 12:36:54 +0000 Subject: [PATCH] add more metrics, improve metrics, reduce metrics impact on other packages --- proxy/internal/metrics/metrics.go | 124 +++++++++++++++++++---- proxy/internal/roundtrip/netbird.go | 27 ++--- proxy/internal/roundtrip/netbird_test.go | 2 +- proxy/server.go | 11 +- 4 files changed, 123 insertions(+), 41 deletions(-) diff --git a/proxy/internal/metrics/metrics.go b/proxy/internal/metrics/metrics.go index 8e6d46054..5334ec149 100644 --- a/proxy/internal/metrics/metrics.go +++ b/proxy/internal/metrics/metrics.go @@ -2,55 +2,143 @@ package metrics import ( "net/http" + "strconv" "time" + "github.com/netbirdio/netbird/proxy/internal/proxy" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) type Metrics struct { - requestsTotal prometheus.Counter - requestDuration prometheus.Histogram - activeRequests prometheus.Counter - backendDuration prometheus.Histogram + requestsTotal prometheus.Counter + activeRequests prometheus.Gauge + configuredDomains prometheus.Gauge + pathsPerDomain *prometheus.GaugeVec + requestDuration *prometheus.HistogramVec + backendDuration *prometheus.HistogramVec } func New(reg prometheus.Registerer) *Metrics { + promFactory := promauto.With(reg) return &Metrics{ - requestsTotal: promauto.With(reg).NewCounter(prometheus.CounterOpts{ + requestsTotal: promFactory.NewCounter(prometheus.CounterOpts{ Name: "netbird_proxy_requests_total", Help: "Total number of requests made to the netbird proxy", }), - requestDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ - Name: "netbird_proxy_request_duration_seconds", - Help: "Duration of requests made to the netbird proxy", - Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, - }), - activeRequests: promauto.With(reg).NewCounter(prometheus.CounterOpts{ - Name: "netbird_proxy_active_requests_total", + activeRequests: promFactory.NewGauge(prometheus.GaugeOpts{ + Name: "netbird_proxy_active_requests_count", Help: "Current in-flight requests handled by the netbird proxy", }), - backendDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ + configuredDomains: promFactory.NewGauge(prometheus.GaugeOpts{ + Name: "netbird_proxy_domains_count", + Help: "Current number of domains configured on the netbird proxy", + }), + pathsPerDomain: promFactory.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "netbird_proxy_paths_count", + Help: "Current number of paths configured on the netbird proxy labelled by domain", + }, + []string{"domain"}, + ), + requestDuration: promFactory.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "netbird_proxy_request_duration_seconds", + Help: "Duration of requests made to the netbird proxy", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, + }, + []string{"status", "size", "method", "host", "path"}, + ), + backendDuration: promFactory.NewHistogramVec(prometheus.HistogramOpts{ Name: "netbird_proxy_backend_duration_seconds", Help: "Duration of peer round trip time from the netbird proxy", Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10}, - }), + }, + []string{"status", "size", "method", "host", "path"}, + ), } } +type responseInterceptor struct { + http.ResponseWriter + status int + size int +} + +func (w *responseInterceptor) WriteHeader(status int) { + w.status = status +} + +func (w *responseInterceptor) Write(b []byte) (int, error) { + size, err := w.ResponseWriter.Write(b) + w.size += size + return size, err +} + func (m *Metrics) Middleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { m.requestsTotal.Inc() m.activeRequests.Inc() + interceptor := &responseInterceptor{ResponseWriter: w} + start := time.Now() - next.ServeHTTP(w, r) + next.ServeHTTP(interceptor, r) + duration := time.Since(start) m.activeRequests.Desc() - m.requestDuration.Observe(time.Since(start).Seconds()) + m.requestDuration.With(prometheus.Labels{ + "status": strconv.Itoa(interceptor.status), + "size": strconv.Itoa(interceptor.size), + "method": r.Method, + "host": r.Host, + "path": r.URL.Path, + }).Observe(duration.Seconds()) }) } -func (m *Metrics) CompleteRoundTrip(t time.Duration) { - m.backendDuration.Observe(t.Seconds()) +type roundTripperFunc func(*http.Request) (*http.Response, error) + +func (f roundTripperFunc) RoundTrip(r *http.Request) (*http.Response, error) { + return f(r) +} + +func (m *Metrics) RoundTripper(next http.RoundTripper) http.RoundTripper { + return roundTripperFunc(func(req *http.Request) (*http.Response, error) { + start := time.Now() + res, err := next.RoundTrip(req) + duration := time.Since(start) + + labels := prometheus.Labels{ + "status": strconv.Itoa(res.StatusCode), + "size": strconv.Itoa(int(res.ContentLength)), + "method": req.Method, + "host": req.Host, + "path": req.URL.Path, + } + + // Not all labels will be available if there was an error. + if res != nil { + labels["status"] = strconv.Itoa(res.StatusCode) + labels["size"] = strconv.Itoa(int(res.ContentLength)) + } + + m.backendDuration.With(labels).Observe(duration.Seconds()) + + return res, err + }) +} + +func (m *Metrics) AddMapping(mapping proxy.Mapping) { + m.configuredDomains.Inc() + m.pathsPerDomain.With(prometheus.Labels{ + "domain": mapping.Host, + }).Set(float64(len(mapping.Paths))) +} + +func (m *Metrics) RemoveMapping(mapping proxy.Mapping) { + m.configuredDomains.Dec() + m.pathsPerDomain.With(prometheus.Labels{ + "domain": mapping.Host, + }).Set(0) } diff --git a/proxy/internal/roundtrip/netbird.go b/proxy/internal/roundtrip/netbird.go index fd47af7db..c32e6ee0c 100644 --- a/proxy/internal/roundtrip/netbird.go +++ b/proxy/internal/roundtrip/netbird.go @@ -55,8 +55,6 @@ type managementClient interface { CreateProxyPeer(ctx context.Context, req *proto.CreateProxyPeerRequest, opts ...grpc.CallOption) (*proto.CreateProxyPeerResponse, error) } -type backendMetricRecorder func(duration time.Duration) - // NetBird provides an http.RoundTripper implementation // backed by underlying NetBird connections. // Clients are keyed by AccountID, allowing multiple domains to share the same connection. @@ -72,8 +70,6 @@ type NetBird struct { clients map[types.AccountID]*clientEntry initLogOnce sync.Once statusNotifier statusNotifier - - recordBackendDuration backendMetricRecorder } // ClientDebugInfo contains debug information about a client. @@ -384,10 +380,6 @@ func (n *NetBird) RoundTrip(req *http.Request) (*http.Response, error) { resp, err := transport.RoundTrip(req) duration := time.Since(start) - if n.recordBackendDuration != nil { - n.recordBackendDuration(duration) - } - if err != nil { n.logger.Debugf("roundtrip: method=%s host=%s url=%s account=%s duration=%s err=%v", req.Method, req.Host, req.URL.String(), accountID, duration.Truncate(time.Millisecond), err) @@ -496,20 +488,19 @@ func (n *NetBird) ListClientsForStartup() map[types.AccountID]*embed.Client { // NewNetBird creates a new NetBird transport. Set wgPort to 0 for a random // OS-assigned port. A fixed port only works with single-account deployments; // multiple accounts will fail to bind the same port. -func NewNetBird(mgmtAddr, proxyID, proxyAddr string, wgPort int, logger *log.Logger, notifier statusNotifier, mgmtClient managementClient, metric backendMetricRecorder) *NetBird { +func NewNetBird(mgmtAddr, proxyID, proxyAddr string, wgPort int, logger *log.Logger, notifier statusNotifier, mgmtClient managementClient) *NetBird { if logger == nil { logger = log.StandardLogger() } return &NetBird{ - mgmtAddr: mgmtAddr, - proxyID: proxyID, - proxyAddr: proxyAddr, - wgPort: wgPort, - logger: logger, - clients: make(map[types.AccountID]*clientEntry), - statusNotifier: notifier, - mgmtClient: mgmtClient, - recordBackendDuration: metric, + mgmtAddr: mgmtAddr, + proxyID: proxyID, + proxyAddr: proxyAddr, + wgPort: wgPort, + logger: logger, + clients: make(map[types.AccountID]*clientEntry), + statusNotifier: notifier, + mgmtClient: mgmtClient, } } diff --git a/proxy/internal/roundtrip/netbird_test.go b/proxy/internal/roundtrip/netbird_test.go index fd4c68fe8..fb7e7fa01 100644 --- a/proxy/internal/roundtrip/netbird_test.go +++ b/proxy/internal/roundtrip/netbird_test.go @@ -23,7 +23,7 @@ func (m *mockMgmtClient) CreateProxyPeer(_ context.Context, _ *proto.CreateProxy // mockNetBird creates a NetBird instance for testing without actually connecting. // It uses an invalid management URL to prevent real connections. func mockNetBird() *NetBird { - return NewNetBird("http://invalid.test:9999", "test-proxy", "invalid.test", 0, nil, nil, &mockMgmtClient{}, nil) + return NewNetBird("http://invalid.test:9999", "test-proxy", "invalid.test", 0, nil, nil, &mockMgmtClient{}) } func TestNetBird_AddPeer_CreatesClientForNewAccount(t *testing.T) { diff --git a/proxy/server.go b/proxy/server.go index e63eb4fff..c5455d0b0 100644 --- a/proxy/server.go +++ b/proxy/server.go @@ -61,6 +61,7 @@ type Server struct { debug *http.Server healthServer *health.Server healthChecker *health.Checker + meter *metrics.Metrics // Mostly used for debugging on management. startTime time.Time @@ -151,7 +152,7 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) { // Start up metrics gathering reg := prometheus.NewRegistry() - meter := metrics.New(reg) + s.meter = metrics.New(reg) // The very first thing to do should be to connect to the Management server. // Without this connection, the Proxy cannot do anything. @@ -199,7 +200,7 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) { // Initialize the netbird client, this is required to build peer connections // to proxy over. - s.netbird = roundtrip.NewNetBird(s.ManagementAddress, s.ID, s.ProxyURL, s.WireguardPort, s.Logger, s, s.mgmtClient, meter.CompleteRoundTrip) + s.netbird = roundtrip.NewNetBird(s.ManagementAddress, s.ID, s.ProxyURL, s.WireguardPort, s.Logger, s, s.mgmtClient) // When generating ACME certificates, start a challenge server. tlsConfig := &tls.Config{} @@ -245,7 +246,7 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) { } // Configure the reverse proxy using NetBird's HTTP Client Transport for proxying. - s.proxy = proxy.NewReverseProxy(s.netbird, s.ForwardedProto, s.TrustedProxies, s.Logger) + s.proxy = proxy.NewReverseProxy(s.meter.RoundTripper(s.netbird), s.ForwardedProto, s.TrustedProxies, s.Logger) // Configure the authentication middleware with session validator for OIDC group checks. s.auth = auth.NewMiddleware(s.Logger, s.mgmtClient) @@ -292,7 +293,7 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) { // Start the reverse proxy HTTPS server. s.https = &http.Server{ Addr: addr, - Handler: meter.Middleware(accessLog.Middleware(web.AssetHandler(s.auth.Protect(s.proxy)))), + Handler: s.meter.Middleware(accessLog.Middleware(web.AssetHandler(s.auth.Protect(s.proxy)))), TLSConfig: tlsConfig, } @@ -570,6 +571,7 @@ func (s *Server) updateMapping(ctx context.Context, mapping *proto.ProxyMapping) return fmt.Errorf("auth setup for domain %s: %w", mapping.GetDomain(), err) } s.proxy.AddMapping(s.protoToMapping(mapping)) + s.meter.AddMapping(s.protoToMapping(mapping)) return nil } @@ -588,6 +590,7 @@ func (s *Server) removeMapping(ctx context.Context, mapping *proto.ProxyMapping) } s.auth.RemoveDomain(mapping.GetDomain()) s.proxy.RemoveMapping(s.protoToMapping(mapping)) + s.meter.RemoveMapping(s.protoToMapping(mapping)) } func (s *Server) protoToMapping(mapping *proto.ProxyMapping) proxy.Mapping {