From 4a322a436bd0502231cbce1dbe26ba5ed4425018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Mon, 4 May 2026 00:12:16 +0200 Subject: [PATCH 1/8] chore(docs): add OTLP timeout docs and minor fixes --- .github/CODEOWNERS | 1 + README.md | 2 +- docs/observability.md | 12 ++++++++---- examples/otel-collector-config.yaml | 3 ++- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c5f1403..7d8c330 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1,2 @@ * @oschwartz10612 @miloschwartz +internal/observability/** @marcschaeferger diff --git a/README.md b/README.md index e403d88..6c324c1 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ make ### Binary -Make sure to have Go 1.23.1 installed. +Make sure to have Go 1.26 installed. ```bash make local diff --git a/docs/observability.md b/docs/observability.md index 13cb038..64184d5 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -83,6 +83,7 @@ type OTelConfig struct { Endpoint string // default: "localhost:4317" Insecure bool // default: true ExportInterval time.Duration // default: 60s + Timeout time.Duration // default: 10s } ``` @@ -97,6 +98,7 @@ type OTelConfig struct { | `OTEL_METRICS_ENDPOINT` | `localhost:4317` | OTLP collector address | | `OTEL_METRICS_INSECURE` | `true` | Disable TLS for OTLP | | `OTEL_METRICS_EXPORT_INTERVAL` | `60s` | Push interval (e.g. `10s`, `1m`) | +| `OTEL_METRICS_TIMEOUT` | `10s` | Timeout for OTLP exporter connection setup | | `DEPLOYMENT_ENVIRONMENT` | _(unset)_ | OTel deployment.environment attribute | ### CLI flags @@ -108,7 +110,8 @@ type OTelConfig struct { --otel-metrics-protocol string (default: grpc) --otel-metrics-endpoint string (default: localhost:4317) --otel-metrics-insecure bool (default: true) ---otel-metrics-export-interval duration (default: 1m0s) +--otel-metrics-export-interval duration (default: 60s) +--otel-metrics-timeout duration (default: 10s) ``` --- @@ -164,6 +167,7 @@ export OTEL_METRICS_PROTOCOL=grpc export OTEL_METRICS_ENDPOINT=otel-collector:4317 export OTEL_METRICS_INSECURE=true export OTEL_METRICS_EXPORT_INTERVAL=10s +export OTEL_METRICS_TIMEOUT=10s export DEPLOYMENT_ENVIRONMENT=production ``` @@ -176,6 +180,7 @@ export DEPLOYMENT_ENVIRONMENT=production --otel-metrics-endpoint=otel-collector:4317 \ --otel-metrics-insecure \ --otel-metrics-export-interval=10s \ + --otel-metrics-timeout=10s \ --config=/etc/gerbil/config.json ``` @@ -225,7 +230,6 @@ All metrics use the prefix `gerbil__`. | Metric | Type | Labels | |--------|------|--------| | `gerbil_proxy_mapping_active` | UpDownCounter | `ifname` | -| `gerbil_session_active` | UpDownCounter | `ifname` | | `gerbil_active_sessions` | UpDownCounter | `ifname` | | `gerbil_udp_packets_total` | Counter | `ifname`, `type`, `direction` | | `gerbil_hole_punch_events_total` | Counter | `ifname`, `result` | @@ -256,7 +260,7 @@ The `docker-compose.metrics.yml` provides a complete observability stack. **Prometheus mode:** ```bash -METRICS_BACKEND=prometheus docker-compose -f docker-compose.metrics.yml up -d +METRICS_BACKEND=prometheus docker-compose -f docker compose.metrics.yml up -d # Scrape at http://localhost:3003/metrics # Grafana at http://localhost:3000 (admin/admin) ``` @@ -265,5 +269,5 @@ METRICS_BACKEND=prometheus docker-compose -f docker-compose.metrics.yml up -d ```bash METRICS_BACKEND=otel OTEL_METRICS_ENDPOINT=otel-collector:4317 \ - docker-compose -f docker-compose.metrics.yml up -d + docker compose -f docker-compose.metrics.yml up -d ``` diff --git a/examples/otel-collector-config.yaml b/examples/otel-collector-config.yaml index 5c85356..acfa434 100644 --- a/examples/otel-collector-config.yaml +++ b/examples/otel-collector-config.yaml @@ -1,3 +1,4 @@ +file_format: '1.0' receivers: otlp: protocols: @@ -43,4 +44,4 @@ service: metrics: receivers: [otlp] processors: [batch, resource] - exporters: [prometheus, prometheusremotewrite, debug] \ No newline at end of file + exporters: [prometheus, prometheusremotewrite, debug] From f130a7cdb80ac50f1bf806f8febfa597cc4294db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Mon, 4 May 2026 00:12:22 +0200 Subject: [PATCH 2/8] chore(deps): update OpenTelemetry and related modules --- go.mod | 24 ++++++++++++------------ go.sum | 52 ++++++++++++++++++++++++++-------------------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/go.mod b/go.mod index ace40d4..bd2fe30 100644 --- a/go.mod +++ b/go.mod @@ -6,12 +6,12 @@ require ( github.com/patrickmn/go-cache v2.1.0+incompatible github.com/prometheus/client_golang v1.20.5 github.com/vishvananda/netlink v1.3.1 - go.opentelemetry.io/otel v1.42.0 - go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0 - go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0 - go.opentelemetry.io/otel/metric v1.42.0 - go.opentelemetry.io/otel/sdk v1.42.0 - go.opentelemetry.io/otel/sdk/metric v1.42.0 + go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 golang.org/x/crypto v0.49.0 golang.org/x/sync v0.20.0 golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6 @@ -37,14 +37,14 @@ require ( github.com/prometheus/procfs v0.15.1 // indirect github.com/vishvananda/netns v0.0.5 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/otel/trace v1.42.0 // indirect - go.opentelemetry.io/proto/otlp v1.9.0 // indirect - golang.org/x/net v0.51.0 // indirect + go.opentelemetry.io/otel/trace v1.43.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect + golang.org/x/net v0.52.0 // indirect golang.org/x/sys v0.42.0 // indirect golang.org/x/text v0.35.0 // indirect golang.zx2c4.com/wireguard v0.0.0-20230325221338-052af4a8072b // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect - google.golang.org/grpc v1.79.3 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/grpc v1.80.0 // indirect google.golang.org/protobuf v1.36.11 // indirect ) diff --git a/go.sum b/go.sum index 20a033f..91c4b4f 100644 --- a/go.sum +++ b/go.sum @@ -55,26 +55,26 @@ github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zd github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= -go.opentelemetry.io/otel v1.42.0 h1:lSQGzTgVR3+sgJDAU/7/ZMjN9Z+vUip7leaqBKy4sho= -go.opentelemetry.io/otel v1.42.0/go.mod h1:lJNsdRMxCUIWuMlVJWzecSMuNjE7dOYyWlqOXWkdqCc= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0 h1:MdKucPl/HbzckWWEisiNqMPhRrAOQX8r4jTuGr636gk= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0/go.mod h1:RolT8tWtfHcjajEH5wFIZ4Dgh5jpPdFXYV9pTAk/qjc= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0 h1:H7O6RlGOMTizyl3R08Kn5pdM06bnH8oscSj7o11tmLA= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0/go.mod h1:mBFWu/WOVDkWWsR7Tx7h6EpQB8wsv7P0Yrh0Pb7othc= -go.opentelemetry.io/otel/metric v1.42.0 h1:2jXG+3oZLNXEPfNmnpxKDeZsFI5o4J+nz6xUlaFdF/4= -go.opentelemetry.io/otel/metric v1.42.0/go.mod h1:RlUN/7vTU7Ao/diDkEpQpnz3/92J9ko05BIwxYa2SSI= -go.opentelemetry.io/otel/sdk v1.42.0 h1:LyC8+jqk6UJwdrI/8VydAq/hvkFKNHZVIWuslJXYsDo= -go.opentelemetry.io/otel/sdk v1.42.0/go.mod h1:rGHCAxd9DAph0joO4W6OPwxjNTYWghRWmkHuGbayMts= -go.opentelemetry.io/otel/sdk/metric v1.42.0 h1:D/1QR46Clz6ajyZ3G8SgNlTJKBdGp84q9RKCAZ3YGuA= -go.opentelemetry.io/otel/sdk/metric v1.42.0/go.mod h1:Ua6AAlDKdZ7tdvaQKfSmnFTdHx37+J4ba8MwVCYM5hc= -go.opentelemetry.io/otel/trace v1.42.0 h1:OUCgIPt+mzOnaUTpOQcBiM/PLQ/Op7oq6g4LenLmOYY= -go.opentelemetry.io/otel/trace v1.42.0/go.mod h1:f3K9S+IFqnumBkKhRJMeaZeNk9epyhnCmQh/EysQCdc= -go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= -go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 h1:8UQVDcZxOJLtX6gxtDt3vY2WTgvZqMQRzjsqiIHQdkc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0/go.mod h1:2lmweYCiHYpEjQ/lSJBYhj9jP1zvCvQW4BqL9dnT7FQ= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 h1:w1K+pCJoPpQifuVpsKamUdn9U0zM3xUziVOqsGksUrY= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0/go.mod h1:HBy4BjzgVE8139ieRI75oXm3EcDN+6GhD88JT1Kjvxg= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= -golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo= -golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -87,14 +87,14 @@ golang.zx2c4.com/wireguard v0.0.0-20230325221338-052af4a8072b h1:J1CaxgLerRR5lgx golang.zx2c4.com/wireguard v0.0.0-20230325221338-052af4a8072b/go.mod h1:tqur9LnfstdR9ep2LaJT4lFUl0EjlHtge+gAjmsHUG4= golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6 h1:CawjfCvYQH2OU3/TnxLx97WDSUDRABfT18pCOYwc2GE= golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6/go.mod h1:3rxYc4HtVcSG9gVaTs2GEBdehh+sYPOwKtyUWEOTb80= -gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= -gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= -google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0= -google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= -google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= -google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= From bcb5cc4746b64e1d8969ef92a9076b898a32672c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Mon, 4 May 2026 00:12:23 +0200 Subject: [PATCH 3/8] refactor(metrics): safe initialization and instrument factory --- internal/metrics/metrics.go | 546 +++++++++++++++++++++++++++---- internal/metrics/metrics_test.go | 12 +- 2 files changed, 482 insertions(+), 76 deletions(-) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 4a92b9f..e38aa90 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -7,7 +7,9 @@ package metrics import ( "context" + "fmt" "net/http" + "sync" "github.com/fosrl/gerbil/internal/observability" ) @@ -24,6 +26,7 @@ type OTelConfig = observability.OTelConfig var ( backend observability.Backend + initMu sync.Mutex // Interface and peer metrics wgInterfaceUp observability.Int64Gauge @@ -55,7 +58,6 @@ var ( udpPacketSizeBytes observability.Histogram holePunchEventsTotal observability.Counter proxyMappingActive observability.UpDownCounter - sessionActive observability.UpDownCounter sessionRebuiltTotal observability.Counter commPatternActive observability.UpDownCounter proxyCleanupRemovedTotal observability.Counter @@ -107,6 +109,13 @@ func DefaultConfig() Config { // Initialize sets up the metrics system using the selected backend. // It returns the /metrics HTTP handler (non-nil only for Prometheus backend). func Initialize(cfg Config) (http.Handler, error) { + initMu.Lock() + defer initMu.Unlock() + + if backend != nil { + return backend.HTTPHandler(), nil + } + b, err := observability.New(cfg) if err != nil { return nil, err @@ -114,6 +123,7 @@ func Initialize(cfg Config) (http.Handler, error) { backend = b if err := createInstruments(); err != nil { + backend = nil return nil, err } @@ -122,8 +132,13 @@ func Initialize(cfg Config) (http.Handler, error) { // Shutdown gracefully shuts down the metrics backend. func Shutdown(ctx context.Context) error { - if backend != nil { - return backend.Shutdown(ctx) + initMu.Lock() + b := backend + backend = nil + initMu.Unlock() + + if b != nil { + return b.Shutdown(ctx) } return nil } @@ -135,129 +150,346 @@ func createInstruments() error { b := backend - wgInterfaceUp = b.NewInt64Gauge("gerbil_wg_interface_up", + newCounter := func(name, desc string, labelNames ...string) (observability.Counter, error) { + c, err := b.NewCounter(name, desc, labelNames...) + if err != nil { + return nil, fmt.Errorf("create counter %q: %w", name, err) + } + return c, nil + } + + newUpDownCounter := func(name, desc string, labelNames ...string) (observability.UpDownCounter, error) { + c, err := b.NewUpDownCounter(name, desc, labelNames...) + if err != nil { + return nil, fmt.Errorf("create updown counter %q: %w", name, err) + } + return c, nil + } + + newInt64Gauge := func(name, desc string, labelNames ...string) (observability.Int64Gauge, error) { + g, err := b.NewInt64Gauge(name, desc, labelNames...) + if err != nil { + return nil, fmt.Errorf("create int64 gauge %q: %w", name, err) + } + return g, nil + } + + newFloat64Gauge := func(name, desc string, labelNames ...string) (observability.Float64Gauge, error) { + g, err := b.NewFloat64Gauge(name, desc, labelNames...) + if err != nil { + return nil, fmt.Errorf("create float64 gauge %q: %w", name, err) + } + return g, nil + } + + newHistogram := func(name, desc string, buckets []float64, labelNames ...string) (observability.Histogram, error) { + h, err := b.NewHistogram(name, desc, buckets, labelNames...) + if err != nil { + return nil, fmt.Errorf("create histogram %q: %w", name, err) + } + return h, nil + } + + var err error + + wgInterfaceUp, err = newInt64Gauge("gerbil_wg_interface_up", "Operational state of a WireGuard interface (1=up, 0=down)", "ifname", "instance") - wgPeersTotal = b.NewUpDownCounter("gerbil_wg_peers_total", + if err != nil { + return err + } + wgPeersTotal, err = newUpDownCounter("gerbil_wg_peers_total", "Total number of configured peers per interface", "ifname") - wgPeerConnected = b.NewInt64Gauge("gerbil_wg_peer_connected", + if err != nil { + return err + } + wgPeerConnected, err = newInt64Gauge("gerbil_wg_peer_connected", "Whether a specific peer is connected (1=connected, 0=disconnected)", "ifname", "peer") - allowedIPsCount = b.NewUpDownCounter("gerbil_allowed_ips_count", + if err != nil { + return err + } + allowedIPsCount, err = newUpDownCounter("gerbil_allowed_ips_count", "Number of allowed IPs configured per peer", "ifname", "peer") - keyRotationTotal = b.NewCounter("gerbil_key_rotation_total", + if err != nil { + return err + } + keyRotationTotal, err = newCounter("gerbil_key_rotation_total", "Key rotation events", "ifname", "reason") - wgHandshakesTotal = b.NewCounter("gerbil_wg_handshakes_total", + if err != nil { + return err + } + wgHandshakesTotal, err = newCounter("gerbil_wg_handshakes_total", "Count of handshake attempts with their result status", "ifname", "peer", "result") - wgHandshakeLatency = b.NewHistogram("gerbil_wg_handshake_latency_seconds", + if err != nil { + return err + } + wgHandshakeLatency, err = newHistogram("gerbil_wg_handshake_latency_seconds", "Distribution of handshake latencies in seconds", durationBuckets, "ifname", "peer") - wgPeerRTT = b.NewHistogram("gerbil_wg_peer_rtt_seconds", + if err != nil { + return err + } + wgPeerRTT, err = newHistogram("gerbil_wg_peer_rtt_seconds", "Observed round-trip time to a peer in seconds", durationBuckets, "ifname", "peer") - wgBytesReceived = b.NewCounter("gerbil_wg_bytes_received_total", + if err != nil { + return err + } + wgBytesReceived, err = newCounter("gerbil_wg_bytes_received_total", "Number of bytes received from a peer", "ifname", "peer") - wgBytesTransmitted = b.NewCounter("gerbil_wg_bytes_transmitted_total", + if err != nil { + return err + } + wgBytesTransmitted, err = newCounter("gerbil_wg_bytes_transmitted_total", "Number of bytes transmitted to a peer", "ifname", "peer") - netlinkEventsTotal = b.NewCounter("gerbil_netlink_events_total", + if err != nil { + return err + } + netlinkEventsTotal, err = newCounter("gerbil_netlink_events_total", "Number of netlink events processed", "event_type") - netlinkErrorsTotal = b.NewCounter("gerbil_netlink_errors_total", + if err != nil { + return err + } + netlinkErrorsTotal, err = newCounter("gerbil_netlink_errors_total", "Count of netlink or kernel errors", "component", "error_type") - syncDuration = b.NewHistogram("gerbil_sync_duration_seconds", + if err != nil { + return err + } + syncDuration, err = newHistogram("gerbil_sync_duration_seconds", "Duration of reconciliation/sync loops in seconds", durationBuckets, "component") - workqueueDepth = b.NewUpDownCounter("gerbil_workqueue_depth", + if err != nil { + return err + } + workqueueDepth, err = newUpDownCounter("gerbil_workqueue_depth", "Current length of internal work queues", "queue") - kernelModuleLoads = b.NewCounter("gerbil_kernel_module_loads_total", + if err != nil { + return err + } + kernelModuleLoads, err = newCounter("gerbil_kernel_module_loads_total", "Count of kernel module load attempts", "result") - firewallRulesApplied = b.NewCounter("gerbil_firewall_rules_applied_total", + if err != nil { + return err + } + firewallRulesApplied, err = newCounter("gerbil_firewall_rules_applied_total", "IPTables/NFT rules applied", "result", "chain") - activeSessions = b.NewUpDownCounter("gerbil_active_sessions", + if err != nil { + return err + } + activeSessions, err = newUpDownCounter("gerbil_active_sessions", "Number of active UDP relay sessions", "ifname") - activeProxyConnections = b.NewUpDownCounter("gerbil_active_proxy_connections", + if err != nil { + return err + } + activeProxyConnections, err = newUpDownCounter("gerbil_active_proxy_connections", "Active SNI proxy connections") - proxyRouteLookups = b.NewCounter("gerbil_proxy_route_lookups_total", + if err != nil { + return err + } + proxyRouteLookups, err = newCounter("gerbil_proxy_route_lookups_total", "Number of route lookups", "result") - proxyTLSHandshake = b.NewHistogram("gerbil_proxy_tls_handshake_seconds", + if err != nil { + return err + } + proxyTLSHandshake, err = newHistogram("gerbil_proxy_tls_handshake_seconds", "TLS handshake duration for SNI proxy in seconds", durationBuckets) - proxyBytesTransmitted = b.NewCounter("gerbil_proxy_bytes_transmitted_total", + if err != nil { + return err + } + proxyBytesTransmitted, err = newCounter("gerbil_proxy_bytes_transmitted_total", "Bytes sent/received by the SNI proxy", "direction") - configReloadsTotal = b.NewCounter("gerbil_config_reloads_total", + if err != nil { + return err + } + configReloadsTotal, err = newCounter("gerbil_config_reloads_total", "Number of configuration reloads", "result") - restartTotal = b.NewCounter("gerbil_restart_total", + if err != nil { + return err + } + restartTotal, err = newCounter("gerbil_restart_total", "Process restart count") - authFailuresTotal = b.NewCounter("gerbil_auth_failures_total", + if err != nil { + return err + } + authFailuresTotal, err = newCounter("gerbil_auth_failures_total", "Count of authentication or peer validation failures", "peer", "reason") - aclDeniedTotal = b.NewCounter("gerbil_acl_denied_total", + if err != nil { + return err + } + aclDeniedTotal, err = newCounter("gerbil_acl_denied_total", "Access control denied events", "ifname", "peer", "policy") - certificateExpiryDays = b.NewFloat64Gauge("gerbil_certificate_expiry_days", + if err != nil { + return err + } + certificateExpiryDays, err = newFloat64Gauge("gerbil_certificate_expiry_days", "Days until certificate expiry", "cert_name", "ifname") - udpPacketsTotal = b.NewCounter("gerbil_udp_packets_total", + if err != nil { + return err + } + udpPacketsTotal, err = newCounter("gerbil_udp_packets_total", "Count of UDP packets processed by relay workers", "ifname", "type", "direction") - udpPacketSizeBytes = b.NewHistogram("gerbil_udp_packet_size_bytes", + if err != nil { + return err + } + udpPacketSizeBytes, err = newHistogram("gerbil_udp_packet_size_bytes", "Size distribution of packets forwarded through relay", sizeBuckets, "ifname", "type") - holePunchEventsTotal = b.NewCounter("gerbil_hole_punch_events_total", + if err != nil { + return err + } + holePunchEventsTotal, err = newCounter("gerbil_hole_punch_events_total", "Count of hole punch messages processed", "ifname", "result") - proxyMappingActive = b.NewUpDownCounter("gerbil_proxy_mapping_active", + if err != nil { + return err + } + proxyMappingActive, err = newUpDownCounter("gerbil_proxy_mapping_active", "Number of active proxy mappings", "ifname") - sessionActive = b.NewUpDownCounter("gerbil_session_active", - "Number of active WireGuard sessions", "ifname") - sessionRebuiltTotal = b.NewCounter("gerbil_session_rebuilt_total", + if err != nil { + return err + } + sessionRebuiltTotal, err = newCounter("gerbil_session_rebuilt_total", "Count of sessions rebuilt from communication patterns", "ifname") - commPatternActive = b.NewUpDownCounter("gerbil_comm_pattern_active", + if err != nil { + return err + } + commPatternActive, err = newUpDownCounter("gerbil_comm_pattern_active", "Number of active communication patterns", "ifname") - proxyCleanupRemovedTotal = b.NewCounter("gerbil_proxy_cleanup_removed_total", + if err != nil { + return err + } + proxyCleanupRemovedTotal, err = newCounter("gerbil_proxy_cleanup_removed_total", "Count of items removed during cleanup routines", "ifname", "component") - proxyConnectionErrorsTotal = b.NewCounter("gerbil_proxy_connection_errors_total", + if err != nil { + return err + } + proxyConnectionErrorsTotal, err = newCounter("gerbil_proxy_connection_errors_total", "Count of connection errors in proxy operations", "ifname", "error_type") - proxyInitialMappingsTotal = b.NewInt64Gauge("gerbil_proxy_initial_mappings", + if err != nil { + return err + } + proxyInitialMappingsTotal, err = newInt64Gauge("gerbil_proxy_initial_mappings", "Number of initial proxy mappings loaded", "ifname") - proxyMappingUpdatesTotal = b.NewCounter("gerbil_proxy_mapping_updates_total", + if err != nil { + return err + } + proxyMappingUpdatesTotal, err = newCounter("gerbil_proxy_mapping_updates_total", "Count of proxy mapping updates", "ifname") - proxyIdleCleanupDuration = b.NewHistogram("gerbil_proxy_idle_cleanup_duration_seconds", + if err != nil { + return err + } + proxyIdleCleanupDuration, err = newHistogram("gerbil_proxy_idle_cleanup_duration_seconds", "Duration of cleanup cycles", durationBuckets, "ifname", "component") - sniConnectionsTotal = b.NewCounter("gerbil_sni_connections_total", + if err != nil { + return err + } + sniConnectionsTotal, err = newCounter("gerbil_sni_connections_total", "Count of connections processed by SNI proxy", "result") - sniConnectionDuration = b.NewHistogram("gerbil_sni_connection_duration_seconds", + if err != nil { + return err + } + sniConnectionDuration, err = newHistogram("gerbil_sni_connection_duration_seconds", "Lifetime distribution of proxied TLS connections", sniDurationBuckets) - sniActiveConnections = b.NewUpDownCounter("gerbil_sni_active_connections", + if err != nil { + return err + } + sniActiveConnections, err = newUpDownCounter("gerbil_sni_active_connections", "Number of active SNI tunnels") - sniRouteCacheHitsTotal = b.NewCounter("gerbil_sni_route_cache_hits_total", + if err != nil { + return err + } + sniRouteCacheHitsTotal, err = newCounter("gerbil_sni_route_cache_hits_total", "Count of route cache hits and misses", "result") - sniRouteAPIRequestsTotal = b.NewCounter("gerbil_sni_route_api_requests_total", + if err != nil { + return err + } + sniRouteAPIRequestsTotal, err = newCounter("gerbil_sni_route_api_requests_total", "Count of route API requests", "result") - sniRouteAPILatency = b.NewHistogram("gerbil_sni_route_api_latency_seconds", + if err != nil { + return err + } + sniRouteAPILatency, err = newHistogram("gerbil_sni_route_api_latency_seconds", "Distribution of route API call latencies", durationBuckets) - sniLocalOverrideTotal = b.NewCounter("gerbil_sni_local_override_total", + if err != nil { + return err + } + sniLocalOverrideTotal, err = newCounter("gerbil_sni_local_override_total", "Count of routes using local overrides", "hit") - sniTrustedProxyEventsTotal = b.NewCounter("gerbil_sni_trusted_proxy_events_total", + if err != nil { + return err + } + sniTrustedProxyEventsTotal, err = newCounter("gerbil_sni_trusted_proxy_events_total", "Count of PROXY protocol events", "event") - sniProxyProtocolParseErrorsTotal = b.NewCounter("gerbil_sni_proxy_protocol_parse_errors_total", + if err != nil { + return err + } + sniProxyProtocolParseErrorsTotal, err = newCounter("gerbil_sni_proxy_protocol_parse_errors_total", "Count of PROXY protocol parse failures") - sniDataBytesTotal = b.NewCounter("gerbil_sni_data_bytes_total", + if err != nil { + return err + } + sniDataBytesTotal, err = newCounter("gerbil_sni_data_bytes_total", "Count of bytes proxied through SNI tunnels", "direction") - sniTunnelTerminationsTotal = b.NewCounter("gerbil_sni_tunnel_terminations_total", + if err != nil { + return err + } + sniTunnelTerminationsTotal, err = newCounter("gerbil_sni_tunnel_terminations_total", "Count of tunnel terminations by reason", "reason") - httpRequestsTotal = b.NewCounter("gerbil_http_requests_total", + if err != nil { + return err + } + httpRequestsTotal, err = newCounter("gerbil_http_requests_total", "Count of HTTP requests to management API", "endpoint", "method", "status_code") - httpRequestDuration = b.NewHistogram("gerbil_http_request_duration_seconds", + if err != nil { + return err + } + httpRequestDuration, err = newHistogram("gerbil_http_request_duration_seconds", "Distribution of HTTP request handling time", durationBuckets, "endpoint", "method") - peerOperationsTotal = b.NewCounter("gerbil_peer_operations_total", + if err != nil { + return err + } + peerOperationsTotal, err = newCounter("gerbil_peer_operations_total", "Count of peer lifecycle operations", "operation", "result") - proxyMappingUpdateRequestsTotal = b.NewCounter("gerbil_proxy_mapping_update_requests_total", + if err != nil { + return err + } + proxyMappingUpdateRequestsTotal, err = newCounter("gerbil_proxy_mapping_update_requests_total", "Count of proxy mapping update API calls", "result") - destinationsUpdateRequestsTotal = b.NewCounter("gerbil_destinations_update_requests_total", + if err != nil { + return err + } + destinationsUpdateRequestsTotal, err = newCounter("gerbil_destinations_update_requests_total", "Count of destinations update API calls", "result") - remoteConfigFetchesTotal = b.NewCounter("gerbil_remote_config_fetches_total", + if err != nil { + return err + } + remoteConfigFetchesTotal, err = newCounter("gerbil_remote_config_fetches_total", "Count of remote configuration fetch attempts", "result") - bandwidthReportsTotal = b.NewCounter("gerbil_bandwidth_reports_total", + if err != nil { + return err + } + bandwidthReportsTotal, err = newCounter("gerbil_bandwidth_reports_total", "Count of bandwidth report transmissions", "result") - peerBandwidthBytesTotal = b.NewCounter("gerbil_peer_bandwidth_bytes_total", + if err != nil { + return err + } + peerBandwidthBytesTotal, err = newCounter("gerbil_peer_bandwidth_bytes_total", "Bytes per peer tracked by bandwidth calculation", "peer", "direction") - memorySpikeTotal = b.NewCounter("gerbil_memory_spike_total", + if err != nil { + return err + } + memorySpikeTotal, err = newCounter("gerbil_memory_spike_total", "Count of memory spikes detected", "severity") - heapProfilesWrittenTotal = b.NewCounter("gerbil_heap_profiles_written_total", + if err != nil { + return err + } + heapProfilesWrittenTotal, err = newCounter("gerbil_heap_profiles_written_total", "Count of heap profile files generated") + if err != nil { + return err + } return nil } func RecordInterfaceUp(ifname, instance string, up bool) { + if wgInterfaceUp == nil { + return + } value := int64(0) if up { value = 1 @@ -266,10 +498,16 @@ func RecordInterfaceUp(ifname, instance string, up bool) { } func RecordPeersTotal(ifname string, delta int64) { + if wgPeersTotal == nil { + return + } wgPeersTotal.Add(context.Background(), delta, observability.Labels{"ifname": ifname}) } func RecordPeerConnected(ifname, peer string, connected bool) { + if wgPeerConnected == nil { + return + } value := int64(0) if connected { value = 1 @@ -278,229 +516,393 @@ func RecordPeerConnected(ifname, peer string, connected bool) { } func RecordHandshake(ifname, peer, result string) { + if wgHandshakesTotal == nil { + return + } wgHandshakesTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "peer": peer, "result": result}) } func RecordHandshakeLatency(ifname, peer string, seconds float64) { + if wgHandshakeLatency == nil { + return + } wgHandshakeLatency.Record(context.Background(), seconds, observability.Labels{"ifname": ifname, "peer": peer}) } func RecordPeerRTT(ifname, peer string, seconds float64) { + if wgPeerRTT == nil { + return + } wgPeerRTT.Record(context.Background(), seconds, observability.Labels{"ifname": ifname, "peer": peer}) } func RecordBytesReceived(ifname, peer string, bytes int64) { + if wgBytesReceived == nil { + return + } wgBytesReceived.Add(context.Background(), bytes, observability.Labels{"ifname": ifname, "peer": peer}) } func RecordBytesTransmitted(ifname, peer string, bytes int64) { + if wgBytesTransmitted == nil { + return + } wgBytesTransmitted.Add(context.Background(), bytes, observability.Labels{"ifname": ifname, "peer": peer}) } func RecordAllowedIPsCount(ifname, peer string, delta int64) { + if allowedIPsCount == nil { + return + } allowedIPsCount.Add(context.Background(), delta, observability.Labels{"ifname": ifname, "peer": peer}) } func RecordKeyRotation(ifname, reason string) { + if keyRotationTotal == nil { + return + } keyRotationTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "reason": reason}) } func RecordNetlinkEvent(eventType string) { + if netlinkEventsTotal == nil { + return + } netlinkEventsTotal.Add(context.Background(), 1, observability.Labels{"event_type": eventType}) } func RecordNetlinkError(component, errorType string) { + if netlinkErrorsTotal == nil { + return + } netlinkErrorsTotal.Add(context.Background(), 1, observability.Labels{"component": component, "error_type": errorType}) } func RecordSyncDuration(component string, seconds float64) { + if syncDuration == nil { + return + } syncDuration.Record(context.Background(), seconds, observability.Labels{"component": component}) } func RecordWorkqueueDepth(queue string, delta int64) { + if workqueueDepth == nil { + return + } workqueueDepth.Add(context.Background(), delta, observability.Labels{"queue": queue}) } func RecordKernelModuleLoad(result string) { + if kernelModuleLoads == nil { + return + } kernelModuleLoads.Add(context.Background(), 1, observability.Labels{"result": result}) } func RecordFirewallRuleApplied(result, chain string) { + if firewallRulesApplied == nil { + return + } firewallRulesApplied.Add(context.Background(), 1, observability.Labels{"result": result, "chain": chain}) } func RecordActiveSession(ifname string, delta int64) { + if activeSessions == nil { + return + } activeSessions.Add(context.Background(), delta, observability.Labels{"ifname": ifname}) } -func RecordActiveProxyConnection(hostname string, delta int64) { - _ = hostname +func RecordActiveProxyConnection(delta int64) { + if activeProxyConnections == nil { + return + } activeProxyConnections.Add(context.Background(), delta, nil) } -func RecordProxyRouteLookup(result, hostname string) { - _ = hostname +func RecordProxyRouteLookup(result string) { + if proxyRouteLookups == nil { + return + } proxyRouteLookups.Add(context.Background(), 1, observability.Labels{"result": result}) } -func RecordProxyTLSHandshake(hostname string, seconds float64) { - _ = hostname +func RecordProxyTLSHandshake(seconds float64) { + if proxyTLSHandshake == nil { + return + } proxyTLSHandshake.Record(context.Background(), seconds, nil) } -func RecordProxyBytesTransmitted(hostname, direction string, bytes int64) { - _ = hostname +func RecordProxyBytesTransmitted(direction string, bytes int64) { + if proxyBytesTransmitted == nil { + return + } proxyBytesTransmitted.Add(context.Background(), bytes, observability.Labels{"direction": direction}) } func RecordConfigReload(result string) { + if configReloadsTotal == nil { + return + } configReloadsTotal.Add(context.Background(), 1, observability.Labels{"result": result}) } func RecordRestart() { + if restartTotal == nil { + return + } restartTotal.Add(context.Background(), 1, nil) } func RecordAuthFailure(peer, reason string) { + if authFailuresTotal == nil { + return + } authFailuresTotal.Add(context.Background(), 1, observability.Labels{"peer": peer, "reason": reason}) } func RecordACLDenied(ifname, peer, policy string) { + if aclDeniedTotal == nil { + return + } aclDeniedTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "peer": peer, "policy": policy}) } func RecordCertificateExpiry(certName, ifname string, days float64) { + if certificateExpiryDays == nil { + return + } certificateExpiryDays.Record(context.Background(), days, observability.Labels{"cert_name": certName, "ifname": ifname}) } func RecordUDPPacket(ifname, packetType, direction string) { + if udpPacketsTotal == nil { + return + } udpPacketsTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "type": packetType, "direction": direction}) } func RecordUDPPacketSize(ifname, packetType string, bytes float64) { + if udpPacketSizeBytes == nil { + return + } udpPacketSizeBytes.Record(context.Background(), bytes, observability.Labels{"ifname": ifname, "type": packetType}) } func RecordHolePunchEvent(ifname, result string) { + if holePunchEventsTotal == nil { + return + } holePunchEventsTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "result": result}) } func RecordProxyMapping(ifname string, delta int64) { + if proxyMappingActive == nil { + return + } proxyMappingActive.Add(context.Background(), delta, observability.Labels{"ifname": ifname}) } func RecordSession(ifname string, delta int64) { - sessionActive.Add(context.Background(), delta, observability.Labels{"ifname": ifname}) + if activeSessions == nil { + return + } + activeSessions.Add(context.Background(), delta, observability.Labels{"ifname": ifname}) } func RecordSessionRebuilt(ifname string) { + if sessionRebuiltTotal == nil { + return + } sessionRebuiltTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname}) } func RecordCommPattern(ifname string, delta int64) { + if commPatternActive == nil { + return + } commPatternActive.Add(context.Background(), delta, observability.Labels{"ifname": ifname}) } func RecordProxyCleanupRemoved(ifname, component string, count int64) { + if proxyCleanupRemovedTotal == nil { + return + } proxyCleanupRemovedTotal.Add(context.Background(), count, observability.Labels{"ifname": ifname, "component": component}) } func RecordProxyConnectionError(ifname, errorType string) { + if proxyConnectionErrorsTotal == nil { + return + } proxyConnectionErrorsTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "error_type": errorType}) } func RecordProxyInitialMappings(ifname string, count int64) { + if proxyInitialMappingsTotal == nil { + return + } proxyInitialMappingsTotal.Record(context.Background(), count, observability.Labels{"ifname": ifname}) } func RecordProxyMappingUpdate(ifname string) { + if proxyMappingUpdatesTotal == nil { + return + } proxyMappingUpdatesTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname}) } func RecordProxyIdleCleanupDuration(ifname, component string, seconds float64) { + if proxyIdleCleanupDuration == nil { + return + } proxyIdleCleanupDuration.Record(context.Background(), seconds, observability.Labels{"ifname": ifname, "component": component}) } func RecordSNIConnection(result string) { + if sniConnectionsTotal == nil { + return + } sniConnectionsTotal.Add(context.Background(), 1, observability.Labels{"result": result}) } func RecordSNIConnectionDuration(seconds float64) { + if sniConnectionDuration == nil { + return + } sniConnectionDuration.Record(context.Background(), seconds, nil) } func RecordSNIActiveConnection(delta int64) { + if sniActiveConnections == nil { + return + } sniActiveConnections.Add(context.Background(), delta, nil) } func RecordSNIRouteCacheHit(result string) { + if sniRouteCacheHitsTotal == nil { + return + } sniRouteCacheHitsTotal.Add(context.Background(), 1, observability.Labels{"result": result}) } func RecordSNIRouteAPIRequest(result string) { + if sniRouteAPIRequestsTotal == nil { + return + } sniRouteAPIRequestsTotal.Add(context.Background(), 1, observability.Labels{"result": result}) } func RecordSNIRouteAPILatency(seconds float64) { + if sniRouteAPILatency == nil { + return + } sniRouteAPILatency.Record(context.Background(), seconds, nil) } func RecordSNILocalOverride(hit string) { + if sniLocalOverrideTotal == nil { + return + } sniLocalOverrideTotal.Add(context.Background(), 1, observability.Labels{"hit": hit}) } func RecordSNITrustedProxyEvent(event string) { + if sniTrustedProxyEventsTotal == nil { + return + } sniTrustedProxyEventsTotal.Add(context.Background(), 1, observability.Labels{"event": event}) } func RecordSNIProxyProtocolParseError() { + if sniProxyProtocolParseErrorsTotal == nil { + return + } sniProxyProtocolParseErrorsTotal.Add(context.Background(), 1, nil) } func RecordSNIDataBytes(direction string, bytes int64) { + if sniDataBytesTotal == nil { + return + } sniDataBytesTotal.Add(context.Background(), bytes, observability.Labels{"direction": direction}) } func RecordSNITunnelTermination(reason string) { + if sniTunnelTerminationsTotal == nil { + return + } sniTunnelTerminationsTotal.Add(context.Background(), 1, observability.Labels{"reason": reason}) } func RecordHTTPRequest(endpoint, method, statusCode string) { + if httpRequestsTotal == nil { + return + } httpRequestsTotal.Add(context.Background(), 1, observability.Labels{"endpoint": endpoint, "method": method, "status_code": statusCode}) } func RecordHTTPRequestDuration(endpoint, method string, seconds float64) { + if httpRequestDuration == nil { + return + } httpRequestDuration.Record(context.Background(), seconds, observability.Labels{"endpoint": endpoint, "method": method}) } func RecordPeerOperation(operation, result string) { + if peerOperationsTotal == nil { + return + } peerOperationsTotal.Add(context.Background(), 1, observability.Labels{"operation": operation, "result": result}) } func RecordProxyMappingUpdateRequest(result string) { + if proxyMappingUpdateRequestsTotal == nil { + return + } proxyMappingUpdateRequestsTotal.Add(context.Background(), 1, observability.Labels{"result": result}) } func RecordDestinationsUpdateRequest(result string) { + if destinationsUpdateRequestsTotal == nil { + return + } destinationsUpdateRequestsTotal.Add(context.Background(), 1, observability.Labels{"result": result}) } func RecordRemoteConfigFetch(result string) { + if remoteConfigFetchesTotal == nil { + return + } remoteConfigFetchesTotal.Add(context.Background(), 1, observability.Labels{"result": result}) } func RecordBandwidthReport(result string) { + if bandwidthReportsTotal == nil { + return + } bandwidthReportsTotal.Add(context.Background(), 1, observability.Labels{"result": result}) } func RecordPeerBandwidthBytes(peer, direction string, bytes int64) { + if peerBandwidthBytesTotal == nil { + return + } peerBandwidthBytesTotal.Add(context.Background(), bytes, observability.Labels{"peer": peer, "direction": direction}) } func RecordMemorySpike(severity string) { + if memorySpikeTotal == nil { + return + } memorySpikeTotal.Add(context.Background(), 1, observability.Labels{"severity": severity}) } func RecordHeapProfileWritten() { + if heapProfilesWrittenTotal == nil { + return + } heapProfilesWrittenTotal.Add(context.Background(), 1, nil) } diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go index 132c3fe..8c01c68 100644 --- a/internal/metrics/metrics_test.go +++ b/internal/metrics/metrics_test.go @@ -89,6 +89,9 @@ func TestDefaultConfig(t *testing.T) { } func TestShutdownNoInit(t *testing.T) { + // Ensure a known clean global state before testing no-init shutdown behavior. + _ = metrics.Shutdown(context.Background()) + // Shutdown without Initialize should not panic or error. if err := metrics.Shutdown(context.Background()); err != nil { t.Errorf("unexpected error: %v", err) @@ -168,6 +171,7 @@ func TestRecordRelay(t *testing.T) { body := scrape(t, h) assertContains(t, body, "gerbil_udp_packets_total") assertContains(t, body, "gerbil_proxy_mapping_active") + assertContains(t, body, "gerbil_active_sessions") } func TestRecordWireGuard(t *testing.T) { @@ -216,10 +220,10 @@ func TestRecordNetlink(t *testing.T) { metrics.RecordKernelModuleLoad("success") metrics.RecordFirewallRuleApplied("success", "INPUT") metrics.RecordActiveSession("wg0", 1) - metrics.RecordActiveProxyConnection(exampleHostname, 1) - metrics.RecordProxyRouteLookup("hit", exampleHostname) - metrics.RecordProxyTLSHandshake(exampleHostname, 0.05) - metrics.RecordProxyBytesTransmitted(exampleHostname, "tx", 1024) + metrics.RecordActiveProxyConnection(1) + metrics.RecordProxyRouteLookup("hit") + metrics.RecordProxyTLSHandshake(0.05) + metrics.RecordProxyBytesTransmitted("tx", 1024) body := scrape(t, h) assertContains(t, body, "gerbil_netlink_events_total") assertContains(t, body, "gerbil_active_sessions") From 73d4d4d37cbc8a108dc66a10cc99f158d1b33ca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Mon, 4 May 2026 00:12:23 +0200 Subject: [PATCH 4/8] feat(observability): unify backend APIs and harden OTel handling --- internal/observability/config.go | 12 +- internal/observability/metrics.go | 31 ++-- internal/observability/metrics_test.go | 99 ++++++++++-- internal/observability/noop.go | 27 ++-- internal/observability/noop_test.go | 67 ++++++-- internal/observability/otel/backend.go | 163 ++++++++++++++++---- internal/observability/otel/backend_test.go | 44 +++++- internal/observability/otel/exporter.go | 22 ++- 8 files changed, 360 insertions(+), 105 deletions(-) diff --git a/internal/observability/config.go b/internal/observability/config.go index 9643727..7ac9ba2 100644 --- a/internal/observability/config.go +++ b/internal/observability/config.go @@ -60,6 +60,10 @@ type OTelConfig struct { // ExportInterval is how often metrics are pushed to the collector. // Defaults to 60 s. ExportInterval time.Duration + + // Timeout bounds OTLP exporter construction calls. + // Defaults to 10 s. + Timeout time.Duration } // DefaultMetricsConfig returns a MetricsConfig with sensible defaults. @@ -75,6 +79,7 @@ func DefaultMetricsConfig() MetricsConfig { Endpoint: "localhost:4317", Insecure: true, ExportInterval: 60 * time.Second, + Timeout: 10 * time.Second, }, ServiceName: "gerbil", ServiceVersion: "1.0.0", @@ -88,8 +93,10 @@ func (c *MetricsConfig) Validate() error { } switch c.Backend { - case "prometheus", "none", "": + case "prometheus", "none": // valid + case "": + return fmt.Errorf("metrics: enabled requires a non-empty backend") case "otel": if c.OTel.Endpoint == "" { return fmt.Errorf("metrics: backend=otel requires a non-empty OTel endpoint") @@ -100,6 +107,9 @@ func (c *MetricsConfig) Validate() error { if c.OTel.ExportInterval <= 0 { return fmt.Errorf("metrics: otel export interval must be positive") } + if c.OTel.Timeout <= 0 { + return fmt.Errorf("metrics: otel timeout must be positive") + } default: return fmt.Errorf("metrics: unknown backend %q (must be \"prometheus\", \"otel\", or \"none\")", c.Backend) } diff --git a/internal/observability/metrics.go b/internal/observability/metrics.go index ff2e2ae..9ea0013 100644 --- a/internal/observability/metrics.go +++ b/internal/observability/metrics.go @@ -43,20 +43,20 @@ type Histogram interface { type Backend interface { // NewCounter creates a counter metric. // labelNames declares the set of label keys that will be passed at observation time. - NewCounter(name, desc string, labelNames ...string) Counter + NewCounter(name, desc string, labelNames ...string) (Counter, error) // NewUpDownCounter creates an up-down counter metric. - NewUpDownCounter(name, desc string, labelNames ...string) UpDownCounter + NewUpDownCounter(name, desc string, labelNames ...string) (UpDownCounter, error) // NewInt64Gauge creates an integer gauge metric. - NewInt64Gauge(name, desc string, labelNames ...string) Int64Gauge + NewInt64Gauge(name, desc string, labelNames ...string) (Int64Gauge, error) // NewFloat64Gauge creates a float gauge metric. - NewFloat64Gauge(name, desc string, labelNames ...string) Float64Gauge + NewFloat64Gauge(name, desc string, labelNames ...string) (Float64Gauge, error) // NewHistogram creates a histogram metric. // buckets are the explicit upper-bound bucket boundaries. - NewHistogram(name, desc string, buckets []float64, labelNames ...string) Histogram + NewHistogram(name, desc string, buckets []float64, labelNames ...string) (Histogram, error) // HTTPHandler returns the /metrics HTTP handler. // Implementations that do not expose an HTTP endpoint return nil. @@ -88,6 +88,7 @@ func New(cfg MetricsConfig) (Backend, error) { Endpoint: cfg.OTel.Endpoint, Insecure: cfg.OTel.Insecure, ExportInterval: cfg.OTel.ExportInterval, + Timeout: cfg.OTel.Timeout, ServiceName: cfg.ServiceName, ServiceVersion: cfg.ServiceVersion, DeploymentEnvironment: cfg.DeploymentEnvironment, @@ -110,19 +111,19 @@ type promAdapter struct { b *obsprom.Backend } -func (a *promAdapter) NewCounter(name, desc string, labelNames ...string) Counter { +func (a *promAdapter) NewCounter(name, desc string, labelNames ...string) (Counter, error) { return a.b.NewCounter(name, desc, labelNames...) } -func (a *promAdapter) NewUpDownCounter(name, desc string, labelNames ...string) UpDownCounter { +func (a *promAdapter) NewUpDownCounter(name, desc string, labelNames ...string) (UpDownCounter, error) { return a.b.NewUpDownCounter(name, desc, labelNames...) } -func (a *promAdapter) NewInt64Gauge(name, desc string, labelNames ...string) Int64Gauge { +func (a *promAdapter) NewInt64Gauge(name, desc string, labelNames ...string) (Int64Gauge, error) { return a.b.NewInt64Gauge(name, desc, labelNames...) } -func (a *promAdapter) NewFloat64Gauge(name, desc string, labelNames ...string) Float64Gauge { +func (a *promAdapter) NewFloat64Gauge(name, desc string, labelNames ...string) (Float64Gauge, error) { return a.b.NewFloat64Gauge(name, desc, labelNames...) } -func (a *promAdapter) NewHistogram(name, desc string, buckets []float64, labelNames ...string) Histogram { +func (a *promAdapter) NewHistogram(name, desc string, buckets []float64, labelNames ...string) (Histogram, error) { return a.b.NewHistogram(name, desc, buckets, labelNames...) } func (a *promAdapter) HTTPHandler() http.Handler { return a.b.HTTPHandler() } @@ -133,19 +134,19 @@ type otelAdapter struct { b *obsotel.Backend } -func (a *otelAdapter) NewCounter(name, desc string, labelNames ...string) Counter { +func (a *otelAdapter) NewCounter(name, desc string, labelNames ...string) (Counter, error) { return a.b.NewCounter(name, desc, labelNames...) } -func (a *otelAdapter) NewUpDownCounter(name, desc string, labelNames ...string) UpDownCounter { +func (a *otelAdapter) NewUpDownCounter(name, desc string, labelNames ...string) (UpDownCounter, error) { return a.b.NewUpDownCounter(name, desc, labelNames...) } -func (a *otelAdapter) NewInt64Gauge(name, desc string, labelNames ...string) Int64Gauge { +func (a *otelAdapter) NewInt64Gauge(name, desc string, labelNames ...string) (Int64Gauge, error) { return a.b.NewInt64Gauge(name, desc, labelNames...) } -func (a *otelAdapter) NewFloat64Gauge(name, desc string, labelNames ...string) Float64Gauge { +func (a *otelAdapter) NewFloat64Gauge(name, desc string, labelNames ...string) (Float64Gauge, error) { return a.b.NewFloat64Gauge(name, desc, labelNames...) } -func (a *otelAdapter) NewHistogram(name, desc string, buckets []float64, labelNames ...string) Histogram { +func (a *otelAdapter) NewHistogram(name, desc string, buckets []float64, labelNames ...string) (Histogram, error) { return a.b.NewHistogram(name, desc, buckets, labelNames...) } func (a *otelAdapter) HTTPHandler() http.Handler { return a.b.HTTPHandler() } diff --git a/internal/observability/metrics_test.go b/internal/observability/metrics_test.go index 91a048c..cf3eccb 100644 --- a/internal/observability/metrics_test.go +++ b/internal/observability/metrics_test.go @@ -2,6 +2,8 @@ package observability_test import ( "context" + "net" + "os" "testing" "time" @@ -39,20 +41,19 @@ func TestValidateValidConfigs(t *testing.T) { }{ {name: "disabled", cfg: observability.MetricsConfig{Enabled: false}}, {name: "backend none", cfg: observability.MetricsConfig{Enabled: true, Backend: "none"}}, - {name: "backend empty", cfg: observability.MetricsConfig{Enabled: true, Backend: ""}}, {name: "prometheus", cfg: observability.MetricsConfig{Enabled: true, Backend: "prometheus"}}, { name: "otel grpc", cfg: observability.MetricsConfig{ Enabled: true, Backend: "otel", - OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second}, + OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second, Timeout: 2 * time.Second}, }, }, { name: "otel http", cfg: observability.MetricsConfig{ Enabled: true, Backend: "otel", - OTel: observability.OTelConfig{Protocol: "http", Endpoint: "localhost:4318", ExportInterval: 30 * time.Second}, + OTel: observability.OTelConfig{Protocol: "http", Endpoint: "localhost:4318", ExportInterval: 30 * time.Second, Timeout: 2 * time.Second}, }, }, } @@ -71,25 +72,36 @@ func TestValidateInvalidConfigs(t *testing.T) { cfg observability.MetricsConfig }{ {name: "unknown backend", cfg: observability.MetricsConfig{Enabled: true, Backend: "datadog"}}, + { + name: "backend empty while enabled", + cfg: observability.MetricsConfig{Enabled: true, Backend: ""}, + }, { name: "otel missing endpoint", cfg: observability.MetricsConfig{ Enabled: true, Backend: "otel", - OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: "", ExportInterval: 10 * time.Second}, + OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: "", ExportInterval: 10 * time.Second, Timeout: 2 * time.Second}, }, }, { name: "otel invalid protocol", cfg: observability.MetricsConfig{ Enabled: true, Backend: "otel", - OTel: observability.OTelConfig{Protocol: "tcp", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second}, + OTel: observability.OTelConfig{Protocol: "tcp", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second, Timeout: 2 * time.Second}, }, }, { name: "otel zero interval", cfg: observability.MetricsConfig{ Enabled: true, Backend: "otel", - OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 0}, + OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 0, Timeout: 2 * time.Second}, + }, + }, + { + name: "otel zero timeout", + cfg: observability.MetricsConfig{ + Enabled: true, Backend: "otel", + OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second, Timeout: 0}, }, }, } @@ -157,11 +169,32 @@ func TestPrometheusAdapterAllInstruments(t *testing.T) { ctx := context.Background() labels := observability.Labels{"k": "v"} - b.NewCounter("prom_adapter_counter_total", "desc", "k").Add(ctx, 1, labels) - b.NewUpDownCounter("prom_adapter_updown", "desc", "k").Add(ctx, 2, labels) - b.NewInt64Gauge("prom_adapter_int_gauge", "desc", "k").Record(ctx, 99, labels) - b.NewFloat64Gauge("prom_adapter_float_gauge", "desc", "k").Record(ctx, 1.23, labels) - b.NewHistogram("prom_adapter_histogram", "desc", []float64{0.1, 1.0}, "k").Record(ctx, 0.5, labels) + c, err := b.NewCounter("prom_adapter_counter_total", "desc", "k") + if err != nil { + t.Fatalf("NewCounter error: %v", err) + } + u, err := b.NewUpDownCounter("prom_adapter_updown", "desc", "k") + if err != nil { + t.Fatalf("NewUpDownCounter error: %v", err) + } + ig, err := b.NewInt64Gauge("prom_adapter_int_gauge", "desc", "k") + if err != nil { + t.Fatalf("NewInt64Gauge error: %v", err) + } + fg, err := b.NewFloat64Gauge("prom_adapter_float_gauge", "desc", "k") + if err != nil { + t.Fatalf("NewFloat64Gauge error: %v", err) + } + h, err := b.NewHistogram("prom_adapter_histogram", "desc", []float64{0.1, 1.0}, "k") + if err != nil { + t.Fatalf("NewHistogram error: %v", err) + } + + c.Add(ctx, 1, labels) + u.Add(ctx, 2, labels) + ig.Record(ctx, 99, labels) + fg.Record(ctx, 1.23, labels) + h.Record(ctx, 0.5, labels) if b.HTTPHandler() == nil { t.Error("prometheus adapter HTTPHandler should not be nil") @@ -172,9 +205,20 @@ func TestPrometheusAdapterAllInstruments(t *testing.T) { } func TestOtelAdapterAllInstruments(t *testing.T) { + if os.Getenv("SKIP_OTEL_INTEGRATION") != "" { + t.Skip("skipping OTel integration test because SKIP_OTEL_INTEGRATION is set") + } + + dialTimeout := 300 * time.Millisecond + conn, err := net.DialTimeout("tcp", otelGRPCEndpoint, dialTimeout) + if err != nil { + t.Skipf("skipping OTel integration test; collector %s not reachable: %v", otelGRPCEndpoint, err) + } + _ = conn.Close() + b, err := observability.New(observability.MetricsConfig{ Enabled: true, Backend: "otel", - OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, Insecure: true, ExportInterval: 100 * time.Millisecond}, + OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, Insecure: true, ExportInterval: 100 * time.Millisecond, Timeout: 2 * time.Second}, }) if err != nil { t.Fatalf("failed to create otel backend: %v", err) @@ -182,11 +226,32 @@ func TestOtelAdapterAllInstruments(t *testing.T) { ctx := context.Background() labels := observability.Labels{"k": "v"} - b.NewCounter("otel_adapter_counter_total", "desc", "k").Add(ctx, 1, labels) - b.NewUpDownCounter("otel_adapter_updown", "desc", "k").Add(ctx, 2, labels) - b.NewInt64Gauge("otel_adapter_int_gauge", "desc", "k").Record(ctx, 99, labels) - b.NewFloat64Gauge("otel_adapter_float_gauge", "desc", "k").Record(ctx, 1.23, labels) - b.NewHistogram("otel_adapter_histogram", "desc", []float64{0.1, 1.0}, "k").Record(ctx, 0.5, labels) + c, err := b.NewCounter("otel_adapter_counter_total", "desc", "k") + if err != nil { + t.Fatalf("NewCounter error: %v", err) + } + u, err := b.NewUpDownCounter("otel_adapter_updown", "desc", "k") + if err != nil { + t.Fatalf("NewUpDownCounter error: %v", err) + } + ig, err := b.NewInt64Gauge("otel_adapter_int_gauge", "desc", "k") + if err != nil { + t.Fatalf("NewInt64Gauge error: %v", err) + } + fg, err := b.NewFloat64Gauge("otel_adapter_float_gauge", "desc", "k") + if err != nil { + t.Fatalf("NewFloat64Gauge error: %v", err) + } + h, err := b.NewHistogram("otel_adapter_histogram", "desc", []float64{0.1, 1.0}, "k") + if err != nil { + t.Fatalf("NewHistogram error: %v", err) + } + + c.Add(ctx, 1, labels) + u.Add(ctx, 2, labels) + ig.Record(ctx, 99, labels) + fg.Record(ctx, 1.23, labels) + h.Record(ctx, 0.5, labels) if b.HTTPHandler() != nil { t.Error("OTel adapter HTTPHandler should be nil") diff --git a/internal/observability/noop.go b/internal/observability/noop.go index 47acc07..cfd8280 100644 --- a/internal/observability/noop.go +++ b/internal/observability/noop.go @@ -13,38 +13,31 @@ type NoopBackend struct{} // Compile-time interface check. var _ Backend = (*NoopBackend)(nil) -func (n *NoopBackend) NewCounter(_ string, _ string, _ ...string) Counter { - _ = n - return noopCounter{} +func (n *NoopBackend) NewCounter(_ string, _ string, _ ...string) (Counter, error) { + return noopCounter{}, nil } -func (n *NoopBackend) NewUpDownCounter(_ string, _ string, _ ...string) UpDownCounter { - _ = n - return noopUpDownCounter{} +func (n *NoopBackend) NewUpDownCounter(_ string, _ string, _ ...string) (UpDownCounter, error) { + return noopUpDownCounter{}, nil } -func (n *NoopBackend) NewInt64Gauge(_ string, _ string, _ ...string) Int64Gauge { - _ = n - return noopInt64Gauge{} +func (n *NoopBackend) NewInt64Gauge(_ string, _ string, _ ...string) (Int64Gauge, error) { + return noopInt64Gauge{}, nil } -func (n *NoopBackend) NewFloat64Gauge(_ string, _ string, _ ...string) Float64Gauge { - _ = n - return noopFloat64Gauge{} +func (n *NoopBackend) NewFloat64Gauge(_ string, _ string, _ ...string) (Float64Gauge, error) { + return noopFloat64Gauge{}, nil } -func (n *NoopBackend) NewHistogram(_ string, _ string, _ []float64, _ ...string) Histogram { - _ = n - return noopHistogram{} +func (n *NoopBackend) NewHistogram(_ string, _ string, _ []float64, _ ...string) (Histogram, error) { + return noopHistogram{}, nil } func (n *NoopBackend) HTTPHandler() http.Handler { - _ = n return nil } func (n *NoopBackend) Shutdown(_ context.Context) error { - _ = n return nil } diff --git a/internal/observability/noop_test.go b/internal/observability/noop_test.go index 9496a0a..037ceb2 100644 --- a/internal/observability/noop_test.go +++ b/internal/observability/noop_test.go @@ -13,32 +13,32 @@ func TestNoopBackendAllInstruments(t *testing.T) { ctx := context.Background() labels := observability.Labels{"k": "v"} - t.Run("Counter", func(_ *testing.T) { - c := n.NewCounter("test_counter", "desc") + t.Run("Counter", func(t *testing.T) { + c, _ := n.NewCounter("test_counter", "desc") c.Add(ctx, 1, labels) c.Add(ctx, 0, nil) }) - t.Run("UpDownCounter", func(_ *testing.T) { - u := n.NewUpDownCounter("test_updown", "desc") + t.Run("UpDownCounter", func(t *testing.T) { + u, _ := n.NewUpDownCounter("test_updown", "desc") u.Add(ctx, 1, labels) u.Add(ctx, -1, nil) }) - t.Run("Int64Gauge", func(_ *testing.T) { - g := n.NewInt64Gauge("test_int64gauge", "desc") + t.Run("Int64Gauge", func(t *testing.T) { + g, _ := n.NewInt64Gauge("test_int64gauge", "desc") g.Record(ctx, 42, labels) g.Record(ctx, 0, nil) }) - t.Run("Float64Gauge", func(_ *testing.T) { - g := n.NewFloat64Gauge("test_float64gauge", "desc") + t.Run("Float64Gauge", func(t *testing.T) { + g, _ := n.NewFloat64Gauge("test_float64gauge", "desc") g.Record(ctx, 3.14, labels) g.Record(ctx, 0, nil) }) - t.Run("Histogram", func(_ *testing.T) { - h := n.NewHistogram("test_histogram", "desc", []float64{1, 5, 10}) + t.Run("Histogram", func(t *testing.T) { + h, _ := n.NewHistogram("test_histogram", "desc", []float64{1, 5, 10}) h.Record(ctx, 2.5, labels) h.Record(ctx, 0, nil) }) @@ -56,12 +56,47 @@ func TestNoopBackendAllInstruments(t *testing.T) { }) } -func TestNoopBackendLabelNames(_ *testing.T) { +func TestNoopBackendLabelNames(t *testing.T) { // Verify that label names passed at creation time are accepted without panic. n := &observability.NoopBackend{} - n.NewCounter("c", "d", "label1", "label2") - n.NewUpDownCounter("u", "d", "l1") - n.NewInt64Gauge("g1", "d", "l1", "l2", "l3") - n.NewFloat64Gauge("g2", "d") - n.NewHistogram("h", "d", []float64{0.1, 1.0}, "l1") + + assertNoPanic := func(t *testing.T, constructor string, fn func()) { + t.Helper() + defer func() { + if r := recover(); r != nil { + t.Fatalf("%s panicked: %v", constructor, r) + } + }() + fn() + } + + t.Run("NewCounter", func(t *testing.T) { + assertNoPanic(t, "NewCounter", func() { + _, _ = n.NewCounter("c", "d", "label1", "label2") + }) + }) + + t.Run("NewUpDownCounter", func(t *testing.T) { + assertNoPanic(t, "NewUpDownCounter", func() { + _, _ = n.NewUpDownCounter("u", "d", "l1") + }) + }) + + t.Run("NewInt64Gauge", func(t *testing.T) { + assertNoPanic(t, "NewInt64Gauge", func() { + _, _ = n.NewInt64Gauge("g1", "d", "l1", "l2", "l3") + }) + }) + + t.Run("NewFloat64Gauge", func(t *testing.T) { + assertNoPanic(t, "NewFloat64Gauge", func() { + _, _ = n.NewFloat64Gauge("g2", "d") + }) + }) + + t.Run("NewHistogram", func(t *testing.T) { + assertNoPanic(t, "NewHistogram", func() { + _, _ = n.NewHistogram("h", "d", []float64{0.1, 1.0}, "l1") + }) + }) } diff --git a/internal/observability/otel/backend.go b/internal/observability/otel/backend.go index d3e3a23..7f49579 100644 --- a/internal/observability/otel/backend.go +++ b/internal/observability/otel/backend.go @@ -9,7 +9,10 @@ package otel import ( "context" "fmt" + "log" "net/http" + "regexp" + "strings" "time" "go.opentelemetry.io/otel/attribute" @@ -17,6 +20,8 @@ import ( sdkmetric "go.opentelemetry.io/otel/sdk/metric" ) +var metricLabelNameRE = regexp.MustCompile(`^[a-zA-Z_][a-zA-Z0-9_]*$`) + // Config holds OTel backend configuration. type Config struct { // Protocol is "grpc" (default) or "http". @@ -31,6 +36,9 @@ type Config struct { // ExportInterval is the period between pushes to the collector. ExportInterval time.Duration + // Timeout bounds exporter construction calls. + Timeout time.Duration + ServiceName string ServiceVersion string DeploymentEnvironment string @@ -57,9 +65,15 @@ func New(cfg Config) (*Backend, error) { if cfg.Protocol == "" { cfg.Protocol = "grpc" } + if strings.TrimSpace(cfg.Endpoint) == "" { + return nil, fmt.Errorf("otel backend: empty cfg.Endpoint") + } if cfg.ExportInterval <= 0 { cfg.ExportInterval = 60 * time.Second } + if cfg.Timeout <= 0 { + cfg.Timeout = 10 * time.Second + } if cfg.ServiceName == "" { cfg.ServiceName = "gerbil" } @@ -100,111 +114,196 @@ func (b *Backend) Shutdown(ctx context.Context) error { } // NewCounter creates an OTel Int64Counter. -func (b *Backend) NewCounter(name, desc string, _ ...string) *Counter { +func (b *Backend) NewCounter(name, desc string, labelNames ...string) (*Counter, error) { + normalizedLabelNames, err := validateLabelNames(labelNames) + if err != nil { + return nil, fmt.Errorf("otel: create counter %q: %w", name, err) + } c, err := b.meter.Int64Counter(name, metric.WithDescription(desc)) if err != nil { - panic(fmt.Sprintf("otel: create counter %q: %v", name, err)) + return nil, fmt.Errorf("otel: create counter %q: %w", name, err) } - return &Counter{c: c} + return &Counter{c: c, labelNames: normalizedLabelNames}, nil } // NewUpDownCounter creates an OTel Int64UpDownCounter. -func (b *Backend) NewUpDownCounter(name, desc string, _ ...string) *UpDownCounter { +func (b *Backend) NewUpDownCounter(name, desc string, labelNames ...string) (*UpDownCounter, error) { + normalizedLabelNames, err := validateLabelNames(labelNames) + if err != nil { + return nil, fmt.Errorf("otel: create up-down counter %q: %w", name, err) + } c, err := b.meter.Int64UpDownCounter(name, metric.WithDescription(desc)) if err != nil { - panic(fmt.Sprintf("otel: create up-down counter %q: %v", name, err)) + return nil, fmt.Errorf("otel: create up-down counter %q: %w", name, err) } - return &UpDownCounter{c: c} + return &UpDownCounter{c: c, labelNames: normalizedLabelNames}, nil } // NewInt64Gauge creates an OTel Int64Gauge. -func (b *Backend) NewInt64Gauge(name, desc string, _ ...string) *Int64Gauge { +func (b *Backend) NewInt64Gauge(name, desc string, labelNames ...string) (*Int64Gauge, error) { + normalizedLabelNames, err := validateLabelNames(labelNames) + if err != nil { + return nil, fmt.Errorf("otel: create int64 gauge %q: %w", name, err) + } g, err := b.meter.Int64Gauge(name, metric.WithDescription(desc)) if err != nil { - panic(fmt.Sprintf("otel: create int64 gauge %q: %v", name, err)) + return nil, fmt.Errorf("otel: create int64 gauge %q: %w", name, err) } - return &Int64Gauge{g: g} + return &Int64Gauge{g: g, labelNames: normalizedLabelNames}, nil } // NewFloat64Gauge creates an OTel Float64Gauge. -func (b *Backend) NewFloat64Gauge(name, desc string, _ ...string) *Float64Gauge { +func (b *Backend) NewFloat64Gauge(name, desc string, labelNames ...string) (*Float64Gauge, error) { + normalizedLabelNames, err := validateLabelNames(labelNames) + if err != nil { + return nil, fmt.Errorf("otel: create float64 gauge %q: %w", name, err) + } g, err := b.meter.Float64Gauge(name, metric.WithDescription(desc)) if err != nil { - panic(fmt.Sprintf("otel: create float64 gauge %q: %v", name, err)) + return nil, fmt.Errorf("otel: create float64 gauge %q: %w", name, err) } - return &Float64Gauge{g: g} + return &Float64Gauge{g: g, labelNames: normalizedLabelNames}, nil } // NewHistogram creates an OTel Float64Histogram with explicit bucket boundaries. -func (b *Backend) NewHistogram(name, desc string, buckets []float64, _ ...string) *Histogram { +func (b *Backend) NewHistogram(name, desc string, buckets []float64, labelNames ...string) (*Histogram, error) { + normalizedLabelNames, err := validateLabelNames(labelNames) + if err != nil { + return nil, fmt.Errorf("otel: create histogram %q: %w", name, err) + } h, err := b.meter.Float64Histogram(name, metric.WithDescription(desc), metric.WithExplicitBucketBoundaries(buckets...), ) if err != nil { - panic(fmt.Sprintf("otel: create histogram %q: %v", name, err)) + return nil, fmt.Errorf("otel: create histogram %q: %w", name, err) } - return &Histogram{h: h} + return &Histogram{h: h, labelNames: normalizedLabelNames}, nil } -// labelsToAttrs converts a Labels map to OTel attribute key-value pairs. -func labelsToAttrs(labels map[string]string) []attribute.KeyValue { - if len(labels) == 0 { - return nil +func validateLabelNames(labelNames []string) ([]string, error) { + if len(labelNames) == 0 { + return nil, nil } - attrs := make([]attribute.KeyValue, 0, len(labels)) - for k, v := range labels { - attrs = append(attrs, attribute.String(k, v)) + + normalized := make([]string, len(labelNames)) + seen := make(map[string]struct{}, len(labelNames)) + for i, name := range labelNames { + if !metricLabelNameRE.MatchString(name) { + return nil, fmt.Errorf("invalid label name %q", name) + } + if _, exists := seen[name]; exists { + return nil, fmt.Errorf("duplicate label name %q", name) + } + seen[name] = struct{}{} + normalized[i] = name } + + return normalized, nil +} + +func labelsToAttrs(labelNames []string, labels map[string]string) []attribute.KeyValue { + if len(labelNames) == 0 { + if len(labels) > 0 { + log.Printf("WARN: dropping otel metric sample due to unexpected labels: got=%v expected=none", labels) + return nil + } + return []attribute.KeyValue{} + } + + attrs := make([]attribute.KeyValue, 0, len(labelNames)) + for _, labelName := range labelNames { + attrs = append(attrs, attribute.String(labelName, labels[labelName])) + } + + for got := range labels { + found := false + for _, expected := range labelNames { + if got == expected { + found = true + break + } + } + if !found { + log.Printf("WARN: dropping otel metric sample due to unexpected label key %q (expected=%v)", got, labelNames) + return nil + } + } + return attrs } // Counter wraps an OTel Int64Counter. type Counter struct { - c metric.Int64Counter + c metric.Int64Counter + labelNames []string } // Add increments the counter by value. func (c *Counter) Add(ctx context.Context, value int64, labels map[string]string) { - c.c.Add(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...)) + attrs := labelsToAttrs(c.labelNames, labels) + if attrs == nil { + return + } + c.c.Add(ctx, value, metric.WithAttributes(attrs...)) } // UpDownCounter wraps an OTel Int64UpDownCounter. type UpDownCounter struct { - c metric.Int64UpDownCounter + c metric.Int64UpDownCounter + labelNames []string } // Add adjusts the up-down counter by value. func (u *UpDownCounter) Add(ctx context.Context, value int64, labels map[string]string) { - u.c.Add(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...)) + attrs := labelsToAttrs(u.labelNames, labels) + if attrs == nil { + return + } + u.c.Add(ctx, value, metric.WithAttributes(attrs...)) } // Int64Gauge wraps an OTel Int64Gauge. type Int64Gauge struct { - g metric.Int64Gauge + g metric.Int64Gauge + labelNames []string } // Record sets the gauge to value. func (g *Int64Gauge) Record(ctx context.Context, value int64, labels map[string]string) { - g.g.Record(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...)) + attrs := labelsToAttrs(g.labelNames, labels) + if attrs == nil { + return + } + g.g.Record(ctx, value, metric.WithAttributes(attrs...)) } // Float64Gauge wraps an OTel Float64Gauge. type Float64Gauge struct { - g metric.Float64Gauge + g metric.Float64Gauge + labelNames []string } // Record sets the gauge to value. func (g *Float64Gauge) Record(ctx context.Context, value float64, labels map[string]string) { - g.g.Record(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...)) + attrs := labelsToAttrs(g.labelNames, labels) + if attrs == nil { + return + } + g.g.Record(ctx, value, metric.WithAttributes(attrs...)) } // Histogram wraps an OTel Float64Histogram. type Histogram struct { - h metric.Float64Histogram + h metric.Float64Histogram + labelNames []string } // Record observes value in the histogram. func (h *Histogram) Record(ctx context.Context, value float64, labels map[string]string) { - h.h.Record(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...)) + attrs := labelsToAttrs(h.labelNames, labels) + if attrs == nil { + return + } + h.h.Record(ctx, value, metric.WithAttributes(attrs...)) } diff --git a/internal/observability/otel/backend_test.go b/internal/observability/otel/backend_test.go index e527678..ef753e8 100644 --- a/internal/observability/otel/backend_test.go +++ b/internal/observability/otel/backend_test.go @@ -55,7 +55,10 @@ func TestOtelBackendCounter(t *testing.T) { b := newInMemoryBackend(t) defer b.Shutdown(context.Background()) //nolint:errcheck - c := b.NewCounter("gerbil_test_counter_total", "test counter", "result") + c, err := b.NewCounter("gerbil_test_counter_total", "test counter", "result") + if err != nil { + t.Fatalf("NewCounter returned error: %v", err) + } // Should not panic c.Add(context.Background(), 1, map[string]string{"result": "ok"}) c.Add(context.Background(), 5, nil) @@ -65,7 +68,10 @@ func TestOtelBackendUpDownCounter(t *testing.T) { b := newInMemoryBackend(t) defer b.Shutdown(context.Background()) //nolint:errcheck - u := b.NewUpDownCounter("gerbil_test_updown", "test updown", "state") + u, err := b.NewUpDownCounter("gerbil_test_updown", "test updown", "state") + if err != nil { + t.Fatalf("NewUpDownCounter returned error: %v", err) + } u.Add(context.Background(), 3, map[string]string{"state": "active"}) u.Add(context.Background(), -1, map[string]string{"state": "active"}) } @@ -74,7 +80,10 @@ func TestOtelBackendInt64Gauge(t *testing.T) { b := newInMemoryBackend(t) defer b.Shutdown(context.Background()) //nolint:errcheck - g := b.NewInt64Gauge("gerbil_test_int_gauge", "test gauge") + g, err := b.NewInt64Gauge("gerbil_test_int_gauge", "test gauge") + if err != nil { + t.Fatalf("NewInt64Gauge returned error: %v", err) + } g.Record(context.Background(), 42, nil) } @@ -82,7 +91,10 @@ func TestOtelBackendFloat64Gauge(t *testing.T) { b := newInMemoryBackend(t) defer b.Shutdown(context.Background()) //nolint:errcheck - g := b.NewFloat64Gauge("gerbil_test_float_gauge", "test float gauge") + g, err := b.NewFloat64Gauge("gerbil_test_float_gauge", "test float gauge") + if err != nil { + t.Fatalf("NewFloat64Gauge returned error: %v", err) + } g.Record(context.Background(), 3.14, nil) } @@ -90,8 +102,11 @@ func TestOtelBackendHistogram(t *testing.T) { b := newInMemoryBackend(t) defer b.Shutdown(context.Background()) //nolint:errcheck - h := b.NewHistogram("gerbil_test_duration_seconds", "test histogram", + h, err := b.NewHistogram("gerbil_test_duration_seconds", "test histogram", []float64{0.1, 0.5, 1.0}, "method") + if err != nil { + t.Fatalf("NewHistogram returned error: %v", err) + } h.Record(context.Background(), 0.3, map[string]string{"method": "GET"}) } @@ -139,3 +154,22 @@ func TestOtelBackendDeploymentEnvironment(t *testing.T) { } defer b.Shutdown(context.Background()) //nolint:errcheck } + +func TestOtelBackendRejectsInvalidLabelNames(t *testing.T) { + b := newInMemoryBackend(t) + defer b.Shutdown(context.Background()) //nolint:errcheck + + t.Run("duplicate labels", func(t *testing.T) { + _, err := b.NewCounter("gerbil_test_invalid_labels_total", "test counter", "result", "result") + if err == nil { + t.Fatal("expected error for duplicate label names") + } + }) + + t.Run("invalid label name", func(t *testing.T) { + _, err := b.NewHistogram("gerbil_test_invalid_histogram", "test histogram", []float64{0.1, 1.0}, "status-code") + if err == nil { + t.Fatal("expected error for invalid label name") + } + }) +} diff --git a/internal/observability/otel/exporter.go b/internal/observability/otel/exporter.go index 44fe1e2..89950fa 100644 --- a/internal/observability/otel/exporter.go +++ b/internal/observability/otel/exporter.go @@ -3,6 +3,8 @@ package otel import ( "context" "fmt" + "net/url" + "strings" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp" @@ -11,6 +13,10 @@ import ( // newExporter creates the appropriate OTLP exporter based on cfg.Protocol. func newExporter(ctx context.Context, cfg Config) (sdkmetric.Exporter, error) { + if strings.TrimSpace(cfg.Endpoint) == "" { + return nil, fmt.Errorf("otel: cfg.Endpoint is empty") + } + switch cfg.Protocol { case "grpc", "": return newGRPCExporter(ctx, cfg) @@ -36,8 +42,20 @@ func newGRPCExporter(ctx context.Context, cfg Config) (sdkmetric.Exporter, error } func newHTTPExporter(ctx context.Context, cfg Config) (sdkmetric.Exporter, error) { - opts := []otlpmetrichttp.Option{ - otlpmetrichttp.WithEndpoint(cfg.Endpoint), + endpoint := strings.TrimSpace(cfg.Endpoint) + + opts := make([]otlpmetrichttp.Option, 0, 3) + if strings.Contains(endpoint, "://") { + parsed, err := url.Parse(endpoint) + if err != nil { + return nil, fmt.Errorf("otlp http exporter: parse endpoint URL %q: %w", endpoint, err) + } + opts = append(opts, otlpmetrichttp.WithEndpointURL(parsed.String())) + } else { + opts = append(opts, + otlpmetrichttp.WithEndpoint(endpoint), + otlpmetrichttp.WithURLPath("/v1/metrics"), + ) } if cfg.Insecure { opts = append(opts, otlpmetrichttp.WithInsecure()) From 191b4fa26a390b3d5651ed8fe7eb9b5dce76174b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Mon, 4 May 2026 00:12:24 +0200 Subject: [PATCH 5/8] feat(prometheus): robust label validation and registration handling --- internal/observability/prometheus/backend.go | 183 +++++++++++++++--- .../observability/prometheus/backend_test.go | 78 +++++++- 2 files changed, 222 insertions(+), 39 deletions(-) diff --git a/internal/observability/prometheus/backend.go b/internal/observability/prometheus/backend.go index f2744c1..a513b88 100644 --- a/internal/observability/prometheus/backend.go +++ b/internal/observability/prometheus/backend.go @@ -7,6 +7,7 @@ package prometheus import ( "context" + "log" "net/http" "github.com/prometheus/client_golang/prometheus" @@ -30,9 +31,10 @@ type Config struct { // in the backend-specific instrument types that implement the observability // instrument interfaces. type Backend struct { - cfg Config - registry *prometheus.Registry - handler http.Handler + cfg Config + registry *prometheus.Registry + handler http.Handler + droppedSamplesCounter prometheus.Counter } // New creates and initialises a Prometheus backend. @@ -48,6 +50,11 @@ func New(cfg Config) (*Backend, error) { } registry := prometheus.NewRegistry() + droppedSamplesCounter := prometheus.NewCounter(prometheus.CounterOpts{ + Name: "gerbil_dropped_metric_samples_total", + Help: "Total number of metric samples dropped due to invalid labels or unsupported label sets", + }) + registry.MustRegister(droppedSamplesCounter) // Include Go and process metrics by default. includeGo := cfg.IncludeGoMetrics == nil || *cfg.IncludeGoMetrics @@ -62,7 +69,7 @@ func New(cfg Config) (*Backend, error) { EnableOpenMetrics: false, }) - return &Backend{cfg: cfg, registry: registry, handler: handler}, nil + return &Backend{cfg: cfg, registry: registry, handler: handler, droppedSamplesCounter: droppedSamplesCounter}, nil } // HTTPHandler returns the Prometheus /metrics HTTP handler. @@ -78,60 +85,107 @@ func (b *Backend) Shutdown(_ context.Context) error { } // NewCounter creates a Prometheus CounterVec registered on the backend's registry. -func (b *Backend) NewCounter(name, desc string, labelNames ...string) *Counter { +func (b *Backend) NewCounter(name, desc string, labelNames ...string) (*Counter, error) { vec := prometheus.NewCounterVec(prometheus.CounterOpts{ Name: name, Help: desc, }, labelNames) - b.registry.MustRegister(vec) - return &Counter{vec: vec} + if err := b.registry.Register(vec); err != nil { + if are, ok := err.(prometheus.AlreadyRegisteredError); ok { + existing, ok := are.ExistingCollector.(*prometheus.CounterVec) + if !ok { + return nil, err + } + return &Counter{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil + } + return nil, err + } + return &Counter{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil } // NewUpDownCounter creates a Prometheus GaugeVec (Prometheus gauges are // bidirectional) registered on the backend's registry. -func (b *Backend) NewUpDownCounter(name, desc string, labelNames ...string) *UpDownCounter { +func (b *Backend) NewUpDownCounter(name, desc string, labelNames ...string) (*UpDownCounter, error) { vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: name, Help: desc, }, labelNames) - b.registry.MustRegister(vec) - return &UpDownCounter{vec: vec} + if err := b.registry.Register(vec); err != nil { + if are, ok := err.(prometheus.AlreadyRegisteredError); ok { + existing, ok := are.ExistingCollector.(*prometheus.GaugeVec) + if !ok { + return nil, err + } + return &UpDownCounter{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil + } + return nil, err + } + return &UpDownCounter{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil } // NewInt64Gauge creates a Prometheus GaugeVec registered on the backend's registry. -func (b *Backend) NewInt64Gauge(name, desc string, labelNames ...string) *Int64Gauge { +func (b *Backend) NewInt64Gauge(name, desc string, labelNames ...string) (*Int64Gauge, error) { vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: name, Help: desc, }, labelNames) - b.registry.MustRegister(vec) - return &Int64Gauge{vec: vec} + if err := b.registry.Register(vec); err != nil { + if are, ok := err.(prometheus.AlreadyRegisteredError); ok { + existing, ok := are.ExistingCollector.(*prometheus.GaugeVec) + if !ok { + return nil, err + } + return &Int64Gauge{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil + } + return nil, err + } + return &Int64Gauge{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil } // NewFloat64Gauge creates a Prometheus GaugeVec registered on the backend's registry. -func (b *Backend) NewFloat64Gauge(name, desc string, labelNames ...string) *Float64Gauge { +func (b *Backend) NewFloat64Gauge(name, desc string, labelNames ...string) (*Float64Gauge, error) { vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: name, Help: desc, }, labelNames) - b.registry.MustRegister(vec) - return &Float64Gauge{vec: vec} + if err := b.registry.Register(vec); err != nil { + if are, ok := err.(prometheus.AlreadyRegisteredError); ok { + existing, ok := are.ExistingCollector.(*prometheus.GaugeVec) + if !ok { + return nil, err + } + return &Float64Gauge{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil + } + return nil, err + } + return &Float64Gauge{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil } // NewHistogram creates a Prometheus HistogramVec registered on the backend's registry. -func (b *Backend) NewHistogram(name, desc string, buckets []float64, labelNames ...string) *Histogram { +func (b *Backend) NewHistogram(name, desc string, buckets []float64, labelNames ...string) (*Histogram, error) { vec := prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: name, Help: desc, Buckets: buckets, }, labelNames) - b.registry.MustRegister(vec) - return &Histogram{vec: vec} + if err := b.registry.Register(vec); err != nil { + if are, ok := err.(prometheus.AlreadyRegisteredError); ok { + existing, ok := are.ExistingCollector.(*prometheus.HistogramVec) + if !ok { + return nil, err + } + return &Histogram{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil + } + return nil, err + } + return &Histogram{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil } // Counter is a native Prometheus counter instrument. type Counter struct { - vec *prometheus.CounterVec + vec *prometheus.CounterVec + labelNames []string + droppedSamplesCounter prometheus.Counter } // Add increments the counter by value for the given labels. @@ -139,47 +193,118 @@ type Counter struct { // value must be non-negative. Negative values are ignored. func (c *Counter) Add(_ context.Context, value int64, labels map[string]string) { if value < 0 { + log.Printf("WARN: counter add called with negative value=%d labels=%v expected_labels=%v", value, labels, c.labelNames) return } - c.vec.With(prometheus.Labels(labels)).Add(float64(value)) + normalized, ok := normalizeLabels(c.labelNames, labels, c.droppedSamplesCounter) + if !ok { + return + } + defer guardMetricPanic("counter", c.labelNames, labels) + c.vec.With(normalized).Add(float64(value)) } // UpDownCounter is a native Prometheus gauge used as a bidirectional counter. type UpDownCounter struct { - vec *prometheus.GaugeVec + vec *prometheus.GaugeVec + labelNames []string + droppedSamplesCounter prometheus.Counter } // Add adjusts the gauge by value for the given labels. func (u *UpDownCounter) Add(_ context.Context, value int64, labels map[string]string) { - u.vec.With(prometheus.Labels(labels)).Add(float64(value)) + normalized, ok := normalizeLabels(u.labelNames, labels, u.droppedSamplesCounter) + if !ok { + return + } + defer guardMetricPanic("updown", u.labelNames, labels) + u.vec.With(normalized).Add(float64(value)) } // Int64Gauge is a native Prometheus gauge recording integer snapshot values. type Int64Gauge struct { - vec *prometheus.GaugeVec + vec *prometheus.GaugeVec + labelNames []string + droppedSamplesCounter prometheus.Counter } // Record sets the gauge to value for the given labels. func (g *Int64Gauge) Record(_ context.Context, value int64, labels map[string]string) { - g.vec.With(prometheus.Labels(labels)).Set(float64(value)) + normalized, ok := normalizeLabels(g.labelNames, labels, g.droppedSamplesCounter) + if !ok { + return + } + defer guardMetricPanic("int64-gauge", g.labelNames, labels) + g.vec.With(normalized).Set(float64(value)) } // Float64Gauge is a native Prometheus gauge recording float snapshot values. type Float64Gauge struct { - vec *prometheus.GaugeVec + vec *prometheus.GaugeVec + labelNames []string + droppedSamplesCounter prometheus.Counter } // Record sets the gauge to value for the given labels. func (g *Float64Gauge) Record(_ context.Context, value float64, labels map[string]string) { - g.vec.With(prometheus.Labels(labels)).Set(value) + normalized, ok := normalizeLabels(g.labelNames, labels, g.droppedSamplesCounter) + if !ok { + return + } + defer guardMetricPanic("float64-gauge", g.labelNames, labels) + g.vec.With(normalized).Set(value) } // Histogram is a native Prometheus histogram instrument. type Histogram struct { - vec *prometheus.HistogramVec + vec *prometheus.HistogramVec + labelNames []string + droppedSamplesCounter prometheus.Counter } // Record observes value for the given labels. func (h *Histogram) Record(_ context.Context, value float64, labels map[string]string) { - h.vec.With(prometheus.Labels(labels)).Observe(value) + normalized, ok := normalizeLabels(h.labelNames, labels, h.droppedSamplesCounter) + if !ok { + return + } + defer guardMetricPanic("histogram", h.labelNames, labels) + h.vec.With(normalized).Observe(value) +} + +func normalizeLabels(labelNames []string, labels map[string]string, droppedSamplesCounter prometheus.Counter) (prometheus.Labels, bool) { + if len(labelNames) == 0 { + if len(labels) > 0 { + if droppedSamplesCounter != nil { + droppedSamplesCounter.Inc() + } + log.Printf("WARN: dropping metric sample due to unexpected labels: got=%v expected=none", labels) + return nil, false + } + return nil, true + } + + normalized := make(prometheus.Labels, len(labelNames)) + for _, name := range labelNames { + normalized[name] = "" + } + + for k, v := range labels { + if _, ok := normalized[k]; !ok { + if droppedSamplesCounter != nil { + droppedSamplesCounter.Inc() + } + log.Printf("WARN: dropping metric sample due to unexpected label key %q (expected=%v)", k, labelNames) + return nil, false + } + normalized[k] = v + } + + return normalized, true +} + +func guardMetricPanic(kind string, expected []string, labels map[string]string) { + if recovered := recover(); recovered != nil { + log.Printf("WARN: dropped %s metric sample due to label panic: expected=%v got=%v err=%v", kind, expected, labels, recovered) + } } diff --git a/internal/observability/prometheus/backend_test.go b/internal/observability/prometheus/backend_test.go index d60821f..23f4b90 100644 --- a/internal/observability/prometheus/backend_test.go +++ b/internal/observability/prometheus/backend_test.go @@ -36,7 +36,10 @@ func TestPrometheusBackendShutdown(t *testing.T) { func TestPrometheusBackendCounter(t *testing.T) { b := newTestBackend(t) - c := b.NewCounter("test_counter_total", "A test counter", "result") + c, err := b.NewCounter("test_counter_total", "A test counter", "result") + if err != nil { + t.Fatalf("NewCounter returned error: %v", err) + } c.Add(context.Background(), 3, map[string]string{"result": "ok"}) body := scrapeMetrics(t, b) @@ -45,7 +48,10 @@ func TestPrometheusBackendCounter(t *testing.T) { func TestPrometheusBackendUpDownCounter(t *testing.T) { b := newTestBackend(t) - u := b.NewUpDownCounter("test_gauge_total", "A test up-down counter", "state") + u, err := b.NewUpDownCounter("test_gauge_total", "A test up-down counter", "state") + if err != nil { + t.Fatalf("NewUpDownCounter returned error: %v", err) + } u.Add(context.Background(), 5, map[string]string{"state": "active"}) u.Add(context.Background(), -2, map[string]string{"state": "active"}) @@ -55,7 +61,10 @@ func TestPrometheusBackendUpDownCounter(t *testing.T) { func TestPrometheusBackendInt64Gauge(t *testing.T) { b := newTestBackend(t) - g := b.NewInt64Gauge("test_int_gauge", "An integer gauge", "ifname") + g, err := b.NewInt64Gauge("test_int_gauge", "An integer gauge", "ifname") + if err != nil { + t.Fatalf("NewInt64Gauge returned error: %v", err) + } g.Record(context.Background(), 42, map[string]string{"ifname": "wg0"}) body := scrapeMetrics(t, b) @@ -64,7 +73,10 @@ func TestPrometheusBackendInt64Gauge(t *testing.T) { func TestPrometheusBackendFloat64Gauge(t *testing.T) { b := newTestBackend(t) - g := b.NewFloat64Gauge("test_float_gauge", "A float gauge", "cert") + g, err := b.NewFloat64Gauge("test_float_gauge", "A float gauge", "cert") + if err != nil { + t.Fatalf("NewFloat64Gauge returned error: %v", err) + } g.Record(context.Background(), 7.5, map[string]string{"cert": "example.com"}) body := scrapeMetrics(t, b) @@ -74,7 +86,10 @@ func TestPrometheusBackendFloat64Gauge(t *testing.T) { func TestPrometheusBackendHistogram(t *testing.T) { b := newTestBackend(t) buckets := []float64{0.1, 0.5, 1.0, 5.0} - h := b.NewHistogram("test_duration_seconds", "A test histogram", buckets, "method") + h, err := b.NewHistogram("test_duration_seconds", "A test histogram", buckets, "method") + if err != nil { + t.Fatalf("NewHistogram returned error: %v", err) + } h.Record(context.Background(), 0.3, map[string]string{"method": "GET"}) body := scrapeMetrics(t, b) @@ -85,7 +100,10 @@ func TestPrometheusBackendHistogram(t *testing.T) { func TestPrometheusBackendMultipleLabels(t *testing.T) { b := newTestBackend(t) - c := b.NewCounter("multi_label_total", "Multi-label counter", "method", "route", "status_code") + c, err := b.NewCounter("multi_label_total", "Multi-label counter", "method", "route", "status_code") + if err != nil { + t.Fatalf("NewCounter returned error: %v", err) + } c.Add(context.Background(), 1, map[string]string{ "method": "POST", "route": "/api/peers", @@ -122,23 +140,29 @@ func TestPrometheusBackendNoGoMetrics(t *testing.T) { func TestPrometheusBackendNilLabels(t *testing.T) { // Adding with nil labels should not panic (treated as empty map). b := newTestBackend(t) - c := b.NewCounter("nil_labels_total", "counter with no labels") + c, err := b.NewCounter("nil_labels_total", "counter with no labels") + if err != nil { + t.Fatalf("NewCounter returned error: %v", err) + } // nil labels with no label names declared should be safe c.Add(context.Background(), 1, nil) } func TestPrometheusBackendConcurrentAdd(t *testing.T) { b := newTestBackend(t) - c := b.NewCounter("concurrent_total", "concurrent counter", "worker") + c, err := b.NewCounter("concurrent_total", "concurrent counter", "worker") + if err != nil { + t.Fatalf("NewCounter returned error: %v", err) + } done := make(chan struct{}) for i := 0; i < 10; i++ { - go func(_ int) { + go func() { for j := 0; j < 100; j++ { c.Add(context.Background(), 1, map[string]string{"worker": "w"}) } done <- struct{}{} - }(i) + }() } for i := 0; i < 10; i++ { <-done @@ -148,6 +172,40 @@ func TestPrometheusBackendConcurrentAdd(t *testing.T) { assertMetricPresent(t, body, `concurrent_total{worker="w"} 1000`) } +func TestPrometheusBackendAlreadyRegisteredCounter(t *testing.T) { + b := newTestBackend(t) + c1, err := b.NewCounter("dupe_counter_total", "duplicate counter", "result") + if err != nil { + t.Fatalf("first NewCounter returned error: %v", err) + } + c2, err := b.NewCounter("dupe_counter_total", "duplicate counter", "result") + if err != nil { + t.Fatalf("second NewCounter returned error: %v", err) + } + + c1.Add(context.Background(), 1, map[string]string{"result": "ok"}) + c2.Add(context.Background(), 2, map[string]string{"result": "ok"}) + + body := scrapeMetrics(t, b) + assertMetricPresent(t, body, `dupe_counter_total{result="ok"} 3`) +} + +func TestPrometheusBackendInvalidLabelsNoPanic(t *testing.T) { + b := newTestBackend(t) + c, err := b.NewCounter("invalid_labels_total", "invalid labels test", "result") + if err != nil { + t.Fatalf("NewCounter returned error: %v", err) + } + + // Extra label key should be dropped and must not panic. + c.Add(context.Background(), 5, map[string]string{"result": "ok", "unexpected": "x"}) + + body := scrapeMetrics(t, b) + if strings.Contains(body, `invalid_labels_total{result="ok"}`) { + t.Error("invalid label sample should have been dropped") + } +} + // --- helpers --- func scrapeMetrics(t *testing.T, b *obsprom.Backend) string { From cda6fa677295aa48ce1781092675b78c35b793b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Mon, 4 May 2026 00:12:24 +0200 Subject: [PATCH 6/8] feat(cli/proxy): add OTLP timeout flag and make proxy metrics resilient --- main.go | 17 ++++++++++++++++- proxy/proxy.go | 20 ++++++++++---------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/main.go b/main.go index 62bfe7c..2a31c2b 100644 --- a/main.go +++ b/main.go @@ -175,6 +175,7 @@ func main() { otelMetricsEndpoint string otelMetricsInsecure bool otelMetricsExportInterval time.Duration + otelMetricsTimeout time.Duration ) interfaceName = os.Getenv("INTERFACE") @@ -229,6 +230,14 @@ func main() { log.Printf("WARN: invalid OTEL_METRICS_EXPORT_INTERVAL=%q: %v", v, err2) } } + otelMetricsTimeout = 10 * time.Second // default + if v := os.Getenv("OTEL_METRICS_TIMEOUT"); v != "" { + if d, err2 := time.ParseDuration(v); err2 == nil { + otelMetricsTimeout = d + } else { + log.Printf("WARN: invalid OTEL_METRICS_TIMEOUT=%q: %v", v, err2) + } + } if interfaceName == "" { flag.StringVar(&interfaceName, "interface", "wg0", "Name of the WireGuard interface") @@ -322,6 +331,7 @@ func main() { flag.StringVar(&otelMetricsEndpoint, "otel-metrics-endpoint", otelMetricsEndpoint, "OTLP collector endpoint (e.g. localhost:4317)") flag.BoolVar(&otelMetricsInsecure, "otel-metrics-insecure", otelMetricsInsecure, "Disable TLS for OTLP connection") flag.DurationVar(&otelMetricsExportInterval, "otel-metrics-export-interval", otelMetricsExportInterval, "Interval between OTLP metric pushes") + flag.DurationVar(&otelMetricsTimeout, "otel-metrics-timeout", otelMetricsTimeout, "Timeout for OTLP exporter setup") flag.Parse() @@ -347,6 +357,7 @@ func main() { Endpoint: otelMetricsEndpoint, Insecure: otelMetricsInsecure, ExportInterval: otelMetricsExportInterval, + Timeout: otelMetricsTimeout, }, ServiceName: "gerbil", ServiceVersion: "1.0.0", @@ -543,6 +554,8 @@ func main() { // Register metrics endpoint only for Prometheus backend. // OTel backend pushes to a collector; no /metrics endpoint needed. + // Note: metricsPath is registered directly without httpMetricsMiddleware to prevent infinite recursion. + // The metricsHandler must not be wrapped by the middleware, as it would observe its own observation calls. if metricsHandler != nil { http.Handle(metricsPath, metricsHandler) logger.Info("Metrics endpoint enabled at %s", metricsPath) @@ -1162,10 +1175,12 @@ func removePeerInternal(publicKey string) error { // Get current peer info before removing to clear relay connections and bandwidth limits var wgIPs []string + allowedIPsCount := 0 device, err := wgClient.Device(interfaceName) if err == nil { for _, peer := range device.Peers { if peer.PublicKey.String() == publicKey { + allowedIPsCount = len(peer.AllowedIPs) // Extract WireGuard IPs from this peer's allowed IPs for _, allowedIP := range peer.AllowedIPs { wgIPs = append(wgIPs, allowedIP.IP.String()) @@ -1208,7 +1223,7 @@ func removePeerInternal(publicKey string) error { // Record metrics metrics.RecordPeersTotal(interfaceName, -1) - metrics.RecordAllowedIPsCount(interfaceName, publicKey, -int64(len(wgIPs))) + metrics.RecordAllowedIPsCount(interfaceName, publicKey, -int64(allowedIPsCount)) return nil } diff --git a/proxy/proxy.go b/proxy/proxy.go index 71cf4ed..9b46e10 100644 --- a/proxy/proxy.go +++ b/proxy/proxy.go @@ -548,7 +548,7 @@ func (p *SNIProxy) handleConnection(clientConn net.Conn) { logger.Debug("SNI extraction failed: %v", err) return } - metrics.RecordProxyTLSHandshake(hostname, time.Since(clientHelloStart).Seconds()) + metrics.RecordProxyTLSHandshake(time.Since(clientHelloStart).Seconds()) if hostname == "" { log.Println("No SNI hostname found") @@ -596,8 +596,8 @@ func (p *SNIProxy) handleConnection(clientConn net.Conn) { defer targetConn.Close() logger.Debug("Connected to target: %s:%d", route.TargetHost, route.TargetPort) - metrics.RecordActiveProxyConnection(hostname, 1) - defer metrics.RecordActiveProxyConnection(hostname, -1) + metrics.RecordActiveProxyConnection(1) + defer metrics.RecordActiveProxyConnection(-1) // Send PROXY protocol header if enabled if p.proxyProtocol { @@ -655,7 +655,7 @@ func (p *SNIProxy) getRoute(hostname, clientAddr string) (*RouteRecord, error) { // Check local overrides first if _, isOverride := p.localOverrides[hostname]; isOverride { logger.Debug("Local override matched for hostname: %s", hostname) - metrics.RecordProxyRouteLookup("local_override", hostname) + metrics.RecordProxyRouteLookup("local_override") return &RouteRecord{ Hostname: hostname, TargetHost: p.localProxyAddr, @@ -668,7 +668,7 @@ func (p *SNIProxy) getRoute(hostname, clientAddr string) (*RouteRecord, error) { _, isLocal := p.localSNIs[hostname] p.localSNIsLock.RUnlock() if isLocal { - metrics.RecordProxyRouteLookup("local", hostname) + metrics.RecordProxyRouteLookup("local") return &RouteRecord{ Hostname: hostname, TargetHost: p.localProxyAddr, @@ -679,16 +679,16 @@ func (p *SNIProxy) getRoute(hostname, clientAddr string) (*RouteRecord, error) { // Check cache first if cached, found := p.cache.Get(hostname); found { if cached == nil { - metrics.RecordProxyRouteLookup("cached_not_found", hostname) + metrics.RecordProxyRouteLookup("cached_not_found") return nil, nil // Cached negative result } logger.Debug("Cache hit for hostname: %s", hostname) - metrics.RecordProxyRouteLookup("cache_hit", hostname) + metrics.RecordProxyRouteLookup("cache_hit") return cached.(*RouteRecord), nil } logger.Debug("Cache miss for hostname: %s, querying API", hostname) - metrics.RecordProxyRouteLookup("cache_miss", hostname) + metrics.RecordProxyRouteLookup("cache_miss") // Query API with timeout ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second) @@ -822,7 +822,7 @@ func (p *SNIProxy) pipe(hostname string, clientConn, targetConn net.Conn, client }() bytesCopied, err := io.CopyBuffer(targetConn, clientReader, *bufPtr) - metrics.RecordProxyBytesTransmitted(hostname, "client_to_target", bytesCopied) + metrics.RecordProxyBytesTransmitted("client_to_target", bytesCopied) if err != nil && err != io.EOF { logger.Debug("Copy client->target error: %v", err) } @@ -842,7 +842,7 @@ func (p *SNIProxy) pipe(hostname string, clientConn, targetConn net.Conn, client }() bytesCopied, err := io.CopyBuffer(clientConn, targetConn, *bufPtr) - metrics.RecordProxyBytesTransmitted(hostname, "target_to_client", bytesCopied) + metrics.RecordProxyBytesTransmitted("target_to_client", bytesCopied) if err != nil && err != io.EOF { logger.Debug("Copy target->client error: %v", err) } From 3f95e2da255c4592a932b3297ed4e256a7bb7aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Mon, 4 May 2026 00:21:10 +0200 Subject: [PATCH 7/8] fix(otel): revert semconv version and correct deployment environment attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marc Schäfer --- internal/observability/otel/resource.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/observability/otel/resource.go b/internal/observability/otel/resource.go index 47a14ff..b0f3b11 100644 --- a/internal/observability/otel/resource.go +++ b/internal/observability/otel/resource.go @@ -3,7 +3,7 @@ package otel import ( "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/sdk/resource" - semconv "go.opentelemetry.io/otel/semconv/v1.40.0" + semconv "go.opentelemetry.io/otel/semconv/v1.18.0" ) // newResource builds an OTel resource for the Gerbil service. @@ -15,11 +15,11 @@ func newResource(serviceName, serviceVersion, deploymentEnv string) (*resource.R attrs = append(attrs, semconv.ServiceVersion(serviceVersion)) } if deploymentEnv != "" { - attrs = append(attrs, semconv.DeploymentEnvironmentName(deploymentEnv)) + attrs = append(attrs, semconv.DeploymentEnvironment(deploymentEnv)) } return resource.Merge( resource.Default(), - resource.NewWithAttributes(semconv.SchemaURL, attrs...), + resource.NewSchemaless(attrs...), ) } From 375cb8b0bae6c0a8e2bf50ec7355cce4e1ab8c6a Mon Sep 17 00:00:00 2001 From: Owen Date: Sun, 3 May 2026 15:26:52 -0700 Subject: [PATCH 8/8] Update CODEOWNERS --- .github/CODEOWNERS | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7d8c330..c5f1403 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,2 +1 @@ * @oschwartz10612 @miloschwartz -internal/observability/** @marcschaeferger