Merge pull request #79 from fosrl/observability-update

Enhance observability with OTLP timeout and API unification
This commit is contained in:
Owen Schwartz
2026-05-03 15:27:44 -07:00
committed by GitHub
20 changed files with 1142 additions and 278 deletions

View File

@@ -138,7 +138,7 @@ make
### Binary
Make sure to have Go 1.23.1 installed.
Make sure to have Go 1.26 installed.
```bash
make local

View File

@@ -83,6 +83,7 @@ type OTelConfig struct {
Endpoint string // default: "localhost:4317"
Insecure bool // default: true
ExportInterval time.Duration // default: 60s
Timeout time.Duration // default: 10s
}
```
@@ -97,6 +98,7 @@ type OTelConfig struct {
| `OTEL_METRICS_ENDPOINT` | `localhost:4317` | OTLP collector address |
| `OTEL_METRICS_INSECURE` | `true` | Disable TLS for OTLP |
| `OTEL_METRICS_EXPORT_INTERVAL` | `60s` | Push interval (e.g. `10s`, `1m`) |
| `OTEL_METRICS_TIMEOUT` | `10s` | Timeout for OTLP exporter connection setup |
| `DEPLOYMENT_ENVIRONMENT` | _(unset)_ | OTel deployment.environment attribute |
### CLI flags
@@ -108,7 +110,8 @@ type OTelConfig struct {
--otel-metrics-protocol string (default: grpc)
--otel-metrics-endpoint string (default: localhost:4317)
--otel-metrics-insecure bool (default: true)
--otel-metrics-export-interval duration (default: 1m0s)
--otel-metrics-export-interval duration (default: 60s)
--otel-metrics-timeout duration (default: 10s)
```
---
@@ -164,6 +167,7 @@ export OTEL_METRICS_PROTOCOL=grpc
export OTEL_METRICS_ENDPOINT=otel-collector:4317
export OTEL_METRICS_INSECURE=true
export OTEL_METRICS_EXPORT_INTERVAL=10s
export OTEL_METRICS_TIMEOUT=10s
export DEPLOYMENT_ENVIRONMENT=production
```
@@ -176,6 +180,7 @@ export DEPLOYMENT_ENVIRONMENT=production
--otel-metrics-endpoint=otel-collector:4317 \
--otel-metrics-insecure \
--otel-metrics-export-interval=10s \
--otel-metrics-timeout=10s \
--config=/etc/gerbil/config.json
```
@@ -225,7 +230,6 @@ All metrics use the prefix `gerbil_<component>_<name>`.
| Metric | Type | Labels |
|--------|------|--------|
| `gerbil_proxy_mapping_active` | UpDownCounter | `ifname` |
| `gerbil_session_active` | UpDownCounter | `ifname` |
| `gerbil_active_sessions` | UpDownCounter | `ifname` |
| `gerbil_udp_packets_total` | Counter | `ifname`, `type`, `direction` |
| `gerbil_hole_punch_events_total` | Counter | `ifname`, `result` |
@@ -256,7 +260,7 @@ The `docker-compose.metrics.yml` provides a complete observability stack.
**Prometheus mode:**
```bash
METRICS_BACKEND=prometheus docker-compose -f docker-compose.metrics.yml up -d
METRICS_BACKEND=prometheus docker-compose -f docker compose.metrics.yml up -d
# Scrape at http://localhost:3003/metrics
# Grafana at http://localhost:3000 (admin/admin)
```
@@ -265,5 +269,5 @@ METRICS_BACKEND=prometheus docker-compose -f docker-compose.metrics.yml up -d
```bash
METRICS_BACKEND=otel OTEL_METRICS_ENDPOINT=otel-collector:4317 \
docker-compose -f docker-compose.metrics.yml up -d
docker compose -f docker-compose.metrics.yml up -d
```

View File

@@ -1,3 +1,4 @@
file_format: '1.0'
receivers:
otlp:
protocols:
@@ -43,4 +44,4 @@ service:
metrics:
receivers: [otlp]
processors: [batch, resource]
exporters: [prometheus, prometheusremotewrite, debug]
exporters: [prometheus, prometheusremotewrite, debug]

24
go.mod
View File

@@ -6,12 +6,12 @@ require (
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/prometheus/client_golang v1.20.5
github.com/vishvananda/netlink v1.3.1
go.opentelemetry.io/otel v1.42.0
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0
go.opentelemetry.io/otel/metric v1.42.0
go.opentelemetry.io/otel/sdk v1.42.0
go.opentelemetry.io/otel/sdk/metric v1.42.0
go.opentelemetry.io/otel v1.43.0
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0
go.opentelemetry.io/otel/metric v1.43.0
go.opentelemetry.io/otel/sdk v1.43.0
go.opentelemetry.io/otel/sdk/metric v1.43.0
golang.org/x/crypto v0.49.0
golang.org/x/sync v0.20.0
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6
@@ -37,14 +37,14 @@ require (
github.com/prometheus/procfs v0.15.1 // indirect
github.com/vishvananda/netns v0.0.5 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/otel/trace v1.42.0 // indirect
go.opentelemetry.io/proto/otlp v1.9.0 // indirect
golang.org/x/net v0.51.0 // indirect
go.opentelemetry.io/otel/trace v1.43.0 // indirect
go.opentelemetry.io/proto/otlp v1.10.0 // indirect
golang.org/x/net v0.52.0 // indirect
golang.org/x/sys v0.42.0 // indirect
golang.org/x/text v0.35.0 // indirect
golang.zx2c4.com/wireguard v0.0.0-20230325221338-052af4a8072b // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect
google.golang.org/grpc v1.79.3 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
google.golang.org/grpc v1.80.0 // indirect
google.golang.org/protobuf v1.36.11 // indirect
)

52
go.sum
View File

@@ -55,26 +55,26 @@ github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zd
github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM=
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
go.opentelemetry.io/otel v1.42.0 h1:lSQGzTgVR3+sgJDAU/7/ZMjN9Z+vUip7leaqBKy4sho=
go.opentelemetry.io/otel v1.42.0/go.mod h1:lJNsdRMxCUIWuMlVJWzecSMuNjE7dOYyWlqOXWkdqCc=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0 h1:MdKucPl/HbzckWWEisiNqMPhRrAOQX8r4jTuGr636gk=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0/go.mod h1:RolT8tWtfHcjajEH5wFIZ4Dgh5jpPdFXYV9pTAk/qjc=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0 h1:H7O6RlGOMTizyl3R08Kn5pdM06bnH8oscSj7o11tmLA=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0/go.mod h1:mBFWu/WOVDkWWsR7Tx7h6EpQB8wsv7P0Yrh0Pb7othc=
go.opentelemetry.io/otel/metric v1.42.0 h1:2jXG+3oZLNXEPfNmnpxKDeZsFI5o4J+nz6xUlaFdF/4=
go.opentelemetry.io/otel/metric v1.42.0/go.mod h1:RlUN/7vTU7Ao/diDkEpQpnz3/92J9ko05BIwxYa2SSI=
go.opentelemetry.io/otel/sdk v1.42.0 h1:LyC8+jqk6UJwdrI/8VydAq/hvkFKNHZVIWuslJXYsDo=
go.opentelemetry.io/otel/sdk v1.42.0/go.mod h1:rGHCAxd9DAph0joO4W6OPwxjNTYWghRWmkHuGbayMts=
go.opentelemetry.io/otel/sdk/metric v1.42.0 h1:D/1QR46Clz6ajyZ3G8SgNlTJKBdGp84q9RKCAZ3YGuA=
go.opentelemetry.io/otel/sdk/metric v1.42.0/go.mod h1:Ua6AAlDKdZ7tdvaQKfSmnFTdHx37+J4ba8MwVCYM5hc=
go.opentelemetry.io/otel/trace v1.42.0 h1:OUCgIPt+mzOnaUTpOQcBiM/PLQ/Op7oq6g4LenLmOYY=
go.opentelemetry.io/otel/trace v1.42.0/go.mod h1:f3K9S+IFqnumBkKhRJMeaZeNk9epyhnCmQh/EysQCdc=
go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A=
go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4=
go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 h1:8UQVDcZxOJLtX6gxtDt3vY2WTgvZqMQRzjsqiIHQdkc=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0/go.mod h1:2lmweYCiHYpEjQ/lSJBYhj9jP1zvCvQW4BqL9dnT7FQ=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 h1:w1K+pCJoPpQifuVpsKamUdn9U0zM3xUziVOqsGksUrY=
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0/go.mod h1:HBy4BjzgVE8139ieRI75oXm3EcDN+6GhD88JT1Kjvxg=
go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g=
go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk=
golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4=
golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA=
golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo=
golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y=
golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
@@ -87,14 +87,14 @@ golang.zx2c4.com/wireguard v0.0.0-20230325221338-052af4a8072b h1:J1CaxgLerRR5lgx
golang.zx2c4.com/wireguard v0.0.0-20230325221338-052af4a8072b/go.mod h1:tqur9LnfstdR9ep2LaJT4lFUl0EjlHtge+gAjmsHUG4=
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6 h1:CawjfCvYQH2OU3/TnxLx97WDSUDRABfT18pCOYwc2GE=
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6/go.mod h1:3rxYc4HtVcSG9gVaTs2GEBdehh+sYPOwKtyUWEOTb80=
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0=
google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ=
google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE=
google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ=
gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=

View File

@@ -7,7 +7,9 @@ package metrics
import (
"context"
"fmt"
"net/http"
"sync"
"github.com/fosrl/gerbil/internal/observability"
)
@@ -24,6 +26,7 @@ type OTelConfig = observability.OTelConfig
var (
backend observability.Backend
initMu sync.Mutex
// Interface and peer metrics
wgInterfaceUp observability.Int64Gauge
@@ -55,7 +58,6 @@ var (
udpPacketSizeBytes observability.Histogram
holePunchEventsTotal observability.Counter
proxyMappingActive observability.UpDownCounter
sessionActive observability.UpDownCounter
sessionRebuiltTotal observability.Counter
commPatternActive observability.UpDownCounter
proxyCleanupRemovedTotal observability.Counter
@@ -107,6 +109,13 @@ func DefaultConfig() Config {
// Initialize sets up the metrics system using the selected backend.
// It returns the /metrics HTTP handler (non-nil only for Prometheus backend).
func Initialize(cfg Config) (http.Handler, error) {
initMu.Lock()
defer initMu.Unlock()
if backend != nil {
return backend.HTTPHandler(), nil
}
b, err := observability.New(cfg)
if err != nil {
return nil, err
@@ -114,6 +123,7 @@ func Initialize(cfg Config) (http.Handler, error) {
backend = b
if err := createInstruments(); err != nil {
backend = nil
return nil, err
}
@@ -122,8 +132,13 @@ func Initialize(cfg Config) (http.Handler, error) {
// Shutdown gracefully shuts down the metrics backend.
func Shutdown(ctx context.Context) error {
if backend != nil {
return backend.Shutdown(ctx)
initMu.Lock()
b := backend
backend = nil
initMu.Unlock()
if b != nil {
return b.Shutdown(ctx)
}
return nil
}
@@ -135,129 +150,346 @@ func createInstruments() error {
b := backend
wgInterfaceUp = b.NewInt64Gauge("gerbil_wg_interface_up",
newCounter := func(name, desc string, labelNames ...string) (observability.Counter, error) {
c, err := b.NewCounter(name, desc, labelNames...)
if err != nil {
return nil, fmt.Errorf("create counter %q: %w", name, err)
}
return c, nil
}
newUpDownCounter := func(name, desc string, labelNames ...string) (observability.UpDownCounter, error) {
c, err := b.NewUpDownCounter(name, desc, labelNames...)
if err != nil {
return nil, fmt.Errorf("create updown counter %q: %w", name, err)
}
return c, nil
}
newInt64Gauge := func(name, desc string, labelNames ...string) (observability.Int64Gauge, error) {
g, err := b.NewInt64Gauge(name, desc, labelNames...)
if err != nil {
return nil, fmt.Errorf("create int64 gauge %q: %w", name, err)
}
return g, nil
}
newFloat64Gauge := func(name, desc string, labelNames ...string) (observability.Float64Gauge, error) {
g, err := b.NewFloat64Gauge(name, desc, labelNames...)
if err != nil {
return nil, fmt.Errorf("create float64 gauge %q: %w", name, err)
}
return g, nil
}
newHistogram := func(name, desc string, buckets []float64, labelNames ...string) (observability.Histogram, error) {
h, err := b.NewHistogram(name, desc, buckets, labelNames...)
if err != nil {
return nil, fmt.Errorf("create histogram %q: %w", name, err)
}
return h, nil
}
var err error
wgInterfaceUp, err = newInt64Gauge("gerbil_wg_interface_up",
"Operational state of a WireGuard interface (1=up, 0=down)", "ifname", "instance")
wgPeersTotal = b.NewUpDownCounter("gerbil_wg_peers_total",
if err != nil {
return err
}
wgPeersTotal, err = newUpDownCounter("gerbil_wg_peers_total",
"Total number of configured peers per interface", "ifname")
wgPeerConnected = b.NewInt64Gauge("gerbil_wg_peer_connected",
if err != nil {
return err
}
wgPeerConnected, err = newInt64Gauge("gerbil_wg_peer_connected",
"Whether a specific peer is connected (1=connected, 0=disconnected)", "ifname", "peer")
allowedIPsCount = b.NewUpDownCounter("gerbil_allowed_ips_count",
if err != nil {
return err
}
allowedIPsCount, err = newUpDownCounter("gerbil_allowed_ips_count",
"Number of allowed IPs configured per peer", "ifname", "peer")
keyRotationTotal = b.NewCounter("gerbil_key_rotation_total",
if err != nil {
return err
}
keyRotationTotal, err = newCounter("gerbil_key_rotation_total",
"Key rotation events", "ifname", "reason")
wgHandshakesTotal = b.NewCounter("gerbil_wg_handshakes_total",
if err != nil {
return err
}
wgHandshakesTotal, err = newCounter("gerbil_wg_handshakes_total",
"Count of handshake attempts with their result status", "ifname", "peer", "result")
wgHandshakeLatency = b.NewHistogram("gerbil_wg_handshake_latency_seconds",
if err != nil {
return err
}
wgHandshakeLatency, err = newHistogram("gerbil_wg_handshake_latency_seconds",
"Distribution of handshake latencies in seconds", durationBuckets, "ifname", "peer")
wgPeerRTT = b.NewHistogram("gerbil_wg_peer_rtt_seconds",
if err != nil {
return err
}
wgPeerRTT, err = newHistogram("gerbil_wg_peer_rtt_seconds",
"Observed round-trip time to a peer in seconds", durationBuckets, "ifname", "peer")
wgBytesReceived = b.NewCounter("gerbil_wg_bytes_received_total",
if err != nil {
return err
}
wgBytesReceived, err = newCounter("gerbil_wg_bytes_received_total",
"Number of bytes received from a peer", "ifname", "peer")
wgBytesTransmitted = b.NewCounter("gerbil_wg_bytes_transmitted_total",
if err != nil {
return err
}
wgBytesTransmitted, err = newCounter("gerbil_wg_bytes_transmitted_total",
"Number of bytes transmitted to a peer", "ifname", "peer")
netlinkEventsTotal = b.NewCounter("gerbil_netlink_events_total",
if err != nil {
return err
}
netlinkEventsTotal, err = newCounter("gerbil_netlink_events_total",
"Number of netlink events processed", "event_type")
netlinkErrorsTotal = b.NewCounter("gerbil_netlink_errors_total",
if err != nil {
return err
}
netlinkErrorsTotal, err = newCounter("gerbil_netlink_errors_total",
"Count of netlink or kernel errors", "component", "error_type")
syncDuration = b.NewHistogram("gerbil_sync_duration_seconds",
if err != nil {
return err
}
syncDuration, err = newHistogram("gerbil_sync_duration_seconds",
"Duration of reconciliation/sync loops in seconds", durationBuckets, "component")
workqueueDepth = b.NewUpDownCounter("gerbil_workqueue_depth",
if err != nil {
return err
}
workqueueDepth, err = newUpDownCounter("gerbil_workqueue_depth",
"Current length of internal work queues", "queue")
kernelModuleLoads = b.NewCounter("gerbil_kernel_module_loads_total",
if err != nil {
return err
}
kernelModuleLoads, err = newCounter("gerbil_kernel_module_loads_total",
"Count of kernel module load attempts", "result")
firewallRulesApplied = b.NewCounter("gerbil_firewall_rules_applied_total",
if err != nil {
return err
}
firewallRulesApplied, err = newCounter("gerbil_firewall_rules_applied_total",
"IPTables/NFT rules applied", "result", "chain")
activeSessions = b.NewUpDownCounter("gerbil_active_sessions",
if err != nil {
return err
}
activeSessions, err = newUpDownCounter("gerbil_active_sessions",
"Number of active UDP relay sessions", "ifname")
activeProxyConnections = b.NewUpDownCounter("gerbil_active_proxy_connections",
if err != nil {
return err
}
activeProxyConnections, err = newUpDownCounter("gerbil_active_proxy_connections",
"Active SNI proxy connections")
proxyRouteLookups = b.NewCounter("gerbil_proxy_route_lookups_total",
if err != nil {
return err
}
proxyRouteLookups, err = newCounter("gerbil_proxy_route_lookups_total",
"Number of route lookups", "result")
proxyTLSHandshake = b.NewHistogram("gerbil_proxy_tls_handshake_seconds",
if err != nil {
return err
}
proxyTLSHandshake, err = newHistogram("gerbil_proxy_tls_handshake_seconds",
"TLS handshake duration for SNI proxy in seconds", durationBuckets)
proxyBytesTransmitted = b.NewCounter("gerbil_proxy_bytes_transmitted_total",
if err != nil {
return err
}
proxyBytesTransmitted, err = newCounter("gerbil_proxy_bytes_transmitted_total",
"Bytes sent/received by the SNI proxy", "direction")
configReloadsTotal = b.NewCounter("gerbil_config_reloads_total",
if err != nil {
return err
}
configReloadsTotal, err = newCounter("gerbil_config_reloads_total",
"Number of configuration reloads", "result")
restartTotal = b.NewCounter("gerbil_restart_total",
if err != nil {
return err
}
restartTotal, err = newCounter("gerbil_restart_total",
"Process restart count")
authFailuresTotal = b.NewCounter("gerbil_auth_failures_total",
if err != nil {
return err
}
authFailuresTotal, err = newCounter("gerbil_auth_failures_total",
"Count of authentication or peer validation failures", "peer", "reason")
aclDeniedTotal = b.NewCounter("gerbil_acl_denied_total",
if err != nil {
return err
}
aclDeniedTotal, err = newCounter("gerbil_acl_denied_total",
"Access control denied events", "ifname", "peer", "policy")
certificateExpiryDays = b.NewFloat64Gauge("gerbil_certificate_expiry_days",
if err != nil {
return err
}
certificateExpiryDays, err = newFloat64Gauge("gerbil_certificate_expiry_days",
"Days until certificate expiry", "cert_name", "ifname")
udpPacketsTotal = b.NewCounter("gerbil_udp_packets_total",
if err != nil {
return err
}
udpPacketsTotal, err = newCounter("gerbil_udp_packets_total",
"Count of UDP packets processed by relay workers", "ifname", "type", "direction")
udpPacketSizeBytes = b.NewHistogram("gerbil_udp_packet_size_bytes",
if err != nil {
return err
}
udpPacketSizeBytes, err = newHistogram("gerbil_udp_packet_size_bytes",
"Size distribution of packets forwarded through relay", sizeBuckets, "ifname", "type")
holePunchEventsTotal = b.NewCounter("gerbil_hole_punch_events_total",
if err != nil {
return err
}
holePunchEventsTotal, err = newCounter("gerbil_hole_punch_events_total",
"Count of hole punch messages processed", "ifname", "result")
proxyMappingActive = b.NewUpDownCounter("gerbil_proxy_mapping_active",
if err != nil {
return err
}
proxyMappingActive, err = newUpDownCounter("gerbil_proxy_mapping_active",
"Number of active proxy mappings", "ifname")
sessionActive = b.NewUpDownCounter("gerbil_session_active",
"Number of active WireGuard sessions", "ifname")
sessionRebuiltTotal = b.NewCounter("gerbil_session_rebuilt_total",
if err != nil {
return err
}
sessionRebuiltTotal, err = newCounter("gerbil_session_rebuilt_total",
"Count of sessions rebuilt from communication patterns", "ifname")
commPatternActive = b.NewUpDownCounter("gerbil_comm_pattern_active",
if err != nil {
return err
}
commPatternActive, err = newUpDownCounter("gerbil_comm_pattern_active",
"Number of active communication patterns", "ifname")
proxyCleanupRemovedTotal = b.NewCounter("gerbil_proxy_cleanup_removed_total",
if err != nil {
return err
}
proxyCleanupRemovedTotal, err = newCounter("gerbil_proxy_cleanup_removed_total",
"Count of items removed during cleanup routines", "ifname", "component")
proxyConnectionErrorsTotal = b.NewCounter("gerbil_proxy_connection_errors_total",
if err != nil {
return err
}
proxyConnectionErrorsTotal, err = newCounter("gerbil_proxy_connection_errors_total",
"Count of connection errors in proxy operations", "ifname", "error_type")
proxyInitialMappingsTotal = b.NewInt64Gauge("gerbil_proxy_initial_mappings",
if err != nil {
return err
}
proxyInitialMappingsTotal, err = newInt64Gauge("gerbil_proxy_initial_mappings",
"Number of initial proxy mappings loaded", "ifname")
proxyMappingUpdatesTotal = b.NewCounter("gerbil_proxy_mapping_updates_total",
if err != nil {
return err
}
proxyMappingUpdatesTotal, err = newCounter("gerbil_proxy_mapping_updates_total",
"Count of proxy mapping updates", "ifname")
proxyIdleCleanupDuration = b.NewHistogram("gerbil_proxy_idle_cleanup_duration_seconds",
if err != nil {
return err
}
proxyIdleCleanupDuration, err = newHistogram("gerbil_proxy_idle_cleanup_duration_seconds",
"Duration of cleanup cycles", durationBuckets, "ifname", "component")
sniConnectionsTotal = b.NewCounter("gerbil_sni_connections_total",
if err != nil {
return err
}
sniConnectionsTotal, err = newCounter("gerbil_sni_connections_total",
"Count of connections processed by SNI proxy", "result")
sniConnectionDuration = b.NewHistogram("gerbil_sni_connection_duration_seconds",
if err != nil {
return err
}
sniConnectionDuration, err = newHistogram("gerbil_sni_connection_duration_seconds",
"Lifetime distribution of proxied TLS connections", sniDurationBuckets)
sniActiveConnections = b.NewUpDownCounter("gerbil_sni_active_connections",
if err != nil {
return err
}
sniActiveConnections, err = newUpDownCounter("gerbil_sni_active_connections",
"Number of active SNI tunnels")
sniRouteCacheHitsTotal = b.NewCounter("gerbil_sni_route_cache_hits_total",
if err != nil {
return err
}
sniRouteCacheHitsTotal, err = newCounter("gerbil_sni_route_cache_hits_total",
"Count of route cache hits and misses", "result")
sniRouteAPIRequestsTotal = b.NewCounter("gerbil_sni_route_api_requests_total",
if err != nil {
return err
}
sniRouteAPIRequestsTotal, err = newCounter("gerbil_sni_route_api_requests_total",
"Count of route API requests", "result")
sniRouteAPILatency = b.NewHistogram("gerbil_sni_route_api_latency_seconds",
if err != nil {
return err
}
sniRouteAPILatency, err = newHistogram("gerbil_sni_route_api_latency_seconds",
"Distribution of route API call latencies", durationBuckets)
sniLocalOverrideTotal = b.NewCounter("gerbil_sni_local_override_total",
if err != nil {
return err
}
sniLocalOverrideTotal, err = newCounter("gerbil_sni_local_override_total",
"Count of routes using local overrides", "hit")
sniTrustedProxyEventsTotal = b.NewCounter("gerbil_sni_trusted_proxy_events_total",
if err != nil {
return err
}
sniTrustedProxyEventsTotal, err = newCounter("gerbil_sni_trusted_proxy_events_total",
"Count of PROXY protocol events", "event")
sniProxyProtocolParseErrorsTotal = b.NewCounter("gerbil_sni_proxy_protocol_parse_errors_total",
if err != nil {
return err
}
sniProxyProtocolParseErrorsTotal, err = newCounter("gerbil_sni_proxy_protocol_parse_errors_total",
"Count of PROXY protocol parse failures")
sniDataBytesTotal = b.NewCounter("gerbil_sni_data_bytes_total",
if err != nil {
return err
}
sniDataBytesTotal, err = newCounter("gerbil_sni_data_bytes_total",
"Count of bytes proxied through SNI tunnels", "direction")
sniTunnelTerminationsTotal = b.NewCounter("gerbil_sni_tunnel_terminations_total",
if err != nil {
return err
}
sniTunnelTerminationsTotal, err = newCounter("gerbil_sni_tunnel_terminations_total",
"Count of tunnel terminations by reason", "reason")
httpRequestsTotal = b.NewCounter("gerbil_http_requests_total",
if err != nil {
return err
}
httpRequestsTotal, err = newCounter("gerbil_http_requests_total",
"Count of HTTP requests to management API", "endpoint", "method", "status_code")
httpRequestDuration = b.NewHistogram("gerbil_http_request_duration_seconds",
if err != nil {
return err
}
httpRequestDuration, err = newHistogram("gerbil_http_request_duration_seconds",
"Distribution of HTTP request handling time", durationBuckets, "endpoint", "method")
peerOperationsTotal = b.NewCounter("gerbil_peer_operations_total",
if err != nil {
return err
}
peerOperationsTotal, err = newCounter("gerbil_peer_operations_total",
"Count of peer lifecycle operations", "operation", "result")
proxyMappingUpdateRequestsTotal = b.NewCounter("gerbil_proxy_mapping_update_requests_total",
if err != nil {
return err
}
proxyMappingUpdateRequestsTotal, err = newCounter("gerbil_proxy_mapping_update_requests_total",
"Count of proxy mapping update API calls", "result")
destinationsUpdateRequestsTotal = b.NewCounter("gerbil_destinations_update_requests_total",
if err != nil {
return err
}
destinationsUpdateRequestsTotal, err = newCounter("gerbil_destinations_update_requests_total",
"Count of destinations update API calls", "result")
remoteConfigFetchesTotal = b.NewCounter("gerbil_remote_config_fetches_total",
if err != nil {
return err
}
remoteConfigFetchesTotal, err = newCounter("gerbil_remote_config_fetches_total",
"Count of remote configuration fetch attempts", "result")
bandwidthReportsTotal = b.NewCounter("gerbil_bandwidth_reports_total",
if err != nil {
return err
}
bandwidthReportsTotal, err = newCounter("gerbil_bandwidth_reports_total",
"Count of bandwidth report transmissions", "result")
peerBandwidthBytesTotal = b.NewCounter("gerbil_peer_bandwidth_bytes_total",
if err != nil {
return err
}
peerBandwidthBytesTotal, err = newCounter("gerbil_peer_bandwidth_bytes_total",
"Bytes per peer tracked by bandwidth calculation", "peer", "direction")
memorySpikeTotal = b.NewCounter("gerbil_memory_spike_total",
if err != nil {
return err
}
memorySpikeTotal, err = newCounter("gerbil_memory_spike_total",
"Count of memory spikes detected", "severity")
heapProfilesWrittenTotal = b.NewCounter("gerbil_heap_profiles_written_total",
if err != nil {
return err
}
heapProfilesWrittenTotal, err = newCounter("gerbil_heap_profiles_written_total",
"Count of heap profile files generated")
if err != nil {
return err
}
return nil
}
func RecordInterfaceUp(ifname, instance string, up bool) {
if wgInterfaceUp == nil {
return
}
value := int64(0)
if up {
value = 1
@@ -266,10 +498,16 @@ func RecordInterfaceUp(ifname, instance string, up bool) {
}
func RecordPeersTotal(ifname string, delta int64) {
if wgPeersTotal == nil {
return
}
wgPeersTotal.Add(context.Background(), delta, observability.Labels{"ifname": ifname})
}
func RecordPeerConnected(ifname, peer string, connected bool) {
if wgPeerConnected == nil {
return
}
value := int64(0)
if connected {
value = 1
@@ -278,229 +516,393 @@ func RecordPeerConnected(ifname, peer string, connected bool) {
}
func RecordHandshake(ifname, peer, result string) {
if wgHandshakesTotal == nil {
return
}
wgHandshakesTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "peer": peer, "result": result})
}
func RecordHandshakeLatency(ifname, peer string, seconds float64) {
if wgHandshakeLatency == nil {
return
}
wgHandshakeLatency.Record(context.Background(), seconds, observability.Labels{"ifname": ifname, "peer": peer})
}
func RecordPeerRTT(ifname, peer string, seconds float64) {
if wgPeerRTT == nil {
return
}
wgPeerRTT.Record(context.Background(), seconds, observability.Labels{"ifname": ifname, "peer": peer})
}
func RecordBytesReceived(ifname, peer string, bytes int64) {
if wgBytesReceived == nil {
return
}
wgBytesReceived.Add(context.Background(), bytes, observability.Labels{"ifname": ifname, "peer": peer})
}
func RecordBytesTransmitted(ifname, peer string, bytes int64) {
if wgBytesTransmitted == nil {
return
}
wgBytesTransmitted.Add(context.Background(), bytes, observability.Labels{"ifname": ifname, "peer": peer})
}
func RecordAllowedIPsCount(ifname, peer string, delta int64) {
if allowedIPsCount == nil {
return
}
allowedIPsCount.Add(context.Background(), delta, observability.Labels{"ifname": ifname, "peer": peer})
}
func RecordKeyRotation(ifname, reason string) {
if keyRotationTotal == nil {
return
}
keyRotationTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "reason": reason})
}
func RecordNetlinkEvent(eventType string) {
if netlinkEventsTotal == nil {
return
}
netlinkEventsTotal.Add(context.Background(), 1, observability.Labels{"event_type": eventType})
}
func RecordNetlinkError(component, errorType string) {
if netlinkErrorsTotal == nil {
return
}
netlinkErrorsTotal.Add(context.Background(), 1, observability.Labels{"component": component, "error_type": errorType})
}
func RecordSyncDuration(component string, seconds float64) {
if syncDuration == nil {
return
}
syncDuration.Record(context.Background(), seconds, observability.Labels{"component": component})
}
func RecordWorkqueueDepth(queue string, delta int64) {
if workqueueDepth == nil {
return
}
workqueueDepth.Add(context.Background(), delta, observability.Labels{"queue": queue})
}
func RecordKernelModuleLoad(result string) {
if kernelModuleLoads == nil {
return
}
kernelModuleLoads.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordFirewallRuleApplied(result, chain string) {
if firewallRulesApplied == nil {
return
}
firewallRulesApplied.Add(context.Background(), 1, observability.Labels{"result": result, "chain": chain})
}
func RecordActiveSession(ifname string, delta int64) {
if activeSessions == nil {
return
}
activeSessions.Add(context.Background(), delta, observability.Labels{"ifname": ifname})
}
func RecordActiveProxyConnection(hostname string, delta int64) {
_ = hostname
func RecordActiveProxyConnection(delta int64) {
if activeProxyConnections == nil {
return
}
activeProxyConnections.Add(context.Background(), delta, nil)
}
func RecordProxyRouteLookup(result, hostname string) {
_ = hostname
func RecordProxyRouteLookup(result string) {
if proxyRouteLookups == nil {
return
}
proxyRouteLookups.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordProxyTLSHandshake(hostname string, seconds float64) {
_ = hostname
func RecordProxyTLSHandshake(seconds float64) {
if proxyTLSHandshake == nil {
return
}
proxyTLSHandshake.Record(context.Background(), seconds, nil)
}
func RecordProxyBytesTransmitted(hostname, direction string, bytes int64) {
_ = hostname
func RecordProxyBytesTransmitted(direction string, bytes int64) {
if proxyBytesTransmitted == nil {
return
}
proxyBytesTransmitted.Add(context.Background(), bytes, observability.Labels{"direction": direction})
}
func RecordConfigReload(result string) {
if configReloadsTotal == nil {
return
}
configReloadsTotal.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordRestart() {
if restartTotal == nil {
return
}
restartTotal.Add(context.Background(), 1, nil)
}
func RecordAuthFailure(peer, reason string) {
if authFailuresTotal == nil {
return
}
authFailuresTotal.Add(context.Background(), 1, observability.Labels{"peer": peer, "reason": reason})
}
func RecordACLDenied(ifname, peer, policy string) {
if aclDeniedTotal == nil {
return
}
aclDeniedTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "peer": peer, "policy": policy})
}
func RecordCertificateExpiry(certName, ifname string, days float64) {
if certificateExpiryDays == nil {
return
}
certificateExpiryDays.Record(context.Background(), days, observability.Labels{"cert_name": certName, "ifname": ifname})
}
func RecordUDPPacket(ifname, packetType, direction string) {
if udpPacketsTotal == nil {
return
}
udpPacketsTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "type": packetType, "direction": direction})
}
func RecordUDPPacketSize(ifname, packetType string, bytes float64) {
if udpPacketSizeBytes == nil {
return
}
udpPacketSizeBytes.Record(context.Background(), bytes, observability.Labels{"ifname": ifname, "type": packetType})
}
func RecordHolePunchEvent(ifname, result string) {
if holePunchEventsTotal == nil {
return
}
holePunchEventsTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "result": result})
}
func RecordProxyMapping(ifname string, delta int64) {
if proxyMappingActive == nil {
return
}
proxyMappingActive.Add(context.Background(), delta, observability.Labels{"ifname": ifname})
}
func RecordSession(ifname string, delta int64) {
sessionActive.Add(context.Background(), delta, observability.Labels{"ifname": ifname})
if activeSessions == nil {
return
}
activeSessions.Add(context.Background(), delta, observability.Labels{"ifname": ifname})
}
func RecordSessionRebuilt(ifname string) {
if sessionRebuiltTotal == nil {
return
}
sessionRebuiltTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname})
}
func RecordCommPattern(ifname string, delta int64) {
if commPatternActive == nil {
return
}
commPatternActive.Add(context.Background(), delta, observability.Labels{"ifname": ifname})
}
func RecordProxyCleanupRemoved(ifname, component string, count int64) {
if proxyCleanupRemovedTotal == nil {
return
}
proxyCleanupRemovedTotal.Add(context.Background(), count, observability.Labels{"ifname": ifname, "component": component})
}
func RecordProxyConnectionError(ifname, errorType string) {
if proxyConnectionErrorsTotal == nil {
return
}
proxyConnectionErrorsTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname, "error_type": errorType})
}
func RecordProxyInitialMappings(ifname string, count int64) {
if proxyInitialMappingsTotal == nil {
return
}
proxyInitialMappingsTotal.Record(context.Background(), count, observability.Labels{"ifname": ifname})
}
func RecordProxyMappingUpdate(ifname string) {
if proxyMappingUpdatesTotal == nil {
return
}
proxyMappingUpdatesTotal.Add(context.Background(), 1, observability.Labels{"ifname": ifname})
}
func RecordProxyIdleCleanupDuration(ifname, component string, seconds float64) {
if proxyIdleCleanupDuration == nil {
return
}
proxyIdleCleanupDuration.Record(context.Background(), seconds, observability.Labels{"ifname": ifname, "component": component})
}
func RecordSNIConnection(result string) {
if sniConnectionsTotal == nil {
return
}
sniConnectionsTotal.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordSNIConnectionDuration(seconds float64) {
if sniConnectionDuration == nil {
return
}
sniConnectionDuration.Record(context.Background(), seconds, nil)
}
func RecordSNIActiveConnection(delta int64) {
if sniActiveConnections == nil {
return
}
sniActiveConnections.Add(context.Background(), delta, nil)
}
func RecordSNIRouteCacheHit(result string) {
if sniRouteCacheHitsTotal == nil {
return
}
sniRouteCacheHitsTotal.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordSNIRouteAPIRequest(result string) {
if sniRouteAPIRequestsTotal == nil {
return
}
sniRouteAPIRequestsTotal.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordSNIRouteAPILatency(seconds float64) {
if sniRouteAPILatency == nil {
return
}
sniRouteAPILatency.Record(context.Background(), seconds, nil)
}
func RecordSNILocalOverride(hit string) {
if sniLocalOverrideTotal == nil {
return
}
sniLocalOverrideTotal.Add(context.Background(), 1, observability.Labels{"hit": hit})
}
func RecordSNITrustedProxyEvent(event string) {
if sniTrustedProxyEventsTotal == nil {
return
}
sniTrustedProxyEventsTotal.Add(context.Background(), 1, observability.Labels{"event": event})
}
func RecordSNIProxyProtocolParseError() {
if sniProxyProtocolParseErrorsTotal == nil {
return
}
sniProxyProtocolParseErrorsTotal.Add(context.Background(), 1, nil)
}
func RecordSNIDataBytes(direction string, bytes int64) {
if sniDataBytesTotal == nil {
return
}
sniDataBytesTotal.Add(context.Background(), bytes, observability.Labels{"direction": direction})
}
func RecordSNITunnelTermination(reason string) {
if sniTunnelTerminationsTotal == nil {
return
}
sniTunnelTerminationsTotal.Add(context.Background(), 1, observability.Labels{"reason": reason})
}
func RecordHTTPRequest(endpoint, method, statusCode string) {
if httpRequestsTotal == nil {
return
}
httpRequestsTotal.Add(context.Background(), 1, observability.Labels{"endpoint": endpoint, "method": method, "status_code": statusCode})
}
func RecordHTTPRequestDuration(endpoint, method string, seconds float64) {
if httpRequestDuration == nil {
return
}
httpRequestDuration.Record(context.Background(), seconds, observability.Labels{"endpoint": endpoint, "method": method})
}
func RecordPeerOperation(operation, result string) {
if peerOperationsTotal == nil {
return
}
peerOperationsTotal.Add(context.Background(), 1, observability.Labels{"operation": operation, "result": result})
}
func RecordProxyMappingUpdateRequest(result string) {
if proxyMappingUpdateRequestsTotal == nil {
return
}
proxyMappingUpdateRequestsTotal.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordDestinationsUpdateRequest(result string) {
if destinationsUpdateRequestsTotal == nil {
return
}
destinationsUpdateRequestsTotal.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordRemoteConfigFetch(result string) {
if remoteConfigFetchesTotal == nil {
return
}
remoteConfigFetchesTotal.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordBandwidthReport(result string) {
if bandwidthReportsTotal == nil {
return
}
bandwidthReportsTotal.Add(context.Background(), 1, observability.Labels{"result": result})
}
func RecordPeerBandwidthBytes(peer, direction string, bytes int64) {
if peerBandwidthBytesTotal == nil {
return
}
peerBandwidthBytesTotal.Add(context.Background(), bytes, observability.Labels{"peer": peer, "direction": direction})
}
func RecordMemorySpike(severity string) {
if memorySpikeTotal == nil {
return
}
memorySpikeTotal.Add(context.Background(), 1, observability.Labels{"severity": severity})
}
func RecordHeapProfileWritten() {
if heapProfilesWrittenTotal == nil {
return
}
heapProfilesWrittenTotal.Add(context.Background(), 1, nil)
}

View File

@@ -89,6 +89,9 @@ func TestDefaultConfig(t *testing.T) {
}
func TestShutdownNoInit(t *testing.T) {
// Ensure a known clean global state before testing no-init shutdown behavior.
_ = metrics.Shutdown(context.Background())
// Shutdown without Initialize should not panic or error.
if err := metrics.Shutdown(context.Background()); err != nil {
t.Errorf("unexpected error: %v", err)
@@ -168,6 +171,7 @@ func TestRecordRelay(t *testing.T) {
body := scrape(t, h)
assertContains(t, body, "gerbil_udp_packets_total")
assertContains(t, body, "gerbil_proxy_mapping_active")
assertContains(t, body, "gerbil_active_sessions")
}
func TestRecordWireGuard(t *testing.T) {
@@ -216,10 +220,10 @@ func TestRecordNetlink(t *testing.T) {
metrics.RecordKernelModuleLoad("success")
metrics.RecordFirewallRuleApplied("success", "INPUT")
metrics.RecordActiveSession("wg0", 1)
metrics.RecordActiveProxyConnection(exampleHostname, 1)
metrics.RecordProxyRouteLookup("hit", exampleHostname)
metrics.RecordProxyTLSHandshake(exampleHostname, 0.05)
metrics.RecordProxyBytesTransmitted(exampleHostname, "tx", 1024)
metrics.RecordActiveProxyConnection(1)
metrics.RecordProxyRouteLookup("hit")
metrics.RecordProxyTLSHandshake(0.05)
metrics.RecordProxyBytesTransmitted("tx", 1024)
body := scrape(t, h)
assertContains(t, body, "gerbil_netlink_events_total")
assertContains(t, body, "gerbil_active_sessions")

View File

@@ -60,6 +60,10 @@ type OTelConfig struct {
// ExportInterval is how often metrics are pushed to the collector.
// Defaults to 60 s.
ExportInterval time.Duration
// Timeout bounds OTLP exporter construction calls.
// Defaults to 10 s.
Timeout time.Duration
}
// DefaultMetricsConfig returns a MetricsConfig with sensible defaults.
@@ -75,6 +79,7 @@ func DefaultMetricsConfig() MetricsConfig {
Endpoint: "localhost:4317",
Insecure: true,
ExportInterval: 60 * time.Second,
Timeout: 10 * time.Second,
},
ServiceName: "gerbil",
ServiceVersion: "1.0.0",
@@ -88,8 +93,10 @@ func (c *MetricsConfig) Validate() error {
}
switch c.Backend {
case "prometheus", "none", "":
case "prometheus", "none":
// valid
case "":
return fmt.Errorf("metrics: enabled requires a non-empty backend")
case "otel":
if c.OTel.Endpoint == "" {
return fmt.Errorf("metrics: backend=otel requires a non-empty OTel endpoint")
@@ -100,6 +107,9 @@ func (c *MetricsConfig) Validate() error {
if c.OTel.ExportInterval <= 0 {
return fmt.Errorf("metrics: otel export interval must be positive")
}
if c.OTel.Timeout <= 0 {
return fmt.Errorf("metrics: otel timeout must be positive")
}
default:
return fmt.Errorf("metrics: unknown backend %q (must be \"prometheus\", \"otel\", or \"none\")", c.Backend)
}

View File

@@ -43,20 +43,20 @@ type Histogram interface {
type Backend interface {
// NewCounter creates a counter metric.
// labelNames declares the set of label keys that will be passed at observation time.
NewCounter(name, desc string, labelNames ...string) Counter
NewCounter(name, desc string, labelNames ...string) (Counter, error)
// NewUpDownCounter creates an up-down counter metric.
NewUpDownCounter(name, desc string, labelNames ...string) UpDownCounter
NewUpDownCounter(name, desc string, labelNames ...string) (UpDownCounter, error)
// NewInt64Gauge creates an integer gauge metric.
NewInt64Gauge(name, desc string, labelNames ...string) Int64Gauge
NewInt64Gauge(name, desc string, labelNames ...string) (Int64Gauge, error)
// NewFloat64Gauge creates a float gauge metric.
NewFloat64Gauge(name, desc string, labelNames ...string) Float64Gauge
NewFloat64Gauge(name, desc string, labelNames ...string) (Float64Gauge, error)
// NewHistogram creates a histogram metric.
// buckets are the explicit upper-bound bucket boundaries.
NewHistogram(name, desc string, buckets []float64, labelNames ...string) Histogram
NewHistogram(name, desc string, buckets []float64, labelNames ...string) (Histogram, error)
// HTTPHandler returns the /metrics HTTP handler.
// Implementations that do not expose an HTTP endpoint return nil.
@@ -88,6 +88,7 @@ func New(cfg MetricsConfig) (Backend, error) {
Endpoint: cfg.OTel.Endpoint,
Insecure: cfg.OTel.Insecure,
ExportInterval: cfg.OTel.ExportInterval,
Timeout: cfg.OTel.Timeout,
ServiceName: cfg.ServiceName,
ServiceVersion: cfg.ServiceVersion,
DeploymentEnvironment: cfg.DeploymentEnvironment,
@@ -110,19 +111,19 @@ type promAdapter struct {
b *obsprom.Backend
}
func (a *promAdapter) NewCounter(name, desc string, labelNames ...string) Counter {
func (a *promAdapter) NewCounter(name, desc string, labelNames ...string) (Counter, error) {
return a.b.NewCounter(name, desc, labelNames...)
}
func (a *promAdapter) NewUpDownCounter(name, desc string, labelNames ...string) UpDownCounter {
func (a *promAdapter) NewUpDownCounter(name, desc string, labelNames ...string) (UpDownCounter, error) {
return a.b.NewUpDownCounter(name, desc, labelNames...)
}
func (a *promAdapter) NewInt64Gauge(name, desc string, labelNames ...string) Int64Gauge {
func (a *promAdapter) NewInt64Gauge(name, desc string, labelNames ...string) (Int64Gauge, error) {
return a.b.NewInt64Gauge(name, desc, labelNames...)
}
func (a *promAdapter) NewFloat64Gauge(name, desc string, labelNames ...string) Float64Gauge {
func (a *promAdapter) NewFloat64Gauge(name, desc string, labelNames ...string) (Float64Gauge, error) {
return a.b.NewFloat64Gauge(name, desc, labelNames...)
}
func (a *promAdapter) NewHistogram(name, desc string, buckets []float64, labelNames ...string) Histogram {
func (a *promAdapter) NewHistogram(name, desc string, buckets []float64, labelNames ...string) (Histogram, error) {
return a.b.NewHistogram(name, desc, buckets, labelNames...)
}
func (a *promAdapter) HTTPHandler() http.Handler { return a.b.HTTPHandler() }
@@ -133,19 +134,19 @@ type otelAdapter struct {
b *obsotel.Backend
}
func (a *otelAdapter) NewCounter(name, desc string, labelNames ...string) Counter {
func (a *otelAdapter) NewCounter(name, desc string, labelNames ...string) (Counter, error) {
return a.b.NewCounter(name, desc, labelNames...)
}
func (a *otelAdapter) NewUpDownCounter(name, desc string, labelNames ...string) UpDownCounter {
func (a *otelAdapter) NewUpDownCounter(name, desc string, labelNames ...string) (UpDownCounter, error) {
return a.b.NewUpDownCounter(name, desc, labelNames...)
}
func (a *otelAdapter) NewInt64Gauge(name, desc string, labelNames ...string) Int64Gauge {
func (a *otelAdapter) NewInt64Gauge(name, desc string, labelNames ...string) (Int64Gauge, error) {
return a.b.NewInt64Gauge(name, desc, labelNames...)
}
func (a *otelAdapter) NewFloat64Gauge(name, desc string, labelNames ...string) Float64Gauge {
func (a *otelAdapter) NewFloat64Gauge(name, desc string, labelNames ...string) (Float64Gauge, error) {
return a.b.NewFloat64Gauge(name, desc, labelNames...)
}
func (a *otelAdapter) NewHistogram(name, desc string, buckets []float64, labelNames ...string) Histogram {
func (a *otelAdapter) NewHistogram(name, desc string, buckets []float64, labelNames ...string) (Histogram, error) {
return a.b.NewHistogram(name, desc, buckets, labelNames...)
}
func (a *otelAdapter) HTTPHandler() http.Handler { return a.b.HTTPHandler() }

View File

@@ -2,6 +2,8 @@ package observability_test
import (
"context"
"net"
"os"
"testing"
"time"
@@ -39,20 +41,19 @@ func TestValidateValidConfigs(t *testing.T) {
}{
{name: "disabled", cfg: observability.MetricsConfig{Enabled: false}},
{name: "backend none", cfg: observability.MetricsConfig{Enabled: true, Backend: "none"}},
{name: "backend empty", cfg: observability.MetricsConfig{Enabled: true, Backend: ""}},
{name: "prometheus", cfg: observability.MetricsConfig{Enabled: true, Backend: "prometheus"}},
{
name: "otel grpc",
cfg: observability.MetricsConfig{
Enabled: true, Backend: "otel",
OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second},
OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second, Timeout: 2 * time.Second},
},
},
{
name: "otel http",
cfg: observability.MetricsConfig{
Enabled: true, Backend: "otel",
OTel: observability.OTelConfig{Protocol: "http", Endpoint: "localhost:4318", ExportInterval: 30 * time.Second},
OTel: observability.OTelConfig{Protocol: "http", Endpoint: "localhost:4318", ExportInterval: 30 * time.Second, Timeout: 2 * time.Second},
},
},
}
@@ -71,25 +72,36 @@ func TestValidateInvalidConfigs(t *testing.T) {
cfg observability.MetricsConfig
}{
{name: "unknown backend", cfg: observability.MetricsConfig{Enabled: true, Backend: "datadog"}},
{
name: "backend empty while enabled",
cfg: observability.MetricsConfig{Enabled: true, Backend: ""},
},
{
name: "otel missing endpoint",
cfg: observability.MetricsConfig{
Enabled: true, Backend: "otel",
OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: "", ExportInterval: 10 * time.Second},
OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: "", ExportInterval: 10 * time.Second, Timeout: 2 * time.Second},
},
},
{
name: "otel invalid protocol",
cfg: observability.MetricsConfig{
Enabled: true, Backend: "otel",
OTel: observability.OTelConfig{Protocol: "tcp", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second},
OTel: observability.OTelConfig{Protocol: "tcp", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second, Timeout: 2 * time.Second},
},
},
{
name: "otel zero interval",
cfg: observability.MetricsConfig{
Enabled: true, Backend: "otel",
OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 0},
OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 0, Timeout: 2 * time.Second},
},
},
{
name: "otel zero timeout",
cfg: observability.MetricsConfig{
Enabled: true, Backend: "otel",
OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, ExportInterval: 10 * time.Second, Timeout: 0},
},
},
}
@@ -157,11 +169,32 @@ func TestPrometheusAdapterAllInstruments(t *testing.T) {
ctx := context.Background()
labels := observability.Labels{"k": "v"}
b.NewCounter("prom_adapter_counter_total", "desc", "k").Add(ctx, 1, labels)
b.NewUpDownCounter("prom_adapter_updown", "desc", "k").Add(ctx, 2, labels)
b.NewInt64Gauge("prom_adapter_int_gauge", "desc", "k").Record(ctx, 99, labels)
b.NewFloat64Gauge("prom_adapter_float_gauge", "desc", "k").Record(ctx, 1.23, labels)
b.NewHistogram("prom_adapter_histogram", "desc", []float64{0.1, 1.0}, "k").Record(ctx, 0.5, labels)
c, err := b.NewCounter("prom_adapter_counter_total", "desc", "k")
if err != nil {
t.Fatalf("NewCounter error: %v", err)
}
u, err := b.NewUpDownCounter("prom_adapter_updown", "desc", "k")
if err != nil {
t.Fatalf("NewUpDownCounter error: %v", err)
}
ig, err := b.NewInt64Gauge("prom_adapter_int_gauge", "desc", "k")
if err != nil {
t.Fatalf("NewInt64Gauge error: %v", err)
}
fg, err := b.NewFloat64Gauge("prom_adapter_float_gauge", "desc", "k")
if err != nil {
t.Fatalf("NewFloat64Gauge error: %v", err)
}
h, err := b.NewHistogram("prom_adapter_histogram", "desc", []float64{0.1, 1.0}, "k")
if err != nil {
t.Fatalf("NewHistogram error: %v", err)
}
c.Add(ctx, 1, labels)
u.Add(ctx, 2, labels)
ig.Record(ctx, 99, labels)
fg.Record(ctx, 1.23, labels)
h.Record(ctx, 0.5, labels)
if b.HTTPHandler() == nil {
t.Error("prometheus adapter HTTPHandler should not be nil")
@@ -172,9 +205,20 @@ func TestPrometheusAdapterAllInstruments(t *testing.T) {
}
func TestOtelAdapterAllInstruments(t *testing.T) {
if os.Getenv("SKIP_OTEL_INTEGRATION") != "" {
t.Skip("skipping OTel integration test because SKIP_OTEL_INTEGRATION is set")
}
dialTimeout := 300 * time.Millisecond
conn, err := net.DialTimeout("tcp", otelGRPCEndpoint, dialTimeout)
if err != nil {
t.Skipf("skipping OTel integration test; collector %s not reachable: %v", otelGRPCEndpoint, err)
}
_ = conn.Close()
b, err := observability.New(observability.MetricsConfig{
Enabled: true, Backend: "otel",
OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, Insecure: true, ExportInterval: 100 * time.Millisecond},
OTel: observability.OTelConfig{Protocol: "grpc", Endpoint: otelGRPCEndpoint, Insecure: true, ExportInterval: 100 * time.Millisecond, Timeout: 2 * time.Second},
})
if err != nil {
t.Fatalf("failed to create otel backend: %v", err)
@@ -182,11 +226,32 @@ func TestOtelAdapterAllInstruments(t *testing.T) {
ctx := context.Background()
labels := observability.Labels{"k": "v"}
b.NewCounter("otel_adapter_counter_total", "desc", "k").Add(ctx, 1, labels)
b.NewUpDownCounter("otel_adapter_updown", "desc", "k").Add(ctx, 2, labels)
b.NewInt64Gauge("otel_adapter_int_gauge", "desc", "k").Record(ctx, 99, labels)
b.NewFloat64Gauge("otel_adapter_float_gauge", "desc", "k").Record(ctx, 1.23, labels)
b.NewHistogram("otel_adapter_histogram", "desc", []float64{0.1, 1.0}, "k").Record(ctx, 0.5, labels)
c, err := b.NewCounter("otel_adapter_counter_total", "desc", "k")
if err != nil {
t.Fatalf("NewCounter error: %v", err)
}
u, err := b.NewUpDownCounter("otel_adapter_updown", "desc", "k")
if err != nil {
t.Fatalf("NewUpDownCounter error: %v", err)
}
ig, err := b.NewInt64Gauge("otel_adapter_int_gauge", "desc", "k")
if err != nil {
t.Fatalf("NewInt64Gauge error: %v", err)
}
fg, err := b.NewFloat64Gauge("otel_adapter_float_gauge", "desc", "k")
if err != nil {
t.Fatalf("NewFloat64Gauge error: %v", err)
}
h, err := b.NewHistogram("otel_adapter_histogram", "desc", []float64{0.1, 1.0}, "k")
if err != nil {
t.Fatalf("NewHistogram error: %v", err)
}
c.Add(ctx, 1, labels)
u.Add(ctx, 2, labels)
ig.Record(ctx, 99, labels)
fg.Record(ctx, 1.23, labels)
h.Record(ctx, 0.5, labels)
if b.HTTPHandler() != nil {
t.Error("OTel adapter HTTPHandler should be nil")

View File

@@ -13,38 +13,31 @@ type NoopBackend struct{}
// Compile-time interface check.
var _ Backend = (*NoopBackend)(nil)
func (n *NoopBackend) NewCounter(_ string, _ string, _ ...string) Counter {
_ = n
return noopCounter{}
func (n *NoopBackend) NewCounter(_ string, _ string, _ ...string) (Counter, error) {
return noopCounter{}, nil
}
func (n *NoopBackend) NewUpDownCounter(_ string, _ string, _ ...string) UpDownCounter {
_ = n
return noopUpDownCounter{}
func (n *NoopBackend) NewUpDownCounter(_ string, _ string, _ ...string) (UpDownCounter, error) {
return noopUpDownCounter{}, nil
}
func (n *NoopBackend) NewInt64Gauge(_ string, _ string, _ ...string) Int64Gauge {
_ = n
return noopInt64Gauge{}
func (n *NoopBackend) NewInt64Gauge(_ string, _ string, _ ...string) (Int64Gauge, error) {
return noopInt64Gauge{}, nil
}
func (n *NoopBackend) NewFloat64Gauge(_ string, _ string, _ ...string) Float64Gauge {
_ = n
return noopFloat64Gauge{}
func (n *NoopBackend) NewFloat64Gauge(_ string, _ string, _ ...string) (Float64Gauge, error) {
return noopFloat64Gauge{}, nil
}
func (n *NoopBackend) NewHistogram(_ string, _ string, _ []float64, _ ...string) Histogram {
_ = n
return noopHistogram{}
func (n *NoopBackend) NewHistogram(_ string, _ string, _ []float64, _ ...string) (Histogram, error) {
return noopHistogram{}, nil
}
func (n *NoopBackend) HTTPHandler() http.Handler {
_ = n
return nil
}
func (n *NoopBackend) Shutdown(_ context.Context) error {
_ = n
return nil
}

View File

@@ -13,32 +13,32 @@ func TestNoopBackendAllInstruments(t *testing.T) {
ctx := context.Background()
labels := observability.Labels{"k": "v"}
t.Run("Counter", func(_ *testing.T) {
c := n.NewCounter("test_counter", "desc")
t.Run("Counter", func(t *testing.T) {
c, _ := n.NewCounter("test_counter", "desc")
c.Add(ctx, 1, labels)
c.Add(ctx, 0, nil)
})
t.Run("UpDownCounter", func(_ *testing.T) {
u := n.NewUpDownCounter("test_updown", "desc")
t.Run("UpDownCounter", func(t *testing.T) {
u, _ := n.NewUpDownCounter("test_updown", "desc")
u.Add(ctx, 1, labels)
u.Add(ctx, -1, nil)
})
t.Run("Int64Gauge", func(_ *testing.T) {
g := n.NewInt64Gauge("test_int64gauge", "desc")
t.Run("Int64Gauge", func(t *testing.T) {
g, _ := n.NewInt64Gauge("test_int64gauge", "desc")
g.Record(ctx, 42, labels)
g.Record(ctx, 0, nil)
})
t.Run("Float64Gauge", func(_ *testing.T) {
g := n.NewFloat64Gauge("test_float64gauge", "desc")
t.Run("Float64Gauge", func(t *testing.T) {
g, _ := n.NewFloat64Gauge("test_float64gauge", "desc")
g.Record(ctx, 3.14, labels)
g.Record(ctx, 0, nil)
})
t.Run("Histogram", func(_ *testing.T) {
h := n.NewHistogram("test_histogram", "desc", []float64{1, 5, 10})
t.Run("Histogram", func(t *testing.T) {
h, _ := n.NewHistogram("test_histogram", "desc", []float64{1, 5, 10})
h.Record(ctx, 2.5, labels)
h.Record(ctx, 0, nil)
})
@@ -56,12 +56,47 @@ func TestNoopBackendAllInstruments(t *testing.T) {
})
}
func TestNoopBackendLabelNames(_ *testing.T) {
func TestNoopBackendLabelNames(t *testing.T) {
// Verify that label names passed at creation time are accepted without panic.
n := &observability.NoopBackend{}
n.NewCounter("c", "d", "label1", "label2")
n.NewUpDownCounter("u", "d", "l1")
n.NewInt64Gauge("g1", "d", "l1", "l2", "l3")
n.NewFloat64Gauge("g2", "d")
n.NewHistogram("h", "d", []float64{0.1, 1.0}, "l1")
assertNoPanic := func(t *testing.T, constructor string, fn func()) {
t.Helper()
defer func() {
if r := recover(); r != nil {
t.Fatalf("%s panicked: %v", constructor, r)
}
}()
fn()
}
t.Run("NewCounter", func(t *testing.T) {
assertNoPanic(t, "NewCounter", func() {
_, _ = n.NewCounter("c", "d", "label1", "label2")
})
})
t.Run("NewUpDownCounter", func(t *testing.T) {
assertNoPanic(t, "NewUpDownCounter", func() {
_, _ = n.NewUpDownCounter("u", "d", "l1")
})
})
t.Run("NewInt64Gauge", func(t *testing.T) {
assertNoPanic(t, "NewInt64Gauge", func() {
_, _ = n.NewInt64Gauge("g1", "d", "l1", "l2", "l3")
})
})
t.Run("NewFloat64Gauge", func(t *testing.T) {
assertNoPanic(t, "NewFloat64Gauge", func() {
_, _ = n.NewFloat64Gauge("g2", "d")
})
})
t.Run("NewHistogram", func(t *testing.T) {
assertNoPanic(t, "NewHistogram", func() {
_, _ = n.NewHistogram("h", "d", []float64{0.1, 1.0}, "l1")
})
})
}

View File

@@ -9,7 +9,10 @@ package otel
import (
"context"
"fmt"
"log"
"net/http"
"regexp"
"strings"
"time"
"go.opentelemetry.io/otel/attribute"
@@ -17,6 +20,8 @@ import (
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
)
var metricLabelNameRE = regexp.MustCompile(`^[a-zA-Z_][a-zA-Z0-9_]*$`)
// Config holds OTel backend configuration.
type Config struct {
// Protocol is "grpc" (default) or "http".
@@ -31,6 +36,9 @@ type Config struct {
// ExportInterval is the period between pushes to the collector.
ExportInterval time.Duration
// Timeout bounds exporter construction calls.
Timeout time.Duration
ServiceName string
ServiceVersion string
DeploymentEnvironment string
@@ -57,9 +65,15 @@ func New(cfg Config) (*Backend, error) {
if cfg.Protocol == "" {
cfg.Protocol = "grpc"
}
if strings.TrimSpace(cfg.Endpoint) == "" {
return nil, fmt.Errorf("otel backend: empty cfg.Endpoint")
}
if cfg.ExportInterval <= 0 {
cfg.ExportInterval = 60 * time.Second
}
if cfg.Timeout <= 0 {
cfg.Timeout = 10 * time.Second
}
if cfg.ServiceName == "" {
cfg.ServiceName = "gerbil"
}
@@ -100,111 +114,196 @@ func (b *Backend) Shutdown(ctx context.Context) error {
}
// NewCounter creates an OTel Int64Counter.
func (b *Backend) NewCounter(name, desc string, _ ...string) *Counter {
func (b *Backend) NewCounter(name, desc string, labelNames ...string) (*Counter, error) {
normalizedLabelNames, err := validateLabelNames(labelNames)
if err != nil {
return nil, fmt.Errorf("otel: create counter %q: %w", name, err)
}
c, err := b.meter.Int64Counter(name, metric.WithDescription(desc))
if err != nil {
panic(fmt.Sprintf("otel: create counter %q: %v", name, err))
return nil, fmt.Errorf("otel: create counter %q: %w", name, err)
}
return &Counter{c: c}
return &Counter{c: c, labelNames: normalizedLabelNames}, nil
}
// NewUpDownCounter creates an OTel Int64UpDownCounter.
func (b *Backend) NewUpDownCounter(name, desc string, _ ...string) *UpDownCounter {
func (b *Backend) NewUpDownCounter(name, desc string, labelNames ...string) (*UpDownCounter, error) {
normalizedLabelNames, err := validateLabelNames(labelNames)
if err != nil {
return nil, fmt.Errorf("otel: create up-down counter %q: %w", name, err)
}
c, err := b.meter.Int64UpDownCounter(name, metric.WithDescription(desc))
if err != nil {
panic(fmt.Sprintf("otel: create up-down counter %q: %v", name, err))
return nil, fmt.Errorf("otel: create up-down counter %q: %w", name, err)
}
return &UpDownCounter{c: c}
return &UpDownCounter{c: c, labelNames: normalizedLabelNames}, nil
}
// NewInt64Gauge creates an OTel Int64Gauge.
func (b *Backend) NewInt64Gauge(name, desc string, _ ...string) *Int64Gauge {
func (b *Backend) NewInt64Gauge(name, desc string, labelNames ...string) (*Int64Gauge, error) {
normalizedLabelNames, err := validateLabelNames(labelNames)
if err != nil {
return nil, fmt.Errorf("otel: create int64 gauge %q: %w", name, err)
}
g, err := b.meter.Int64Gauge(name, metric.WithDescription(desc))
if err != nil {
panic(fmt.Sprintf("otel: create int64 gauge %q: %v", name, err))
return nil, fmt.Errorf("otel: create int64 gauge %q: %w", name, err)
}
return &Int64Gauge{g: g}
return &Int64Gauge{g: g, labelNames: normalizedLabelNames}, nil
}
// NewFloat64Gauge creates an OTel Float64Gauge.
func (b *Backend) NewFloat64Gauge(name, desc string, _ ...string) *Float64Gauge {
func (b *Backend) NewFloat64Gauge(name, desc string, labelNames ...string) (*Float64Gauge, error) {
normalizedLabelNames, err := validateLabelNames(labelNames)
if err != nil {
return nil, fmt.Errorf("otel: create float64 gauge %q: %w", name, err)
}
g, err := b.meter.Float64Gauge(name, metric.WithDescription(desc))
if err != nil {
panic(fmt.Sprintf("otel: create float64 gauge %q: %v", name, err))
return nil, fmt.Errorf("otel: create float64 gauge %q: %w", name, err)
}
return &Float64Gauge{g: g}
return &Float64Gauge{g: g, labelNames: normalizedLabelNames}, nil
}
// NewHistogram creates an OTel Float64Histogram with explicit bucket boundaries.
func (b *Backend) NewHistogram(name, desc string, buckets []float64, _ ...string) *Histogram {
func (b *Backend) NewHistogram(name, desc string, buckets []float64, labelNames ...string) (*Histogram, error) {
normalizedLabelNames, err := validateLabelNames(labelNames)
if err != nil {
return nil, fmt.Errorf("otel: create histogram %q: %w", name, err)
}
h, err := b.meter.Float64Histogram(name,
metric.WithDescription(desc),
metric.WithExplicitBucketBoundaries(buckets...),
)
if err != nil {
panic(fmt.Sprintf("otel: create histogram %q: %v", name, err))
return nil, fmt.Errorf("otel: create histogram %q: %w", name, err)
}
return &Histogram{h: h}
return &Histogram{h: h, labelNames: normalizedLabelNames}, nil
}
// labelsToAttrs converts a Labels map to OTel attribute key-value pairs.
func labelsToAttrs(labels map[string]string) []attribute.KeyValue {
if len(labels) == 0 {
return nil
func validateLabelNames(labelNames []string) ([]string, error) {
if len(labelNames) == 0 {
return nil, nil
}
attrs := make([]attribute.KeyValue, 0, len(labels))
for k, v := range labels {
attrs = append(attrs, attribute.String(k, v))
normalized := make([]string, len(labelNames))
seen := make(map[string]struct{}, len(labelNames))
for i, name := range labelNames {
if !metricLabelNameRE.MatchString(name) {
return nil, fmt.Errorf("invalid label name %q", name)
}
if _, exists := seen[name]; exists {
return nil, fmt.Errorf("duplicate label name %q", name)
}
seen[name] = struct{}{}
normalized[i] = name
}
return normalized, nil
}
func labelsToAttrs(labelNames []string, labels map[string]string) []attribute.KeyValue {
if len(labelNames) == 0 {
if len(labels) > 0 {
log.Printf("WARN: dropping otel metric sample due to unexpected labels: got=%v expected=none", labels)
return nil
}
return []attribute.KeyValue{}
}
attrs := make([]attribute.KeyValue, 0, len(labelNames))
for _, labelName := range labelNames {
attrs = append(attrs, attribute.String(labelName, labels[labelName]))
}
for got := range labels {
found := false
for _, expected := range labelNames {
if got == expected {
found = true
break
}
}
if !found {
log.Printf("WARN: dropping otel metric sample due to unexpected label key %q (expected=%v)", got, labelNames)
return nil
}
}
return attrs
}
// Counter wraps an OTel Int64Counter.
type Counter struct {
c metric.Int64Counter
c metric.Int64Counter
labelNames []string
}
// Add increments the counter by value.
func (c *Counter) Add(ctx context.Context, value int64, labels map[string]string) {
c.c.Add(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...))
attrs := labelsToAttrs(c.labelNames, labels)
if attrs == nil {
return
}
c.c.Add(ctx, value, metric.WithAttributes(attrs...))
}
// UpDownCounter wraps an OTel Int64UpDownCounter.
type UpDownCounter struct {
c metric.Int64UpDownCounter
c metric.Int64UpDownCounter
labelNames []string
}
// Add adjusts the up-down counter by value.
func (u *UpDownCounter) Add(ctx context.Context, value int64, labels map[string]string) {
u.c.Add(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...))
attrs := labelsToAttrs(u.labelNames, labels)
if attrs == nil {
return
}
u.c.Add(ctx, value, metric.WithAttributes(attrs...))
}
// Int64Gauge wraps an OTel Int64Gauge.
type Int64Gauge struct {
g metric.Int64Gauge
g metric.Int64Gauge
labelNames []string
}
// Record sets the gauge to value.
func (g *Int64Gauge) Record(ctx context.Context, value int64, labels map[string]string) {
g.g.Record(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...))
attrs := labelsToAttrs(g.labelNames, labels)
if attrs == nil {
return
}
g.g.Record(ctx, value, metric.WithAttributes(attrs...))
}
// Float64Gauge wraps an OTel Float64Gauge.
type Float64Gauge struct {
g metric.Float64Gauge
g metric.Float64Gauge
labelNames []string
}
// Record sets the gauge to value.
func (g *Float64Gauge) Record(ctx context.Context, value float64, labels map[string]string) {
g.g.Record(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...))
attrs := labelsToAttrs(g.labelNames, labels)
if attrs == nil {
return
}
g.g.Record(ctx, value, metric.WithAttributes(attrs...))
}
// Histogram wraps an OTel Float64Histogram.
type Histogram struct {
h metric.Float64Histogram
h metric.Float64Histogram
labelNames []string
}
// Record observes value in the histogram.
func (h *Histogram) Record(ctx context.Context, value float64, labels map[string]string) {
h.h.Record(ctx, value, metric.WithAttributes(labelsToAttrs(labels)...))
attrs := labelsToAttrs(h.labelNames, labels)
if attrs == nil {
return
}
h.h.Record(ctx, value, metric.WithAttributes(attrs...))
}

View File

@@ -55,7 +55,10 @@ func TestOtelBackendCounter(t *testing.T) {
b := newInMemoryBackend(t)
defer b.Shutdown(context.Background()) //nolint:errcheck
c := b.NewCounter("gerbil_test_counter_total", "test counter", "result")
c, err := b.NewCounter("gerbil_test_counter_total", "test counter", "result")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
// Should not panic
c.Add(context.Background(), 1, map[string]string{"result": "ok"})
c.Add(context.Background(), 5, nil)
@@ -65,7 +68,10 @@ func TestOtelBackendUpDownCounter(t *testing.T) {
b := newInMemoryBackend(t)
defer b.Shutdown(context.Background()) //nolint:errcheck
u := b.NewUpDownCounter("gerbil_test_updown", "test updown", "state")
u, err := b.NewUpDownCounter("gerbil_test_updown", "test updown", "state")
if err != nil {
t.Fatalf("NewUpDownCounter returned error: %v", err)
}
u.Add(context.Background(), 3, map[string]string{"state": "active"})
u.Add(context.Background(), -1, map[string]string{"state": "active"})
}
@@ -74,7 +80,10 @@ func TestOtelBackendInt64Gauge(t *testing.T) {
b := newInMemoryBackend(t)
defer b.Shutdown(context.Background()) //nolint:errcheck
g := b.NewInt64Gauge("gerbil_test_int_gauge", "test gauge")
g, err := b.NewInt64Gauge("gerbil_test_int_gauge", "test gauge")
if err != nil {
t.Fatalf("NewInt64Gauge returned error: %v", err)
}
g.Record(context.Background(), 42, nil)
}
@@ -82,7 +91,10 @@ func TestOtelBackendFloat64Gauge(t *testing.T) {
b := newInMemoryBackend(t)
defer b.Shutdown(context.Background()) //nolint:errcheck
g := b.NewFloat64Gauge("gerbil_test_float_gauge", "test float gauge")
g, err := b.NewFloat64Gauge("gerbil_test_float_gauge", "test float gauge")
if err != nil {
t.Fatalf("NewFloat64Gauge returned error: %v", err)
}
g.Record(context.Background(), 3.14, nil)
}
@@ -90,8 +102,11 @@ func TestOtelBackendHistogram(t *testing.T) {
b := newInMemoryBackend(t)
defer b.Shutdown(context.Background()) //nolint:errcheck
h := b.NewHistogram("gerbil_test_duration_seconds", "test histogram",
h, err := b.NewHistogram("gerbil_test_duration_seconds", "test histogram",
[]float64{0.1, 0.5, 1.0}, "method")
if err != nil {
t.Fatalf("NewHistogram returned error: %v", err)
}
h.Record(context.Background(), 0.3, map[string]string{"method": "GET"})
}
@@ -139,3 +154,22 @@ func TestOtelBackendDeploymentEnvironment(t *testing.T) {
}
defer b.Shutdown(context.Background()) //nolint:errcheck
}
func TestOtelBackendRejectsInvalidLabelNames(t *testing.T) {
b := newInMemoryBackend(t)
defer b.Shutdown(context.Background()) //nolint:errcheck
t.Run("duplicate labels", func(t *testing.T) {
_, err := b.NewCounter("gerbil_test_invalid_labels_total", "test counter", "result", "result")
if err == nil {
t.Fatal("expected error for duplicate label names")
}
})
t.Run("invalid label name", func(t *testing.T) {
_, err := b.NewHistogram("gerbil_test_invalid_histogram", "test histogram", []float64{0.1, 1.0}, "status-code")
if err == nil {
t.Fatal("expected error for invalid label name")
}
})
}

View File

@@ -3,6 +3,8 @@ package otel
import (
"context"
"fmt"
"net/url"
"strings"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp"
@@ -11,6 +13,10 @@ import (
// newExporter creates the appropriate OTLP exporter based on cfg.Protocol.
func newExporter(ctx context.Context, cfg Config) (sdkmetric.Exporter, error) {
if strings.TrimSpace(cfg.Endpoint) == "" {
return nil, fmt.Errorf("otel: cfg.Endpoint is empty")
}
switch cfg.Protocol {
case "grpc", "":
return newGRPCExporter(ctx, cfg)
@@ -36,8 +42,20 @@ func newGRPCExporter(ctx context.Context, cfg Config) (sdkmetric.Exporter, error
}
func newHTTPExporter(ctx context.Context, cfg Config) (sdkmetric.Exporter, error) {
opts := []otlpmetrichttp.Option{
otlpmetrichttp.WithEndpoint(cfg.Endpoint),
endpoint := strings.TrimSpace(cfg.Endpoint)
opts := make([]otlpmetrichttp.Option, 0, 3)
if strings.Contains(endpoint, "://") {
parsed, err := url.Parse(endpoint)
if err != nil {
return nil, fmt.Errorf("otlp http exporter: parse endpoint URL %q: %w", endpoint, err)
}
opts = append(opts, otlpmetrichttp.WithEndpointURL(parsed.String()))
} else {
opts = append(opts,
otlpmetrichttp.WithEndpoint(endpoint),
otlpmetrichttp.WithURLPath("/v1/metrics"),
)
}
if cfg.Insecure {
opts = append(opts, otlpmetrichttp.WithInsecure())

View File

@@ -3,7 +3,7 @@ package otel
import (
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/sdk/resource"
semconv "go.opentelemetry.io/otel/semconv/v1.40.0"
semconv "go.opentelemetry.io/otel/semconv/v1.18.0"
)
// newResource builds an OTel resource for the Gerbil service.
@@ -15,11 +15,11 @@ func newResource(serviceName, serviceVersion, deploymentEnv string) (*resource.R
attrs = append(attrs, semconv.ServiceVersion(serviceVersion))
}
if deploymentEnv != "" {
attrs = append(attrs, semconv.DeploymentEnvironmentName(deploymentEnv))
attrs = append(attrs, semconv.DeploymentEnvironment(deploymentEnv))
}
return resource.Merge(
resource.Default(),
resource.NewWithAttributes(semconv.SchemaURL, attrs...),
resource.NewSchemaless(attrs...),
)
}

View File

@@ -7,6 +7,7 @@ package prometheus
import (
"context"
"log"
"net/http"
"github.com/prometheus/client_golang/prometheus"
@@ -30,9 +31,10 @@ type Config struct {
// in the backend-specific instrument types that implement the observability
// instrument interfaces.
type Backend struct {
cfg Config
registry *prometheus.Registry
handler http.Handler
cfg Config
registry *prometheus.Registry
handler http.Handler
droppedSamplesCounter prometheus.Counter
}
// New creates and initialises a Prometheus backend.
@@ -48,6 +50,11 @@ func New(cfg Config) (*Backend, error) {
}
registry := prometheus.NewRegistry()
droppedSamplesCounter := prometheus.NewCounter(prometheus.CounterOpts{
Name: "gerbil_dropped_metric_samples_total",
Help: "Total number of metric samples dropped due to invalid labels or unsupported label sets",
})
registry.MustRegister(droppedSamplesCounter)
// Include Go and process metrics by default.
includeGo := cfg.IncludeGoMetrics == nil || *cfg.IncludeGoMetrics
@@ -62,7 +69,7 @@ func New(cfg Config) (*Backend, error) {
EnableOpenMetrics: false,
})
return &Backend{cfg: cfg, registry: registry, handler: handler}, nil
return &Backend{cfg: cfg, registry: registry, handler: handler, droppedSamplesCounter: droppedSamplesCounter}, nil
}
// HTTPHandler returns the Prometheus /metrics HTTP handler.
@@ -78,60 +85,107 @@ func (b *Backend) Shutdown(_ context.Context) error {
}
// NewCounter creates a Prometheus CounterVec registered on the backend's registry.
func (b *Backend) NewCounter(name, desc string, labelNames ...string) *Counter {
func (b *Backend) NewCounter(name, desc string, labelNames ...string) (*Counter, error) {
vec := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: name,
Help: desc,
}, labelNames)
b.registry.MustRegister(vec)
return &Counter{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.CounterVec)
if !ok {
return nil, err
}
return &Counter{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &Counter{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// NewUpDownCounter creates a Prometheus GaugeVec (Prometheus gauges are
// bidirectional) registered on the backend's registry.
func (b *Backend) NewUpDownCounter(name, desc string, labelNames ...string) *UpDownCounter {
func (b *Backend) NewUpDownCounter(name, desc string, labelNames ...string) (*UpDownCounter, error) {
vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: name,
Help: desc,
}, labelNames)
b.registry.MustRegister(vec)
return &UpDownCounter{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.GaugeVec)
if !ok {
return nil, err
}
return &UpDownCounter{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &UpDownCounter{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// NewInt64Gauge creates a Prometheus GaugeVec registered on the backend's registry.
func (b *Backend) NewInt64Gauge(name, desc string, labelNames ...string) *Int64Gauge {
func (b *Backend) NewInt64Gauge(name, desc string, labelNames ...string) (*Int64Gauge, error) {
vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: name,
Help: desc,
}, labelNames)
b.registry.MustRegister(vec)
return &Int64Gauge{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.GaugeVec)
if !ok {
return nil, err
}
return &Int64Gauge{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &Int64Gauge{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// NewFloat64Gauge creates a Prometheus GaugeVec registered on the backend's registry.
func (b *Backend) NewFloat64Gauge(name, desc string, labelNames ...string) *Float64Gauge {
func (b *Backend) NewFloat64Gauge(name, desc string, labelNames ...string) (*Float64Gauge, error) {
vec := prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: name,
Help: desc,
}, labelNames)
b.registry.MustRegister(vec)
return &Float64Gauge{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.GaugeVec)
if !ok {
return nil, err
}
return &Float64Gauge{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &Float64Gauge{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// NewHistogram creates a Prometheus HistogramVec registered on the backend's registry.
func (b *Backend) NewHistogram(name, desc string, buckets []float64, labelNames ...string) *Histogram {
func (b *Backend) NewHistogram(name, desc string, buckets []float64, labelNames ...string) (*Histogram, error) {
vec := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: name,
Help: desc,
Buckets: buckets,
}, labelNames)
b.registry.MustRegister(vec)
return &Histogram{vec: vec}
if err := b.registry.Register(vec); err != nil {
if are, ok := err.(prometheus.AlreadyRegisteredError); ok {
existing, ok := are.ExistingCollector.(*prometheus.HistogramVec)
if !ok {
return nil, err
}
return &Histogram{vec: existing, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
return nil, err
}
return &Histogram{vec: vec, labelNames: append([]string(nil), labelNames...), droppedSamplesCounter: b.droppedSamplesCounter}, nil
}
// Counter is a native Prometheus counter instrument.
type Counter struct {
vec *prometheus.CounterVec
vec *prometheus.CounterVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Add increments the counter by value for the given labels.
@@ -139,47 +193,118 @@ type Counter struct {
// value must be non-negative. Negative values are ignored.
func (c *Counter) Add(_ context.Context, value int64, labels map[string]string) {
if value < 0 {
log.Printf("WARN: counter add called with negative value=%d labels=%v expected_labels=%v", value, labels, c.labelNames)
return
}
c.vec.With(prometheus.Labels(labels)).Add(float64(value))
normalized, ok := normalizeLabels(c.labelNames, labels, c.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("counter", c.labelNames, labels)
c.vec.With(normalized).Add(float64(value))
}
// UpDownCounter is a native Prometheus gauge used as a bidirectional counter.
type UpDownCounter struct {
vec *prometheus.GaugeVec
vec *prometheus.GaugeVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Add adjusts the gauge by value for the given labels.
func (u *UpDownCounter) Add(_ context.Context, value int64, labels map[string]string) {
u.vec.With(prometheus.Labels(labels)).Add(float64(value))
normalized, ok := normalizeLabels(u.labelNames, labels, u.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("updown", u.labelNames, labels)
u.vec.With(normalized).Add(float64(value))
}
// Int64Gauge is a native Prometheus gauge recording integer snapshot values.
type Int64Gauge struct {
vec *prometheus.GaugeVec
vec *prometheus.GaugeVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Record sets the gauge to value for the given labels.
func (g *Int64Gauge) Record(_ context.Context, value int64, labels map[string]string) {
g.vec.With(prometheus.Labels(labels)).Set(float64(value))
normalized, ok := normalizeLabels(g.labelNames, labels, g.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("int64-gauge", g.labelNames, labels)
g.vec.With(normalized).Set(float64(value))
}
// Float64Gauge is a native Prometheus gauge recording float snapshot values.
type Float64Gauge struct {
vec *prometheus.GaugeVec
vec *prometheus.GaugeVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Record sets the gauge to value for the given labels.
func (g *Float64Gauge) Record(_ context.Context, value float64, labels map[string]string) {
g.vec.With(prometheus.Labels(labels)).Set(value)
normalized, ok := normalizeLabels(g.labelNames, labels, g.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("float64-gauge", g.labelNames, labels)
g.vec.With(normalized).Set(value)
}
// Histogram is a native Prometheus histogram instrument.
type Histogram struct {
vec *prometheus.HistogramVec
vec *prometheus.HistogramVec
labelNames []string
droppedSamplesCounter prometheus.Counter
}
// Record observes value for the given labels.
func (h *Histogram) Record(_ context.Context, value float64, labels map[string]string) {
h.vec.With(prometheus.Labels(labels)).Observe(value)
normalized, ok := normalizeLabels(h.labelNames, labels, h.droppedSamplesCounter)
if !ok {
return
}
defer guardMetricPanic("histogram", h.labelNames, labels)
h.vec.With(normalized).Observe(value)
}
func normalizeLabels(labelNames []string, labels map[string]string, droppedSamplesCounter prometheus.Counter) (prometheus.Labels, bool) {
if len(labelNames) == 0 {
if len(labels) > 0 {
if droppedSamplesCounter != nil {
droppedSamplesCounter.Inc()
}
log.Printf("WARN: dropping metric sample due to unexpected labels: got=%v expected=none", labels)
return nil, false
}
return nil, true
}
normalized := make(prometheus.Labels, len(labelNames))
for _, name := range labelNames {
normalized[name] = ""
}
for k, v := range labels {
if _, ok := normalized[k]; !ok {
if droppedSamplesCounter != nil {
droppedSamplesCounter.Inc()
}
log.Printf("WARN: dropping metric sample due to unexpected label key %q (expected=%v)", k, labelNames)
return nil, false
}
normalized[k] = v
}
return normalized, true
}
func guardMetricPanic(kind string, expected []string, labels map[string]string) {
if recovered := recover(); recovered != nil {
log.Printf("WARN: dropped %s metric sample due to label panic: expected=%v got=%v err=%v", kind, expected, labels, recovered)
}
}

View File

@@ -36,7 +36,10 @@ func TestPrometheusBackendShutdown(t *testing.T) {
func TestPrometheusBackendCounter(t *testing.T) {
b := newTestBackend(t)
c := b.NewCounter("test_counter_total", "A test counter", "result")
c, err := b.NewCounter("test_counter_total", "A test counter", "result")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
c.Add(context.Background(), 3, map[string]string{"result": "ok"})
body := scrapeMetrics(t, b)
@@ -45,7 +48,10 @@ func TestPrometheusBackendCounter(t *testing.T) {
func TestPrometheusBackendUpDownCounter(t *testing.T) {
b := newTestBackend(t)
u := b.NewUpDownCounter("test_gauge_total", "A test up-down counter", "state")
u, err := b.NewUpDownCounter("test_gauge_total", "A test up-down counter", "state")
if err != nil {
t.Fatalf("NewUpDownCounter returned error: %v", err)
}
u.Add(context.Background(), 5, map[string]string{"state": "active"})
u.Add(context.Background(), -2, map[string]string{"state": "active"})
@@ -55,7 +61,10 @@ func TestPrometheusBackendUpDownCounter(t *testing.T) {
func TestPrometheusBackendInt64Gauge(t *testing.T) {
b := newTestBackend(t)
g := b.NewInt64Gauge("test_int_gauge", "An integer gauge", "ifname")
g, err := b.NewInt64Gauge("test_int_gauge", "An integer gauge", "ifname")
if err != nil {
t.Fatalf("NewInt64Gauge returned error: %v", err)
}
g.Record(context.Background(), 42, map[string]string{"ifname": "wg0"})
body := scrapeMetrics(t, b)
@@ -64,7 +73,10 @@ func TestPrometheusBackendInt64Gauge(t *testing.T) {
func TestPrometheusBackendFloat64Gauge(t *testing.T) {
b := newTestBackend(t)
g := b.NewFloat64Gauge("test_float_gauge", "A float gauge", "cert")
g, err := b.NewFloat64Gauge("test_float_gauge", "A float gauge", "cert")
if err != nil {
t.Fatalf("NewFloat64Gauge returned error: %v", err)
}
g.Record(context.Background(), 7.5, map[string]string{"cert": "example.com"})
body := scrapeMetrics(t, b)
@@ -74,7 +86,10 @@ func TestPrometheusBackendFloat64Gauge(t *testing.T) {
func TestPrometheusBackendHistogram(t *testing.T) {
b := newTestBackend(t)
buckets := []float64{0.1, 0.5, 1.0, 5.0}
h := b.NewHistogram("test_duration_seconds", "A test histogram", buckets, "method")
h, err := b.NewHistogram("test_duration_seconds", "A test histogram", buckets, "method")
if err != nil {
t.Fatalf("NewHistogram returned error: %v", err)
}
h.Record(context.Background(), 0.3, map[string]string{"method": "GET"})
body := scrapeMetrics(t, b)
@@ -85,7 +100,10 @@ func TestPrometheusBackendHistogram(t *testing.T) {
func TestPrometheusBackendMultipleLabels(t *testing.T) {
b := newTestBackend(t)
c := b.NewCounter("multi_label_total", "Multi-label counter", "method", "route", "status_code")
c, err := b.NewCounter("multi_label_total", "Multi-label counter", "method", "route", "status_code")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
c.Add(context.Background(), 1, map[string]string{
"method": "POST",
"route": "/api/peers",
@@ -122,23 +140,29 @@ func TestPrometheusBackendNoGoMetrics(t *testing.T) {
func TestPrometheusBackendNilLabels(t *testing.T) {
// Adding with nil labels should not panic (treated as empty map).
b := newTestBackend(t)
c := b.NewCounter("nil_labels_total", "counter with no labels")
c, err := b.NewCounter("nil_labels_total", "counter with no labels")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
// nil labels with no label names declared should be safe
c.Add(context.Background(), 1, nil)
}
func TestPrometheusBackendConcurrentAdd(t *testing.T) {
b := newTestBackend(t)
c := b.NewCounter("concurrent_total", "concurrent counter", "worker")
c, err := b.NewCounter("concurrent_total", "concurrent counter", "worker")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
done := make(chan struct{})
for i := 0; i < 10; i++ {
go func(_ int) {
go func() {
for j := 0; j < 100; j++ {
c.Add(context.Background(), 1, map[string]string{"worker": "w"})
}
done <- struct{}{}
}(i)
}()
}
for i := 0; i < 10; i++ {
<-done
@@ -148,6 +172,40 @@ func TestPrometheusBackendConcurrentAdd(t *testing.T) {
assertMetricPresent(t, body, `concurrent_total{worker="w"} 1000`)
}
func TestPrometheusBackendAlreadyRegisteredCounter(t *testing.T) {
b := newTestBackend(t)
c1, err := b.NewCounter("dupe_counter_total", "duplicate counter", "result")
if err != nil {
t.Fatalf("first NewCounter returned error: %v", err)
}
c2, err := b.NewCounter("dupe_counter_total", "duplicate counter", "result")
if err != nil {
t.Fatalf("second NewCounter returned error: %v", err)
}
c1.Add(context.Background(), 1, map[string]string{"result": "ok"})
c2.Add(context.Background(), 2, map[string]string{"result": "ok"})
body := scrapeMetrics(t, b)
assertMetricPresent(t, body, `dupe_counter_total{result="ok"} 3`)
}
func TestPrometheusBackendInvalidLabelsNoPanic(t *testing.T) {
b := newTestBackend(t)
c, err := b.NewCounter("invalid_labels_total", "invalid labels test", "result")
if err != nil {
t.Fatalf("NewCounter returned error: %v", err)
}
// Extra label key should be dropped and must not panic.
c.Add(context.Background(), 5, map[string]string{"result": "ok", "unexpected": "x"})
body := scrapeMetrics(t, b)
if strings.Contains(body, `invalid_labels_total{result="ok"}`) {
t.Error("invalid label sample should have been dropped")
}
}
// --- helpers ---
func scrapeMetrics(t *testing.T, b *obsprom.Backend) string {

17
main.go
View File

@@ -175,6 +175,7 @@ func main() {
otelMetricsEndpoint string
otelMetricsInsecure bool
otelMetricsExportInterval time.Duration
otelMetricsTimeout time.Duration
)
interfaceName = os.Getenv("INTERFACE")
@@ -229,6 +230,14 @@ func main() {
log.Printf("WARN: invalid OTEL_METRICS_EXPORT_INTERVAL=%q: %v", v, err2)
}
}
otelMetricsTimeout = 10 * time.Second // default
if v := os.Getenv("OTEL_METRICS_TIMEOUT"); v != "" {
if d, err2 := time.ParseDuration(v); err2 == nil {
otelMetricsTimeout = d
} else {
log.Printf("WARN: invalid OTEL_METRICS_TIMEOUT=%q: %v", v, err2)
}
}
if interfaceName == "" {
flag.StringVar(&interfaceName, "interface", "wg0", "Name of the WireGuard interface")
@@ -322,6 +331,7 @@ func main() {
flag.StringVar(&otelMetricsEndpoint, "otel-metrics-endpoint", otelMetricsEndpoint, "OTLP collector endpoint (e.g. localhost:4317)")
flag.BoolVar(&otelMetricsInsecure, "otel-metrics-insecure", otelMetricsInsecure, "Disable TLS for OTLP connection")
flag.DurationVar(&otelMetricsExportInterval, "otel-metrics-export-interval", otelMetricsExportInterval, "Interval between OTLP metric pushes")
flag.DurationVar(&otelMetricsTimeout, "otel-metrics-timeout", otelMetricsTimeout, "Timeout for OTLP exporter setup")
flag.Parse()
@@ -347,6 +357,7 @@ func main() {
Endpoint: otelMetricsEndpoint,
Insecure: otelMetricsInsecure,
ExportInterval: otelMetricsExportInterval,
Timeout: otelMetricsTimeout,
},
ServiceName: "gerbil",
ServiceVersion: "1.0.0",
@@ -543,6 +554,8 @@ func main() {
// Register metrics endpoint only for Prometheus backend.
// OTel backend pushes to a collector; no /metrics endpoint needed.
// Note: metricsPath is registered directly without httpMetricsMiddleware to prevent infinite recursion.
// The metricsHandler must not be wrapped by the middleware, as it would observe its own observation calls.
if metricsHandler != nil {
http.Handle(metricsPath, metricsHandler)
logger.Info("Metrics endpoint enabled at %s", metricsPath)
@@ -1162,10 +1175,12 @@ func removePeerInternal(publicKey string) error {
// Get current peer info before removing to clear relay connections and bandwidth limits
var wgIPs []string
allowedIPsCount := 0
device, err := wgClient.Device(interfaceName)
if err == nil {
for _, peer := range device.Peers {
if peer.PublicKey.String() == publicKey {
allowedIPsCount = len(peer.AllowedIPs)
// Extract WireGuard IPs from this peer's allowed IPs
for _, allowedIP := range peer.AllowedIPs {
wgIPs = append(wgIPs, allowedIP.IP.String())
@@ -1208,7 +1223,7 @@ func removePeerInternal(publicKey string) error {
// Record metrics
metrics.RecordPeersTotal(interfaceName, -1)
metrics.RecordAllowedIPsCount(interfaceName, publicKey, -int64(len(wgIPs)))
metrics.RecordAllowedIPsCount(interfaceName, publicKey, -int64(allowedIPsCount))
return nil
}

View File

@@ -548,7 +548,7 @@ func (p *SNIProxy) handleConnection(clientConn net.Conn) {
logger.Debug("SNI extraction failed: %v", err)
return
}
metrics.RecordProxyTLSHandshake(hostname, time.Since(clientHelloStart).Seconds())
metrics.RecordProxyTLSHandshake(time.Since(clientHelloStart).Seconds())
if hostname == "" {
log.Println("No SNI hostname found")
@@ -596,8 +596,8 @@ func (p *SNIProxy) handleConnection(clientConn net.Conn) {
defer targetConn.Close()
logger.Debug("Connected to target: %s:%d", route.TargetHost, route.TargetPort)
metrics.RecordActiveProxyConnection(hostname, 1)
defer metrics.RecordActiveProxyConnection(hostname, -1)
metrics.RecordActiveProxyConnection(1)
defer metrics.RecordActiveProxyConnection(-1)
// Send PROXY protocol header if enabled
if p.proxyProtocol {
@@ -655,7 +655,7 @@ func (p *SNIProxy) getRoute(hostname, clientAddr string) (*RouteRecord, error) {
// Check local overrides first
if _, isOverride := p.localOverrides[hostname]; isOverride {
logger.Debug("Local override matched for hostname: %s", hostname)
metrics.RecordProxyRouteLookup("local_override", hostname)
metrics.RecordProxyRouteLookup("local_override")
return &RouteRecord{
Hostname: hostname,
TargetHost: p.localProxyAddr,
@@ -668,7 +668,7 @@ func (p *SNIProxy) getRoute(hostname, clientAddr string) (*RouteRecord, error) {
_, isLocal := p.localSNIs[hostname]
p.localSNIsLock.RUnlock()
if isLocal {
metrics.RecordProxyRouteLookup("local", hostname)
metrics.RecordProxyRouteLookup("local")
return &RouteRecord{
Hostname: hostname,
TargetHost: p.localProxyAddr,
@@ -679,16 +679,16 @@ func (p *SNIProxy) getRoute(hostname, clientAddr string) (*RouteRecord, error) {
// Check cache first
if cached, found := p.cache.Get(hostname); found {
if cached == nil {
metrics.RecordProxyRouteLookup("cached_not_found", hostname)
metrics.RecordProxyRouteLookup("cached_not_found")
return nil, nil // Cached negative result
}
logger.Debug("Cache hit for hostname: %s", hostname)
metrics.RecordProxyRouteLookup("cache_hit", hostname)
metrics.RecordProxyRouteLookup("cache_hit")
return cached.(*RouteRecord), nil
}
logger.Debug("Cache miss for hostname: %s, querying API", hostname)
metrics.RecordProxyRouteLookup("cache_miss", hostname)
metrics.RecordProxyRouteLookup("cache_miss")
// Query API with timeout
ctx, cancel := context.WithTimeout(p.ctx, 5*time.Second)
@@ -822,7 +822,7 @@ func (p *SNIProxy) pipe(hostname string, clientConn, targetConn net.Conn, client
}()
bytesCopied, err := io.CopyBuffer(targetConn, clientReader, *bufPtr)
metrics.RecordProxyBytesTransmitted(hostname, "client_to_target", bytesCopied)
metrics.RecordProxyBytesTransmitted("client_to_target", bytesCopied)
if err != nil && err != io.EOF {
logger.Debug("Copy client->target error: %v", err)
}
@@ -842,7 +842,7 @@ func (p *SNIProxy) pipe(hostname string, clientConn, targetConn net.Conn, client
}()
bytesCopied, err := io.CopyBuffer(clientConn, targetConn, *bufPtr)
metrics.RecordProxyBytesTransmitted(hostname, "target_to_client", bytesCopied)
metrics.RecordProxyBytesTransmitted("target_to_client", bytesCopied)
if err != nil && err != io.EOF {
logger.Debug("Copy target->client error: %v", err)
}