diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml index e3eb21d..1dcb633 100644 --- a/docker-compose.metrics.yml +++ b/docker-compose.metrics.yml @@ -1,25 +1,17 @@ services: - collector: - image: otel/opentelemetry-collector-contrib:0.136.0 - command: ["--config=/etc/otelcol/config.yaml"] - volumes: - - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro - ports: - - "4317:4317" # OTLP gRPC in - - "8889:8889" # Prometheus scrape out + # Recommended Variant A: Direct Prometheus scrape of Newt (/metrics) + # Optional: You may add the Collector service and enable OTLP export, but do NOT + # scrape both Newt and the Collector for the same process. newt: build: . image: newt:dev - env_file: - - .env + env_file: + - .env environment: OTEL_SERVICE_NAME: newt NEWT_METRICS_PROMETHEUS_ENABLED: "true" - NEWT_METRICS_OTLP_ENABLED: "true" - OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317" - OTEL_EXPORTER_OTLP_INSECURE: "true" - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative" + NEWT_METRICS_OTLP_ENABLED: "false" # avoid double-scrape by default NEWT_ADMIN_ADDR: ":2112" # Base NEWT configuration PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT} @@ -28,8 +20,16 @@ services: LOG_LEVEL: "DEBUG" ports: - "2112:2112" - depends_on: - - collector + + # Optional Variant B: Enable the Collector and switch Prometheus scrape to it. + # collector: + # image: otel/opentelemetry-collector-contrib:0.136.0 + # command: ["--config=/etc/otelcol/config.yaml"] + # volumes: + # - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro + # ports: + # - "4317:4317" # OTLP gRPC in + # - "8889:8889" # Prometheus scrape out prometheus: image: prom/prometheus:v3.6.0 diff --git a/docs/observability.md b/docs/observability.md index cf8de79..bae5fb7 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -49,8 +49,8 @@ Metric catalog (initial) Conventions -- Durations in seconds, names end with _seconds -- Sizes in bytes, names end with _bytes +- Durations in seconds (unit: s), names end with _seconds +- Sizes in bytes (unit: By), names end with _bytes - Counters end with _total - Labels must be low-cardinality and stable @@ -163,6 +163,7 @@ Compatibility notes - Gauges do not use the _total suffix (e.g., newt_tunnel_sessions). - site_id is emitted as both resource attribute and metric label on all newt_* series; region is included as a metric label only when set. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels. +- NEWT_METRICS_INCLUDE_TUNNEL_ID (default: true) toggles whether tunnel_id is included as a label on bytes/sessions/proxy/reconnect metrics. Disable in high-cardinality environments. - Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both. - Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write. - No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard). diff --git a/examples/prometheus.yml b/examples/prometheus.yml index c3018a5..89e82b4 100644 --- a/examples/prometheus.yml +++ b/examples/prometheus.yml @@ -2,9 +2,20 @@ global: scrape_interval: 15s scrape_configs: - - job_name: newt + - job_name: 'newt' + scrape_interval: 15s static_configs: - - targets: ["newt:2112"] - - job_name: otel-collector - static_configs: - - targets: ["collector:8889"] + - targets: ['newt:2112'] # /metrics + relabel_configs: + # optional: tunnel_id droppen + - action: labeldrop + regex: 'tunnel_id' + # optional: nur bestimmte sites zulassen + - action: keep + source_labels: [site_id] + regex: '(site-a|site-b)' + + # WARNING: Do not enable this together with the 'newt' job above or you will double-count. + # - job_name: 'otel-collector' + # static_configs: + # - targets: ['collector:8889'] diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index 84c0f27..2571379 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -248,10 +248,13 @@ func IncSiteRegistration(ctx context.Context, result string) { } func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) { - mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite( - attribute.String("tunnel_id", tunnelID), + attrs := []attribute.KeyValue{ attribute.String("direction", direction), - )...)) + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(attrs...)...)) } // AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations. @@ -316,18 +319,24 @@ func IncCertRotation(ctx context.Context, result string) { } func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) { - mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite( - attribute.String("tunnel_id", tunnelID), + attrs := []attribute.KeyValue{ attribute.String("transport", transport), - )...)) + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...)) } func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) { - mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite( - attribute.String("tunnel_id", tunnelID), + attrs := []attribute.KeyValue{ attribute.String("initiator", initiator), attribute.String("reason", reason), - )...)) + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) } func IncConnAttempt(ctx context.Context, transport, result string) { diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 7e3b819..8eb0927 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -112,6 +112,12 @@ type Setup struct { // installs recommended histogram views for *_latency_seconds, and returns a Setup with // a Shutdown method to flush exporters. func Init(ctx context.Context, cfg Config) (*Setup, error) { + // Configure tunnel_id label inclusion from env (default true) + if getenv("NEWT_METRICS_INCLUDE_TUNNEL_ID", "true") == "true" { + includeTunnelIDVal.Store(true) + } else { + includeTunnelIDVal.Store(false) + } // Build resource with required attributes and only include optional ones when non-empty attrs := []attribute.KeyValue{ semconv.ServiceName(cfg.ServiceName), @@ -294,6 +300,7 @@ func parseResourceAttributes(s string) map[string]string { // Global site/region used to enrich metric labels. var siteIDVal atomic.Value var regionVal atomic.Value +var includeTunnelIDVal atomic.Value // bool; default true // UpdateSiteInfo updates the global site_id and region used for metric labels. // Thread-safe via atomic.Value: subsequent metric emissions will include @@ -336,6 +343,14 @@ func siteAttrs() []attribute.KeyValue { // SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager). func SiteLabelKVs() []attribute.KeyValue { return siteAttrs() } +// ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted. +func ShouldIncludeTunnelID() bool { + if v, ok := includeTunnelIDVal.Load().(bool); ok { + return v + } + return true +} + func getenv(k, d string) string { if v := os.Getenv(k); v != "" { return v diff --git a/proxy/manager.go b/proxy/manager.go index cf15c66..3052f56 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -117,26 +117,29 @@ func (pm *ProxyManager) SetTunnelID(id string) { e := pm.tunnels[id] // include site labels if available site := telemetry.SiteLabelKVs() - e.attrInTCP = attribute.NewSet(append(site, - attribute.String("tunnel_id", id), + build := func(base []attribute.KeyValue) attribute.Set { + if telemetry.ShouldIncludeTunnelID() { + base = append([]attribute.KeyValue{attribute.String("tunnel_id", id)}, base...) + } + base = append(site, base...) + return attribute.NewSet(base...) + } + e.attrInTCP = build([]attribute.KeyValue{ attribute.String("direction", "ingress"), attribute.String("protocol", "tcp"), - )...) - e.attrOutTCP = attribute.NewSet(append(site, - attribute.String("tunnel_id", id), + }) + e.attrOutTCP = build([]attribute.KeyValue{ attribute.String("direction", "egress"), attribute.String("protocol", "tcp"), - )...) - e.attrInUDP = attribute.NewSet(append(site, - attribute.String("tunnel_id", id), + }) + e.attrInUDP = build([]attribute.KeyValue{ attribute.String("direction", "ingress"), attribute.String("protocol", "udp"), - )...) - e.attrOutUDP = attribute.NewSet(append(site, - attribute.String("tunnel_id", id), + }) + e.attrOutUDP = build([]attribute.KeyValue{ attribute.String("direction", "egress"), attribute.String("protocol", "udp"), - )...) + }) } // ClearTunnelID clears cached attribute sets for the current tunnel. diff --git a/scripts/smoke-metrics.sh b/scripts/smoke-metrics.sh index e0eac32..d2eb11f 100644 --- a/scripts/smoke-metrics.sh +++ b/scripts/smoke-metrics.sh @@ -26,6 +26,16 @@ probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_i probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true +# Optional: verify absence/presence of tunnel_id based on EXPECT_TUNNEL_ID (default true) +EXPECT_TUNNEL_ID=${EXPECT_TUNNEL_ID:-true} +if [ "$EXPECT_TUNNEL_ID" = "false" ]; then + echo "[probe] ensure tunnel_id label is absent when NEWT_METRICS_INCLUDE_TUNNEL_ID=false" + ! curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[fail] tunnel_id present but EXPECT_TUNNEL_ID=false"; exit 1; } +else + echo "[probe] ensure tunnel_id label is present (default)" + curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[warn] tunnel_id not found (may be expected if no tunnel is active)"; } +fi + # WebSocket metrics (when OTLP/WS used) probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true