feat(metrics): NEWT_METRICS_INCLUDE_TUNNEL_ID toggle; conditionally drop tunnel_id across bytes/sessions/proxy/reconnect; docs and smoke test updated; examples/prometheus.yml with relabels; docker-compose defaults avoid double-scrape

This commit is contained in:
Marc Schäfer
2025-10-08 00:53:40 +02:00
parent f28d90595b
commit b20f7a02b2
7 changed files with 93 additions and 44 deletions

View File

@@ -1,25 +1,17 @@
services:
collector:
image: otel/opentelemetry-collector-contrib:0.136.0
command: ["--config=/etc/otelcol/config.yaml"]
volumes:
- ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
ports:
- "4317:4317" # OTLP gRPC in
- "8889:8889" # Prometheus scrape out
# Recommended Variant A: Direct Prometheus scrape of Newt (/metrics)
# Optional: You may add the Collector service and enable OTLP export, but do NOT
# scrape both Newt and the Collector for the same process.
newt:
build: .
image: newt:dev
env_file:
- .env
env_file:
- .env
environment:
OTEL_SERVICE_NAME: newt
NEWT_METRICS_PROMETHEUS_ENABLED: "true"
NEWT_METRICS_OTLP_ENABLED: "true"
OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317"
OTEL_EXPORTER_OTLP_INSECURE: "true"
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative"
NEWT_METRICS_OTLP_ENABLED: "false" # avoid double-scrape by default
NEWT_ADMIN_ADDR: ":2112"
# Base NEWT configuration
PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT}
@@ -28,8 +20,16 @@ services:
LOG_LEVEL: "DEBUG"
ports:
- "2112:2112"
depends_on:
- collector
# Optional Variant B: Enable the Collector and switch Prometheus scrape to it.
# collector:
# image: otel/opentelemetry-collector-contrib:0.136.0
# command: ["--config=/etc/otelcol/config.yaml"]
# volumes:
# - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
# ports:
# - "4317:4317" # OTLP gRPC in
# - "8889:8889" # Prometheus scrape out
prometheus:
image: prom/prometheus:v3.6.0

View File

@@ -49,8 +49,8 @@ Metric catalog (initial)
Conventions
- Durations in seconds, names end with _seconds
- Sizes in bytes, names end with _bytes
- Durations in seconds (unit: s), names end with _seconds
- Sizes in bytes (unit: By), names end with _bytes
- Counters end with _total
- Labels must be low-cardinality and stable
@@ -163,6 +163,7 @@ Compatibility notes
- Gauges do not use the _total suffix (e.g., newt_tunnel_sessions).
- site_id is emitted as both resource attribute and metric label on all newt_* series; region is included as a metric label only when set. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels.
- NEWT_METRICS_INCLUDE_TUNNEL_ID (default: true) toggles whether tunnel_id is included as a label on bytes/sessions/proxy/reconnect metrics. Disable in high-cardinality environments.
- Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both.
- Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write.
- No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard).

View File

@@ -2,9 +2,20 @@ global:
scrape_interval: 15s
scrape_configs:
- job_name: newt
- job_name: 'newt'
scrape_interval: 15s
static_configs:
- targets: ["newt:2112"]
- job_name: otel-collector
static_configs:
- targets: ["collector:8889"]
- targets: ['newt:2112'] # /metrics
relabel_configs:
# optional: tunnel_id droppen
- action: labeldrop
regex: 'tunnel_id'
# optional: nur bestimmte sites zulassen
- action: keep
source_labels: [site_id]
regex: '(site-a|site-b)'
# WARNING: Do not enable this together with the 'newt' job above or you will double-count.
# - job_name: 'otel-collector'
# static_configs:
# - targets: ['collector:8889']

View File

@@ -248,10 +248,13 @@ func IncSiteRegistration(ctx context.Context, result string) {
}
func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) {
mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(
attribute.String("tunnel_id", tunnelID),
attrs := []attribute.KeyValue{
attribute.String("direction", direction),
)...))
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(attrs...)...))
}
// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations.
@@ -316,18 +319,24 @@ func IncCertRotation(ctx context.Context, result string) {
}
func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) {
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
attribute.String("tunnel_id", tunnelID),
attrs := []attribute.KeyValue{
attribute.String("transport", transport),
)...))
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
}
func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) {
mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("tunnel_id", tunnelID),
attrs := []attribute.KeyValue{
attribute.String("initiator", initiator),
attribute.String("reason", reason),
)...))
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
}
func IncConnAttempt(ctx context.Context, transport, result string) {

View File

@@ -112,6 +112,12 @@ type Setup struct {
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
// a Shutdown method to flush exporters.
func Init(ctx context.Context, cfg Config) (*Setup, error) {
// Configure tunnel_id label inclusion from env (default true)
if getenv("NEWT_METRICS_INCLUDE_TUNNEL_ID", "true") == "true" {
includeTunnelIDVal.Store(true)
} else {
includeTunnelIDVal.Store(false)
}
// Build resource with required attributes and only include optional ones when non-empty
attrs := []attribute.KeyValue{
semconv.ServiceName(cfg.ServiceName),
@@ -294,6 +300,7 @@ func parseResourceAttributes(s string) map[string]string {
// Global site/region used to enrich metric labels.
var siteIDVal atomic.Value
var regionVal atomic.Value
var includeTunnelIDVal atomic.Value // bool; default true
// UpdateSiteInfo updates the global site_id and region used for metric labels.
// Thread-safe via atomic.Value: subsequent metric emissions will include
@@ -336,6 +343,14 @@ func siteAttrs() []attribute.KeyValue {
// SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager).
func SiteLabelKVs() []attribute.KeyValue { return siteAttrs() }
// ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted.
func ShouldIncludeTunnelID() bool {
if v, ok := includeTunnelIDVal.Load().(bool); ok {
return v
}
return true
}
func getenv(k, d string) string {
if v := os.Getenv(k); v != "" {
return v

View File

@@ -117,26 +117,29 @@ func (pm *ProxyManager) SetTunnelID(id string) {
e := pm.tunnels[id]
// include site labels if available
site := telemetry.SiteLabelKVs()
e.attrInTCP = attribute.NewSet(append(site,
attribute.String("tunnel_id", id),
build := func(base []attribute.KeyValue) attribute.Set {
if telemetry.ShouldIncludeTunnelID() {
base = append([]attribute.KeyValue{attribute.String("tunnel_id", id)}, base...)
}
base = append(site, base...)
return attribute.NewSet(base...)
}
e.attrInTCP = build([]attribute.KeyValue{
attribute.String("direction", "ingress"),
attribute.String("protocol", "tcp"),
)...)
e.attrOutTCP = attribute.NewSet(append(site,
attribute.String("tunnel_id", id),
})
e.attrOutTCP = build([]attribute.KeyValue{
attribute.String("direction", "egress"),
attribute.String("protocol", "tcp"),
)...)
e.attrInUDP = attribute.NewSet(append(site,
attribute.String("tunnel_id", id),
})
e.attrInUDP = build([]attribute.KeyValue{
attribute.String("direction", "ingress"),
attribute.String("protocol", "udp"),
)...)
e.attrOutUDP = attribute.NewSet(append(site,
attribute.String("tunnel_id", id),
})
e.attrOutUDP = build([]attribute.KeyValue{
attribute.String("direction", "egress"),
attribute.String("protocol", "udp"),
)...)
})
}
// ClearTunnelID clears cached attribute sets for the current tunnel.

View File

@@ -26,6 +26,16 @@ probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_i
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
# Optional: verify absence/presence of tunnel_id based on EXPECT_TUNNEL_ID (default true)
EXPECT_TUNNEL_ID=${EXPECT_TUNNEL_ID:-true}
if [ "$EXPECT_TUNNEL_ID" = "false" ]; then
echo "[probe] ensure tunnel_id label is absent when NEWT_METRICS_INCLUDE_TUNNEL_ID=false"
! curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[fail] tunnel_id present but EXPECT_TUNNEL_ID=false"; exit 1; }
else
echo "[probe] ensure tunnel_id label is present (default)"
curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[warn] tunnel_id not found (may be expected if no tunnel is active)"; }
fi
# WebSocket metrics (when OTLP/WS used)
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true