mirror of
https://github.com/fosrl/newt.git
synced 2026-03-26 12:36:45 +00:00
feat(metrics): NEWT_METRICS_INCLUDE_TUNNEL_ID toggle; conditionally drop tunnel_id across bytes/sessions/proxy/reconnect; docs and smoke test updated; examples/prometheus.yml with relabels; docker-compose defaults avoid double-scrape
This commit is contained in:
@@ -1,25 +1,17 @@
|
||||
services:
|
||||
collector:
|
||||
image: otel/opentelemetry-collector-contrib:0.136.0
|
||||
command: ["--config=/etc/otelcol/config.yaml"]
|
||||
volumes:
|
||||
- ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
|
||||
ports:
|
||||
- "4317:4317" # OTLP gRPC in
|
||||
- "8889:8889" # Prometheus scrape out
|
||||
# Recommended Variant A: Direct Prometheus scrape of Newt (/metrics)
|
||||
# Optional: You may add the Collector service and enable OTLP export, but do NOT
|
||||
# scrape both Newt and the Collector for the same process.
|
||||
|
||||
newt:
|
||||
build: .
|
||||
image: newt:dev
|
||||
env_file:
|
||||
- .env
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
OTEL_SERVICE_NAME: newt
|
||||
NEWT_METRICS_PROMETHEUS_ENABLED: "true"
|
||||
NEWT_METRICS_OTLP_ENABLED: "true"
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317"
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative"
|
||||
NEWT_METRICS_OTLP_ENABLED: "false" # avoid double-scrape by default
|
||||
NEWT_ADMIN_ADDR: ":2112"
|
||||
# Base NEWT configuration
|
||||
PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT}
|
||||
@@ -28,8 +20,16 @@ services:
|
||||
LOG_LEVEL: "DEBUG"
|
||||
ports:
|
||||
- "2112:2112"
|
||||
depends_on:
|
||||
- collector
|
||||
|
||||
# Optional Variant B: Enable the Collector and switch Prometheus scrape to it.
|
||||
# collector:
|
||||
# image: otel/opentelemetry-collector-contrib:0.136.0
|
||||
# command: ["--config=/etc/otelcol/config.yaml"]
|
||||
# volumes:
|
||||
# - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
|
||||
# ports:
|
||||
# - "4317:4317" # OTLP gRPC in
|
||||
# - "8889:8889" # Prometheus scrape out
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:v3.6.0
|
||||
|
||||
@@ -49,8 +49,8 @@ Metric catalog (initial)
|
||||
|
||||
Conventions
|
||||
|
||||
- Durations in seconds, names end with _seconds
|
||||
- Sizes in bytes, names end with _bytes
|
||||
- Durations in seconds (unit: s), names end with _seconds
|
||||
- Sizes in bytes (unit: By), names end with _bytes
|
||||
- Counters end with _total
|
||||
- Labels must be low-cardinality and stable
|
||||
|
||||
@@ -163,6 +163,7 @@ Compatibility notes
|
||||
|
||||
- Gauges do not use the _total suffix (e.g., newt_tunnel_sessions).
|
||||
- site_id is emitted as both resource attribute and metric label on all newt_* series; region is included as a metric label only when set. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels.
|
||||
- NEWT_METRICS_INCLUDE_TUNNEL_ID (default: true) toggles whether tunnel_id is included as a label on bytes/sessions/proxy/reconnect metrics. Disable in high-cardinality environments.
|
||||
- Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both.
|
||||
- Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write.
|
||||
- No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard).
|
||||
|
||||
@@ -2,9 +2,20 @@ global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: newt
|
||||
- job_name: 'newt'
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ["newt:2112"]
|
||||
- job_name: otel-collector
|
||||
static_configs:
|
||||
- targets: ["collector:8889"]
|
||||
- targets: ['newt:2112'] # /metrics
|
||||
relabel_configs:
|
||||
# optional: tunnel_id droppen
|
||||
- action: labeldrop
|
||||
regex: 'tunnel_id'
|
||||
# optional: nur bestimmte sites zulassen
|
||||
- action: keep
|
||||
source_labels: [site_id]
|
||||
regex: '(site-a|site-b)'
|
||||
|
||||
# WARNING: Do not enable this together with the 'newt' job above or you will double-count.
|
||||
# - job_name: 'otel-collector'
|
||||
# static_configs:
|
||||
# - targets: ['collector:8889']
|
||||
|
||||
@@ -248,10 +248,13 @@ func IncSiteRegistration(ctx context.Context, result string) {
|
||||
}
|
||||
|
||||
func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) {
|
||||
mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("tunnel_id", tunnelID),
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("direction", direction),
|
||||
)...))
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations.
|
||||
@@ -316,18 +319,24 @@ func IncCertRotation(ctx context.Context, result string) {
|
||||
}
|
||||
|
||||
func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) {
|
||||
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("tunnel_id", tunnelID),
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("transport", transport),
|
||||
)...))
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) {
|
||||
mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("tunnel_id", tunnelID),
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("initiator", initiator),
|
||||
attribute.String("reason", reason),
|
||||
)...))
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncConnAttempt(ctx context.Context, transport, result string) {
|
||||
|
||||
@@ -112,6 +112,12 @@ type Setup struct {
|
||||
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
|
||||
// a Shutdown method to flush exporters.
|
||||
func Init(ctx context.Context, cfg Config) (*Setup, error) {
|
||||
// Configure tunnel_id label inclusion from env (default true)
|
||||
if getenv("NEWT_METRICS_INCLUDE_TUNNEL_ID", "true") == "true" {
|
||||
includeTunnelIDVal.Store(true)
|
||||
} else {
|
||||
includeTunnelIDVal.Store(false)
|
||||
}
|
||||
// Build resource with required attributes and only include optional ones when non-empty
|
||||
attrs := []attribute.KeyValue{
|
||||
semconv.ServiceName(cfg.ServiceName),
|
||||
@@ -294,6 +300,7 @@ func parseResourceAttributes(s string) map[string]string {
|
||||
// Global site/region used to enrich metric labels.
|
||||
var siteIDVal atomic.Value
|
||||
var regionVal atomic.Value
|
||||
var includeTunnelIDVal atomic.Value // bool; default true
|
||||
|
||||
// UpdateSiteInfo updates the global site_id and region used for metric labels.
|
||||
// Thread-safe via atomic.Value: subsequent metric emissions will include
|
||||
@@ -336,6 +343,14 @@ func siteAttrs() []attribute.KeyValue {
|
||||
// SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager).
|
||||
func SiteLabelKVs() []attribute.KeyValue { return siteAttrs() }
|
||||
|
||||
// ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted.
|
||||
func ShouldIncludeTunnelID() bool {
|
||||
if v, ok := includeTunnelIDVal.Load().(bool); ok {
|
||||
return v
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func getenv(k, d string) string {
|
||||
if v := os.Getenv(k); v != "" {
|
||||
return v
|
||||
|
||||
@@ -117,26 +117,29 @@ func (pm *ProxyManager) SetTunnelID(id string) {
|
||||
e := pm.tunnels[id]
|
||||
// include site labels if available
|
||||
site := telemetry.SiteLabelKVs()
|
||||
e.attrInTCP = attribute.NewSet(append(site,
|
||||
attribute.String("tunnel_id", id),
|
||||
build := func(base []attribute.KeyValue) attribute.Set {
|
||||
if telemetry.ShouldIncludeTunnelID() {
|
||||
base = append([]attribute.KeyValue{attribute.String("tunnel_id", id)}, base...)
|
||||
}
|
||||
base = append(site, base...)
|
||||
return attribute.NewSet(base...)
|
||||
}
|
||||
e.attrInTCP = build([]attribute.KeyValue{
|
||||
attribute.String("direction", "ingress"),
|
||||
attribute.String("protocol", "tcp"),
|
||||
)...)
|
||||
e.attrOutTCP = attribute.NewSet(append(site,
|
||||
attribute.String("tunnel_id", id),
|
||||
})
|
||||
e.attrOutTCP = build([]attribute.KeyValue{
|
||||
attribute.String("direction", "egress"),
|
||||
attribute.String("protocol", "tcp"),
|
||||
)...)
|
||||
e.attrInUDP = attribute.NewSet(append(site,
|
||||
attribute.String("tunnel_id", id),
|
||||
})
|
||||
e.attrInUDP = build([]attribute.KeyValue{
|
||||
attribute.String("direction", "ingress"),
|
||||
attribute.String("protocol", "udp"),
|
||||
)...)
|
||||
e.attrOutUDP = attribute.NewSet(append(site,
|
||||
attribute.String("tunnel_id", id),
|
||||
})
|
||||
e.attrOutUDP = build([]attribute.KeyValue{
|
||||
attribute.String("direction", "egress"),
|
||||
attribute.String("protocol", "udp"),
|
||||
)...)
|
||||
})
|
||||
}
|
||||
|
||||
// ClearTunnelID clears cached attribute sets for the current tunnel.
|
||||
|
||||
@@ -26,6 +26,16 @@ probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_i
|
||||
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
|
||||
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
|
||||
|
||||
# Optional: verify absence/presence of tunnel_id based on EXPECT_TUNNEL_ID (default true)
|
||||
EXPECT_TUNNEL_ID=${EXPECT_TUNNEL_ID:-true}
|
||||
if [ "$EXPECT_TUNNEL_ID" = "false" ]; then
|
||||
echo "[probe] ensure tunnel_id label is absent when NEWT_METRICS_INCLUDE_TUNNEL_ID=false"
|
||||
! curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[fail] tunnel_id present but EXPECT_TUNNEL_ID=false"; exit 1; }
|
||||
else
|
||||
echo "[probe] ensure tunnel_id label is present (default)"
|
||||
curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[warn] tunnel_id not found (may be expected if no tunnel is active)"; }
|
||||
fi
|
||||
|
||||
# WebSocket metrics (when OTLP/WS used)
|
||||
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
|
||||
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true
|
||||
|
||||
Reference in New Issue
Block a user