mirror of
https://github.com/fosrl/newt.git
synced 2026-03-26 20:46:41 +00:00
feat(metrics): NEWT_METRICS_INCLUDE_TUNNEL_ID toggle; conditionally drop tunnel_id across bytes/sessions/proxy/reconnect; docs and smoke test updated; examples/prometheus.yml with relabels; docker-compose defaults avoid double-scrape
This commit is contained in:
@@ -1,25 +1,17 @@
|
|||||||
services:
|
services:
|
||||||
collector:
|
# Recommended Variant A: Direct Prometheus scrape of Newt (/metrics)
|
||||||
image: otel/opentelemetry-collector-contrib:0.136.0
|
# Optional: You may add the Collector service and enable OTLP export, but do NOT
|
||||||
command: ["--config=/etc/otelcol/config.yaml"]
|
# scrape both Newt and the Collector for the same process.
|
||||||
volumes:
|
|
||||||
- ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
|
|
||||||
ports:
|
|
||||||
- "4317:4317" # OTLP gRPC in
|
|
||||||
- "8889:8889" # Prometheus scrape out
|
|
||||||
|
|
||||||
newt:
|
newt:
|
||||||
build: .
|
build: .
|
||||||
image: newt:dev
|
image: newt:dev
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
environment:
|
environment:
|
||||||
OTEL_SERVICE_NAME: newt
|
OTEL_SERVICE_NAME: newt
|
||||||
NEWT_METRICS_PROMETHEUS_ENABLED: "true"
|
NEWT_METRICS_PROMETHEUS_ENABLED: "true"
|
||||||
NEWT_METRICS_OTLP_ENABLED: "true"
|
NEWT_METRICS_OTLP_ENABLED: "false" # avoid double-scrape by default
|
||||||
OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317"
|
|
||||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
|
||||||
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative"
|
|
||||||
NEWT_ADMIN_ADDR: ":2112"
|
NEWT_ADMIN_ADDR: ":2112"
|
||||||
# Base NEWT configuration
|
# Base NEWT configuration
|
||||||
PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT}
|
PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT}
|
||||||
@@ -28,8 +20,16 @@ services:
|
|||||||
LOG_LEVEL: "DEBUG"
|
LOG_LEVEL: "DEBUG"
|
||||||
ports:
|
ports:
|
||||||
- "2112:2112"
|
- "2112:2112"
|
||||||
depends_on:
|
|
||||||
- collector
|
# Optional Variant B: Enable the Collector and switch Prometheus scrape to it.
|
||||||
|
# collector:
|
||||||
|
# image: otel/opentelemetry-collector-contrib:0.136.0
|
||||||
|
# command: ["--config=/etc/otelcol/config.yaml"]
|
||||||
|
# volumes:
|
||||||
|
# - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
|
||||||
|
# ports:
|
||||||
|
# - "4317:4317" # OTLP gRPC in
|
||||||
|
# - "8889:8889" # Prometheus scrape out
|
||||||
|
|
||||||
prometheus:
|
prometheus:
|
||||||
image: prom/prometheus:v3.6.0
|
image: prom/prometheus:v3.6.0
|
||||||
|
|||||||
@@ -49,8 +49,8 @@ Metric catalog (initial)
|
|||||||
|
|
||||||
Conventions
|
Conventions
|
||||||
|
|
||||||
- Durations in seconds, names end with _seconds
|
- Durations in seconds (unit: s), names end with _seconds
|
||||||
- Sizes in bytes, names end with _bytes
|
- Sizes in bytes (unit: By), names end with _bytes
|
||||||
- Counters end with _total
|
- Counters end with _total
|
||||||
- Labels must be low-cardinality and stable
|
- Labels must be low-cardinality and stable
|
||||||
|
|
||||||
@@ -163,6 +163,7 @@ Compatibility notes
|
|||||||
|
|
||||||
- Gauges do not use the _total suffix (e.g., newt_tunnel_sessions).
|
- Gauges do not use the _total suffix (e.g., newt_tunnel_sessions).
|
||||||
- site_id is emitted as both resource attribute and metric label on all newt_* series; region is included as a metric label only when set. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels.
|
- site_id is emitted as both resource attribute and metric label on all newt_* series; region is included as a metric label only when set. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels.
|
||||||
|
- NEWT_METRICS_INCLUDE_TUNNEL_ID (default: true) toggles whether tunnel_id is included as a label on bytes/sessions/proxy/reconnect metrics. Disable in high-cardinality environments.
|
||||||
- Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both.
|
- Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both.
|
||||||
- Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write.
|
- Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write.
|
||||||
- No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard).
|
- No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard).
|
||||||
|
|||||||
@@ -2,9 +2,20 @@ global:
|
|||||||
scrape_interval: 15s
|
scrape_interval: 15s
|
||||||
|
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: newt
|
- job_name: 'newt'
|
||||||
|
scrape_interval: 15s
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ["newt:2112"]
|
- targets: ['newt:2112'] # /metrics
|
||||||
- job_name: otel-collector
|
relabel_configs:
|
||||||
static_configs:
|
# optional: tunnel_id droppen
|
||||||
- targets: ["collector:8889"]
|
- action: labeldrop
|
||||||
|
regex: 'tunnel_id'
|
||||||
|
# optional: nur bestimmte sites zulassen
|
||||||
|
- action: keep
|
||||||
|
source_labels: [site_id]
|
||||||
|
regex: '(site-a|site-b)'
|
||||||
|
|
||||||
|
# WARNING: Do not enable this together with the 'newt' job above or you will double-count.
|
||||||
|
# - job_name: 'otel-collector'
|
||||||
|
# static_configs:
|
||||||
|
# - targets: ['collector:8889']
|
||||||
|
|||||||
@@ -248,10 +248,13 @@ func IncSiteRegistration(ctx context.Context, result string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) {
|
func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) {
|
||||||
mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(
|
attrs := []attribute.KeyValue{
|
||||||
attribute.String("tunnel_id", tunnelID),
|
|
||||||
attribute.String("direction", direction),
|
attribute.String("direction", direction),
|
||||||
)...))
|
}
|
||||||
|
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||||
|
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||||
|
}
|
||||||
|
mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||||
}
|
}
|
||||||
|
|
||||||
// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations.
|
// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations.
|
||||||
@@ -316,18 +319,24 @@ func IncCertRotation(ctx context.Context, result string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) {
|
func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) {
|
||||||
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
|
attrs := []attribute.KeyValue{
|
||||||
attribute.String("tunnel_id", tunnelID),
|
|
||||||
attribute.String("transport", transport),
|
attribute.String("transport", transport),
|
||||||
)...))
|
}
|
||||||
|
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||||
|
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||||
|
}
|
||||||
|
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||||
}
|
}
|
||||||
|
|
||||||
func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) {
|
func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) {
|
||||||
mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
attrs := []attribute.KeyValue{
|
||||||
attribute.String("tunnel_id", tunnelID),
|
|
||||||
attribute.String("initiator", initiator),
|
attribute.String("initiator", initiator),
|
||||||
attribute.String("reason", reason),
|
attribute.String("reason", reason),
|
||||||
)...))
|
}
|
||||||
|
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||||
|
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||||
|
}
|
||||||
|
mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||||
}
|
}
|
||||||
|
|
||||||
func IncConnAttempt(ctx context.Context, transport, result string) {
|
func IncConnAttempt(ctx context.Context, transport, result string) {
|
||||||
|
|||||||
@@ -112,6 +112,12 @@ type Setup struct {
|
|||||||
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
|
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
|
||||||
// a Shutdown method to flush exporters.
|
// a Shutdown method to flush exporters.
|
||||||
func Init(ctx context.Context, cfg Config) (*Setup, error) {
|
func Init(ctx context.Context, cfg Config) (*Setup, error) {
|
||||||
|
// Configure tunnel_id label inclusion from env (default true)
|
||||||
|
if getenv("NEWT_METRICS_INCLUDE_TUNNEL_ID", "true") == "true" {
|
||||||
|
includeTunnelIDVal.Store(true)
|
||||||
|
} else {
|
||||||
|
includeTunnelIDVal.Store(false)
|
||||||
|
}
|
||||||
// Build resource with required attributes and only include optional ones when non-empty
|
// Build resource with required attributes and only include optional ones when non-empty
|
||||||
attrs := []attribute.KeyValue{
|
attrs := []attribute.KeyValue{
|
||||||
semconv.ServiceName(cfg.ServiceName),
|
semconv.ServiceName(cfg.ServiceName),
|
||||||
@@ -294,6 +300,7 @@ func parseResourceAttributes(s string) map[string]string {
|
|||||||
// Global site/region used to enrich metric labels.
|
// Global site/region used to enrich metric labels.
|
||||||
var siteIDVal atomic.Value
|
var siteIDVal atomic.Value
|
||||||
var regionVal atomic.Value
|
var regionVal atomic.Value
|
||||||
|
var includeTunnelIDVal atomic.Value // bool; default true
|
||||||
|
|
||||||
// UpdateSiteInfo updates the global site_id and region used for metric labels.
|
// UpdateSiteInfo updates the global site_id and region used for metric labels.
|
||||||
// Thread-safe via atomic.Value: subsequent metric emissions will include
|
// Thread-safe via atomic.Value: subsequent metric emissions will include
|
||||||
@@ -336,6 +343,14 @@ func siteAttrs() []attribute.KeyValue {
|
|||||||
// SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager).
|
// SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager).
|
||||||
func SiteLabelKVs() []attribute.KeyValue { return siteAttrs() }
|
func SiteLabelKVs() []attribute.KeyValue { return siteAttrs() }
|
||||||
|
|
||||||
|
// ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted.
|
||||||
|
func ShouldIncludeTunnelID() bool {
|
||||||
|
if v, ok := includeTunnelIDVal.Load().(bool); ok {
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func getenv(k, d string) string {
|
func getenv(k, d string) string {
|
||||||
if v := os.Getenv(k); v != "" {
|
if v := os.Getenv(k); v != "" {
|
||||||
return v
|
return v
|
||||||
|
|||||||
@@ -117,26 +117,29 @@ func (pm *ProxyManager) SetTunnelID(id string) {
|
|||||||
e := pm.tunnels[id]
|
e := pm.tunnels[id]
|
||||||
// include site labels if available
|
// include site labels if available
|
||||||
site := telemetry.SiteLabelKVs()
|
site := telemetry.SiteLabelKVs()
|
||||||
e.attrInTCP = attribute.NewSet(append(site,
|
build := func(base []attribute.KeyValue) attribute.Set {
|
||||||
attribute.String("tunnel_id", id),
|
if telemetry.ShouldIncludeTunnelID() {
|
||||||
|
base = append([]attribute.KeyValue{attribute.String("tunnel_id", id)}, base...)
|
||||||
|
}
|
||||||
|
base = append(site, base...)
|
||||||
|
return attribute.NewSet(base...)
|
||||||
|
}
|
||||||
|
e.attrInTCP = build([]attribute.KeyValue{
|
||||||
attribute.String("direction", "ingress"),
|
attribute.String("direction", "ingress"),
|
||||||
attribute.String("protocol", "tcp"),
|
attribute.String("protocol", "tcp"),
|
||||||
)...)
|
})
|
||||||
e.attrOutTCP = attribute.NewSet(append(site,
|
e.attrOutTCP = build([]attribute.KeyValue{
|
||||||
attribute.String("tunnel_id", id),
|
|
||||||
attribute.String("direction", "egress"),
|
attribute.String("direction", "egress"),
|
||||||
attribute.String("protocol", "tcp"),
|
attribute.String("protocol", "tcp"),
|
||||||
)...)
|
})
|
||||||
e.attrInUDP = attribute.NewSet(append(site,
|
e.attrInUDP = build([]attribute.KeyValue{
|
||||||
attribute.String("tunnel_id", id),
|
|
||||||
attribute.String("direction", "ingress"),
|
attribute.String("direction", "ingress"),
|
||||||
attribute.String("protocol", "udp"),
|
attribute.String("protocol", "udp"),
|
||||||
)...)
|
})
|
||||||
e.attrOutUDP = attribute.NewSet(append(site,
|
e.attrOutUDP = build([]attribute.KeyValue{
|
||||||
attribute.String("tunnel_id", id),
|
|
||||||
attribute.String("direction", "egress"),
|
attribute.String("direction", "egress"),
|
||||||
attribute.String("protocol", "udp"),
|
attribute.String("protocol", "udp"),
|
||||||
)...)
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ClearTunnelID clears cached attribute sets for the current tunnel.
|
// ClearTunnelID clears cached attribute sets for the current tunnel.
|
||||||
|
|||||||
@@ -26,6 +26,16 @@ probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_i
|
|||||||
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
|
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
|
||||||
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
|
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
|
||||||
|
|
||||||
|
# Optional: verify absence/presence of tunnel_id based on EXPECT_TUNNEL_ID (default true)
|
||||||
|
EXPECT_TUNNEL_ID=${EXPECT_TUNNEL_ID:-true}
|
||||||
|
if [ "$EXPECT_TUNNEL_ID" = "false" ]; then
|
||||||
|
echo "[probe] ensure tunnel_id label is absent when NEWT_METRICS_INCLUDE_TUNNEL_ID=false"
|
||||||
|
! curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[fail] tunnel_id present but EXPECT_TUNNEL_ID=false"; exit 1; }
|
||||||
|
else
|
||||||
|
echo "[probe] ensure tunnel_id label is present (default)"
|
||||||
|
curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[warn] tunnel_id not found (may be expected if no tunnel is active)"; }
|
||||||
|
fi
|
||||||
|
|
||||||
# WebSocket metrics (when OTLP/WS used)
|
# WebSocket metrics (when OTLP/WS used)
|
||||||
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
|
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
|
||||||
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true
|
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true
|
||||||
|
|||||||
Reference in New Issue
Block a user