docs+examples: document direction=ingress|egress, initiator and error_type enums; add cardinality relabel tips; provide Collector variants; add scripts/smoke-metrics.sh

This commit is contained in:
Marc Schäfer
2025-10-08 00:46:01 +02:00
parent 9ace45e71f
commit 4a90e36a44
3 changed files with 92 additions and 15 deletions

View File

@@ -40,12 +40,12 @@ Metric catalog (initial)
- newt_site_registrations_total (counter) labels: result, site_id[, region]
- newt_site_online (observable gauge) labels: site_id (0/1)
- newt_site_last_heartbeat_seconds (observable gauge) labels: site_id
- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id, transport (transport e.g. wireguard)
- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (in|out)
- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id [transport optional when known]
- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (ingress|egress)
- newt_tunnel_latency_seconds (histogram) labels: site_id, tunnel_id, transport (e.g., wireguard)
- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, reason
- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, initiator (client|server), reason
- newt_connection_attempts_total (counter) labels: site_id, transport, result
- newt_connection_errors_total (counter) labels: site_id, transport, error_type
- newt_connection_errors_total (counter) labels: site_id, transport, error_type (dial_timeout|tls_handshake|auth_failed|io_error)
Conventions
@@ -171,6 +171,24 @@ Further reading
- See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts.
Cardinality tips
- tunnel_id can grow in larger fleets. Use relabeling to drop or retain a subset, for example:
```
# Drop all tunnel_id on bytes to reduce series
- source_labels: [__name__]
regex: newt_tunnel_bytes_total
action: keep
- action: labeldrop
regex: tunnel_id
# Or drop only high-churn tunnels
- source_labels: [tunnel_id]
regex: .*
action: drop
```
Troubleshooting
- curl :2112/metrics ensure endpoint is reachable and includes newt_* metrics

View File

@@ -1,3 +1,20 @@
# Variant A: Direct scrape of Newt (/metrics) via Prometheus (no Collector needed)
# Note: Newt already exposes labels like site_id, protocol, direction. Do not promote
# resource attributes into labels when scraping Newt directly.
#
# Example Prometheus scrape config:
# global:
# scrape_interval: 15s
# scrape_configs:
# - job_name: newt
# static_configs:
# - targets: ["newt:2112"]
#
# Variant B: Use OTEL Collector (Newt -> OTLP -> Collector -> Prometheus)
# This pipeline scrapes metrics from the Collector's Prometheus exporter.
# Labels are already on datapoints; promotion from resource is OPTIONAL and typically NOT required.
# If you enable transform/promote below, ensure you do not duplicate labels.
receivers:
otlp:
protocols:
@@ -13,20 +30,20 @@ processors:
detectors: [env, system]
timeout: 5s
batch: {}
transform/promote:
# optional, damit fehlende Keys nicht die Pipeline abbrechen:
error_mode: ignore
metric_statements:
- context: datapoint
statements:
- set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil
- set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil
# OPTIONAL: Only enable if you need to promote resource attributes to labels.
# WARNING: Newt already provides site_id as a label; avoid double-promotion.
# transform/promote:
# error_mode: ignore
# metric_statements:
# - context: datapoint
# statements:
# - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil
# - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil
exporters:
prometheus:
endpoint: ":8889"
send_timestamps: true
# Falls du kein Remote-Write-Ziel hast, kommentiere es aus:
# prometheusremotewrite:
# endpoint: http://mimir:9009/api/v1/push
debug:
@@ -36,8 +53,8 @@ service:
pipelines:
metrics:
receivers: [otlp]
processors: [memory_limiter, resourcedetection, transform/promote, batch]
exporters: [prometheus] # , prometheusremotewrite
processors: [memory_limiter, resourcedetection, batch] # add transform/promote if you really need it
exporters: [prometheus]
traces:
receivers: [otlp]
processors: [memory_limiter, resourcedetection, batch]

42
scripts/smoke-metrics.sh Normal file
View File

@@ -0,0 +1,42 @@
#!/usr/bin/env bash
set -euo pipefail
NEWTHOST=${NEWTHOST:-localhost}
NEWTPORT=${NEWTPORT:-2112}
METRICS_URL="http://${NEWTHOST}:${NEWTPORT}/metrics"
probe() {
local name=$1
local pattern=$2
echo "[probe] ${name}"
curl -sf "${METRICS_URL}" | grep -E "${pattern}" || {
echo "[warn] ${name} not found"
return 1
}
}
# Basic presence
probe "newt_* presence" "^newt_" || true
# Site gauges with site_id
probe "site_online with site_id" "^newt_site_online\{.*site_id=\"[^\"]+\"" || true
probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_id=\"[^\"]+\"" || true
# Bytes with direction ingress/egress and protocol
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
# WebSocket metrics (when OTLP/WS used)
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true
# Proxy metrics (when proxy active)
probe "proxy active connections" "^newt_proxy_active_connections\{" || true
probe "proxy buffer bytes" "^newt_proxy_buffer_bytes\{" || true
probe "proxy drops total" "^newt_proxy_drops_total\{" || true
# Config apply
probe "config apply seconds buckets" "^newt_config_apply_seconds_bucket\{" || true
echo "Smoke checks completed (warnings above are acceptable if the feature isn't exercised yet)."