mirror of
https://github.com/fosrl/newt.git
synced 2026-03-26 12:36:45 +00:00
docs+examples: document direction=ingress|egress, initiator and error_type enums; add cardinality relabel tips; provide Collector variants; add scripts/smoke-metrics.sh
This commit is contained in:
@@ -40,12 +40,12 @@ Metric catalog (initial)
|
|||||||
- newt_site_registrations_total (counter) labels: result, site_id[, region]
|
- newt_site_registrations_total (counter) labels: result, site_id[, region]
|
||||||
- newt_site_online (observable gauge) labels: site_id (0/1)
|
- newt_site_online (observable gauge) labels: site_id (0/1)
|
||||||
- newt_site_last_heartbeat_seconds (observable gauge) labels: site_id
|
- newt_site_last_heartbeat_seconds (observable gauge) labels: site_id
|
||||||
- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id, transport (transport e.g. wireguard)
|
- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id [transport optional when known]
|
||||||
- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (in|out)
|
- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (ingress|egress)
|
||||||
- newt_tunnel_latency_seconds (histogram) labels: site_id, tunnel_id, transport (e.g., wireguard)
|
- newt_tunnel_latency_seconds (histogram) labels: site_id, tunnel_id, transport (e.g., wireguard)
|
||||||
- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, reason
|
- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, initiator (client|server), reason
|
||||||
- newt_connection_attempts_total (counter) labels: site_id, transport, result
|
- newt_connection_attempts_total (counter) labels: site_id, transport, result
|
||||||
- newt_connection_errors_total (counter) labels: site_id, transport, error_type
|
- newt_connection_errors_total (counter) labels: site_id, transport, error_type (dial_timeout|tls_handshake|auth_failed|io_error)
|
||||||
|
|
||||||
Conventions
|
Conventions
|
||||||
|
|
||||||
@@ -171,6 +171,24 @@ Further reading
|
|||||||
|
|
||||||
- See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts.
|
- See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts.
|
||||||
|
|
||||||
|
Cardinality tips
|
||||||
|
|
||||||
|
- tunnel_id can grow in larger fleets. Use relabeling to drop or retain a subset, for example:
|
||||||
|
|
||||||
|
```
|
||||||
|
# Drop all tunnel_id on bytes to reduce series
|
||||||
|
- source_labels: [__name__]
|
||||||
|
regex: newt_tunnel_bytes_total
|
||||||
|
action: keep
|
||||||
|
- action: labeldrop
|
||||||
|
regex: tunnel_id
|
||||||
|
|
||||||
|
# Or drop only high-churn tunnels
|
||||||
|
- source_labels: [tunnel_id]
|
||||||
|
regex: .*
|
||||||
|
action: drop
|
||||||
|
```
|
||||||
|
|
||||||
Troubleshooting
|
Troubleshooting
|
||||||
|
|
||||||
- curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics
|
- curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics
|
||||||
|
|||||||
@@ -1,3 +1,20 @@
|
|||||||
|
# Variant A: Direct scrape of Newt (/metrics) via Prometheus (no Collector needed)
|
||||||
|
# Note: Newt already exposes labels like site_id, protocol, direction. Do not promote
|
||||||
|
# resource attributes into labels when scraping Newt directly.
|
||||||
|
#
|
||||||
|
# Example Prometheus scrape config:
|
||||||
|
# global:
|
||||||
|
# scrape_interval: 15s
|
||||||
|
# scrape_configs:
|
||||||
|
# - job_name: newt
|
||||||
|
# static_configs:
|
||||||
|
# - targets: ["newt:2112"]
|
||||||
|
#
|
||||||
|
# Variant B: Use OTEL Collector (Newt -> OTLP -> Collector -> Prometheus)
|
||||||
|
# This pipeline scrapes metrics from the Collector's Prometheus exporter.
|
||||||
|
# Labels are already on datapoints; promotion from resource is OPTIONAL and typically NOT required.
|
||||||
|
# If you enable transform/promote below, ensure you do not duplicate labels.
|
||||||
|
|
||||||
receivers:
|
receivers:
|
||||||
otlp:
|
otlp:
|
||||||
protocols:
|
protocols:
|
||||||
@@ -13,20 +30,20 @@ processors:
|
|||||||
detectors: [env, system]
|
detectors: [env, system]
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
batch: {}
|
batch: {}
|
||||||
transform/promote:
|
# OPTIONAL: Only enable if you need to promote resource attributes to labels.
|
||||||
# optional, damit fehlende Keys nicht die Pipeline abbrechen:
|
# WARNING: Newt already provides site_id as a label; avoid double-promotion.
|
||||||
error_mode: ignore
|
# transform/promote:
|
||||||
metric_statements:
|
# error_mode: ignore
|
||||||
- context: datapoint
|
# metric_statements:
|
||||||
statements:
|
# - context: datapoint
|
||||||
- set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil
|
# statements:
|
||||||
- set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil
|
# - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil
|
||||||
|
# - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil
|
||||||
|
|
||||||
exporters:
|
exporters:
|
||||||
prometheus:
|
prometheus:
|
||||||
endpoint: ":8889"
|
endpoint: ":8889"
|
||||||
send_timestamps: true
|
send_timestamps: true
|
||||||
# Falls du kein Remote-Write-Ziel hast, kommentiere es aus:
|
|
||||||
# prometheusremotewrite:
|
# prometheusremotewrite:
|
||||||
# endpoint: http://mimir:9009/api/v1/push
|
# endpoint: http://mimir:9009/api/v1/push
|
||||||
debug:
|
debug:
|
||||||
@@ -36,8 +53,8 @@ service:
|
|||||||
pipelines:
|
pipelines:
|
||||||
metrics:
|
metrics:
|
||||||
receivers: [otlp]
|
receivers: [otlp]
|
||||||
processors: [memory_limiter, resourcedetection, transform/promote, batch]
|
processors: [memory_limiter, resourcedetection, batch] # add transform/promote if you really need it
|
||||||
exporters: [prometheus] # , prometheusremotewrite
|
exporters: [prometheus]
|
||||||
traces:
|
traces:
|
||||||
receivers: [otlp]
|
receivers: [otlp]
|
||||||
processors: [memory_limiter, resourcedetection, batch]
|
processors: [memory_limiter, resourcedetection, batch]
|
||||||
|
|||||||
42
scripts/smoke-metrics.sh
Normal file
42
scripts/smoke-metrics.sh
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
NEWTHOST=${NEWTHOST:-localhost}
|
||||||
|
NEWTPORT=${NEWTPORT:-2112}
|
||||||
|
METRICS_URL="http://${NEWTHOST}:${NEWTPORT}/metrics"
|
||||||
|
|
||||||
|
probe() {
|
||||||
|
local name=$1
|
||||||
|
local pattern=$2
|
||||||
|
echo "[probe] ${name}"
|
||||||
|
curl -sf "${METRICS_URL}" | grep -E "${pattern}" || {
|
||||||
|
echo "[warn] ${name} not found"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Basic presence
|
||||||
|
probe "newt_* presence" "^newt_" || true
|
||||||
|
|
||||||
|
# Site gauges with site_id
|
||||||
|
probe "site_online with site_id" "^newt_site_online\{.*site_id=\"[^\"]+\"" || true
|
||||||
|
probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_id=\"[^\"]+\"" || true
|
||||||
|
|
||||||
|
# Bytes with direction ingress/egress and protocol
|
||||||
|
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
|
||||||
|
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
|
||||||
|
|
||||||
|
# WebSocket metrics (when OTLP/WS used)
|
||||||
|
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
|
||||||
|
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true
|
||||||
|
|
||||||
|
# Proxy metrics (when proxy active)
|
||||||
|
probe "proxy active connections" "^newt_proxy_active_connections\{" || true
|
||||||
|
probe "proxy buffer bytes" "^newt_proxy_buffer_bytes\{" || true
|
||||||
|
probe "proxy drops total" "^newt_proxy_drops_total\{" || true
|
||||||
|
|
||||||
|
# Config apply
|
||||||
|
probe "config apply seconds buckets" "^newt_config_apply_seconds_bucket\{" || true
|
||||||
|
|
||||||
|
echo "Smoke checks completed (warnings above are acceptable if the feature isn't exercised yet)."
|
||||||
|
|
||||||
Reference in New Issue
Block a user