mirror of
https://github.com/fosrl/newt.git
synced 2026-03-26 12:36:45 +00:00
docs+examples: document direction=ingress|egress, initiator and error_type enums; add cardinality relabel tips; provide Collector variants; add scripts/smoke-metrics.sh
This commit is contained in:
@@ -40,12 +40,12 @@ Metric catalog (initial)
|
||||
- newt_site_registrations_total (counter) labels: result, site_id[, region]
|
||||
- newt_site_online (observable gauge) labels: site_id (0/1)
|
||||
- newt_site_last_heartbeat_seconds (observable gauge) labels: site_id
|
||||
- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id, transport (transport e.g. wireguard)
|
||||
- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (in|out)
|
||||
- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id [transport optional when known]
|
||||
- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (ingress|egress)
|
||||
- newt_tunnel_latency_seconds (histogram) labels: site_id, tunnel_id, transport (e.g., wireguard)
|
||||
- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, reason
|
||||
- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, initiator (client|server), reason
|
||||
- newt_connection_attempts_total (counter) labels: site_id, transport, result
|
||||
- newt_connection_errors_total (counter) labels: site_id, transport, error_type
|
||||
- newt_connection_errors_total (counter) labels: site_id, transport, error_type (dial_timeout|tls_handshake|auth_failed|io_error)
|
||||
|
||||
Conventions
|
||||
|
||||
@@ -171,6 +171,24 @@ Further reading
|
||||
|
||||
- See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts.
|
||||
|
||||
Cardinality tips
|
||||
|
||||
- tunnel_id can grow in larger fleets. Use relabeling to drop or retain a subset, for example:
|
||||
|
||||
```
|
||||
# Drop all tunnel_id on bytes to reduce series
|
||||
- source_labels: [__name__]
|
||||
regex: newt_tunnel_bytes_total
|
||||
action: keep
|
||||
- action: labeldrop
|
||||
regex: tunnel_id
|
||||
|
||||
# Or drop only high-churn tunnels
|
||||
- source_labels: [tunnel_id]
|
||||
regex: .*
|
||||
action: drop
|
||||
```
|
||||
|
||||
Troubleshooting
|
||||
|
||||
- curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics
|
||||
|
||||
@@ -1,3 +1,20 @@
|
||||
# Variant A: Direct scrape of Newt (/metrics) via Prometheus (no Collector needed)
|
||||
# Note: Newt already exposes labels like site_id, protocol, direction. Do not promote
|
||||
# resource attributes into labels when scraping Newt directly.
|
||||
#
|
||||
# Example Prometheus scrape config:
|
||||
# global:
|
||||
# scrape_interval: 15s
|
||||
# scrape_configs:
|
||||
# - job_name: newt
|
||||
# static_configs:
|
||||
# - targets: ["newt:2112"]
|
||||
#
|
||||
# Variant B: Use OTEL Collector (Newt -> OTLP -> Collector -> Prometheus)
|
||||
# This pipeline scrapes metrics from the Collector's Prometheus exporter.
|
||||
# Labels are already on datapoints; promotion from resource is OPTIONAL and typically NOT required.
|
||||
# If you enable transform/promote below, ensure you do not duplicate labels.
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
@@ -13,20 +30,20 @@ processors:
|
||||
detectors: [env, system]
|
||||
timeout: 5s
|
||||
batch: {}
|
||||
transform/promote:
|
||||
# optional, damit fehlende Keys nicht die Pipeline abbrechen:
|
||||
error_mode: ignore
|
||||
metric_statements:
|
||||
- context: datapoint
|
||||
statements:
|
||||
- set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil
|
||||
- set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil
|
||||
# OPTIONAL: Only enable if you need to promote resource attributes to labels.
|
||||
# WARNING: Newt already provides site_id as a label; avoid double-promotion.
|
||||
# transform/promote:
|
||||
# error_mode: ignore
|
||||
# metric_statements:
|
||||
# - context: datapoint
|
||||
# statements:
|
||||
# - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil
|
||||
# - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil
|
||||
|
||||
exporters:
|
||||
prometheus:
|
||||
endpoint: ":8889"
|
||||
send_timestamps: true
|
||||
# Falls du kein Remote-Write-Ziel hast, kommentiere es aus:
|
||||
# prometheusremotewrite:
|
||||
# endpoint: http://mimir:9009/api/v1/push
|
||||
debug:
|
||||
@@ -36,8 +53,8 @@ service:
|
||||
pipelines:
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, resourcedetection, transform/promote, batch]
|
||||
exporters: [prometheus] # , prometheusremotewrite
|
||||
processors: [memory_limiter, resourcedetection, batch] # add transform/promote if you really need it
|
||||
exporters: [prometheus]
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, resourcedetection, batch]
|
||||
|
||||
42
scripts/smoke-metrics.sh
Normal file
42
scripts/smoke-metrics.sh
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
NEWTHOST=${NEWTHOST:-localhost}
|
||||
NEWTPORT=${NEWTPORT:-2112}
|
||||
METRICS_URL="http://${NEWTHOST}:${NEWTPORT}/metrics"
|
||||
|
||||
probe() {
|
||||
local name=$1
|
||||
local pattern=$2
|
||||
echo "[probe] ${name}"
|
||||
curl -sf "${METRICS_URL}" | grep -E "${pattern}" || {
|
||||
echo "[warn] ${name} not found"
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
# Basic presence
|
||||
probe "newt_* presence" "^newt_" || true
|
||||
|
||||
# Site gauges with site_id
|
||||
probe "site_online with site_id" "^newt_site_online\{.*site_id=\"[^\"]+\"" || true
|
||||
probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_id=\"[^\"]+\"" || true
|
||||
|
||||
# Bytes with direction ingress/egress and protocol
|
||||
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
|
||||
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
|
||||
|
||||
# WebSocket metrics (when OTLP/WS used)
|
||||
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
|
||||
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true
|
||||
|
||||
# Proxy metrics (when proxy active)
|
||||
probe "proxy active connections" "^newt_proxy_active_connections\{" || true
|
||||
probe "proxy buffer bytes" "^newt_proxy_buffer_bytes\{" || true
|
||||
probe "proxy drops total" "^newt_proxy_drops_total\{" || true
|
||||
|
||||
# Config apply
|
||||
probe "config apply seconds buckets" "^newt_config_apply_seconds_bucket\{" || true
|
||||
|
||||
echo "Smoke checks completed (warnings above are acceptable if the feature isn't exercised yet)."
|
||||
|
||||
Reference in New Issue
Block a user