From 4a90e36a442afbb46fe3c027eae205d11bf02d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:46:01 +0200 Subject: [PATCH] docs+examples: document direction=ingress|egress, initiator and error_type enums; add cardinality relabel tips; provide Collector variants; add scripts/smoke-metrics.sh --- docs/observability.md | 26 ++++++++++++++++++---- examples/otel-collector.yaml | 39 +++++++++++++++++++++++---------- scripts/smoke-metrics.sh | 42 ++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 15 deletions(-) create mode 100644 scripts/smoke-metrics.sh diff --git a/docs/observability.md b/docs/observability.md index 1aa7a77..cf8de79 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -40,12 +40,12 @@ Metric catalog (initial) - newt_site_registrations_total (counter) labels: result, site_id[, region] - newt_site_online (observable gauge) labels: site_id (0/1) - newt_site_last_heartbeat_seconds (observable gauge) labels: site_id -- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id, transport (transport e.g. wireguard) -- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (in|out) +- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id [transport optional when known] +- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (ingress|egress) - newt_tunnel_latency_seconds (histogram) labels: site_id, tunnel_id, transport (e.g., wireguard) -- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, reason +- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, initiator (client|server), reason - newt_connection_attempts_total (counter) labels: site_id, transport, result -- newt_connection_errors_total (counter) labels: site_id, transport, error_type +- newt_connection_errors_total (counter) labels: site_id, transport, error_type (dial_timeout|tls_handshake|auth_failed|io_error) Conventions @@ -171,6 +171,24 @@ Further reading - See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts. +Cardinality tips + +- tunnel_id can grow in larger fleets. Use relabeling to drop or retain a subset, for example: + +``` +# Drop all tunnel_id on bytes to reduce series +- source_labels: [__name__] + regex: newt_tunnel_bytes_total + action: keep +- action: labeldrop + regex: tunnel_id + +# Or drop only high-churn tunnels +- source_labels: [tunnel_id] + regex: .* + action: drop +``` + Troubleshooting - curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics diff --git a/examples/otel-collector.yaml b/examples/otel-collector.yaml index b00cb67..408c6a6 100644 --- a/examples/otel-collector.yaml +++ b/examples/otel-collector.yaml @@ -1,3 +1,20 @@ +# Variant A: Direct scrape of Newt (/metrics) via Prometheus (no Collector needed) +# Note: Newt already exposes labels like site_id, protocol, direction. Do not promote +# resource attributes into labels when scraping Newt directly. +# +# Example Prometheus scrape config: +# global: +# scrape_interval: 15s +# scrape_configs: +# - job_name: newt +# static_configs: +# - targets: ["newt:2112"] +# +# Variant B: Use OTEL Collector (Newt -> OTLP -> Collector -> Prometheus) +# This pipeline scrapes metrics from the Collector's Prometheus exporter. +# Labels are already on datapoints; promotion from resource is OPTIONAL and typically NOT required. +# If you enable transform/promote below, ensure you do not duplicate labels. + receivers: otlp: protocols: @@ -13,20 +30,20 @@ processors: detectors: [env, system] timeout: 5s batch: {} - transform/promote: - # optional, damit fehlende Keys nicht die Pipeline abbrechen: - error_mode: ignore - metric_statements: - - context: datapoint - statements: - - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil - - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil + # OPTIONAL: Only enable if you need to promote resource attributes to labels. + # WARNING: Newt already provides site_id as a label; avoid double-promotion. + # transform/promote: + # error_mode: ignore + # metric_statements: + # - context: datapoint + # statements: + # - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil + # - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil exporters: prometheus: endpoint: ":8889" send_timestamps: true - # Falls du kein Remote-Write-Ziel hast, kommentiere es aus: # prometheusremotewrite: # endpoint: http://mimir:9009/api/v1/push debug: @@ -36,8 +53,8 @@ service: pipelines: metrics: receivers: [otlp] - processors: [memory_limiter, resourcedetection, transform/promote, batch] - exporters: [prometheus] # , prometheusremotewrite + processors: [memory_limiter, resourcedetection, batch] # add transform/promote if you really need it + exporters: [prometheus] traces: receivers: [otlp] processors: [memory_limiter, resourcedetection, batch] diff --git a/scripts/smoke-metrics.sh b/scripts/smoke-metrics.sh new file mode 100644 index 0000000..e0eac32 --- /dev/null +++ b/scripts/smoke-metrics.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -euo pipefail + +NEWTHOST=${NEWTHOST:-localhost} +NEWTPORT=${NEWTPORT:-2112} +METRICS_URL="http://${NEWTHOST}:${NEWTPORT}/metrics" + +probe() { + local name=$1 + local pattern=$2 + echo "[probe] ${name}" + curl -sf "${METRICS_URL}" | grep -E "${pattern}" || { + echo "[warn] ${name} not found" + return 1 + } +} + +# Basic presence +probe "newt_* presence" "^newt_" || true + +# Site gauges with site_id +probe "site_online with site_id" "^newt_site_online\{.*site_id=\"[^\"]+\"" || true +probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_id=\"[^\"]+\"" || true + +# Bytes with direction ingress/egress and protocol +probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true +probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true + +# WebSocket metrics (when OTLP/WS used) +probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true +probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true + +# Proxy metrics (when proxy active) +probe "proxy active connections" "^newt_proxy_active_connections\{" || true +probe "proxy buffer bytes" "^newt_proxy_buffer_bytes\{" || true +probe "proxy drops total" "^newt_proxy_drops_total\{" || true + +# Config apply +probe "config apply seconds buckets" "^newt_config_apply_seconds_bucket\{" || true + +echo "Smoke checks completed (warnings above are acceptable if the feature isn't exercised yet)." +