From 0f83489f11c74f04561ce996f43652d24dd59f9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:16:44 +0200 Subject: [PATCH] Add OpenTelemetry configuration and observability documentation --- docker-compose.metrics.yml | 33 +++++++ docs/observability.md | 169 +++++++++++++++++++++++++++++++++++ examples/otel-collector.yaml | 41 +++++++++ examples/prometheus.yml | 11 +++ 4 files changed, 254 insertions(+) create mode 100644 docker-compose.metrics.yml create mode 100644 docs/observability.md create mode 100644 examples/otel-collector.yaml create mode 100644 examples/prometheus.yml diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml new file mode 100644 index 0000000..76b92a8 --- /dev/null +++ b/docker-compose.metrics.yml @@ -0,0 +1,33 @@ +services: + collector: + image: otel/opentelemetry-collector:0.111.0 + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC in + - "8889:8889" # Prometheus scrape out + + newt: + build: . + image: newt:dev + environment: + OTEL_SERVICE_NAME: newt + NEWT_METRICS_PROMETHEUS_ENABLED: "true" + NEWT_METRICS_OTLP_ENABLED: "true" + OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317" + OTEL_EXPORTER_OTLP_INSECURE: "true" + OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative" + NEWT_ADMIN_ADDR: "0.0.0.0:2112" + ports: + - "2112:2112" + depends_on: + - collector + + prometheus: + image: prom/prometheus:v2.55.0 + volumes: + - ./examples/prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 0000000..3e9e890 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,169 @@ +# OpenTelemetry Observability for Newt + +This document describes how Newt exposes metrics using the OpenTelemetry (OTel) Go SDK, how to enable Prometheus scraping, and how to send data to an OpenTelemetry Collector for further export. + +Goals + +- Provide a /metrics endpoint in Prometheus exposition format (via OTel Prometheus exporter) +- Keep metrics backend-agnostic; optional OTLP export to a Collector +- Use OTel semantic conventions where applicable and enforce SI units +- Low-cardinality, stable labels only + +Enable via flags (ENV mirrors) + +- --metrics (default: true) ↔ NEWT_METRICS_PROMETHEUS_ENABLED +- --metrics-admin-addr (default: 127.0.0.1:2112) ↔ NEWT_ADMIN_ADDR +- --otlp (default: false) ↔ NEWT_METRICS_OTLP_ENABLED + +Enable exporters via environment variables (no code changes required) + +- NEWT_METRICS_PROMETHEUS_ENABLED=true|false (default: true) +- NEWT_METRICS_OTLP_ENABLED=true|false (default: false) +- OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 +- OTEL_EXPORTER_OTLP_INSECURE=true|false (default: true for dev) +- OTEL_SERVICE_NAME=newt (default) +- OTEL_SERVICE_VERSION= +- OTEL_RESOURCE_ATTRIBUTES=service.instance.id=,site_id= +- OTEL_METRIC_EXPORT_INTERVAL=15s (default) +- NEWT_ADMIN_ADDR=127.0.0.1:2112 (default admin HTTP with /metrics) + +Runtime behavior + +- When Prometheus exporter is enabled, Newt serves /metrics on NEWT_ADMIN_ADDR (default :2112) +- When OTLP is enabled, metrics and traces are exported to OTLP gRPC endpoint +- Go runtime metrics (goroutines, GC, memory) are exported automatically + +Metric catalog (initial) + +- newt_site_registrations_total (counter) labels: result, region (optional); site_id is a resource attribute +- newt_site_online (observable gauge) no labels (0/1) +- newt_site_last_heartbeat_seconds (observable gauge) no labels +- newt_tunnel_sessions (observable gauge) labels: tunnel_id, transport +- newt_tunnel_bytes_total (counter) labels: tunnel_id, direction (in|out) +- newt_tunnel_latency_seconds (histogram) labels: tunnel_id, transport +- newt_tunnel_reconnects_total (counter) labels: tunnel_id, reason +- newt_connection_attempts_total (counter) labels: transport, result +- newt_connection_errors_total (counter) labels: transport, error_type + +Conventions + +- Durations in seconds, names end with _seconds +- Sizes in bytes, names end with _bytes +- Counters end with _total +- Labels must be low-cardinality and stable + +Histogram buckets + +- Latency (seconds): 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30 + +Local quickstart + +1) Direct Prometheus scrape (do not also scrape the Collector) + NEWT_METRICS_PROMETHEUS_ENABLED=true \ + NEWT_METRICS_OTLP_ENABLED=false \ + NEWT_ADMIN_ADDR="127.0.0.1:2112" \ + ./newt + + curl -s | grep ^newt_ + +2) Using the Collector (compose-style) + NEWT_METRICS_PROMETHEUS_ENABLED=true \ + NEWT_METRICS_OTLP_ENABLED=true \ + OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \ + OTEL_EXPORTER_OTLP_INSECURE=true \ + OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=cumulative \ + ./newt + + Collector config example: examples/otel-collector.yaml + Prometheus scrape config: examples/prometheus.yml + +Adding new metrics + +- Use helpers in internal/telemetry/metrics.go for counters/histograms +- Keep labels low-cardinality +- Add observable gauges through SetObservableCallback + +Optional tracing + +- When --otlp is enabled, you can wrap outbound HTTP clients with otelhttp.NewTransport to create spans for HTTP requests to Pangolin. This affects traces only and does not add metric labels. + +OTLP TLS example + +- Enable TLS to Collector with a custom CA and headers: + +``` +NEWT_METRICS_OTLP_ENABLED=true \ +OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \ +OTEL_EXPORTER_OTLP_INSECURE=false \ +OTEL_EXPORTER_OTLP_CERTIFICATE=/etc/otel/custom-ca.pem \ +OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer abc123,tenant=acme" \ +./newt +``` + +Prometheus scrape strategy (choose one) +A) Scrape Newt directly: + +``` +global: + scrape_interval: 15s +scrape_configs: + - job_name: newt + static_configs: + - targets: ["newt:2112"] +``` + +B) Scrape the Collector’s Prometheus exporter: + +``` +global: + scrape_interval: 15s +scrape_configs: + - job_name: otel-collector + static_configs: + - targets: ["collector:8889"] +``` + +Reason mapping (source → reason) + +- Server instructs reconnect/terminate → server_request +- Heartbeat/Ping threshold exceeded → timeout +- Peer closed connection gracefully → peer_close +- Route/Interface change detected → network_change +- Auth/token failure (HTTP 401/403) → auth_error +- TLS/WG handshake error → handshake_error +- Config reloaded/applied (causing reconnection) → config_change +- Other/unclassified errors → error + +PromQL snippets + +- Throughput in (5m): + +``` +sum(rate(newt_tunnel_bytes_total{direction="in"}[5m])) +``` + +- P95 latency (seconds): + +``` +histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket[5m])) by (le)) +``` + +- Active sessions: + +``` +sum(newt_tunnel_sessions) +``` + +Compatibility notes + +- Gauges do not use the _total suffix (e.g., newt_tunnel_sessions). +- site_id is a resource attribute (one process = one site). tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels. +- Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both. +- Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write. +- No free text in labels; use only the enumerated constants for reason and protocol. + +Troubleshooting + +- curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics +- Check Collector logs for OTLP connection issues +- Verify Prometheus Targets are UP and scraping Newt or Collector diff --git a/examples/otel-collector.yaml b/examples/otel-collector.yaml new file mode 100644 index 0000000..c2b6854 --- /dev/null +++ b/examples/otel-collector.yaml @@ -0,0 +1,41 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +processors: + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + batch: {} + transform/promote: + metric_statements: + - context: datapoint + statements: + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where IsMapKey(resource.attributes, "service.instance.id") + - set(attributes["site_id"], resource.attributes["site_id"]) where IsMapKey(resource.attributes, "site_id") + resourcedetection: + detectors: [env, host] + timeout: 5s + +exporters: + prometheus: + endpoint: 0.0.0.0:8889 + send_timestamps: true + prometheusremotewrite: + # Replace with your remote_write endpoint (Mimir/Cortex/VictoriaMetrics/Thanos Receive) + endpoint: http://mimir:9009/api/v1/push + +service: + pipelines: + metrics: + receivers: [otlp] + processors: [memory_limiter, resourcedetection, batch, transform/promote] + exporters: [prometheus, prometheusremotewrite] + traces: + receivers: [otlp] + processors: [memory_limiter, resourcedetection, batch] + exporters: [] + diff --git a/examples/prometheus.yml b/examples/prometheus.yml new file mode 100644 index 0000000..5323b20 --- /dev/null +++ b/examples/prometheus.yml @@ -0,0 +1,11 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: newt + static_configs: + - targets: ["newt:2112"] + - job_name: otel-collector + static_configs: + - targets: ["collector:8889"] +