diff --git a/docker-compose.metrics.collector.yml b/docker-compose.metrics.collector.yml new file mode 100644 index 0000000..e06c1eb --- /dev/null +++ b/docker-compose.metrics.collector.yml @@ -0,0 +1,27 @@ +services: + newt: + image: your/newt:latest + environment: + - NEWT_METRICS_PROMETHEUS_ENABLED=false # wichtig: direkte /metrics-Erfassung aus + - NEWT_METRICS_OTLP_ENABLED=true # OTLP an den Collector + # optional: + # - NEWT_METRICS_INCLUDE_TUNNEL_ID=false + # Falls Newt selbst Ports exponiert, hier NICHT 2112 mappen + # ports: [] + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC + - "8889:8889" # Prometheus Exporter (wird von Prometheus gescraped) + + prometheus: + image: prom/prometheus:latest + volumes: + - ./examples/prometheus.with-collector.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + diff --git a/docs/observability.md b/docs/observability.md index bae5fb7..e77e2fd 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -190,6 +190,38 @@ Cardinality tips action: drop ``` +Quickstart: direkte Prometheus-Erfassung (empfohlen) + +``` +# Start (direkter /metrics-Scrape, keine Doppel-Erfassung) +docker compose -f docker-compose.metrics.yml up -d + +# Smoke-Checks +./scripts/smoke-metrics.sh +# Tunnel-IDs ausblenden (optional): +# EXPECT_TUNNEL_ID=false NEWT_METRICS_INCLUDE_TUNNEL_ID=false ./scripts/smoke-metrics.sh +``` + +- Prometheus UI: http://localhost:9090 +- Standard-Scrape-Intervall: 15s +- Kein OTLP aktiv (NEWT_METRICS_OTLP_ENABLED=false in docker-compose.metrics.yml) + +Häufige PromQL-Schnelltests + +``` +# Online-Status einer Site in den letzten 5 Minuten +max_over_time(newt_site_online{site_id="$site"}[5m]) + +# TCP egress-Bytes pro Site/Tunnel (10m) +sum by (site_id, tunnel_id) (increase(newt_tunnel_bytes_total{protocol="tcp",direction="egress"}[10m])) + +# WebSocket-Connect P95 +histogram_quantile(0.95, sum by (le, site_id) (rate(newt_websocket_connect_latency_seconds_bucket[5m]))) + +# Reconnects nach Initiator +increase(newt_tunnel_reconnects_total{site_id="$site"}[30m]) by (initiator, reason) +``` + Troubleshooting - curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics diff --git a/examples/prometheus.with-collector.yml b/examples/prometheus.with-collector.yml new file mode 100644 index 0000000..829730d --- /dev/null +++ b/examples/prometheus.with-collector.yml @@ -0,0 +1,17 @@ +global: + scrape_interval: 15s + +scrape_configs: + # WICHTIG: Newt NICHT direkt scrapen, nur den Collector! + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:8889'] + + # optional: Kardinalität begrenzen + relabel_configs: + - action: labeldrop + regex: 'tunnel_id' + # - action: keep + # source_labels: [site_id] + # regex: '(site-a|site-b)' +