diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..0697458 --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +# Copy this file to .env and fill in your values +# Required for connecting to Pangolin service +PANGOLIN_ENDPOINT=https://example.com +NEWT_ID=changeme-id +NEWT_SECRET=changeme-secret \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index b9c4d29..2d69143 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,8 @@ FROM golang:1.25-alpine AS builder +# Install git and ca-certificates +RUN apk --no-cache add ca-certificates git tzdata + # Set the working directory inside the container WORKDIR /app @@ -13,7 +16,7 @@ RUN go mod download COPY . . # Build the application -RUN CGO_ENABLED=0 GOOS=linux go build -o /newt +RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /newt FROM alpine:3.22 AS runner @@ -22,6 +25,9 @@ RUN apk --no-cache add ca-certificates tzdata COPY --from=builder /newt /usr/local/bin/ COPY entrypoint.sh / +# Admin/metrics endpoint (Prometheus scrape) +EXPOSE 2112 + RUN chmod +x /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] -CMD ["newt"] \ No newline at end of file +CMD ["newt"] diff --git a/README.md b/README.md index 413d353..5edac89 100644 --- a/README.md +++ b/README.md @@ -33,61 +33,108 @@ When Newt receives WireGuard control messages, it will use the information encod ## CLI Args +### Core Configuration + - `id`: Newt ID generated by Pangolin to identify the client. - `secret`: A unique secret (not shared and kept private) used to authenticate the client ID with the websocket in order to receive commands. - `endpoint`: The endpoint where both Gerbil and Pangolin reside in order to connect to the websocket. - -- `mtu` (optional): MTU for the internal WG interface. Default: 1280 -- `dns` (optional): DNS server to use to resolve the endpoint. Default: 9.9.9.9 +- `blueprint-file` (optional): Path to blueprint file to define Pangolin resources and configurations. +- `no-cloud` (optional): Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false - `log-level` (optional): The log level to use (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO -- `enforce-hc-cert` (optional): Enforce certificate validation for health checks. Default: false (accepts any cert) + +### Docker Integration + - `docker-socket` (optional): Set the Docker socket to use the container discovery integration -- `ping-interval` (optional): Interval for pinging the server. Default: 3s -- `ping-timeout` (optional): Timeout for each ping. Default: 5s -- `updown` (optional): A script to be called when targets are added or removed. -- `tls-client-cert` (optional): Client certificate (p12 or pfx) for mTLS. See [mTLS](#mtls) -- `tls-client-cert` (optional): Path to client certificate (PEM format, optional if using PKCS12). See [mTLS](#mtls) -- `tls-client-key` (optional): Path to private key for mTLS (PEM format, optional if using PKCS12) -- `tls-ca-cert` (optional): Path to CA certificate to verify server (PEM format, optional if using PKCS12) - `docker-enforce-network-validation` (optional): Validate the container target is on the same network as the newt process. Default: false -- `health-file` (optional): Check if connection to WG server (pangolin) is ok. creates a file if ok, removes it if not ok. Can be used with docker healtcheck to restart newt + +### Accpet Client Connection + - `accept-clients` (optional): Enable WireGuard server mode to accept incoming newt client connections. Default: false - `generateAndSaveKeyTo` (optional): Path to save generated private key - `native` (optional): Use native WireGuard interface when accepting clients (requires WireGuard kernel module and Linux, must run as root). Default: false (uses userspace netstack) - `interface` (optional): Name of the WireGuard interface. Default: newt - `keep-interface` (optional): Keep the WireGuard interface. Default: false -- `blueprint-file` (optional): Path to blueprint file to define Pangolin resources and configurations. -- `no-cloud` (optional): Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false + +### Metrics & Observability + +- `metrics` (optional): Enable Prometheus /metrics exporter. Default: true +- `otlp` (optional): Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT. Default: false +- `metrics-admin-addr` (optional): Admin/metrics bind address. Default: 127.0.0.1:2112 +- `metrics-async-bytes` (optional): Enable async bytes counting (background flush; lower hot path overhead). Default: false +- `region` (optional): Optional region resource attribute for telemetry and metrics. + +### Network Configuration + +- `mtu` (optional): MTU for the internal WG interface. Default: 1280 +- `dns` (optional): DNS server to use to resolve the endpoint. Default: 9.9.9.9 +- `ping-interval` (optional): Interval for pinging the server. Default: 3s +- `ping-timeout` (optional): Timeout for each ping. Default: 5s + +### Security & TLS + +- `enforce-hc-cert` (optional): Enforce certificate validation for health checks. Default: false (accepts any cert) +- `tls-client-cert` (optional): Client certificate (p12 or pfx) for mTLS or path to client certificate (PEM format). See [mTLS](#mtls) +- `tls-client-key` (optional): Path to private key for mTLS (PEM format, optional if using PKCS12) +- `tls-ca-cert` (optional): Path to CA certificate to verify server (PEM format, optional if using PKCS12) + +### Monitoring & Health + +- `health-file` (optional): Check if connection to WG server (pangolin) is ok. creates a file if ok, removes it if not ok. Can be used with docker healtcheck to restart newt +- `updown` (optional): A script to be called when targets are added or removed. ## Environment Variables All CLI arguments can be set using environment variables as an alternative to command line flags. Environment variables are particularly useful when running Newt in containerized environments. +### Core Configuration + - `PANGOLIN_ENDPOINT`: Endpoint of your pangolin server (equivalent to `--endpoint`) - `NEWT_ID`: Newt ID generated by Pangolin (equivalent to `--id`) - `NEWT_SECRET`: Newt secret for authentication (equivalent to `--secret`) -- `MTU`: MTU for the internal WG interface. Default: 1280 (equivalent to `--mtu`) -- `DNS`: DNS server to use to resolve the endpoint. Default: 9.9.9.9 (equivalent to `--dns`) +- `CONFIG_FILE`: Load the config json from this file instead of in the home folder. +- `BLUEPRINT_FILE`: Path to blueprint file to define Pangolin resources and configurations. (equivalent to `--blueprint-file`) +- `NO_CLOUD`: Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false (equivalent to `--no-cloud`) - `LOG_LEVEL`: Log level (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO (equivalent to `--log-level`) + +### Docker Integration + - `DOCKER_SOCKET`: Path to Docker socket for container discovery (equivalent to `--docker-socket`) -- `PING_INTERVAL`: Interval for pinging the server. Default: 3s (equivalent to `--ping-interval`) -- `PING_TIMEOUT`: Timeout for each ping. Default: 5s (equivalent to `--ping-timeout`) -- `UPDOWN_SCRIPT`: Path to updown script for target add/remove events (equivalent to `--updown`) -- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`) -- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`) -- `TLS_CLIENT_KEY`: Path to private key for mTLS (equivalent to `--tls-client-key`) -- `TLS_CA_CERT`: Path to CA certificate to verify server (equivalent to `--tls-ca-cert`) - `DOCKER_ENFORCE_NETWORK_VALIDATION`: Validate container targets are on same network. Default: false (equivalent to `--docker-enforce-network-validation`) -- `ENFORCE_HC_CERT`: Enforce certificate validation for health checks. Default: false (equivalent to `--enforce-hc-cert`) -- `HEALTH_FILE`: Path to health file for connection monitoring (equivalent to `--health-file`) + +### Accept Client Connections + - `ACCEPT_CLIENTS`: Enable WireGuard server mode. Default: false (equivalent to `--accept-clients`) - `GENERATE_AND_SAVE_KEY_TO`: Path to save generated private key (equivalent to `--generateAndSaveKeyTo`) - `USE_NATIVE_INTERFACE`: Use native WireGuard interface (Linux only). Default: false (equivalent to `--native`) - `INTERFACE`: Name of the WireGuard interface. Default: newt (equivalent to `--interface`) - `KEEP_INTERFACE`: Keep the WireGuard interface after shutdown. Default: false (equivalent to `--keep-interface`) -- `CONFIG_FILE`: Load the config json from this file instead of in the home folder. -- `BLUEPRINT_FILE`: Path to blueprint file to define Pangolin resources and configurations. (equivalent to `--blueprint-file`) -- `NO_CLOUD`: Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false (equivalent to `--no-cloud`) + +### Monitoring & Health + +- `HEALTH_FILE`: Path to health file for connection monitoring (equivalent to `--health-file`) +- `UPDOWN_SCRIPT`: Path to updown script for target add/remove events (equivalent to `--updown`) + +### Metrics & Observability + +- `NEWT_METRICS_PROMETHEUS_ENABLED`: Enable Prometheus /metrics exporter. Default: true (equivalent to `--metrics`) +- `NEWT_METRICS_OTLP_ENABLED`: Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT. Default: false (equivalent to `--otlp`) +- `NEWT_ADMIN_ADDR`: Admin/metrics bind address. Default: 127.0.0.1:2112 (equivalent to `--metrics-admin-addr`) +- `NEWT_METRICS_ASYNC_BYTES`: Enable async bytes counting (background flush; lower hot path overhead). Default: false (equivalent to `--metrics-async-bytes`) +- `NEWT_REGION`: Optional region resource attribute for telemetry and metrics (equivalent to `--region`) + +### Network Configuration + +- `MTU`: MTU for the internal WG interface. Default: 1280 (equivalent to `--mtu`) +- `DNS`: DNS server to use to resolve the endpoint. Default: 9.9.9.9 (equivalent to `--dns`) +- `PING_INTERVAL`: Interval for pinging the server. Default: 3s (equivalent to `--ping-interval`) +- `PING_TIMEOUT`: Timeout for each ping. Default: 5s (equivalent to `--ping-timeout`) + +### Security & TLS + +- `ENFORCE_HC_CERT`: Enforce certificate validation for health checks. Default: false (equivalent to `--enforce-hc-cert`) +- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`) +- `TLS_CLIENT_KEY`: Path to private key for mTLS (equivalent to `--tls-client-key`) +- `TLS_CA_CERT`: Path to CA certificate to verify server (equivalent to `--tls-ca-cert`) ## Loading secrets from files diff --git a/docker-compose.metrics.collector.yml b/docker-compose.metrics.collector.yml new file mode 100644 index 0000000..040f410 --- /dev/null +++ b/docker-compose.metrics.collector.yml @@ -0,0 +1,41 @@ +services: + newt: + build: . + image: newt:dev + env_file: + - .env + environment: + - NEWT_METRICS_PROMETHEUS_ENABLED=false # important: disable direct /metrics scraping + - NEWT_METRICS_OTLP_ENABLED=true # OTLP to the Collector + # optional: + # - NEWT_METRICS_INCLUDE_TUNNEL_ID=false + # When using the Collector pattern, do NOT map the Newt admin/metrics port + # (2112) on the application service. Mapping 2112 here can cause port + # conflicts and may result in duplicated Prometheus scraping (app AND + # collector being scraped for the same metrics). Instead either: + # - leave ports unset on the app service (recommended), or + # - map 2112 only on a dedicated metrics/collector service that is + # responsible for exposing metrics to Prometheus. + # Example: do NOT map here + # ports: [] + # Example: map 2112 only on a collector service + # collector: + # ports: + # - "2112:2112" # collector's prometheus exporter (scraped by Prometheus) + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC + - "8889:8889" # Prometheus Exporter (scraped by Prometheus) + + prometheus: + image: prom/prometheus:latest + volumes: + - ./examples/prometheus.with-collector.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml new file mode 100644 index 0000000..44a886e --- /dev/null +++ b/docker-compose.metrics.yml @@ -0,0 +1,56 @@ +name: Newt-Metrics +services: + # Recommended Variant A: Direct Prometheus scrape of Newt (/metrics) + # Optional: You may add the Collector service and enable OTLP export, but do NOT + # scrape both Newt and the Collector for the same process. + + newt: + build: . + image: newt:dev + env_file: + - .env + environment: + OTEL_SERVICE_NAME: newt + NEWT_METRICS_PROMETHEUS_ENABLED: "true" + NEWT_METRICS_OTLP_ENABLED: "false" # avoid double-scrape by default + NEWT_ADMIN_ADDR: ":2112" + # Base NEWT configuration + PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT} + NEWT_ID: ${NEWT_ID} + NEWT_SECRET: ${NEWT_SECRET} + LOG_LEVEL: "DEBUG" + ports: + - "2112:2112" + + # Optional Variant B: Enable the Collector and switch Prometheus scrape to it. + # collector: + # image: otel/opentelemetry-collector-contrib:0.136.0 + # command: ["--config=/etc/otelcol/config.yaml"] + # volumes: + # - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro + # ports: + # - "4317:4317" # OTLP gRPC in + # - "8889:8889" # Prometheus scrape out + + prometheus: + image: prom/prometheus:v3.6.0 + volumes: + - ./examples/prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + + grafana: + image: grafana/grafana:12.2.0 + container_name: newt-metrics-grafana + restart: unless-stopped + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + ports: + - "3005:3000" + depends_on: + - prometheus + volumes: + - ./examples/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + - ./examples/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro + - ./examples/grafana/dashboards:/var/lib/grafana/dashboards:ro diff --git a/examples/grafana/dashboards/newt-overview.json b/examples/grafana/dashboards/newt-overview.json new file mode 100644 index 0000000..2f3a539 --- /dev/null +++ b/examples/grafana/dashboards/newt-overview.json @@ -0,0 +1,898 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 500 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_goroutine_count", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Goroutines", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 256 + }, + { + "color": "red", + "value": 512 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memory_gc_goal_bytes / 1024 / 1024", + "format": "time_series", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "GC Target Heap (MiB)", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 10 + }, + { + "color": "red", + "value": 25 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(http_server_request_duration_seconds_count[$__rate_interval]))", + "instant": false, + "legendFormat": "req/s", + "refId": "A" + } + ], + "title": "HTTP Requests / s", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.1 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(newt_connection_errors_total{site_id=~\"$site_id\"}[$__rate_interval]))", + "instant": false, + "legendFormat": "errors/s", + "refId": "A" + } + ], + "title": "Connection Errors / s", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(go_memory_used_bytes)", + "legendFormat": "Used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memory_gc_goal_bytes", + "legendFormat": "GC Goal", + "refId": "B" + } + ], + "title": "Go Heap Usage vs GC Goal", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(go_memory_allocations_total[$__rate_interval])", + "legendFormat": "Allocations/s", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(go_memory_allocated_bytes_total[$__rate_interval])", + "legendFormat": "Allocated bytes/s", + "refId": "B" + } + ], + "title": "Allocation Activity", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "HTTP Request Duration Quantiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(http_server_request_duration_seconds_count[$__rate_interval])) by (http_response_status_code)", + "legendFormat": "{{http_response_status_code}}", + "refId": "A" + } + ], + "title": "HTTP Requests by Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(newt_connection_attempts_total{site_id=~\"$site_id\"}[$__rate_interval])) by (transport, result)", + "legendFormat": "{{transport}} • {{result}}", + "refId": "A" + } + ], + "title": "Connection Attempts by Transport/Result", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(newt_connection_errors_total{site_id=~\"$site_id\"}[$__rate_interval])) by (transport, error_type)", + "legendFormat": "{{transport}} • {{error_type}}", + "refId": "A" + } + ], + "title": "Connection Errors by Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 3, + "mappings": [], + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Tunnel Latency Quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateTurbo" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 34 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "id": 12, + "legend": { + "show": false + }, + "options": { + "calculate": true, + "cellGap": 2, + "cellSize": "auto", + "color": { + "exponent": 0.5 + }, + "exemplars": { + "color": "rgba(255,255,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "tooltip": { + "mode": "single", + "show": true + }, + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 3, + "show": true + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Tunnel Latency Bucket Rate", + "type": "heatmap" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "newt", + "otel" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "label": "Datasource", + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(target_info, site_id)", + "hide": 0, + "includeAll": true, + "label": "Site", + "multi": true, + "name": "site_id", + "options": [], + "query": { + "query": "label_values(target_info, site_id)", + "refId": "SiteIdVar" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\"}, tunnel_id)", + "hide": 0, + "includeAll": true, + "label": "Tunnel", + "multi": true, + "name": "tunnel_id", + "options": [], + "query": { + "query": "label_values(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\"}, tunnel_id)", + "refId": "TunnelVar" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Newt Overview", + "uid": "newt-overview", + "version": 1, + "weekStart": "" +} diff --git a/examples/grafana/provisioning/dashboards/dashboard.yaml b/examples/grafana/provisioning/dashboards/dashboard.yaml new file mode 100644 index 0000000..0acac20 --- /dev/null +++ b/examples/grafana/provisioning/dashboards/dashboard.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 +providers: + - name: "newt" + folder: "Newt" + type: file + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards diff --git a/examples/grafana/provisioning/datasources/prometheus.yaml b/examples/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 0000000..4efb4f7 --- /dev/null +++ b/examples/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + uid: prometheus + isDefault: true + editable: true diff --git a/examples/otel-collector.yaml b/examples/otel-collector.yaml new file mode 100644 index 0000000..408c6a6 --- /dev/null +++ b/examples/otel-collector.yaml @@ -0,0 +1,61 @@ +# Variant A: Direct scrape of Newt (/metrics) via Prometheus (no Collector needed) +# Note: Newt already exposes labels like site_id, protocol, direction. Do not promote +# resource attributes into labels when scraping Newt directly. +# +# Example Prometheus scrape config: +# global: +# scrape_interval: 15s +# scrape_configs: +# - job_name: newt +# static_configs: +# - targets: ["newt:2112"] +# +# Variant B: Use OTEL Collector (Newt -> OTLP -> Collector -> Prometheus) +# This pipeline scrapes metrics from the Collector's Prometheus exporter. +# Labels are already on datapoints; promotion from resource is OPTIONAL and typically NOT required. +# If you enable transform/promote below, ensure you do not duplicate labels. + +receivers: + otlp: + protocols: + grpc: + endpoint: ":4317" + +processors: + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + resourcedetection: + detectors: [env, system] + timeout: 5s + batch: {} + # OPTIONAL: Only enable if you need to promote resource attributes to labels. + # WARNING: Newt already provides site_id as a label; avoid double-promotion. + # transform/promote: + # error_mode: ignore + # metric_statements: + # - context: datapoint + # statements: + # - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil + # - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil + +exporters: + prometheus: + endpoint: ":8889" + send_timestamps: true + # prometheusremotewrite: + # endpoint: http://mimir:9009/api/v1/push + debug: + verbosity: basic + +service: + pipelines: + metrics: + receivers: [otlp] + processors: [memory_limiter, resourcedetection, batch] # add transform/promote if you really need it + exporters: [prometheus] + traces: + receivers: [otlp] + processors: [memory_limiter, resourcedetection, batch] + exporters: [debug] diff --git a/examples/prometheus.with-collector.yml b/examples/prometheus.with-collector.yml new file mode 100644 index 0000000..ca465e3 --- /dev/null +++ b/examples/prometheus.with-collector.yml @@ -0,0 +1,16 @@ +global: + scrape_interval: 15s + +scrape_configs: + # IMPORTANT: Do not scrape Newt directly; scrape only the Collector! + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:8889'] + + # optional: limit metric cardinality + relabel_configs: + - action: labeldrop + regex: 'tunnel_id' + # - action: keep + # source_labels: [site_id] + # regex: '(site-a|site-b)' diff --git a/examples/prometheus.yml b/examples/prometheus.yml new file mode 100644 index 0000000..9edb661 --- /dev/null +++ b/examples/prometheus.yml @@ -0,0 +1,21 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'newt' + scrape_interval: 15s + static_configs: + - targets: ['newt:2112'] # /metrics + relabel_configs: + # optional: drop tunnel_id + - action: labeldrop + regex: 'tunnel_id' + # optional: allow only specific sites + - action: keep + source_labels: [site_id] + regex: '(site-a|site-b)' + + # WARNING: Do not enable this together with the 'newt' job above or you will double-count. + # - job_name: 'otel-collector' + # static_configs: + # - targets: ['otel-collector:8889'] diff --git a/go.mod b/go.mod index 6faa998..5a930b6 100644 --- a/go.mod +++ b/go.mod @@ -6,29 +6,46 @@ require ( github.com/docker/docker v28.5.1+incompatible github.com/google/gopacket v1.1.19 github.com/gorilla/websocket v1.5.3 + github.com/prometheus/client_golang v1.23.2 github.com/vishvananda/netlink v1.3.1 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 + go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 + go.opentelemetry.io/otel/exporters/prometheus v0.60.0 + go.opentelemetry.io/otel/metric v1.38.0 + go.opentelemetry.io/otel/sdk v1.38.0 + go.opentelemetry.io/otel/sdk/metric v1.38.0 golang.org/x/crypto v0.43.0 golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 golang.org/x/net v0.46.0 golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 + google.golang.org/grpc v1.76.0 gopkg.in/yaml.v3 v3.0.1 gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c software.sslmate.com/src/go-pkcs12 v0.6.0 ) require ( - github.com/Microsoft/go-winio v0.6.2 // indirect - github.com/containerd/errdefs v1.0.0 // indirect + github.com/Microsoft/go-winio v0.6.0 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/containerd/errdefs v0.3.0 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/distribution/reference v0.6.0 // indirect - github.com/docker/go-connections v0.5.0 // indirect - github.com/docker/go-units v0.5.0 // indirect + github.com/docker/go-connections v0.6.0 // indirect + github.com/docker/go-units v0.4.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/google/btree v1.1.3 // indirect github.com/google/go-cmp v0.7.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect github.com/josharian/native v1.1.0 // indirect github.com/mdlayher/genetlink v1.3.2 // indirect github.com/mdlayher/netlink v1.7.2 // indirect @@ -37,18 +54,29 @@ require ( github.com/moby/sys/atomicwriter v0.1.0 // indirect github.com/moby/term v0.5.2 // indirect github.com/morikuni/aec v1.0.0 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect - github.com/opencontainers/image-spec v1.1.1 // indirect + github.com/opencontainers/image-spec v1.1.0 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/otlptranslator v0.0.2 // indirect + github.com/prometheus/procfs v0.17.0 // indirect github.com/vishvananda/netns v0.0.5 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect - go.opentelemetry.io/otel v1.37.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0 // indirect - go.opentelemetry.io/otel/metric v1.37.0 // indirect - go.opentelemetry.io/otel/trace v1.37.0 // indirect - golang.org/x/sync v0.16.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + golang.org/x/mod v0.28.0 // indirect + golang.org/x/sync v0.17.0 // indirect golang.org/x/sys v0.37.0 // indirect + golang.org/x/text v0.30.0 // indirect golang.org/x/time v0.12.0 // indirect + golang.org/x/tools v0.37.0 // indirect golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect + google.golang.org/protobuf v1.36.8 // indirect ) diff --git a/go.sum b/go.sum index 25ff90d..81cbe33 100644 --- a/go.sum +++ b/go.sum @@ -1,26 +1,23 @@ -github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= -github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= -github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= -github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= -github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= -github.com/cenkalti/backoff/v5 v5.0.2 h1:rIfFVxEf1QsI7E1ZHfp/B4DF/6QBAUhmgkxc0H7Zss8= -github.com/cenkalti/backoff/v5 v5.0.2/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= -github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= -github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= +github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg= +github.com/Microsoft/go-winio v0.6.0/go.mod h1:cTAf44im0RAYeL23bpB+fzCyDH2MJiz2BO69KH/soAE= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/containerd/errdefs v0.3.0 h1:FSZgGOeK4yuT/+DnF07/Olde/q4KBoMsaamhXxIMDp4= +github.com/containerd/errdefs v0.3.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= -github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= -github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/docker/docker v28.5.1+incompatible h1:Bm8DchhSD2J6PsFzxC35TZo4TLGR2PdW/E69rU45NhM= github.com/docker/docker v28.5.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= -github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= -github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= -github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= -github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94= +github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE= +github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= +github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -38,85 +35,90 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrRoT6yV5+wkrOpcszoIsO4+4ds248= +github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= -github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= -github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mdlayher/genetlink v1.3.2 h1:KdrNKe+CTu+IbZnm/GVUMXSqBBLqcGpRDa0xkQy56gw= github.com/mdlayher/genetlink v1.3.2/go.mod h1:tcC3pkCrPUGIKKsCsp0B3AdaaKuHtaxoJRz3cc+528o= github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= -github.com/mikioh/ipaddr v0.0.0-20190404000644-d465c8ab6721 h1:RlZweED6sbSArvlE924+mUcZuXKLBHA35U7LN621Bws= -github.com/mikioh/ipaddr v0.0.0-20190404000644-d465c8ab6721/go.mod h1:Ickgr2WtCLZ2MDGd4Gr0geeCH5HybhRJbonOgQpvSxc= github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= -github.com/moby/sys/atomicwriter v0.1.0 h1:kw5D/EqkBwsBFi0ss9v1VG3wIkVhzGvLklJ+w3A14Sw= github.com/moby/sys/atomicwriter v0.1.0/go.mod h1:Ul8oqv2ZMNHOceF643P6FKPXeCmYtlQMvpizfsSoaWs= -github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= -github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= -github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= -github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= -github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= -github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= +github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= +github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= -github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/otlptranslator v0.0.2 h1:+1CdeLVrRQ6Psmhnobldo0kTp96Rj80DRXRd5OSnMEQ= +github.com/prometheus/otlptranslator v0.0.2/go.mod h1:P8AwMgdD7XEr6QRUJ2QWLpiAZTgTE2UYgjlu3svompI= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4= github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY= github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY= -go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= -go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0 h1:dNzwXjZKpMpE2JhmO+9HsPl42NIXFIFSUSSs0fiqra0= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0/go.mod h1:90PoxvaEB5n6AOdZvi+yWJQoE95U8Dhhw2bSyRqnTD0= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0 h1:nRVXXvf78e00EwY6Wp0YII8ww2JVWshZ20HfTlE11AM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0/go.mod h1:r49hO7CgrxY9Voaj3Xe8pANWtr0Oq916d0XAmOoCZAQ= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= -go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= -go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= -go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= -go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= -go.opentelemetry.io/proto/otlp v1.6.0 h1:jQjP+AQyTf+Fe7OKj/MfkDrmK4MNVtw2NpXsf9fefDI= -go.opentelemetry.io/proto/otlp v1.6.0/go.mod h1:cicgGehlFuNdgZkcALOCh3VE6K/u2tAjzlRhDwmVpZc= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg= +go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 h1:PeBoRj6af6xMI7qCupwFvTbbnd49V7n5YpG6pg8iDYQ= +go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0/go.mod h1:ingqBCtMCe8I4vpz/UVzCW6sxoqgZB37nao91mLQ3Bw= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4= +go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo= +go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= -golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.28.0 h1:gQBtGhjxykdjY9YhZpSlZIsbnaE2+PgjfLWUQTnoZ1U= +golang.org/x/mod v0.28.0/go.mod h1:yfB/L0NOf/kmEbXjzCPOx1iK1fRutOydrCMsqRhEBxI= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= +golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -129,6 +131,8 @@ golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= +golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg= golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2/go.mod h1:deeaetjYA+DHMHg+sMSMI58GrEteJUUzzw7en6TJQcI= @@ -136,22 +140,17 @@ golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb h1:whnFRlWMcXI9d+Z golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb/go.mod h1:rpwXGsirqLqN2L0JDJQlwOboGHmptD5ZD6T2VmcqhTw= golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 h1:3GDAcqdIg1ozBNLgPy4SLT84nfcBjr6rhGtXYtrkWLU= golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10/go.mod h1:T97yPqesLiNrOYxkwmhMI0ZIlJDm+p0PMR8eRVeR5tQ= -google.golang.org/genproto v0.0.0-20230920204549-e6e6cdab5c13 h1:vlzZttNJGVqTsRFU9AmdnrcO1Znh8Ew9kCD//yjigk0= -google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237 h1:Kog3KlB4xevJlAcbbbzPfRG0+X9fdoGM+UBRKVz6Wr0= -google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237/go.mod h1:ezi0AVyMKDWy5xAncvjLWH7UcLBB5n7y2fQ8MzjJcto= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250519155744-55703ea1f237 h1:cJfm9zPbe1e873mHJzmQ1nwVEeRDU/T1wXDK2kUSU34= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250519155744-55703ea1f237/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= -google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY= +google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc= +google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= +google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= -gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= -gotest.tools/v3 v3.4.0/go.mod h1:CtbdzLSsqVhDgMtKsx03ird5YTGB3ar27v0u/yKBW5g= gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c h1:m/r7OM+Y2Ty1sgBQ7Qb27VgIMBW8ZZhT4gLnUyDIhzI= gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c/go.mod h1:3r5CMtNQMKIvBlrmM9xWUNamjKBYPOWyXOjmg5Kts3g= software.sslmate.com/src/go-pkcs12 v0.6.0 h1:f3sQittAeF+pao32Vb+mkli+ZyT+VwKaD014qFGq6oU= diff --git a/internal/state/telemetry_view.go b/internal/state/telemetry_view.go new file mode 100644 index 0000000..fb1d44a --- /dev/null +++ b/internal/state/telemetry_view.go @@ -0,0 +1,80 @@ +package state + +import ( + "sync" + "sync/atomic" + "time" + + "github.com/fosrl/newt/internal/telemetry" +) + +// TelemetryView is a minimal, thread-safe implementation to feed observables. +// Since one Newt process represents one site, we expose a single logical site. +// site_id is a resource attribute, so we do not emit per-site labels here. +type TelemetryView struct { + online atomic.Bool + lastHBUnix atomic.Int64 // unix seconds + // per-tunnel sessions + sessMu sync.RWMutex + sessions map[string]*atomic.Int64 +} + +var ( + globalView atomic.Pointer[TelemetryView] +) + +// Global returns a singleton TelemetryView. +func Global() *TelemetryView { + if v := globalView.Load(); v != nil { return v } + v := &TelemetryView{ sessions: make(map[string]*atomic.Int64) } + globalView.Store(v) + telemetry.RegisterStateView(v) + return v +} + +// Instrumentation helpers +func (v *TelemetryView) IncSessions(tunnelID string) { + v.sessMu.Lock(); defer v.sessMu.Unlock() + c := v.sessions[tunnelID] + if c == nil { c = &atomic.Int64{}; v.sessions[tunnelID] = c } + c.Add(1) +} +func (v *TelemetryView) DecSessions(tunnelID string) { + v.sessMu.Lock(); defer v.sessMu.Unlock() + if c := v.sessions[tunnelID]; c != nil { + c.Add(-1) + if c.Load() <= 0 { delete(v.sessions, tunnelID) } + } +} +func (v *TelemetryView) ClearTunnel(tunnelID string) { + v.sessMu.Lock(); defer v.sessMu.Unlock() + delete(v.sessions, tunnelID) +} +func (v *TelemetryView) SetOnline(b bool) { v.online.Store(b) } +func (v *TelemetryView) TouchHeartbeat() { v.lastHBUnix.Store(time.Now().Unix()) } + +// --- telemetry.StateView interface --- + +func (v *TelemetryView) ListSites() []string { return []string{"self"} } +func (v *TelemetryView) Online(_ string) (bool, bool) { return v.online.Load(), true } +func (v *TelemetryView) LastHeartbeat(_ string) (time.Time, bool) { + sec := v.lastHBUnix.Load() + if sec == 0 { return time.Time{}, false } + return time.Unix(sec, 0), true +} +func (v *TelemetryView) ActiveSessions(_ string) (int64, bool) { + // aggregated sessions (not used for per-tunnel gauge) + v.sessMu.RLock(); defer v.sessMu.RUnlock() + var sum int64 + for _, c := range v.sessions { if c != nil { sum += c.Load() } } + return sum, true +} + +// Extended accessor used by telemetry callback to publish per-tunnel samples. +func (v *TelemetryView) SessionsByTunnel() map[string]int64 { + v.sessMu.RLock(); defer v.sessMu.RUnlock() + out := make(map[string]int64, len(v.sessions)) + for id, c := range v.sessions { if c != nil && c.Load() > 0 { out[id] = c.Load() } } + return out +} + diff --git a/internal/telemetry/constants.go b/internal/telemetry/constants.go new file mode 100644 index 0000000..bc117bf --- /dev/null +++ b/internal/telemetry/constants.go @@ -0,0 +1,19 @@ +package telemetry + +// Protocol labels (low-cardinality) +const ( + ProtocolTCP = "tcp" + ProtocolUDP = "udp" +) + +// Reconnect reason bins (fixed, low-cardinality) +const ( + ReasonServerRequest = "server_request" + ReasonTimeout = "timeout" + ReasonPeerClose = "peer_close" + ReasonNetworkChange = "network_change" + ReasonAuthError = "auth_error" + ReasonHandshakeError = "handshake_error" + ReasonConfigChange = "config_change" + ReasonError = "error" +) diff --git a/internal/telemetry/constants_test.go b/internal/telemetry/constants_test.go new file mode 100644 index 0000000..e95fb52 --- /dev/null +++ b/internal/telemetry/constants_test.go @@ -0,0 +1,32 @@ +package telemetry + +import "testing" + +func TestAllowedConstants(t *testing.T) { + allowedReasons := map[string]struct{}{ + ReasonServerRequest: {}, + ReasonTimeout: {}, + ReasonPeerClose: {}, + ReasonNetworkChange: {}, + ReasonAuthError: {}, + ReasonHandshakeError: {}, + ReasonConfigChange: {}, + ReasonError: {}, + } + for k := range allowedReasons { + if k == "" { + t.Fatalf("empty reason constant") + } + } + + allowedProtocols := map[string]struct{}{ + ProtocolTCP: {}, + ProtocolUDP: {}, + } + for k := range allowedProtocols { + if k == "" { + t.Fatalf("empty protocol constant") + } + } +} + diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go new file mode 100644 index 0000000..6c34724 --- /dev/null +++ b/internal/telemetry/metrics.go @@ -0,0 +1,542 @@ +package telemetry + +import ( + "context" + "sync" + "sync/atomic" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Instruments and helpers for Newt metrics following the naming, units, and +// low-cardinality label guidance from the issue description. +// +// Counters end with _total, durations are in seconds, sizes in bytes. +// Only low-cardinality stable labels are supported: tunnel_id, +// transport, direction, result, reason, error_type. +var ( + initOnce sync.Once + + meter metric.Meter + + // Site / Registration + mSiteRegistrations metric.Int64Counter + mSiteOnline metric.Int64ObservableGauge + mSiteLastHeartbeat metric.Float64ObservableGauge + + // Tunnel / Sessions + mTunnelSessions metric.Int64ObservableGauge + mTunnelBytes metric.Int64Counter + mTunnelLatency metric.Float64Histogram + mReconnects metric.Int64Counter + + // Connection / NAT + mConnAttempts metric.Int64Counter + mConnErrors metric.Int64Counter + + // Config/Restart + mConfigReloads metric.Int64Counter + mConfigApply metric.Float64Histogram + mCertRotationTotal metric.Int64Counter + mProcessStartTime metric.Float64ObservableGauge + + // Build info + mBuildInfo metric.Int64ObservableGauge + + // WebSocket + mWSConnectLatency metric.Float64Histogram + mWSMessages metric.Int64Counter + mWSDisconnects metric.Int64Counter + mWSKeepaliveFailure metric.Int64Counter + mWSSessionDuration metric.Float64Histogram + mWSConnected metric.Int64ObservableGauge + mWSReconnects metric.Int64Counter + + // Proxy + mProxyActiveConns metric.Int64ObservableGauge + mProxyBufferBytes metric.Int64ObservableGauge + mProxyAsyncBacklogByte metric.Int64ObservableGauge + mProxyDropsTotal metric.Int64Counter + mProxyAcceptsTotal metric.Int64Counter + mProxyConnDuration metric.Float64Histogram + mProxyConnectionsTotal metric.Int64Counter + + buildVersion string + buildCommit string + processStartUnix = float64(time.Now().UnixNano()) / 1e9 + wsConnectedState atomic.Int64 +) + +// Proxy connection lifecycle events. +const ( + ProxyConnectionOpened = "opened" + ProxyConnectionClosed = "closed" +) + +// attrsWithSite appends site/region labels only when explicitly enabled to keep +// label cardinality low by default. +func attrsWithSite(extra ...attribute.KeyValue) []attribute.KeyValue { + attrs := make([]attribute.KeyValue, len(extra)) + copy(attrs, extra) + if ShouldIncludeSiteLabels() { + attrs = append(attrs, siteAttrs()...) + } + return attrs +} + +func registerInstruments() error { + var err error + initOnce.Do(func() { + meter = otel.Meter("newt") + if e := registerSiteInstruments(); e != nil { + err = e + return + } + if e := registerTunnelInstruments(); e != nil { + err = e + return + } + if e := registerConnInstruments(); e != nil { + err = e + return + } + if e := registerConfigInstruments(); e != nil { + err = e + return + } + if e := registerBuildWSProxyInstruments(); e != nil { + err = e + return + } + }) + return err +} + +func registerSiteInstruments() error { + var err error + mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total", + metric.WithDescription("Total site registration attempts")) + if err != nil { + return err + } + mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online", + metric.WithDescription("Site online (0/1)")) + if err != nil { + return err + } + mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_timestamp_seconds", + metric.WithDescription("Unix timestamp of the last site heartbeat"), + metric.WithUnit("s")) + if err != nil { + return err + } + return nil +} + +func registerTunnelInstruments() error { + var err error + mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions", + metric.WithDescription("Active tunnel sessions")) + if err != nil { + return err + } + mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total", + metric.WithDescription("Tunnel bytes ingress/egress"), + metric.WithUnit("By")) + if err != nil { + return err + } + mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds", + metric.WithDescription("Per-tunnel latency in seconds"), + metric.WithUnit("s")) + if err != nil { + return err + } + mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total", + metric.WithDescription("Tunnel reconnect events")) + if err != nil { + return err + } + return nil +} + +func registerConnInstruments() error { + var err error + mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total", + metric.WithDescription("Connection attempts")) + if err != nil { + return err + } + mConnErrors, err = meter.Int64Counter("newt_connection_errors_total", + metric.WithDescription("Connection errors by type")) + if err != nil { + return err + } + return nil +} + +func registerConfigInstruments() error { + mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total", + metric.WithDescription("Configuration reloads")) + mConfigApply, _ = meter.Float64Histogram("newt_config_apply_seconds", + metric.WithDescription("Configuration apply duration in seconds"), + metric.WithUnit("s")) + mCertRotationTotal, _ = meter.Int64Counter("newt_cert_rotation_total", + metric.WithDescription("Certificate rotation events (success/failure)")) + mProcessStartTime, _ = meter.Float64ObservableGauge("process_start_time_seconds", + metric.WithDescription("Unix timestamp of the process start time"), + metric.WithUnit("s")) + if mProcessStartTime != nil { + if _, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + o.ObserveFloat64(mProcessStartTime, processStartUnix) + return nil + }, mProcessStartTime); err != nil { + otel.Handle(err) + } + } + return nil +} + +func registerBuildWSProxyInstruments() error { + // Build info gauge (value 1 with version/commit attributes) + mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info", + metric.WithDescription("Newt build information (value is always 1)")) + // WebSocket + mWSConnectLatency, _ = meter.Float64Histogram("newt_websocket_connect_latency_seconds", + metric.WithDescription("WebSocket connect latency in seconds"), + metric.WithUnit("s")) + mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total", + metric.WithDescription("WebSocket messages by direction and type")) + mWSDisconnects, _ = meter.Int64Counter("newt_websocket_disconnects_total", + metric.WithDescription("WebSocket disconnects by reason/result")) + mWSKeepaliveFailure, _ = meter.Int64Counter("newt_websocket_keepalive_failures_total", + metric.WithDescription("WebSocket keepalive (ping/pong) failures")) + mWSSessionDuration, _ = meter.Float64Histogram("newt_websocket_session_duration_seconds", + metric.WithDescription("Duration of established WebSocket sessions"), + metric.WithUnit("s")) + mWSConnected, _ = meter.Int64ObservableGauge("newt_websocket_connected", + metric.WithDescription("WebSocket connection state (1=connected, 0=disconnected)")) + mWSReconnects, _ = meter.Int64Counter("newt_websocket_reconnects_total", + metric.WithDescription("WebSocket reconnect attempts by reason")) + // Proxy + mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections", + metric.WithDescription("Proxy active connections per tunnel and protocol")) + mProxyBufferBytes, _ = meter.Int64ObservableGauge("newt_proxy_buffer_bytes", + metric.WithDescription("Proxy buffer bytes (may approximate async backlog)"), + metric.WithUnit("By")) + mProxyAsyncBacklogByte, _ = meter.Int64ObservableGauge("newt_proxy_async_backlog_bytes", + metric.WithDescription("Unflushed async byte backlog per tunnel and protocol"), + metric.WithUnit("By")) + mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total", + metric.WithDescription("Proxy drops due to write errors")) + mProxyAcceptsTotal, _ = meter.Int64Counter("newt_proxy_accept_total", + metric.WithDescription("Proxy connection accepts by protocol and result")) + mProxyConnDuration, _ = meter.Float64Histogram("newt_proxy_connection_duration_seconds", + metric.WithDescription("Duration of completed proxy connections"), + metric.WithUnit("s")) + mProxyConnectionsTotal, _ = meter.Int64Counter("newt_proxy_connections_total", + metric.WithDescription("Proxy connection lifecycle events by protocol")) + // Register a default callback for build info if version/commit set + reg, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + if buildVersion == "" && buildCommit == "" { + return nil + } + attrs := []attribute.KeyValue{} + if buildVersion != "" { + attrs = append(attrs, attribute.String("version", buildVersion)) + } + if buildCommit != "" { + attrs = append(attrs, attribute.String("commit", buildCommit)) + } + if ShouldIncludeSiteLabels() { + attrs = append(attrs, siteAttrs()...) + } + o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...)) + return nil + }, mBuildInfo) + if e != nil { + otel.Handle(e) + } else { + // Provide a functional stopper that unregisters the callback + obsStopper = func() { _ = reg.Unregister() } + } + if mWSConnected != nil { + if regConn, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + val := wsConnectedState.Load() + o.ObserveInt64(mWSConnected, val, metric.WithAttributes(attrsWithSite()...)) + return nil + }, mWSConnected); err != nil { + otel.Handle(err) + } else { + wsConnStopper = func() { _ = regConn.Unregister() } + } + } + return nil +} + +// Observable registration: Newt can register a callback to report gauges. +// Call SetObservableCallback once to start observing online status, last +// heartbeat seconds, and active sessions. + +var ( + obsOnce sync.Once + obsStopper func() + proxyObsOnce sync.Once + proxyStopper func() + wsConnStopper func() +) + +// SetObservableCallback registers a single callback that will be invoked +// on collection. Use the provided observer to emit values for the observable +// gauges defined here. +// +// Example inside your code (where you have access to current state): +// +// telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error { +// o.ObserveInt64(mSiteOnline, 1) +// o.ObserveFloat64(mSiteLastHeartbeat, float64(lastHB.Unix())) +// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions))) +// return nil +// }) +func SetObservableCallback(cb func(context.Context, metric.Observer) error) { + obsOnce.Do(func() { + reg, e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions) + if e != nil { + otel.Handle(e) + obsStopper = func() { + // no-op: registration failed; keep stopper callable + } + return + } + // Provide a functional stopper mirroring proxy/build-info behavior + obsStopper = func() { _ = reg.Unregister() } + }) +} + +// SetProxyObservableCallback registers a callback to observe proxy gauges. +func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) { + proxyObsOnce.Do(func() { + reg, e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte) + if e != nil { + otel.Handle(e) + proxyStopper = func() { + // no-op: registration failed; keep stopper callable + } + return + } + // Provide a functional stopper to unregister later if needed + proxyStopper = func() { _ = reg.Unregister() } + }) +} + +// Build info registration +func RegisterBuildInfo(version, commit string) { + buildVersion = version + buildCommit = commit +} + +// Config reloads +func IncConfigReload(ctx context.Context, result string) { + mConfigReloads.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("result", result), + )...)) +} + +// Helpers for counters/histograms + +func IncSiteRegistration(ctx context.Context, result string) { + attrs := []attribute.KeyValue{ + attribute.String("result", result), + } + mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) { + attrs := []attribute.KeyValue{ + attribute.String("direction", direction), + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations. +func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) { + mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs)) +} + +// --- WebSocket helpers --- + +func ObserveWSConnectLatency(ctx context.Context, seconds float64, result, errorType string) { + attrs := []attribute.KeyValue{ + attribute.String("transport", "websocket"), + attribute.String("result", result), + } + if errorType != "" { + attrs = append(attrs, attribute.String("error_type", errorType)) + } + mWSConnectLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +func IncWSMessage(ctx context.Context, direction, msgType string) { + mWSMessages.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("direction", direction), + attribute.String("msg_type", msgType), + )...)) +} + +func IncWSDisconnect(ctx context.Context, reason, result string) { + mWSDisconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("reason", reason), + attribute.String("result", result), + )...)) +} + +func IncWSKeepaliveFailure(ctx context.Context, reason string) { + mWSKeepaliveFailure.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("reason", reason), + )...)) +} + +// SetWSConnectionState updates the backing gauge for the WebSocket connected state. +func SetWSConnectionState(connected bool) { + if connected { + wsConnectedState.Store(1) + } else { + wsConnectedState.Store(0) + } +} + +// IncWSReconnect increments the WebSocket reconnect counter with a bounded reason label. +func IncWSReconnect(ctx context.Context, reason string) { + if reason == "" { + reason = "unknown" + } + mWSReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("reason", reason), + )...)) +} + +func ObserveWSSessionDuration(ctx context.Context, seconds float64, result string) { + mWSSessionDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite( + attribute.String("result", result), + )...)) +} + +// --- Proxy helpers --- + +func ObserveProxyActiveConnsObs(o metric.Observer, value int64, attrs []attribute.KeyValue) { + o.ObserveInt64(mProxyActiveConns, value, metric.WithAttributes(attrs...)) +} + +func ObserveProxyBufferBytesObs(o metric.Observer, value int64, attrs []attribute.KeyValue) { + o.ObserveInt64(mProxyBufferBytes, value, metric.WithAttributes(attrs...)) +} + +func ObserveProxyAsyncBacklogObs(o metric.Observer, value int64, attrs []attribute.KeyValue) { + o.ObserveInt64(mProxyAsyncBacklogByte, value, metric.WithAttributes(attrs...)) +} + +func IncProxyDrops(ctx context.Context, tunnelID, protocol string) { + attrs := []attribute.KeyValue{ + attribute.String("protocol", protocol), + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +func IncProxyAccept(ctx context.Context, tunnelID, protocol, result, reason string) { + attrs := []attribute.KeyValue{ + attribute.String("protocol", protocol), + attribute.String("result", result), + } + if reason != "" { + attrs = append(attrs, attribute.String("reason", reason)) + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mProxyAcceptsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +func ObserveProxyConnectionDuration(ctx context.Context, tunnelID, protocol, result string, seconds float64) { + attrs := []attribute.KeyValue{ + attribute.String("protocol", protocol), + attribute.String("result", result), + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mProxyConnDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +// IncProxyConnectionEvent records proxy connection lifecycle events (opened/closed). +func IncProxyConnectionEvent(ctx context.Context, tunnelID, protocol, event string) { + if event == "" { + event = "unknown" + } + attrs := []attribute.KeyValue{ + attribute.String("protocol", protocol), + attribute.String("event", event), + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mProxyConnectionsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +// --- Config/PKI helpers --- + +func ObserveConfigApply(ctx context.Context, phase, result string, seconds float64) { + mConfigApply.Record(ctx, seconds, metric.WithAttributes(attrsWithSite( + attribute.String("phase", phase), + attribute.String("result", result), + )...)) +} + +func IncCertRotation(ctx context.Context, result string) { + mCertRotationTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("result", result), + )...)) +} + +func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) { + attrs := []attribute.KeyValue{ + attribute.String("transport", transport), + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) { + attrs := []attribute.KeyValue{ + attribute.String("initiator", initiator), + attribute.String("reason", reason), + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +func IncConnAttempt(ctx context.Context, transport, result string) { + mConnAttempts.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("transport", transport), + attribute.String("result", result), + )...)) +} + +func IncConnError(ctx context.Context, transport, typ string) { + mConnErrors.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("transport", transport), + attribute.String("error_type", typ), + )...)) +} diff --git a/internal/telemetry/metrics_test_helper.go b/internal/telemetry/metrics_test_helper.go new file mode 100644 index 0000000..16aa1a3 --- /dev/null +++ b/internal/telemetry/metrics_test_helper.go @@ -0,0 +1,59 @@ +package telemetry + +import ( + "sync" + "time" +) + +func resetMetricsForTest() { + initOnce = sync.Once{} + obsOnce = sync.Once{} + proxyObsOnce = sync.Once{} + obsStopper = nil + proxyStopper = nil + if wsConnStopper != nil { + wsConnStopper() + } + wsConnStopper = nil + meter = nil + + mSiteRegistrations = nil + mSiteOnline = nil + mSiteLastHeartbeat = nil + + mTunnelSessions = nil + mTunnelBytes = nil + mTunnelLatency = nil + mReconnects = nil + + mConnAttempts = nil + mConnErrors = nil + + mConfigReloads = nil + mConfigApply = nil + mCertRotationTotal = nil + mProcessStartTime = nil + + mBuildInfo = nil + + mWSConnectLatency = nil + mWSMessages = nil + mWSDisconnects = nil + mWSKeepaliveFailure = nil + mWSSessionDuration = nil + mWSConnected = nil + mWSReconnects = nil + + mProxyActiveConns = nil + mProxyBufferBytes = nil + mProxyAsyncBacklogByte = nil + mProxyDropsTotal = nil + mProxyAcceptsTotal = nil + mProxyConnDuration = nil + mProxyConnectionsTotal = nil + + processStartUnix = float64(time.Now().UnixNano()) / 1e9 + wsConnectedState.Store(0) + includeTunnelIDVal.Store(false) + includeSiteLabelVal.Store(false) +} diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go new file mode 100644 index 0000000..6c6b6de --- /dev/null +++ b/internal/telemetry/state_view.go @@ -0,0 +1,106 @@ +package telemetry + +import ( + "context" + "sync/atomic" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// StateView provides a read-only view for observable gauges. +// Implementations must be concurrency-safe and avoid blocking operations. +// All methods should be fast and use RLocks where applicable. +type StateView interface { + // ListSites returns a stable, low-cardinality list of site IDs to expose. + ListSites() []string + // Online returns whether the site is online. + Online(siteID string) (online bool, ok bool) + // LastHeartbeat returns the last heartbeat time for a site. + LastHeartbeat(siteID string) (t time.Time, ok bool) + // ActiveSessions returns the current number of active sessions for a site (across tunnels), + // or scoped to site if your model is site-scoped. + ActiveSessions(siteID string) (n int64, ok bool) +} + +var ( + stateView atomic.Value // of type StateView +) + +// RegisterStateView sets the global StateView used by the default observable callback. +func RegisterStateView(v StateView) { + stateView.Store(v) + // If instruments are registered, ensure a callback exists. + if v != nil { + SetObservableCallback(func(ctx context.Context, o metric.Observer) error { + if any := stateView.Load(); any != nil { + if sv, ok := any.(StateView); ok { + for _, siteID := range sv.ListSites() { + observeSiteOnlineFor(o, sv, siteID) + observeLastHeartbeatFor(o, sv, siteID) + observeSessionsFor(o, siteID, sv) + } + } + } + return nil + }) + } +} + +func observeSiteOnlineFor(o metric.Observer, sv StateView, siteID string) { + if online, ok := sv.Online(siteID); ok { + val := int64(0) + if online { + val = 1 + } + o.ObserveInt64(mSiteOnline, val, metric.WithAttributes( + attribute.String("site_id", siteID), + )) + } +} + +func observeLastHeartbeatFor(o metric.Observer, sv StateView, siteID string) { + if t, ok := sv.LastHeartbeat(siteID); ok { + ts := float64(t.UnixNano()) / 1e9 + o.ObserveFloat64(mSiteLastHeartbeat, ts, metric.WithAttributes( + attribute.String("site_id", siteID), + )) + } +} + +func observeSessionsFor(o metric.Observer, siteID string, any interface{}) { + if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok { + sessions := tm.SessionsByTunnel() + // If tunnel_id labels are enabled, preserve existing per-tunnel observations + if ShouldIncludeTunnelID() { + for tid, n := range sessions { + attrs := []attribute.KeyValue{ + attribute.String("site_id", siteID), + } + if tid != "" { + attrs = append(attrs, attribute.String("tunnel_id", tid)) + } + o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attrs...)) + } + return + } + // When tunnel_id is disabled, collapse per-tunnel counts into a single site-level value + var total int64 + for _, n := range sessions { + total += n + } + // If there are no per-tunnel entries, fall back to ActiveSessions() if available + if total == 0 { + if svAny := stateView.Load(); svAny != nil { + if sv, ok := svAny.(StateView); ok { + if n, ok2 := sv.ActiveSessions(siteID); ok2 { + total = n + } + } + } + } + o.ObserveInt64(mTunnelSessions, total, metric.WithAttributes(attribute.String("site_id", siteID))) + return + } +} diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go new file mode 100644 index 0000000..3d47a61 --- /dev/null +++ b/internal/telemetry/telemetry.go @@ -0,0 +1,384 @@ +package telemetry + +import ( + "context" + "errors" + "net/http" + "os" + "strings" + "sync/atomic" + "time" + + promclient "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "go.opentelemetry.io/contrib/instrumentation/runtime" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/prometheus" + "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" + "google.golang.org/grpc/credentials" +) + +// Config controls telemetry initialization via env flags. +// +// Defaults align with the issue requirements: +// - Prometheus exporter enabled by default (/metrics) +// - OTLP exporter disabled by default +// - Durations in seconds, bytes in raw bytes +// - Admin HTTP server address configurable (for mounting /metrics) +type Config struct { + ServiceName string + ServiceVersion string + + // Optional resource attributes + SiteID string + Region string + + PromEnabled bool + OTLPEnabled bool + + OTLPEndpoint string // host:port + OTLPInsecure bool + + MetricExportInterval time.Duration + AdminAddr string // e.g.: ":2112" + + // Optional build info for newt_build_info metric + BuildVersion string + BuildCommit string +} + +// FromEnv reads configuration from environment variables. +// +// NEWT_METRICS_PROMETHEUS_ENABLED (default: true) +// NEWT_METRICS_OTLP_ENABLED (default: false) +// OTEL_EXPORTER_OTLP_ENDPOINT (default: "localhost:4317") +// OTEL_EXPORTER_OTLP_INSECURE (default: true) +// OTEL_METRIC_EXPORT_INTERVAL (default: 15s) +// OTEL_SERVICE_NAME (default: "newt") +// OTEL_SERVICE_VERSION (default: "") +// NEWT_ADMIN_ADDR (default: ":2112") +func FromEnv() Config { + // Prefer explicit NEWT_* env vars, then fall back to OTEL_RESOURCE_ATTRIBUTES + site := getenv("NEWT_SITE_ID", "") + if site == "" { + site = getenv("NEWT_ID", "") + } + region := os.Getenv("NEWT_REGION") + if site == "" || region == "" { + if ra := os.Getenv("OTEL_RESOURCE_ATTRIBUTES"); ra != "" { + m := parseResourceAttributes(ra) + if site == "" { + site = m["site_id"] + } + if region == "" { + region = m["region"] + } + } + } + return Config{ + ServiceName: getenv("OTEL_SERVICE_NAME", "newt"), + ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"), + SiteID: site, + Region: region, + PromEnabled: getenv("NEWT_METRICS_PROMETHEUS_ENABLED", "true") == "true", + OTLPEnabled: getenv("NEWT_METRICS_OTLP_ENABLED", "false") == "true", + OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"), + OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true", + MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second), + AdminAddr: getenv("NEWT_ADMIN_ADDR", ":2112"), + } +} + +// Setup holds initialized telemetry providers and (optionally) a /metrics handler. +// Call Shutdown when the process terminates to flush exporters. +type Setup struct { + MeterProvider *metric.MeterProvider + TracerProvider *trace.TracerProvider + + PrometheusHandler http.Handler // nil if Prometheus exporter disabled + + shutdowns []func(context.Context) error +} + +// Init configures OpenTelemetry metrics and (optionally) tracing. +// +// It sets a global MeterProvider and TracerProvider, registers runtime instrumentation, +// installs recommended histogram views for *_latency_seconds, and returns a Setup with +// a Shutdown method to flush exporters. +func Init(ctx context.Context, cfg Config) (*Setup, error) { + // Configure tunnel_id label inclusion from env (default true) + if getenv("NEWT_METRICS_INCLUDE_TUNNEL_ID", "true") == "true" { + includeTunnelIDVal.Store(true) + } else { + includeTunnelIDVal.Store(false) + } + if getenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") == "true" { + includeSiteLabelVal.Store(true) + } else { + includeSiteLabelVal.Store(false) + } + res := buildResource(ctx, cfg) + UpdateSiteInfo(cfg.SiteID, cfg.Region) + + s := &Setup{} + readers, promHandler, shutdowns, err := setupMetricExport(ctx, cfg, res) + if err != nil { + return nil, err + } + s.PrometheusHandler = promHandler + // Build provider + mp := buildMeterProvider(res, readers) + otel.SetMeterProvider(mp) + s.MeterProvider = mp + s.shutdowns = append(s.shutdowns, mp.Shutdown) + // Optional tracing + if cfg.OTLPEnabled { + if tp, shutdown := setupTracing(ctx, cfg, res); tp != nil { + otel.SetTracerProvider(tp) + s.TracerProvider = tp + s.shutdowns = append(s.shutdowns, func(c context.Context) error { + return errors.Join(shutdown(c), tp.Shutdown(c)) + }) + } + } + // Add metric exporter shutdowns + s.shutdowns = append(s.shutdowns, shutdowns...) + // Runtime metrics + _ = runtime.Start(runtime.WithMeterProvider(mp)) + // Instruments + if err := registerInstruments(); err != nil { + return nil, err + } + if cfg.BuildVersion != "" || cfg.BuildCommit != "" { + RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit) + } + return s, nil +} + +func buildResource(ctx context.Context, cfg Config) *resource.Resource { + attrs := []attribute.KeyValue{ + semconv.ServiceName(cfg.ServiceName), + semconv.ServiceVersion(cfg.ServiceVersion), + } + if cfg.SiteID != "" { + attrs = append(attrs, attribute.String("site_id", cfg.SiteID)) + } + if cfg.Region != "" { + attrs = append(attrs, attribute.String("region", cfg.Region)) + } + res, _ := resource.New(ctx, resource.WithFromEnv(), resource.WithHost(), resource.WithAttributes(attrs...)) + return res +} + +func setupMetricExport(ctx context.Context, cfg Config, _ *resource.Resource) ([]metric.Reader, http.Handler, []func(context.Context) error, error) { + var readers []metric.Reader + var shutdowns []func(context.Context) error + var promHandler http.Handler + if cfg.PromEnabled { + reg := promclient.NewRegistry() + exp, err := prometheus.New(prometheus.WithRegisterer(reg)) + if err != nil { + return nil, nil, nil, err + } + readers = append(readers, exp) + promHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{}) + } + if cfg.OTLPEnabled { + mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)} + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { + mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs)) + } + if cfg.OTLPInsecure { + mopts = append(mopts, otlpmetricgrpc.WithInsecure()) + } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil { + mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds)) + } + } + mexp, err := otlpmetricgrpc.New(ctx, mopts...) + if err != nil { + return nil, nil, nil, err + } + readers = append(readers, metric.NewPeriodicReader(mexp, metric.WithInterval(cfg.MetricExportInterval))) + shutdowns = append(shutdowns, mexp.Shutdown) + } + return readers, promHandler, shutdowns, nil +} + +func buildMeterProvider(res *resource.Resource, readers []metric.Reader) *metric.MeterProvider { + var mpOpts []metric.Option + mpOpts = append(mpOpts, metric.WithResource(res)) + for _, r := range readers { + mpOpts = append(mpOpts, metric.WithReader(r)) + } + mpOpts = append(mpOpts, metric.WithView(metric.NewView( + metric.Instrument{Name: "newt_*_latency_seconds"}, + metric.Stream{Aggregation: metric.AggregationExplicitBucketHistogram{Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}}}, + ))) + mpOpts = append(mpOpts, metric.WithView(metric.NewView( + metric.Instrument{Name: "newt_*"}, + metric.Stream{AttributeFilter: func(kv attribute.KeyValue) bool { + k := string(kv.Key) + switch k { + case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "msg_type", "phase", "version", "commit", "site_id", "region": + return true + default: + return false + } + }}, + ))) + return metric.NewMeterProvider(mpOpts...) +} + +func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*trace.TracerProvider, func(context.Context) error) { + topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)} + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { + topts = append(topts, otlptracegrpc.WithHeaders(hdrs)) + } + if cfg.OTLPInsecure { + topts = append(topts, otlptracegrpc.WithInsecure()) + } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil { + topts = append(topts, otlptracegrpc.WithTLSCredentials(creds)) + } + } + exp, err := otlptracegrpc.New(ctx, topts...) + if err != nil { + return nil, nil + } + tp := trace.NewTracerProvider(trace.WithBatcher(exp), trace.WithResource(res)) + return tp, exp.Shutdown +} + +// Shutdown flushes exporters and providers in reverse init order. +func (s *Setup) Shutdown(ctx context.Context) error { + var err error + for i := len(s.shutdowns) - 1; i >= 0; i-- { + err = errors.Join(err, s.shutdowns[i](ctx)) + } + return err +} + +func parseOTLPHeaders(h string) map[string]string { + m := map[string]string{} + if h == "" { + return m + } + pairs := strings.Split(h, ",") + for _, p := range pairs { + kv := strings.SplitN(strings.TrimSpace(p), "=", 2) + if len(kv) == 2 { + m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1]) + } + } + return m +} + +// parseResourceAttributes parses OTEL_RESOURCE_ATTRIBUTES formatted as k=v,k2=v2 +func parseResourceAttributes(s string) map[string]string { + m := map[string]string{} + if s == "" { + return m + } + parts := strings.Split(s, ",") + for _, p := range parts { + kv := strings.SplitN(strings.TrimSpace(p), "=", 2) + if len(kv) == 2 { + m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1]) + } + } + return m +} + +// Global site/region used to enrich metric labels. +var siteIDVal atomic.Value +var regionVal atomic.Value +var ( + includeTunnelIDVal atomic.Value // bool; default true + includeSiteLabelVal atomic.Value // bool; default false +) + +// UpdateSiteInfo updates the global site_id and region used for metric labels. +// Thread-safe via atomic.Value: subsequent metric emissions will include +// the new labels, prior emissions remain unchanged. +func UpdateSiteInfo(siteID, region string) { + if siteID != "" { + siteIDVal.Store(siteID) + } + if region != "" { + regionVal.Store(region) + } +} + +func getSiteID() string { + if v, ok := siteIDVal.Load().(string); ok { + return v + } + return "" +} + +func getRegion() string { + if v, ok := regionVal.Load().(string); ok { + return v + } + return "" +} + +// siteAttrs returns label KVs for site_id and region (if set). +func siteAttrs() []attribute.KeyValue { + var out []attribute.KeyValue + if s := getSiteID(); s != "" { + out = append(out, attribute.String("site_id", s)) + } + if r := getRegion(); r != "" { + out = append(out, attribute.String("region", r)) + } + return out +} + +// SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager). +func SiteLabelKVs() []attribute.KeyValue { + if !ShouldIncludeSiteLabels() { + return nil + } + return siteAttrs() +} + +// ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted. +func ShouldIncludeTunnelID() bool { + if v, ok := includeTunnelIDVal.Load().(bool); ok { + return v + } + return true +} + +// ShouldIncludeSiteLabels returns whether site_id/region should be emitted as +// metric labels in addition to resource attributes. +func ShouldIncludeSiteLabels() bool { + if v, ok := includeSiteLabelVal.Load().(bool); ok { + return v + } + return false +} + +func getenv(k, d string) string { + if v := os.Getenv(k); v != "" { + return v + } + return d +} + +func getdur(k string, d time.Duration) time.Duration { + if v := os.Getenv(k); v != "" { + if p, e := time.ParseDuration(v); e == nil { + return p + } + } + return d +} diff --git a/internal/telemetry/telemetry_attrfilter_test.go b/internal/telemetry/telemetry_attrfilter_test.go new file mode 100644 index 0000000..ebbb3c2 --- /dev/null +++ b/internal/telemetry/telemetry_attrfilter_test.go @@ -0,0 +1,53 @@ +package telemetry + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "go.opentelemetry.io/otel/attribute" +) + +// Test that disallowed attributes are filtered from the exposition. +func TestAttributeFilterDropsUnknownKeys(t *testing.T) { + ctx := context.Background() + resetMetricsForTest() + t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") + cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"} + tel, err := Init(ctx, cfg) + if err != nil { + t.Fatalf("init: %v", err) + } + defer func() { _ = tel.Shutdown(context.Background()) }() + + if tel.PrometheusHandler == nil { + t.Fatalf("prom handler nil") + } + ts := httptest.NewServer(tel.PrometheusHandler) + defer ts.Close() + + // Add samples with disallowed attribute keys + for _, k := range []string{"forbidden", "site_id", "host"} { + set := attribute.NewSet(attribute.String(k, "x")) + AddTunnelBytesSet(ctx, 123, set) + } + time.Sleep(50 * time.Millisecond) + + resp, err := http.Get(ts.URL) + if err != nil { + t.Fatalf("GET: %v", err) + } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + body := string(b) + if strings.Contains(body, "forbidden=") { + t.Fatalf("unexpected forbidden attribute leaked into metrics: %s", body) + } + if !strings.Contains(body, "site_id=\"x\"") { + t.Fatalf("expected allowed attribute site_id to be present in metrics, got: %s", body) + } +} diff --git a/internal/telemetry/telemetry_golden_test.go b/internal/telemetry/telemetry_golden_test.go new file mode 100644 index 0000000..62f41b8 --- /dev/null +++ b/internal/telemetry/telemetry_golden_test.go @@ -0,0 +1,76 @@ +package telemetry + +import ( + "bufio" + "context" + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + "time" +) + +// Golden test that /metrics contains expected metric names. +func TestMetricsGoldenContains(t *testing.T) { + ctx := context.Background() + resetMetricsForTest() + t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") + cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", BuildVersion: "test"} + tel, err := Init(ctx, cfg) + if err != nil { + t.Fatalf("telemetry init error: %v", err) + } + defer func() { _ = tel.Shutdown(context.Background()) }() + + if tel.PrometheusHandler == nil { + t.Fatalf("prom handler nil") + } + ts := httptest.NewServer(tel.PrometheusHandler) + defer ts.Close() + + // Trigger counters to ensure they appear in the scrape + IncConnAttempt(ctx, "websocket", "success") + IncWSReconnect(ctx, "io_error") + IncProxyConnectionEvent(ctx, "", "tcp", ProxyConnectionOpened) + if tel.MeterProvider != nil { + _ = tel.MeterProvider.ForceFlush(ctx) + } + time.Sleep(100 * time.Millisecond) + + var body string + for i := 0; i < 5; i++ { + resp, err := http.Get(ts.URL) + if err != nil { + t.Fatalf("GET metrics failed: %v", err) + } + b, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + body = string(b) + if strings.Contains(body, "newt_connection_attempts_total") { + break + } + time.Sleep(100 * time.Millisecond) + } + + f, err := os.Open(filepath.Join("testdata", "expected_contains.golden")) + if err != nil { + t.Fatalf("read golden: %v", err) + } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + needle := strings.TrimSpace(s.Text()) + if needle == "" { + continue + } + if !strings.Contains(body, needle) { + t.Fatalf("expected metrics body to contain %q. body=\n%s", needle, body) + } + } + if err := s.Err(); err != nil { + t.Fatalf("scan golden: %v", err) + } +} diff --git a/internal/telemetry/telemetry_smoke_test.go b/internal/telemetry/telemetry_smoke_test.go new file mode 100644 index 0000000..b736ca5 --- /dev/null +++ b/internal/telemetry/telemetry_smoke_test.go @@ -0,0 +1,65 @@ +package telemetry + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +// Smoke test that /metrics contains at least one newt_* metric when Prom exporter is enabled. +func TestMetricsSmoke(t *testing.T) { + ctx := context.Background() + resetMetricsForTest() + t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") + cfg := Config{ + ServiceName: "newt", + PromEnabled: true, + OTLPEnabled: false, + AdminAddr: "127.0.0.1:0", + BuildVersion: "test", + BuildCommit: "deadbeef", + MetricExportInterval: 5 * time.Second, + } + tel, err := Init(ctx, cfg) + if err != nil { + t.Fatalf("telemetry init error: %v", err) + } + defer func() { _ = tel.Shutdown(context.Background()) }() + + // Serve the Prom handler on a test server + if tel.PrometheusHandler == nil { + t.Fatalf("Prometheus handler nil; PromEnabled should enable it") + } + ts := httptest.NewServer(tel.PrometheusHandler) + defer ts.Close() + + // Record a simple metric and then fetch /metrics + IncConnAttempt(ctx, "websocket", "success") + if tel.MeterProvider != nil { + _ = tel.MeterProvider.ForceFlush(ctx) + } + // Give the exporter a tick to collect + time.Sleep(100 * time.Millisecond) + + var body string + for i := 0; i < 5; i++ { + resp, err := http.Get(ts.URL) + if err != nil { + t.Fatalf("GET /metrics failed: %v", err) + } + b, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + body = string(b) + if strings.Contains(body, "newt_connection_attempts_total") { + break + } + time.Sleep(100 * time.Millisecond) + } + if !strings.Contains(body, "newt_connection_attempts_total") { + t.Fatalf("expected newt_connection_attempts_total in metrics, got:\n%s", body) + } +} diff --git a/main.go b/main.go index b6ccc94..57ac17c 100644 --- a/main.go +++ b/main.go @@ -1,7 +1,9 @@ package main import ( + "context" "encoding/json" + "errors" "flag" "fmt" "net" @@ -22,6 +24,9 @@ import ( "github.com/fosrl/newt/updates" "github.com/fosrl/newt/websocket" + "github.com/fosrl/newt/internal/state" + "github.com/fosrl/newt/internal/telemetry" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "golang.zx2c4.com/wireguard/conn" "golang.zx2c4.com/wireguard/device" "golang.zx2c4.com/wireguard/tun" @@ -91,6 +96,14 @@ func (s *stringSlice) Set(value string) error { return nil } +const ( + fmtErrMarshaling = "Error marshaling data: %v" + fmtReceivedMsg = "Received: %+v" + topicWGRegister = "newt/wg/register" + msgNoTunnelOrProxy = "No tunnel IP or proxy manager available" + fmtErrParsingTargetData = "Error parsing target data: %v" +) + var ( endpoint string id string @@ -120,8 +133,17 @@ var ( preferEndpoint string healthMonitor *healthcheck.Monitor enforceHealthcheckCert bool - blueprintFile string - noCloud bool + // Build/version (can be overridden via -ldflags "-X main.newtVersion=...") + newtVersion = "version_replaceme" + + // Observability/metrics flags + metricsEnabled bool + otlpEnabled bool + adminAddr string + region string + metricsAsyncBytes bool + blueprintFile string + noCloud bool // New mTLS configuration variables tlsClientCert string @@ -133,6 +155,10 @@ var ( ) func main() { + // Prepare context for graceful shutdown and signal handling + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + // if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values endpoint = os.Getenv("PANGOLIN_ENDPOINT") id = os.Getenv("NEWT_ID") @@ -143,6 +169,14 @@ func main() { updownScript = os.Getenv("UPDOWN_SCRIPT") interfaceName = os.Getenv("INTERFACE") generateAndSaveKeyTo = os.Getenv("GENERATE_AND_SAVE_KEY_TO") + + // Metrics/observability env mirrors + metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED") + otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED") + adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR") + regionEnv := os.Getenv("NEWT_REGION") + asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES") + keepInterfaceEnv := os.Getenv("KEEP_INTERFACE") keepInterface = keepInterfaceEnv == "true" acceptClientsEnv := os.Getenv("ACCEPT_CLIENTS") @@ -286,6 +320,43 @@ func main() { flag.BoolVar(&noCloud, "no-cloud", false, "Disable cloud failover") } + // Metrics/observability flags (mirror ENV if unset) + if metricsEnabledEnv == "" { + flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter") + } else { + if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { + metricsEnabled = v + } else { + metricsEnabled = true + } + } + if otlpEnabledEnv == "" { + flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT") + } else { + if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { + otlpEnabled = v + } + } + if adminAddrEnv == "" { + flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address") + } else { + adminAddr = adminAddrEnv + } + // Async bytes toggle + if asyncBytesEnv == "" { + flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)") + } else { + if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { + metricsAsyncBytes = v + } + } + // Optional region flag (resource attribute) + if regionEnv == "" { + flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)") + } else { + region = regionEnv + } + // do a --version check version := flag.Bool("version", false, "Print the version") @@ -300,12 +371,58 @@ func main() { loggerLevel := parseLogLevel(logLevel) logger.GetLogger().SetLevel(parseLogLevel(logLevel)) - newtVersion := "version_replaceme" + // Initialize telemetry after flags are parsed (so flags override env) + tcfg := telemetry.FromEnv() + tcfg.PromEnabled = metricsEnabled + tcfg.OTLPEnabled = otlpEnabled + if adminAddr != "" { + tcfg.AdminAddr = adminAddr + } + // Resource attributes (if available) + tcfg.SiteID = id + tcfg.Region = region + // Build info + tcfg.BuildVersion = newtVersion + tcfg.BuildCommit = os.Getenv("NEWT_COMMIT") + + tel, telErr := telemetry.Init(ctx, tcfg) + if telErr != nil { + logger.Warn("Telemetry init failed: %v", telErr) + } + if tel != nil { + // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled) + logger.Info("Starting metrics server on %s", tcfg.AdminAddr) + mux := http.NewServeMux() + mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) }) + if tel.PrometheusHandler != nil { + mux.Handle("/metrics", tel.PrometheusHandler) + } + admin := &http.Server{ + Addr: tcfg.AdminAddr, + Handler: otelhttp.NewHandler(mux, "newt-admin"), + ReadTimeout: 5 * time.Second, + WriteTimeout: 10 * time.Second, + ReadHeaderTimeout: 5 * time.Second, + IdleTimeout: 30 * time.Second, + } + go func() { + if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + logger.Warn("admin http error: %v", err) + } + }() + defer func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = admin.Shutdown(ctx) + }() + defer func() { _ = tel.Shutdown(context.Background()) }() + } + if *version { fmt.Println("Newt version " + newtVersion) os.Exit(0) } else { - logger.Info("Newt version " + newtVersion) + logger.Info("Newt version %s", newtVersion) } if err := updates.CheckForUpdate("fosrl", "newt", newtVersion); err != nil { @@ -376,6 +493,8 @@ func main() { } endpoint = client.GetConfig().Endpoint // Update endpoint from config id = client.GetConfig().ID // Update ID from config + // Update site labels for metrics with the resolved ID + telemetry.UpdateSiteInfo(id, region) // output env var values if set logger.Debug("Endpoint: %v", endpoint) @@ -484,6 +603,10 @@ func main() { // Register handlers for different message types client.RegisterHandler("newt/wg/connect", func(msg websocket.WSMessage) { logger.Debug("Received registration message") + regResult := "success" + defer func() { + telemetry.IncSiteRegistration(ctx, regResult) + }() if stopFunc != nil { stopFunc() // stop the ws from sending more requests stopFunc = nil // reset stopFunc to nil to avoid double stopping @@ -502,22 +625,25 @@ func main() { jsonData, err := json.Marshal(msg.Data) if err != nil { - logger.Info("Error marshaling data: %v", err) + logger.Info(fmtErrMarshaling, err) + regResult = "failure" return } if err := json.Unmarshal(jsonData, &wgData); err != nil { logger.Info("Error unmarshaling target data: %v", err) + regResult = "failure" return } - logger.Debug("Received: %+v", msg) + logger.Debug(fmtReceivedMsg, msg) tun, tnet, err = netstack.CreateNetTUN( []netip.Addr{netip.MustParseAddr(wgData.TunnelIP)}, []netip.Addr{netip.MustParseAddr(dns)}, mtuInt) if err != nil { logger.Error("Failed to create TUN device: %v", err) + regResult = "failure" } setDownstreamTNetstack(tnet) @@ -531,6 +657,7 @@ func main() { host, _, err := net.SplitHostPort(wgData.Endpoint) if err != nil { logger.Error("Failed to split endpoint: %v", err) + regResult = "failure" return } @@ -539,6 +666,7 @@ func main() { endpoint, err := resolveDomain(wgData.Endpoint) if err != nil { logger.Error("Failed to resolve endpoint: %v", err) + regResult = "failure" return } @@ -554,12 +682,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub err = dev.IpcSet(config) if err != nil { logger.Error("Failed to configure WireGuard device: %v", err) + regResult = "failure" } // Bring up the device err = dev.Up() if err != nil { logger.Error("Failed to bring up WireGuard device: %v", err) + regResult = "failure" } logger.Debug("WireGuard device created. Lets ping the server now...") @@ -572,9 +702,13 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } // Use reliable ping for initial connection test logger.Debug("Testing initial connection with reliable ping...") - _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) + lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) + if err == nil && wgData.PublicKey != "" { + telemetry.ObserveTunnelLatency(ctx, wgData.PublicKey, "wireguard", lat.Seconds()) + } if err != nil { logger.Warn("Initial reliable ping failed, but continuing: %v", err) + regResult = "failure" } else { logger.Debug("Initial connection test successful") } @@ -585,11 +719,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub // as the pings will continue in the background if !connected { logger.Debug("Starting ping check") - pingStopChan = startPingCheck(tnet, wgData.ServerIP, client) + pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey) } // Create proxy manager pm = proxy.NewProxyManager(tnet) + pm.SetAsyncBytes(metricsAsyncBytes) + // Set tunnel_id for metrics (WireGuard peer public key) + pm.SetTunnelID(wgData.PublicKey) connected = true @@ -626,10 +763,19 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) { logger.Info("Received reconnect message") + if wgData.PublicKey != "" { + telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest) + } // Close the WireGuard device and TUN closeWgTunnel() + // Clear metrics attrs and sessions for the tunnel + if pm != nil { + pm.ClearTunnelID() + state.Global().ClearTunnel(wgData.PublicKey) + } + // Mark as disconnected connected = false @@ -648,6 +794,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) { logger.Info("Received termination message") + if wgData.PublicKey != "" { + telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest) + } // Close the WireGuard device and TUN closeWgTunnel() @@ -675,7 +824,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub jsonData, err := json.Marshal(msg.Data) if err != nil { - logger.Info("Error marshaling data: %v", err) + logger.Info(fmtErrMarshaling, err) return } if err := json.Unmarshal(jsonData, &exitNodeData); err != nil { @@ -716,7 +865,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }, } - stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{ + stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{ "publicKey": publicKey.String(), "pingResults": pingResults, "newtVersion": newtVersion, @@ -819,7 +968,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } // Send the ping results to the cloud for selection - stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{ + stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{ "publicKey": publicKey.String(), "pingResults": pingResults, "newtVersion": newtVersion, @@ -829,17 +978,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }) client.RegisterHandler("newt/tcp/add", func(msg websocket.WSMessage) { - logger.Debug("Received: %+v", msg) + logger.Debug(fmtReceivedMsg, msg) // if there is no wgData or pm, we can't add targets if wgData.TunnelIP == "" || pm == nil { - logger.Info("No tunnel IP or proxy manager available") + logger.Info(msgNoTunnelOrProxy) return } targetData, err := parseTargetData(msg.Data) if err != nil { - logger.Info("Error parsing target data: %v", err) + logger.Info(fmtErrParsingTargetData, err) return } @@ -854,17 +1003,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }) client.RegisterHandler("newt/udp/add", func(msg websocket.WSMessage) { - logger.Info("Received: %+v", msg) + logger.Info(fmtReceivedMsg, msg) // if there is no wgData or pm, we can't add targets if wgData.TunnelIP == "" || pm == nil { - logger.Info("No tunnel IP or proxy manager available") + logger.Info(msgNoTunnelOrProxy) return } targetData, err := parseTargetData(msg.Data) if err != nil { - logger.Info("Error parsing target data: %v", err) + logger.Info(fmtErrParsingTargetData, err) return } @@ -879,17 +1028,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }) client.RegisterHandler("newt/udp/remove", func(msg websocket.WSMessage) { - logger.Info("Received: %+v", msg) + logger.Info(fmtReceivedMsg, msg) // if there is no wgData or pm, we can't add targets if wgData.TunnelIP == "" || pm == nil { - logger.Info("No tunnel IP or proxy manager available") + logger.Info(msgNoTunnelOrProxy) return } targetData, err := parseTargetData(msg.Data) if err != nil { - logger.Info("Error parsing target data: %v", err) + logger.Info(fmtErrParsingTargetData, err) return } @@ -904,17 +1053,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }) client.RegisterHandler("newt/tcp/remove", func(msg websocket.WSMessage) { - logger.Info("Received: %+v", msg) + logger.Info(fmtReceivedMsg, msg) // if there is no wgData or pm, we can't add targets if wgData.TunnelIP == "" || pm == nil { - logger.Info("No tunnel IP or proxy manager available") + logger.Info(msgNoTunnelOrProxy) return } targetData, err := parseTargetData(msg.Data) if err != nil { - logger.Info("Error parsing target data: %v", err) + logger.Info(fmtErrParsingTargetData, err) return } @@ -998,7 +1147,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub jsonData, err := json.Marshal(msg.Data) if err != nil { - logger.Info("Error marshaling data: %v", err) + logger.Info(fmtErrMarshaling, err) return } if err := json.Unmarshal(jsonData, &sshPublicKeyData); err != nil { @@ -1155,9 +1304,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } if err := healthMonitor.EnableTarget(requestData.ID); err != nil { - logger.Error("Failed to enable health check target %s: %v", requestData.ID, err) + logger.Error("Failed to enable health check target %d: %v", requestData.ID, err) } else { - logger.Info("Enabled health check target: %s", requestData.ID) + logger.Info("Enabled health check target: %d", requestData.ID) } }) @@ -1180,9 +1329,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } if err := healthMonitor.DisableTarget(requestData.ID); err != nil { - logger.Error("Failed to disable health check target %s: %v", requestData.ID, err) + logger.Error("Failed to disable health check target %d: %v", requestData.ID, err) } else { - logger.Info("Disabled health check target: %s", requestData.ID) + logger.Info("Disabled health check target: %d", requestData.ID) } }) @@ -1252,7 +1401,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } // Send registration message to the server for backward compatibility - err := client.SendMessage("newt/wg/register", map[string]interface{}{ + err := client.SendMessage(topicWGRegister, map[string]interface{}{ "publicKey": publicKey.String(), "newtVersion": newtVersion, "backwardsCompatible": true, diff --git a/proxy/manager.go b/proxy/manager.go index bf10322..cef5fa6 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -1,18 +1,28 @@ package proxy import ( + "context" + "errors" "fmt" "io" "net" + "os" "strings" "sync" + "sync/atomic" "time" + "github.com/fosrl/newt/internal/state" + "github.com/fosrl/newt/internal/telemetry" "github.com/fosrl/newt/logger" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "golang.zx2c4.com/wireguard/tun/netstack" "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" ) +const errUnsupportedProtoFmt = "unsupported protocol: %s" + // Target represents a proxy target with its address and port type Target struct { Address string @@ -28,6 +38,90 @@ type ProxyManager struct { udpConns []*gonet.UDPConn running bool mutex sync.RWMutex + + // telemetry (multi-tunnel) + currentTunnelID string + tunnels map[string]*tunnelEntry + asyncBytes bool + flushStop chan struct{} +} + +// tunnelEntry holds per-tunnel attributes and (optional) async counters. +type tunnelEntry struct { + attrInTCP attribute.Set + attrOutTCP attribute.Set + attrInUDP attribute.Set + attrOutUDP attribute.Set + + bytesInTCP atomic.Uint64 + bytesOutTCP atomic.Uint64 + bytesInUDP atomic.Uint64 + bytesOutUDP atomic.Uint64 + + activeTCP atomic.Int64 + activeUDP atomic.Int64 +} + +// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set. +type countingWriter struct { + ctx context.Context + w io.Writer + set attribute.Set + pm *ProxyManager + ent *tunnelEntry + out bool // false=in, true=out + proto string // "tcp" or "udp" +} + +func (cw *countingWriter) Write(p []byte) (int, error) { + n, err := cw.w.Write(p) + if n > 0 { + if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil { + switch cw.proto { + case "tcp": + if cw.out { + cw.ent.bytesOutTCP.Add(uint64(n)) + } else { + cw.ent.bytesInTCP.Add(uint64(n)) + } + case "udp": + if cw.out { + cw.ent.bytesOutUDP.Add(uint64(n)) + } else { + cw.ent.bytesInUDP.Add(uint64(n)) + } + } + } else { + telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set) + } + } + return n, err +} + +func classifyProxyError(err error) string { + if err == nil { + return "" + } + if errors.Is(err, net.ErrClosed) { + return "closed" + } + if ne, ok := err.(net.Error); ok { + if ne.Timeout() { + return "timeout" + } + if ne.Temporary() { + return "temporary" + } + } + msg := strings.ToLower(err.Error()) + switch { + case strings.Contains(msg, "refused"): + return "refused" + case strings.Contains(msg, "reset"): + return "reset" + default: + return "io_error" + } } // NewProxyManager creates a new proxy manager instance @@ -38,9 +132,77 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager { udpTargets: make(map[string]map[int]string), listeners: make([]*gonet.TCPListener, 0), udpConns: make([]*gonet.UDPConn, 0), + tunnels: make(map[string]*tunnelEntry), } } +// SetTunnelID sets the WireGuard peer public key used as tunnel_id label. +func (pm *ProxyManager) SetTunnelID(id string) { + pm.mutex.Lock() + defer pm.mutex.Unlock() + pm.currentTunnelID = id + if _, ok := pm.tunnels[id]; !ok { + pm.tunnels[id] = &tunnelEntry{} + } + e := pm.tunnels[id] + // include site labels if available + site := telemetry.SiteLabelKVs() + build := func(base []attribute.KeyValue) attribute.Set { + if telemetry.ShouldIncludeTunnelID() { + base = append([]attribute.KeyValue{attribute.String("tunnel_id", id)}, base...) + } + base = append(site, base...) + return attribute.NewSet(base...) + } + e.attrInTCP = build([]attribute.KeyValue{ + attribute.String("direction", "ingress"), + attribute.String("protocol", "tcp"), + }) + e.attrOutTCP = build([]attribute.KeyValue{ + attribute.String("direction", "egress"), + attribute.String("protocol", "tcp"), + }) + e.attrInUDP = build([]attribute.KeyValue{ + attribute.String("direction", "ingress"), + attribute.String("protocol", "udp"), + }) + e.attrOutUDP = build([]attribute.KeyValue{ + attribute.String("direction", "egress"), + attribute.String("protocol", "udp"), + }) +} + +// ClearTunnelID clears cached attribute sets for the current tunnel. +func (pm *ProxyManager) ClearTunnelID() { + pm.mutex.Lock() + defer pm.mutex.Unlock() + id := pm.currentTunnelID + if id == "" { + return + } + if e, ok := pm.tunnels[id]; ok { + // final flush for this tunnel + inTCP := e.bytesInTCP.Swap(0) + outTCP := e.bytesOutTCP.Swap(0) + inUDP := e.bytesInUDP.Swap(0) + outUDP := e.bytesOutUDP.Swap(0) + if inTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) + } + if outTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) + } + if inUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) + } + if outUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) + } + delete(pm.tunnels, id) + } + pm.currentTunnelID = "" +} + // init function without tnet func NewProxyManagerWithoutTNet() *ProxyManager { return &ProxyManager{ @@ -75,7 +237,7 @@ func (pm *ProxyManager) AddTarget(proto, listenIP string, port int, targetAddr s } pm.udpTargets[listenIP][port] = targetAddr default: - return fmt.Errorf("unsupported protocol: %s", proto) + return fmt.Errorf(errUnsupportedProtoFmt, proto) } if pm.running { @@ -124,13 +286,28 @@ func (pm *ProxyManager) RemoveTarget(proto, listenIP string, port int) error { return fmt.Errorf("target not found: %s:%d", listenIP, port) } default: - return fmt.Errorf("unsupported protocol: %s", proto) + return fmt.Errorf(errUnsupportedProtoFmt, proto) } return nil } // Start begins listening for all configured proxy targets func (pm *ProxyManager) Start() error { + // Register proxy observables once per process + telemetry.SetProxyObservableCallback(func(ctx context.Context, o metric.Observer) error { + pm.mutex.RLock() + defer pm.mutex.RUnlock() + for _, e := range pm.tunnels { + // active connections + telemetry.ObserveProxyActiveConnsObs(o, e.activeTCP.Load(), e.attrOutTCP.ToSlice()) + telemetry.ObserveProxyActiveConnsObs(o, e.activeUDP.Load(), e.attrOutUDP.ToSlice()) + // backlog bytes (sum of unflushed counters) + b := int64(e.bytesInTCP.Load() + e.bytesOutTCP.Load() + e.bytesInUDP.Load() + e.bytesOutUDP.Load()) + telemetry.ObserveProxyAsyncBacklogObs(o, b, e.attrOutTCP.ToSlice()) + telemetry.ObserveProxyBufferBytesObs(o, b, e.attrOutTCP.ToSlice()) + } + return nil + }) pm.mutex.Lock() defer pm.mutex.Unlock() @@ -160,6 +337,75 @@ func (pm *ProxyManager) Start() error { return nil } +func (pm *ProxyManager) SetAsyncBytes(b bool) { + pm.mutex.Lock() + defer pm.mutex.Unlock() + pm.asyncBytes = b + if b && pm.flushStop == nil { + pm.flushStop = make(chan struct{}) + go pm.flushLoop() + } +} +func (pm *ProxyManager) flushLoop() { + flushInterval := 2 * time.Second + if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" { + if d, err := time.ParseDuration(v); err == nil && d > 0 { + if d/2 < flushInterval { + flushInterval = d / 2 + } + } + } + ticker := time.NewTicker(flushInterval) + defer ticker.Stop() + for { + select { + case <-ticker.C: + pm.mutex.RLock() + for _, e := range pm.tunnels { + inTCP := e.bytesInTCP.Swap(0) + outTCP := e.bytesOutTCP.Swap(0) + inUDP := e.bytesInUDP.Swap(0) + outUDP := e.bytesOutUDP.Swap(0) + if inTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) + } + if outTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) + } + if inUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) + } + if outUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) + } + } + pm.mutex.RUnlock() + case <-pm.flushStop: + pm.mutex.RLock() + for _, e := range pm.tunnels { + inTCP := e.bytesInTCP.Swap(0) + outTCP := e.bytesOutTCP.Swap(0) + inUDP := e.bytesInUDP.Swap(0) + outUDP := e.bytesOutUDP.Swap(0) + if inTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) + } + if outTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) + } + if inUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) + } + if outUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) + } + } + pm.mutex.RUnlock() + return + } + } +} + func (pm *ProxyManager) Stop() error { pm.mutex.Lock() defer pm.mutex.Unlock() @@ -227,7 +473,7 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr go pm.handleUDPProxy(conn, targetAddr) default: - return fmt.Errorf("unsupported protocol: %s", proto) + return fmt.Errorf(errUnsupportedProtoFmt, proto) } logger.Info("Started %s proxy to %s", proto, targetAddr) @@ -236,54 +482,84 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr return nil } +// getEntry returns per-tunnel entry or nil. +func (pm *ProxyManager) getEntry(id string) *tunnelEntry { + pm.mutex.RLock() + e := pm.tunnels[id] + pm.mutex.RUnlock() + return e +} + func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) { for { conn, err := listener.Accept() if err != nil { - // Check if we're shutting down or the listener was closed + telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "tcp", "failure", classifyProxyError(err)) if !pm.running { return } - - // Check for specific network errors that indicate the listener is closed if ne, ok := err.(net.Error); ok && !ne.Temporary() { logger.Info("TCP listener closed, stopping proxy handler for %v", listener.Addr()) return } - logger.Error("Error accepting TCP connection: %v", err) - // Don't hammer the CPU if we hit a temporary error time.Sleep(100 * time.Millisecond) continue } - go func() { + tunnelID := pm.currentTunnelID + telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "success", "") + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "tcp", telemetry.ProxyConnectionOpened) + if tunnelID != "" { + state.Global().IncSessions(tunnelID) + if e := pm.getEntry(tunnelID); e != nil { + e.activeTCP.Add(1) + } + } + + go func(tunnelID string, accepted net.Conn) { + connStart := time.Now() target, err := net.Dial("tcp", targetAddr) if err != nil { logger.Error("Error connecting to target: %v", err) - conn.Close() + accepted.Close() + telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "failure", classifyProxyError(err)) + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "tcp", telemetry.ProxyConnectionClosed) + telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "failure", time.Since(connStart).Seconds()) return } - // Create a WaitGroup to ensure both copy operations complete + entry := pm.getEntry(tunnelID) + if entry == nil { + entry = &tunnelEntry{} + } var wg sync.WaitGroup wg.Add(2) - go func() { + go func(ent *tunnelEntry) { defer wg.Done() - io.Copy(target, conn) - target.Close() - }() + cw := &countingWriter{ctx: context.Background(), w: target, set: ent.attrInTCP, pm: pm, ent: ent, out: false, proto: "tcp"} + _, _ = io.Copy(cw, accepted) + _ = target.Close() + }(entry) - go func() { + go func(ent *tunnelEntry) { defer wg.Done() - io.Copy(conn, target) - conn.Close() - }() + cw := &countingWriter{ctx: context.Background(), w: accepted, set: ent.attrOutTCP, pm: pm, ent: ent, out: true, proto: "tcp"} + _, _ = io.Copy(cw, target) + _ = accepted.Close() + }(entry) - // Wait for both copies to complete wg.Wait() - }() + if tunnelID != "" { + state.Global().DecSessions(tunnelID) + if e := pm.getEntry(tunnelID); e != nil { + e.activeTCP.Add(-1) + } + } + telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "success", time.Since(connStart).Seconds()) + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "tcp", telemetry.ProxyConnectionClosed) + }(tunnelID, conn) } } @@ -326,6 +602,18 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { } clientKey := remoteAddr.String() + // bytes from client -> target (direction=in) + if pm.currentTunnelID != "" && n > 0 { + if pm.asyncBytes { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.bytesInUDP.Add(uint64(n)) + } + } else { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) + } + } + } clientsMutex.RLock() targetConn, exists := clientConns[clientKey] clientsMutex.RUnlock() @@ -334,28 +622,44 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { targetUDPAddr, err := net.ResolveUDPAddr("udp", targetAddr) if err != nil { logger.Error("Error resolving target address: %v", err) + telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "udp", "failure", "resolve") continue } targetConn, err = net.DialUDP("udp", nil, targetUDPAddr) if err != nil { logger.Error("Error connecting to target: %v", err) + telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "udp", "failure", classifyProxyError(err)) continue } + tunnelID := pm.currentTunnelID + telemetry.IncProxyAccept(context.Background(), tunnelID, "udp", "success", "") + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "udp", telemetry.ProxyConnectionOpened) + // Only increment activeUDP after a successful DialUDP + if e := pm.getEntry(tunnelID); e != nil { + e.activeUDP.Add(1) + } clientsMutex.Lock() clientConns[clientKey] = targetConn clientsMutex.Unlock() - go func(clientKey string, targetConn *net.UDPConn, remoteAddr net.Addr) { + go func(clientKey string, targetConn *net.UDPConn, remoteAddr net.Addr, tunnelID string) { + start := time.Now() + result := "success" defer func() { // Always clean up when this goroutine exits clientsMutex.Lock() if storedConn, exists := clientConns[clientKey]; exists && storedConn == targetConn { delete(clientConns, clientKey) targetConn.Close() + if e := pm.getEntry(tunnelID); e != nil { + e.activeUDP.Add(-1) + } } clientsMutex.Unlock() + telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "udp", result, time.Since(start).Seconds()) + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "udp", telemetry.ProxyConnectionClosed) }() buffer := make([]byte, 65507) @@ -363,25 +667,52 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { n, _, err := targetConn.ReadFromUDP(buffer) if err != nil { logger.Error("Error reading from target: %v", err) + result = "failure" return // defer will handle cleanup } + // bytes from target -> client (direction=out) + if pm.currentTunnelID != "" && n > 0 { + if pm.asyncBytes { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.bytesOutUDP.Add(uint64(n)) + } + } else { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) + } + } + } + _, err = conn.WriteTo(buffer[:n], remoteAddr) if err != nil { logger.Error("Error writing to client: %v", err) + telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp") + result = "failure" return // defer will handle cleanup } } - }(clientKey, targetConn, remoteAddr) + }(clientKey, targetConn, remoteAddr, tunnelID) } - _, err = targetConn.Write(buffer[:n]) + written, err := targetConn.Write(buffer[:n]) if err != nil { logger.Error("Error writing to target: %v", err) + telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp") targetConn.Close() clientsMutex.Lock() delete(clientConns, clientKey) clientsMutex.Unlock() + } else if pm.currentTunnelID != "" && written > 0 { + if pm.asyncBytes { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.bytesInUDP.Add(uint64(written)) + } + } else { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) + } + } } } } diff --git a/stub.go b/stub.go index ec91299..3bdbe19 100644 --- a/stub.go +++ b/stub.go @@ -8,25 +8,27 @@ import ( ) func setupClientsNative(client *websocket.Client, host string) { - return // This function is not implemented for non-Linux systems. + _ = client + _ = host + // No-op for non-Linux systems } func closeWgServiceNative() { // No-op for non-Linux systems - return } func clientsOnConnectNative() { // No-op for non-Linux systems - return } func clientsHandleNewtConnectionNative(publicKey, endpoint string) { + _ = publicKey + _ = endpoint // No-op for non-Linux systems - return } func clientsAddProxyTargetNative(pm *proxy.ProxyManager, tunnelIp string) { + _ = pm + _ = tunnelIp // No-op for non-Linux systems - return } diff --git a/util.go b/util.go index 72d2bda..dc48f19 100644 --- a/util.go +++ b/util.go @@ -2,6 +2,7 @@ package main import ( "bytes" + "context" "encoding/base64" "encoding/hex" "encoding/json" @@ -14,6 +15,7 @@ import ( "math/rand" + "github.com/fosrl/newt/internal/telemetry" "github.com/fosrl/newt/logger" "github.com/fosrl/newt/proxy" "github.com/fosrl/newt/websocket" @@ -24,6 +26,8 @@ import ( "gopkg.in/yaml.v3" ) +const msgHealthFileWriteFailed = "Failed to write health file: %v" + func fixKey(key string) string { // Remove any whitespace key = strings.TrimSpace(key) @@ -176,7 +180,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC if healthFile != "" { err := os.WriteFile(healthFile, []byte("ok"), 0644) if err != nil { - logger.Warn("Failed to write health file: %v", err) + logger.Warn(msgHealthFileWriteFailed, err) } } return stopChan, nil @@ -217,11 +221,13 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC if healthFile != "" { err := os.WriteFile(healthFile, []byte("ok"), 0644) if err != nil { - logger.Warn("Failed to write health file: %v", err) + logger.Warn(msgHealthFileWriteFailed, err) } } - return } + case <-pingStopChan: + // Stop the goroutine when signaled + return } } }() @@ -230,7 +236,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background") } -func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} { +func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} { maxInterval := 6 * time.Second currentInterval := pingInterval consecutiveFailures := 0 @@ -293,6 +299,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien if !connectionLost { connectionLost = true logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) + if tunnelID != "" { + telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) + } stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second) // Send registration message to the server for backward compatibility err := client.SendMessage("newt/wg/register", map[string]interface{}{ @@ -319,6 +328,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien } else { // Track recent latencies recentLatencies = append(recentLatencies, latency) + // Record tunnel latency (limit sampling to this periodic check) + if tunnelID != "" { + telemetry.ObserveTunnelLatency(context.Background(), tunnelID, "wireguard", latency.Seconds()) + } if len(recentLatencies) > 10 { recentLatencies = recentLatencies[1:] } @@ -468,7 +481,8 @@ func updateTargets(pm *proxy.ProxyManager, action string, tunnelIP string, proto continue } - if action == "add" { + switch action { + case "add": target := parts[1] + ":" + parts[2] // Call updown script if provided @@ -494,7 +508,7 @@ func updateTargets(pm *proxy.ProxyManager, action string, tunnelIP string, proto // Add the new target pm.AddTarget(proto, tunnelIP, port, processedTarget) - } else if action == "remove" { + case "remove": logger.Info("Removing target with port %d", port) target := parts[1] + ":" + parts[2] @@ -512,6 +526,8 @@ func updateTargets(pm *proxy.ProxyManager, action string, tunnelIP string, proto logger.Error("Failed to remove target: %v", err) return err } + default: + logger.Info("Unknown action: %s", action) } } diff --git a/websocket/client.go b/websocket/client.go index c580f0e..a3ba757 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -7,6 +7,7 @@ import ( "encoding/json" "fmt" "io" + "net" "net/http" "net/url" "os" @@ -18,6 +19,11 @@ import ( "github.com/fosrl/newt/logger" "github.com/gorilla/websocket" + + "context" + + "github.com/fosrl/newt/internal/telemetry" + "go.opentelemetry.io/otel" ) type Client struct { @@ -37,6 +43,8 @@ type Client struct { writeMux sync.Mutex clientType string // Type of client (e.g., "newt", "olm") tlsConfig TLSConfig + metricsCtxMu sync.RWMutex + metricsCtx context.Context configNeedsSave bool // Flag to track if config needs to be saved } @@ -81,6 +89,26 @@ func (c *Client) OnTokenUpdate(callback func(token string)) { c.onTokenUpdate = callback } +func (c *Client) metricsContext() context.Context { + c.metricsCtxMu.RLock() + defer c.metricsCtxMu.RUnlock() + if c.metricsCtx != nil { + return c.metricsCtx + } + return context.Background() +} + +func (c *Client) setMetricsContext(ctx context.Context) { + c.metricsCtxMu.Lock() + c.metricsCtx = ctx + c.metricsCtxMu.Unlock() +} + +// MetricsContext exposes the context used for telemetry emission when a connection is active. +func (c *Client) MetricsContext() context.Context { + return c.metricsContext() +} + // NewClient creates a new websocket client func NewClient(clientType string, ID, secret string, endpoint string, pingInterval time.Duration, pingTimeout time.Duration, opts ...ClientOption) (*Client, error) { config := &Config{ @@ -140,6 +168,7 @@ func (c *Client) Close() error { // Set connection status to false c.setConnected(false) + telemetry.SetWSConnectionState(false) // Close the WebSocket connection gracefully if c.conn != nil { @@ -170,7 +199,11 @@ func (c *Client) SendMessage(messageType string, data interface{}) error { c.writeMux.Lock() defer c.writeMux.Unlock() - return c.conn.WriteJSON(msg) + if err := c.conn.WriteJSON(msg); err != nil { + return err + } + telemetry.IncWSMessage(c.metricsContext(), "out", "text") + return nil } func (c *Client) SendMessageInterval(messageType string, data interface{}, interval time.Duration) (stop func()) { @@ -265,8 +298,12 @@ func (c *Client) getToken() (string, error) { return "", fmt.Errorf("failed to marshal token request data: %w", err) } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + // Create a new request - req, err := http.NewRequest( + req, err := http.NewRequestWithContext( + ctx, "POST", baseEndpoint+"/api/v1/auth/"+c.clientType+"/get-token", bytes.NewBuffer(jsonData), @@ -288,6 +325,8 @@ func (c *Client) getToken() (string, error) { } resp, err := client.Do(req) if err != nil { + telemetry.IncConnAttempt(ctx, "auth", "failure") + telemetry.IncConnError(ctx, "auth", classifyConnError(err)) return "", fmt.Errorf("failed to request new token: %w", err) } defer resp.Body.Close() @@ -295,6 +334,16 @@ func (c *Client) getToken() (string, error) { if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) + telemetry.IncConnAttempt(ctx, "auth", "failure") + etype := "io_error" + if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { + etype = "auth_failed" + } + telemetry.IncConnError(ctx, "auth", etype) + // Reconnect reason mapping for auth failures + if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { + telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonAuthError) + } return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) } @@ -313,10 +362,55 @@ func (c *Client) getToken() (string, error) { } logger.Debug("Received token: %s", tokenResp.Data.Token) + telemetry.IncConnAttempt(ctx, "auth", "success") return tokenResp.Data.Token, nil } +// classifyConnError maps to fixed, low-cardinality error_type values. +// Allowed enum: dial_timeout, tls_handshake, auth_failed, io_error +func classifyConnError(err error) string { + if err == nil { + return "" + } + msg := strings.ToLower(err.Error()) + switch { + case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"): + return "tls_handshake" + case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout") || strings.Contains(msg, "deadline exceeded"): + return "dial_timeout" + case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"): + return "auth_failed" + default: + // Group remaining network/socket errors as io_error to avoid label explosion + return "io_error" + } +} + +func classifyWSDisconnect(err error) (result, reason string) { + if err == nil { + return "success", "normal" + } + if websocket.IsCloseError(err, websocket.CloseNormalClosure) { + return "success", "normal" + } + if ne, ok := err.(net.Error); ok && ne.Timeout() { + return "error", "timeout" + } + if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) { + return "error", "unexpected_close" + } + msg := strings.ToLower(err.Error()) + switch { + case strings.Contains(msg, "eof"): + return "error", "eof" + case strings.Contains(msg, "reset"): + return "error", "connection_reset" + default: + return "error", "read_error" + } +} + func (c *Client) connectWithRetry() { for { select { @@ -335,9 +429,13 @@ func (c *Client) connectWithRetry() { } func (c *Client) establishConnection() error { + ctx := context.Background() + // Get token for authentication token, err := c.getToken() if err != nil { + telemetry.IncConnAttempt(ctx, "websocket", "failure") + telemetry.IncConnError(ctx, "websocket", classifyConnError(err)) return fmt.Errorf("failed to get token: %w", err) } @@ -370,7 +468,12 @@ func (c *Client) establishConnection() error { q.Set("clientType", c.clientType) u.RawQuery = q.Encode() - // Connect to WebSocket + // Connect to WebSocket (optional span) + tr := otel.Tracer("newt") + ctx, span := tr.Start(ctx, "ws.connect") + defer span.End() + + start := time.Now() dialer := websocket.DefaultDialer // Use new TLS configuration method @@ -392,18 +495,42 @@ func (c *Client) establishConnection() error { logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable") } - conn, _, err := dialer.Dial(u.String(), nil) + conn, _, err := dialer.DialContext(ctx, u.String(), nil) + lat := time.Since(start).Seconds() if err != nil { + telemetry.IncConnAttempt(ctx, "websocket", "failure") + etype := classifyConnError(err) + telemetry.IncConnError(ctx, "websocket", etype) + telemetry.ObserveWSConnectLatency(ctx, lat, "failure", etype) + // Map handshake-related errors to reconnect reasons where appropriate + if etype == "tls_handshake" { + telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonHandshakeError) + } else if etype == "dial_timeout" { + telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonTimeout) + } else { + telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonError) + } + telemetry.IncWSReconnect(ctx, etype) return fmt.Errorf("failed to connect to WebSocket: %w", err) } + telemetry.IncConnAttempt(ctx, "websocket", "success") + telemetry.ObserveWSConnectLatency(ctx, lat, "success", "") c.conn = conn c.setConnected(true) + telemetry.SetWSConnectionState(true) + c.setMetricsContext(ctx) + sessionStart := time.Now() + // Wire up pong handler for metrics + c.conn.SetPongHandler(func(appData string) error { + telemetry.IncWSMessage(c.metricsContext(), "in", "pong") + return nil + }) // Start the ping monitor go c.pingMonitor() // Start the read pump with disconnect detection - go c.readPumpWithDisconnectDetection() + go c.readPumpWithDisconnectDetection(sessionStart) if c.onConnect != nil { err := c.saveConfig() @@ -496,6 +623,9 @@ func (c *Client) pingMonitor() { } c.writeMux.Lock() err := c.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(c.pingTimeout)) + if err == nil { + telemetry.IncWSMessage(c.metricsContext(), "out", "ping") + } c.writeMux.Unlock() if err != nil { // Check if we're shutting down before logging error and reconnecting @@ -505,6 +635,8 @@ func (c *Client) pingMonitor() { return default: logger.Error("Ping failed: %v", err) + telemetry.IncWSKeepaliveFailure(c.metricsContext(), "ping_write") + telemetry.IncWSReconnect(c.metricsContext(), "ping_write") c.reconnect() return } @@ -514,17 +646,26 @@ func (c *Client) pingMonitor() { } // readPumpWithDisconnectDetection reads messages and triggers reconnect on error -func (c *Client) readPumpWithDisconnectDetection() { +func (c *Client) readPumpWithDisconnectDetection(started time.Time) { + ctx := c.metricsContext() + disconnectReason := "shutdown" + disconnectResult := "success" + defer func() { if c.conn != nil { c.conn.Close() } + if !started.IsZero() { + telemetry.ObserveWSSessionDuration(ctx, time.Since(started).Seconds(), disconnectResult) + } + telemetry.IncWSDisconnect(ctx, disconnectReason, disconnectResult) // Only attempt reconnect if we're not shutting down select { case <-c.done: // Shutting down, don't reconnect return default: + telemetry.IncWSReconnect(ctx, disconnectReason) c.reconnect() } }() @@ -532,23 +673,33 @@ func (c *Client) readPumpWithDisconnectDetection() { for { select { case <-c.done: + disconnectReason = "shutdown" + disconnectResult = "success" return default: var msg WSMessage err := c.conn.ReadJSON(&msg) + if err == nil { + telemetry.IncWSMessage(c.metricsContext(), "in", "text") + } if err != nil { // Check if we're shutting down before logging error select { case <-c.done: // Expected during shutdown, don't log as error logger.Debug("WebSocket connection closed during shutdown") + disconnectReason = "shutdown" + disconnectResult = "success" return default: // Unexpected error during normal operation - if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure, websocket.CloseNormalClosure) { - logger.Error("WebSocket read error: %v", err) - } else { - logger.Debug("WebSocket connection closed: %v", err) + disconnectResult, disconnectReason = classifyWSDisconnect(err) + if disconnectResult == "error" { + if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure, websocket.CloseNormalClosure) { + logger.Error("WebSocket read error: %v", err) + } else { + logger.Debug("WebSocket connection closed: %v", err) + } } return // triggers reconnect via defer } @@ -565,6 +716,7 @@ func (c *Client) readPumpWithDisconnectDetection() { func (c *Client) reconnect() { c.setConnected(false) + telemetry.SetWSConnectionState(false) if c.conn != nil { c.conn.Close() c.conn = nil diff --git a/wg/wg.go b/wg/wg.go index a14e2c3..4b9e7f7 100644 --- a/wg/wg.go +++ b/wg/wg.go @@ -3,6 +3,7 @@ package wg import ( + "context" "encoding/json" "errors" "fmt" @@ -13,16 +14,19 @@ import ( "sync" "time" + "math/rand" + "github.com/fosrl/newt/logger" "github.com/fosrl/newt/network" "github.com/fosrl/newt/websocket" "github.com/vishvananda/netlink" "golang.org/x/crypto/chacha20poly1305" "golang.org/x/crypto/curve25519" - "golang.org/x/exp/rand" "golang.zx2c4.com/wireguard/conn" "golang.zx2c4.com/wireguard/wgctrl" "golang.zx2c4.com/wireguard/wgctrl/wgtypes" + + "github.com/fosrl/newt/internal/telemetry" ) type WgConfig struct { @@ -106,7 +110,7 @@ func FindAvailableUDPPort(minPort, maxPort uint16) (uint16, error) { } // Fisher-Yates shuffle to randomize the port order - rand.Seed(uint64(time.Now().UnixNano())) + rand.Seed(time.Now().UnixNano()) for i := len(portRange) - 1; i > 0; i-- { j := rand.Intn(i + 1) portRange[i], portRange[j] = portRange[j], portRange[i] @@ -280,6 +284,15 @@ func (s *WireGuardService) LoadRemoteConfig() error { } func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { + ctx := context.Background() + if s.client != nil { + ctx = s.client.MetricsContext() + } + result := "success" + defer func() { + telemetry.IncConfigReload(ctx, result) + }() + var config WgConfig logger.Debug("Received message: %v", msg) @@ -288,11 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { jsonData, err := json.Marshal(msg.Data) if err != nil { logger.Info("Error marshaling data: %v", err) + result = "failure" return } if err := json.Unmarshal(jsonData, &config); err != nil { logger.Info("Error unmarshaling target data: %v", err) + result = "failure" return } s.config = config @@ -302,13 +317,29 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { s.stopGetConfig = nil } - // Ensure the WireGuard interface and peers are configured - if err := s.ensureWireguardInterface(config); err != nil { - logger.Error("Failed to ensure WireGuard interface: %v", err) + // telemetry: config reload success + // Optional reconnect reason mapping: config change + if s.serverPubKey != "" { + telemetry.IncReconnect(ctx, s.serverPubKey, "client", telemetry.ReasonConfigChange) } + // Ensure the WireGuard interface and peers are configured + start := time.Now() + if err := s.ensureWireguardInterface(config); err != nil { + logger.Error("Failed to ensure WireGuard interface: %v", err) + telemetry.ObserveConfigApply(ctx, "interface", "failure", time.Since(start).Seconds()) + result = "failure" + } else { + telemetry.ObserveConfigApply(ctx, "interface", "success", time.Since(start).Seconds()) + } + + startPeers := time.Now() if err := s.ensureWireguardPeers(config.Peers); err != nil { logger.Error("Failed to ensure WireGuard peers: %v", err) + telemetry.ObserveConfigApply(ctx, "peer", "failure", time.Since(startPeers).Seconds()) + result = "failure" + } else { + telemetry.ObserveConfigApply(ctx, "peer", "success", time.Since(startPeers).Seconds()) } } diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go index f6708e9..664d1f0 100644 --- a/wgnetstack/wgnetstack.go +++ b/wgnetstack/wgnetstack.go @@ -1,6 +1,7 @@ package wgnetstack import ( + "context" "crypto/rand" "encoding/base64" "encoding/hex" @@ -26,6 +27,8 @@ import ( "golang.zx2c4.com/wireguard/tun" "golang.zx2c4.com/wireguard/tun/netstack" "golang.zx2c4.com/wireguard/wgctrl/wgtypes" + + "github.com/fosrl/newt/internal/telemetry" ) type WgConfig struct { @@ -242,14 +245,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str return service, nil } +// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally. +func (s *WireGuardService) ReportRTT(seconds float64) { + if s.serverPubKey == "" { return } + telemetry.ObserveTunnelLatency(context.Background(), s.serverPubKey, "wireguard", seconds) +} + func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) { logger.Debug("Received: %+v", msg) // if there is no wgData or pm, we can't add targets if s.TunnelIP == "" || s.proxyManager == nil { logger.Info("No tunnel IP or proxy manager available") - return - } + return +} targetData, err := parseTargetData(msg.Data) if err != nil { diff --git a/wgtester/wgtester.go b/wgtester/wgtester.go index 0035f05..26988f6 100644 --- a/wgtester/wgtester.go +++ b/wgtester/wgtester.go @@ -126,7 +126,7 @@ func (s *Server) Stop() { s.conn.Close() } s.isRunning = false - logger.Info(s.outputPrefix + "Server stopped") + logger.Info("%sServer stopped", s.outputPrefix) } // RestartWithNetstack stops the current server and restarts it with netstack @@ -161,7 +161,7 @@ func (s *Server) handleConnections() { // Set read deadline to avoid blocking forever err := s.conn.SetReadDeadline(time.Now().Add(1 * time.Second)) if err != nil { - logger.Error(s.outputPrefix+"Error setting read deadline: %v", err) + logger.Error("%sError setting read deadline: %v", s.outputPrefix, err) continue } @@ -187,7 +187,7 @@ func (s *Server) handleConnections() { case <-s.shutdownCh: return // Don't log error if we're shutting down default: - logger.Error(s.outputPrefix+"Error reading from UDP: %v", err) + logger.Error("%sError reading from UDP: %v", s.outputPrefix, err) } continue } @@ -219,7 +219,7 @@ func (s *Server) handleConnections() { copy(responsePacket[5:13], buffer[5:13]) // Log response being sent for debugging - logger.Debug(s.outputPrefix+"Sending response to %s", addr.String()) + logger.Debug("%sSending response to %s", s.outputPrefix, addr.String()) // Send the response packet - handle both regular UDP and netstack UDP if s.useNetstack { @@ -233,9 +233,9 @@ func (s *Server) handleConnections() { } if err != nil { - logger.Error(s.outputPrefix+"Error sending response: %v", err) + logger.Error("%sError sending response: %v", s.outputPrefix, err) } else { - logger.Debug(s.outputPrefix + "Response sent successfully") + logger.Debug("%sResponse sent successfully", s.outputPrefix) } } }