From 34d558a5a2f54ef64d2346077314d8a58553a958 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 21 Sep 2025 19:40:11 +0000 Subject: [PATCH 01/72] Bump golang.org/x/net from 0.43.0 to 0.44.0 Bumps [golang.org/x/net](https://github.com/golang/net) from 0.43.0 to 0.44.0. - [Commits](https://github.com/golang/net/compare/v0.43.0...v0.44.0) --- updated-dependencies: - dependency-name: golang.org/x/net dependency-version: 0.44.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 6 +++--- go.sum | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/go.mod b/go.mod index 061e828..d475835 100644 --- a/go.mod +++ b/go.mod @@ -7,9 +7,9 @@ require ( github.com/google/gopacket v1.1.19 github.com/gorilla/websocket v1.5.3 github.com/vishvananda/netlink v1.3.1 - golang.org/x/crypto v0.41.0 + golang.org/x/crypto v0.42.0 golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 - golang.org/x/net v0.43.0 + golang.org/x/net v0.44.0 golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c @@ -48,7 +48,7 @@ require ( go.opentelemetry.io/otel/metric v1.37.0 // indirect go.opentelemetry.io/otel/trace v1.37.0 // indirect golang.org/x/sync v0.16.0 // indirect - golang.org/x/sys v0.35.0 // indirect + golang.org/x/sys v0.36.0 // indirect golang.org/x/time v0.12.0 // indirect golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect ) diff --git a/go.sum b/go.sum index 51efaf1..4a0ff9d 100644 --- a/go.sum +++ b/go.sum @@ -105,8 +105,8 @@ go.opentelemetry.io/proto/otlp v1.6.0/go.mod h1:cicgGehlFuNdgZkcALOCh3VE6K/u2tAj golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= -golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/crypto v0.42.0 h1:chiH31gIWm57EkTXpwnqf8qeuMUi0yekh6mT2AvFlqI= +golang.org/x/crypto v0.42.0/go.mod h1:4+rDnOTJhQCx2q7/j6rAN5XDw8kPjeaXEUR2eL94ix8= golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= @@ -117,8 +117,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= -golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I= +golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -129,12 +129,12 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= +golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= -golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= +golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= From a63a27e3abf1edc8e943b9b8e987698c416fb154 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 21 Sep 2025 19:40:14 +0000 Subject: [PATCH 02/72] Bump golang.org/x/crypto from 0.41.0 to 0.42.0 Bumps [golang.org/x/crypto](https://github.com/golang/crypto) from 0.41.0 to 0.42.0. - [Commits](https://github.com/golang/crypto/compare/v0.41.0...v0.42.0) --- updated-dependencies: - dependency-name: golang.org/x/crypto dependency-version: 0.42.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 4 ++-- go.sum | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index 061e828..213cc9c 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/google/gopacket v1.1.19 github.com/gorilla/websocket v1.5.3 github.com/vishvananda/netlink v1.3.1 - golang.org/x/crypto v0.41.0 + golang.org/x/crypto v0.42.0 golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 golang.org/x/net v0.43.0 golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb @@ -48,7 +48,7 @@ require ( go.opentelemetry.io/otel/metric v1.37.0 // indirect go.opentelemetry.io/otel/trace v1.37.0 // indirect golang.org/x/sync v0.16.0 // indirect - golang.org/x/sys v0.35.0 // indirect + golang.org/x/sys v0.36.0 // indirect golang.org/x/time v0.12.0 // indirect golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect ) diff --git a/go.sum b/go.sum index 51efaf1..39def0b 100644 --- a/go.sum +++ b/go.sum @@ -105,8 +105,8 @@ go.opentelemetry.io/proto/otlp v1.6.0/go.mod h1:cicgGehlFuNdgZkcALOCh3VE6K/u2tAj golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= -golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/crypto v0.42.0 h1:chiH31gIWm57EkTXpwnqf8qeuMUi0yekh6mT2AvFlqI= +golang.org/x/crypto v0.42.0/go.mod h1:4+rDnOTJhQCx2q7/j6rAN5XDw8kPjeaXEUR2eL94ix8= golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= @@ -129,12 +129,12 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= -golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= +golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= -golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= +golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= From 9c0f4599b8667d6955b0c32ae549d81f66a05598 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:13:01 +0200 Subject: [PATCH 03/72] Update dependencies for telemetry and metrics support --- go.mod | 57 +++++++++++++------ go.sum | 175 ++++++++++++++++++++++++++++++++------------------------- 2 files changed, 140 insertions(+), 92 deletions(-) diff --git a/go.mod b/go.mod index d475835..dfa73c0 100644 --- a/go.mod +++ b/go.mod @@ -3,33 +3,48 @@ module github.com/fosrl/newt go 1.25 require ( - github.com/docker/docker v28.3.3+incompatible + github.com/docker/docker v28.5.0+incompatible github.com/google/gopacket v1.1.19 github.com/gorilla/websocket v1.5.3 + github.com/prometheus/client_golang v1.23.2 github.com/vishvananda/netlink v1.3.1 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 + go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 + go.opentelemetry.io/otel v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 + go.opentelemetry.io/otel/exporters/prometheus v0.57.0 + go.opentelemetry.io/otel/metric v1.38.0 + go.opentelemetry.io/otel/sdk v1.38.0 + go.opentelemetry.io/otel/sdk/metric v1.38.0 golang.org/x/crypto v0.42.0 - golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 golang.org/x/net v0.44.0 golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 + google.golang.org/grpc v1.75.1 gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c software.sslmate.com/src/go-pkcs12 v0.6.0 ) require ( - github.com/Microsoft/go-winio v0.6.2 // indirect - github.com/containerd/errdefs v1.0.0 // indirect + github.com/Microsoft/go-winio v0.6.0 // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/containerd/errdefs v0.3.0 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/distribution/reference v0.6.0 // indirect - github.com/docker/go-connections v0.5.0 // indirect - github.com/docker/go-units v0.5.0 // indirect + github.com/docker/go-connections v0.6.0 // indirect + github.com/docker/go-units v0.4.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/gogo/protobuf v1.3.2 // indirect - github.com/google/btree v1.1.3 // indirect + github.com/google/btree v1.1.2 // indirect github.com/google/go-cmp v0.7.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect github.com/josharian/native v1.1.0 // indirect + github.com/klauspost/compress v1.18.0 // indirect github.com/mdlayher/genetlink v1.3.2 // indirect github.com/mdlayher/netlink v1.7.2 // indirect github.com/mdlayher/socket v0.5.1 // indirect @@ -37,18 +52,28 @@ require ( github.com/moby/sys/atomicwriter v0.1.0 // indirect github.com/moby/term v0.5.2 // indirect github.com/morikuni/aec v1.0.0 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect - github.com/opencontainers/image-spec v1.1.1 // indirect + github.com/opencontainers/image-spec v1.1.0 // indirect github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect github.com/vishvananda/netns v0.0.5 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect - go.opentelemetry.io/otel v1.37.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0 // indirect - go.opentelemetry.io/otel/metric v1.37.0 // indirect - go.opentelemetry.io/otel/trace v1.37.0 // indirect - golang.org/x/sync v0.16.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 // indirect + go.opentelemetry.io/otel/trace v1.38.0 // indirect + go.opentelemetry.io/proto/otlp v1.7.1 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + golang.org/x/mod v0.27.0 // indirect + golang.org/x/sync v0.17.0 // indirect golang.org/x/sys v0.36.0 // indirect - golang.org/x/time v0.12.0 // indirect + golang.org/x/text v0.29.0 // indirect + golang.org/x/time v0.7.0 // indirect + golang.org/x/tools v0.36.0 // indirect golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect + google.golang.org/protobuf v1.36.8 // indirect ) diff --git a/go.sum b/go.sum index 4a0ff9d..5814d42 100644 --- a/go.sum +++ b/go.sum @@ -1,12 +1,15 @@ github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= -github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= -github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= -github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= -github.com/cenkalti/backoff/v5 v5.0.2 h1:rIfFVxEf1QsI7E1ZHfp/B4DF/6QBAUhmgkxc0H7Zss8= -github.com/cenkalti/backoff/v5 v5.0.2/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= -github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= -github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= +github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg= +github.com/Microsoft/go-winio v0.6.0/go.mod h1:cTAf44im0RAYeL23bpB+fzCyDH2MJiz2BO69KH/soAE= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/containerd/errdefs v0.3.0 h1:FSZgGOeK4yuT/+DnF07/Olde/q4KBoMsaamhXxIMDp4= +github.com/containerd/errdefs v0.3.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= @@ -15,12 +18,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI= -github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= -github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= -github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= -github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= -github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/docker/docker v28.5.0+incompatible h1:ZdSQoRUE9XxhFI/B8YLvhnEFMmYN9Pp8Egd2qcaFk1E= +github.com/docker/docker v28.5.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94= +github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE= +github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= +github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -28,10 +31,10 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= -github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= -github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU= +github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gopacket v1.1.19 h1:ves8RnFZPGiFnTS0uPQStjwru6uO6h+nlr9j6fL7kF8= @@ -40,12 +43,16 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= -github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= -github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= +github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mdlayher/genetlink v1.3.2 h1:KdrNKe+CTu+IbZnm/GVUMXSqBBLqcGpRDa0xkQy56gw= github.com/mdlayher/genetlink v1.3.2/go.mod h1:tcC3pkCrPUGIKKsCsp0B3AdaaKuHtaxoJRz3cc+528o= github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= @@ -64,103 +71,119 @@ github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= -github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= -github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= +github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= +github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= +github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= +github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= +github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0= github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4= github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY= github.com/vishvananda/netns v0.0.5/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM= -github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY= -go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= -go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0 h1:dNzwXjZKpMpE2JhmO+9HsPl42NIXFIFSUSSs0fiqra0= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.36.0/go.mod h1:90PoxvaEB5n6AOdZvi+yWJQoE95U8Dhhw2bSyRqnTD0= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0 h1:nRVXXvf78e00EwY6Wp0YII8ww2JVWshZ20HfTlE11AM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0/go.mod h1:r49hO7CgrxY9Voaj3Xe8pANWtr0Oq916d0XAmOoCZAQ= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= -go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= -go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= -go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= -go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= -go.opentelemetry.io/proto/otlp v1.6.0 h1:jQjP+AQyTf+Fe7OKj/MfkDrmK4MNVtw2NpXsf9fefDI= -go.opentelemetry.io/proto/otlp v1.6.0/go.mod h1:cicgGehlFuNdgZkcALOCh3VE6K/u2tAjzlRhDwmVpZc= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg= +go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0 h1:PeBoRj6af6xMI7qCupwFvTbbnd49V7n5YpG6pg8iDYQ= +go.opentelemetry.io/contrib/instrumentation/runtime v0.63.0/go.mod h1:ingqBCtMCe8I4vpz/UVzCW6sxoqgZB37nao91mLQ3Bw= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 h1:aTL7F04bJHUlztTsNGJ2l+6he8c+y/b//eR0jjjemT4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4= +go.opentelemetry.io/otel/exporters/prometheus v0.57.0 h1:AHh/lAP1BHrY5gBwk8ncc25FXWm/gmmY3BX258z5nuk= +go.opentelemetry.io/otel/exporters/prometheus v0.57.0/go.mod h1:QpFWz1QxqevfjwzYdbMb4Y1NnlJvqSGwyuU0B4iuc9c= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= +go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.42.0 h1:chiH31gIWm57EkTXpwnqf8qeuMUi0yekh6mT2AvFlqI= golang.org/x/crypto v0.42.0/go.mod h1:4+rDnOTJhQCx2q7/j6rAN5XDw8kPjeaXEUR2eL94ix8= -golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4= -golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.27.0 h1:kb+q2PyFnEADO2IEF935ehFUXlWiNjJWtRNgBLSfbxQ= +golang.org/x/mod v0.27.0/go.mod h1:rWI627Fq0DEoudcK+MBkNkCe0EetEaDSwJJkCcjpazc= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I= golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= -golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= +golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= -golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= -golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= +golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= -golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= +golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg= golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2/go.mod h1:deeaetjYA+DHMHg+sMSMI58GrEteJUUzzw7en6TJQcI= golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb h1:whnFRlWMcXI9d+ZbWg+4sHnLp52d5yiIPUxMBSt4X9A= golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb/go.mod h1:rpwXGsirqLqN2L0JDJQlwOboGHmptD5ZD6T2VmcqhTw= golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 h1:3GDAcqdIg1ozBNLgPy4SLT84nfcBjr6rhGtXYtrkWLU= golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10/go.mod h1:T97yPqesLiNrOYxkwmhMI0ZIlJDm+p0PMR8eRVeR5tQ= -google.golang.org/genproto v0.0.0-20230920204549-e6e6cdab5c13 h1:vlzZttNJGVqTsRFU9AmdnrcO1Znh8Ew9kCD//yjigk0= -google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237 h1:Kog3KlB4xevJlAcbbbzPfRG0+X9fdoGM+UBRKVz6Wr0= -google.golang.org/genproto/googleapis/api v0.0.0-20250519155744-55703ea1f237/go.mod h1:ezi0AVyMKDWy5xAncvjLWH7UcLBB5n7y2fQ8MzjJcto= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250519155744-55703ea1f237 h1:cJfm9zPbe1e873mHJzmQ1nwVEeRDU/T1wXDK2kUSU34= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250519155744-55703ea1f237/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.72.1 h1:HR03wO6eyZ7lknl75XlxABNVLLFc2PAb6mHlYh756mA= -google.golang.org/grpc v1.72.1/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY= +google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc= +google.golang.org/grpc v1.75.1 h1:/ODCNEuf9VghjgO3rqLcfg8fiOP0nSluljWFlDxELLI= +google.golang.org/grpc v1.75.1/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= From 0405aebb45490da598bd1bb70026893d39670a06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:13:02 +0200 Subject: [PATCH 04/72] Expose admin/metrics endpoint in Dockerfile --- Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index b9c4d29..b9b6dea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,6 +22,9 @@ RUN apk --no-cache add ca-certificates tzdata COPY --from=builder /newt /usr/local/bin/ COPY entrypoint.sh / +# Admin/metrics endpoint (Prometheus scrape) +EXPOSE 2112 + RUN chmod +x /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] -CMD ["newt"] \ No newline at end of file +CMD ["newt"] From 85394d3255d1f7c22e21f2b87d4619da6fc6ae21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:13:02 +0200 Subject: [PATCH 05/72] Add flags and environment variables for telemetry and metrics configuration --- main.go | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/main.go b/main.go index 12849b1..1da756f 100644 --- a/main.go +++ b/main.go @@ -1,7 +1,9 @@ package main import ( + "context" "encoding/json" + "errors" "flag" "fmt" "net" @@ -22,6 +24,9 @@ import ( "github.com/fosrl/newt/updates" "github.com/fosrl/newt/websocket" + "github.com/fosrl/newt/internal/state" + "github.com/fosrl/newt/internal/telemetry" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "golang.zx2c4.com/wireguard/conn" "golang.zx2c4.com/wireguard/device" "golang.zx2c4.com/wireguard/tun" @@ -115,6 +120,15 @@ var ( preferEndpoint string healthMonitor *healthcheck.Monitor enforceHealthcheckCert bool + // Build/version (can be overridden via -ldflags "-X main.newtVersion=...") + newtVersion = "version_replaceme" + + // Observability/metrics flags + metricsEnabled bool + otlpEnabled bool + adminAddr string + region string + metricsAsyncBytes bool // New mTLS configuration variables tlsClientCert string @@ -126,6 +140,10 @@ var ( ) func main() { + // Prepare context for graceful shutdown and signal handling + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + // if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values endpoint = os.Getenv("PANGOLIN_ENDPOINT") id = os.Getenv("NEWT_ID") @@ -141,6 +159,13 @@ func main() { useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE") enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT") + // Metrics/observability env mirrors + metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED") + otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED") + adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR") + regionEnv := os.Getenv("NEWT_REGION") + asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES") + keepInterface = keepInterfaceEnv == "true" acceptClients = acceptClientsEnv == "true" useNativeInterface = useNativeInterfaceEnv == "true" @@ -272,6 +297,43 @@ func main() { flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)") } + // Metrics/observability flags (mirror ENV if unset) + if metricsEnabledEnv == "" { + flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter") + } else { + if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { + metricsEnabled = v + } else { + metricsEnabled = true + } + } + if otlpEnabledEnv == "" { + flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT") + } else { + if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { + otlpEnabled = v + } + } + if adminAddrEnv == "" { + flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address") + } else { + adminAddr = adminAddrEnv + } + // Async bytes toggle + if asyncBytesEnv == "" { + flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)") + } else { + if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { + metricsAsyncBytes = v + } + } + // Optional region flag (resource attribute) + if regionEnv == "" { + flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)") + } else { + region = regionEnv + } + // do a --version check version := flag.Bool("version", false, "Print the version") From a89f13870ca11ab6813a668228123efee42da229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:13:03 +0200 Subject: [PATCH 06/72] Initialize telemetry and start admin HTTP server for metrics export --- main.go | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/main.go b/main.go index 1da756f..e56c8db 100644 --- a/main.go +++ b/main.go @@ -348,7 +348,52 @@ func main() { loggerLevel := parseLogLevel(logLevel) logger.GetLogger().SetLevel(parseLogLevel(logLevel)) - newtVersion := "version_replaceme" + // Initialize telemetry after flags are parsed (so flags override env) + tcfg := telemetry.FromEnv() + tcfg.PromEnabled = metricsEnabled + tcfg.OTLPEnabled = otlpEnabled + if adminAddr != "" { + tcfg.AdminAddr = adminAddr + } + // Resource attributes (if available) + tcfg.SiteID = id + tcfg.Region = region + // Build info + tcfg.BuildVersion = newtVersion + tcfg.BuildCommit = os.Getenv("NEWT_COMMIT") + + tel, telErr := telemetry.Init(ctx, tcfg) + if telErr != nil { + logger.Warn("Telemetry init failed: %v", telErr) + } + if tel != nil { + // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled) + mux := http.NewServeMux() + mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) }) + if tel.PrometheusHandler != nil { + mux.Handle("/metrics", tel.PrometheusHandler) + } + admin := &http.Server{ + Addr: tcfg.AdminAddr, + Handler: otelhttp.NewHandler(mux, "newt-admin"), + ReadTimeout: 5 * time.Second, + WriteTimeout: 10 * time.Second, + ReadHeaderTimeout: 5 * time.Second, + IdleTimeout: 30 * time.Second, + } + go func() { + if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + logger.Warn("admin http error: %v", err) + } + }() + defer func() { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = admin.Shutdown(ctx) + }() + defer func() { _ = tel.Shutdown(context.Background()) }() + } + if *version { fmt.Println("Newt version " + newtVersion) os.Exit(0) From 496ff0734c9d476927fd4e51b641a0891ddb06bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:13:03 +0200 Subject: [PATCH 07/72] Integrate tunnel metrics and telemetry reporting throughout main application logic --- main.go | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index e56c8db..025967a 100644 --- a/main.go +++ b/main.go @@ -664,7 +664,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } // Use reliable ping for initial connection test logger.Debug("Testing initial connection with reliable ping...") - _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) + lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) + if err == nil && wgData.PublicKey != "" { + telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds()) + } if err != nil { logger.Warn("Initial reliable ping failed, but continuing: %v", err) } else { @@ -677,14 +680,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub // as the pings will continue in the background if !connected { logger.Debug("Starting ping check") - pingStopChan = startPingCheck(tnet, wgData.ServerIP, client) + pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey) } // Create proxy manager pm = proxy.NewProxyManager(tnet) + pm.SetAsyncBytes(metricsAsyncBytes) + // Set tunnel_id for metrics (WireGuard peer public key) + pm.SetTunnelID(wgData.PublicKey) connected = true + // telemetry: record a successful site registration (omit region unless available) + telemetry.IncSiteRegistration(context.Background(), id, "", "success") + // add the targets if there are any if len(wgData.Targets.TCP) > 0 { updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP}) @@ -718,10 +727,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) { logger.Info("Received reconnect message") + if wgData.PublicKey != "" { + telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") + } // Close the WireGuard device and TUN closeWgTunnel() + // Clear metrics attrs and sessions for the tunnel + if pm != nil { + pm.ClearTunnelID() + state.Global().ClearTunnel(wgData.PublicKey) + } + + // Clear metrics attrs and sessions for the tunnel + if pm != nil { + pm.ClearTunnelID() + state.Global().ClearTunnel(wgData.PublicKey) + } + // Mark as disconnected connected = false @@ -738,6 +762,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) { logger.Info("Received termination message") + if wgData.PublicKey != "" { + telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") + } // Close the WireGuard device and TUN closeWgTunnel() From ceef228665e88233b159b9273adf75d454ea4723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:13:03 +0200 Subject: [PATCH 08/72] Refactor ProxyManager for per-tunnel metrics, async bytes collection, and session counting --- proxy/manager.go | 247 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 241 insertions(+), 6 deletions(-) diff --git a/proxy/manager.go b/proxy/manager.go index bf10322..e2b7a79 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -1,14 +1,20 @@ package proxy import ( + "context" "fmt" "io" "net" + "os" "strings" "sync" + "sync/atomic" "time" + "github.com/fosrl/newt/internal/state" + "github.com/fosrl/newt/internal/telemetry" "github.com/fosrl/newt/logger" + "go.opentelemetry.io/otel/attribute" "golang.zx2c4.com/wireguard/tun/netstack" "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" ) @@ -28,6 +34,60 @@ type ProxyManager struct { udpConns []*gonet.UDPConn running bool mutex sync.RWMutex + + // telemetry (multi-tunnel) + currentTunnelID string + tunnels map[string]*tunnelEntry + asyncBytes bool + flushStop chan struct{} +} + +// tunnelEntry holds per-tunnel attributes and (optional) async counters. +type tunnelEntry struct { + attrInTCP attribute.Set + attrOutTCP attribute.Set + attrInUDP attribute.Set + attrOutUDP attribute.Set + + bytesInTCP atomic.Uint64 + bytesOutTCP atomic.Uint64 + bytesInUDP atomic.Uint64 + bytesOutUDP atomic.Uint64 +} + +// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set. +type countingWriter struct { + ctx context.Context + w io.Writer + set attribute.Set + pm *ProxyManager + ent *tunnelEntry + out bool // false=in, true=out + proto string // "tcp" or "udp" +} + +func (cw *countingWriter) Write(p []byte) (int, error) { + n, err := cw.w.Write(p) + if n > 0 { + if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil { + if cw.proto == "tcp" { + if cw.out { + cw.ent.bytesOutTCP.Add(uint64(n)) + } else { + cw.ent.bytesInTCP.Add(uint64(n)) + } + } else if cw.proto == "udp" { + if cw.out { + cw.ent.bytesOutUDP.Add(uint64(n)) + } else { + cw.ent.bytesInUDP.Add(uint64(n)) + } + } + } else { + telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set) + } + } + return n, err } // NewProxyManager creates a new proxy manager instance @@ -38,9 +98,56 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager { udpTargets: make(map[string]map[int]string), listeners: make([]*gonet.TCPListener, 0), udpConns: make([]*gonet.UDPConn, 0), + tunnels: make(map[string]*tunnelEntry), } } +// SetTunnelID sets the WireGuard peer public key used as tunnel_id label. +func (pm *ProxyManager) SetTunnelID(id string) { + pm.mutex.Lock() + defer pm.mutex.Unlock() + pm.currentTunnelID = id + if _, ok := pm.tunnels[id]; !ok { + pm.tunnels[id] = &tunnelEntry{} + } + e := pm.tunnels[id] + e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp")) + e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp")) + e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp")) + e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp")) +} + +// ClearTunnelID clears cached attribute sets for the current tunnel. +func (pm *ProxyManager) ClearTunnelID() { + pm.mutex.Lock() + defer pm.mutex.Unlock() + id := pm.currentTunnelID + if id == "" { + return + } + if e, ok := pm.tunnels[id]; ok { + // final flush for this tunnel + inTCP := e.bytesInTCP.Swap(0) + outTCP := e.bytesOutTCP.Swap(0) + inUDP := e.bytesInUDP.Swap(0) + outUDP := e.bytesOutUDP.Swap(0) + if inTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) + } + if outTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) + } + if inUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) + } + if outUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) + } + delete(pm.tunnels, id) + } + pm.currentTunnelID = "" +} + // init function without tnet func NewProxyManagerWithoutTNet() *ProxyManager { return &ProxyManager{ @@ -160,6 +267,75 @@ func (pm *ProxyManager) Start() error { return nil } +func (pm *ProxyManager) SetAsyncBytes(b bool) { + pm.mutex.Lock() + defer pm.mutex.Unlock() + pm.asyncBytes = b + if b && pm.flushStop == nil { + pm.flushStop = make(chan struct{}) + go pm.flushLoop() + } +} +func (pm *ProxyManager) flushLoop() { + flushInterval := 2 * time.Second + if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" { + if d, err := time.ParseDuration(v); err == nil && d > 0 { + if d/2 < flushInterval { + flushInterval = d / 2 + } + } + } + ticker := time.NewTicker(flushInterval) + defer ticker.Stop() + for { + select { + case <-ticker.C: + pm.mutex.RLock() + for _, e := range pm.tunnels { + inTCP := e.bytesInTCP.Swap(0) + outTCP := e.bytesOutTCP.Swap(0) + inUDP := e.bytesInUDP.Swap(0) + outUDP := e.bytesOutUDP.Swap(0) + if inTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) + } + if outTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) + } + if inUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) + } + if outUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) + } + } + pm.mutex.RUnlock() + case <-pm.flushStop: + pm.mutex.RLock() + for _, e := range pm.tunnels { + inTCP := e.bytesInTCP.Swap(0) + outTCP := e.bytesOutTCP.Swap(0) + inUDP := e.bytesInUDP.Swap(0) + outUDP := e.bytesOutUDP.Swap(0) + if inTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) + } + if outTCP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) + } + if inUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) + } + if outUDP > 0 { + telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) + } + } + pm.mutex.RUnlock() + return + } + } +} + func (pm *ProxyManager) Stop() error { pm.mutex.Lock() defer pm.mutex.Unlock() @@ -236,6 +412,14 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr return nil } +// getEntry returns per-tunnel entry or nil. +func (pm *ProxyManager) getEntry(id string) *tunnelEntry { + pm.mutex.RLock() + e := pm.tunnels[id] + pm.mutex.RUnlock() + return e +} + func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) { for { conn, err := listener.Accept() @@ -257,6 +441,11 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) continue } + // Count sessions only once per accepted TCP connection + if pm.currentTunnelID != "" { + state.Global().IncSessions(pm.currentTunnelID) + } + go func() { target, err := net.Dial("tcp", targetAddr) if err != nil { @@ -265,24 +454,35 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) return } + // already incremented on accept + // Create a WaitGroup to ensure both copy operations complete var wg sync.WaitGroup wg.Add(2) + // client -> target (direction=in) go func() { defer wg.Done() - io.Copy(target, conn) - target.Close() + e := pm.getEntry(pm.currentTunnelID) + cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"} + _, _ = io.Copy(cw, conn) + _ = target.Close() }() + // target -> client (direction=out) go func() { defer wg.Done() - io.Copy(conn, target) - conn.Close() + e := pm.getEntry(pm.currentTunnelID) + cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"} + _, _ = io.Copy(cw, target) + _ = conn.Close() }() - // Wait for both copies to complete + // Wait for both copies to complete then session -1 wg.Wait() + if pm.currentTunnelID != "" { + state.Global().DecSessions(pm.currentTunnelID) + } }() } } @@ -326,6 +526,18 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { } clientKey := remoteAddr.String() + // bytes from client -> target (direction=in) + if pm.currentTunnelID != "" && n > 0 { + if pm.asyncBytes { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.bytesInUDP.Add(uint64(n)) + } + } else { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) + } + } + } clientsMutex.RLock() targetConn, exists := clientConns[clientKey] clientsMutex.RUnlock() @@ -366,6 +578,19 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { return // defer will handle cleanup } + // bytes from target -> client (direction=out) + if pm.currentTunnelID != "" && n > 0 { + if pm.asyncBytes { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.bytesOutUDP.Add(uint64(n)) + } + } else { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) + } + } + } + _, err = conn.WriteTo(buffer[:n], remoteAddr) if err != nil { logger.Error("Error writing to client: %v", err) @@ -375,13 +600,23 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { }(clientKey, targetConn, remoteAddr) } - _, err = targetConn.Write(buffer[:n]) + written, err := targetConn.Write(buffer[:n]) if err != nil { logger.Error("Error writing to target: %v", err) targetConn.Close() clientsMutex.Lock() delete(clientConns, clientKey) clientsMutex.Unlock() + } else if pm.currentTunnelID != "" && written > 0 { + if pm.asyncBytes { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.bytesInUDP.Add(uint64(written)) + } + } else { + if e := pm.getEntry(pm.currentTunnelID); e != nil { + telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) + } + } } } } From 0d55e35784b850fd8847a46fe1653d69c1466e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:13:04 +0200 Subject: [PATCH 09/72] Add tunnel latency and reconnect telemetry to ping logic --- util.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/util.go b/util.go index 7d6da4f..dc19388 100644 --- a/util.go +++ b/util.go @@ -2,6 +2,7 @@ package main import ( "bytes" + "context" "encoding/base64" "encoding/hex" "encoding/json" @@ -14,6 +15,7 @@ import ( "math/rand" + "github.com/fosrl/newt/internal/telemetry" "github.com/fosrl/newt/logger" "github.com/fosrl/newt/proxy" "github.com/fosrl/newt/websocket" @@ -229,7 +231,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background") } -func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} { +func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} { maxInterval := 6 * time.Second currentInterval := pingInterval consecutiveFailures := 0 @@ -292,6 +294,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien if !connectionLost { connectionLost = true logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) + if tunnelID != "" { + telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout) + } stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second) // Send registration message to the server for backward compatibility err := client.SendMessage("newt/wg/register", map[string]interface{}{ @@ -318,6 +323,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien } else { // Track recent latencies recentLatencies = append(recentLatencies, latency) + // Record tunnel latency (limit sampling to this periodic check) + if tunnelID != "" { + telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds()) + } if len(recentLatencies) > 10 { recentLatencies = recentLatencies[1:] } From 660adcc72d12d7b3466d65750032c5c3c3a74a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:13:04 +0200 Subject: [PATCH 10/72] Instrument authentication and WebSocket connection logic for telemetry events --- websocket/client.go | 64 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/websocket/client.go b/websocket/client.go index 0c0664a..c9ac264 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -18,6 +18,10 @@ import ( "github.com/fosrl/newt/logger" "github.com/gorilla/websocket" + + "context" + "github.com/fosrl/newt/internal/telemetry" + "go.opentelemetry.io/otel" ) type Client struct { @@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) { } resp, err := client.Do(req) if err != nil { + telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err)) return "", fmt.Errorf("failed to request new token: %w", err) } defer resp.Body.Close() @@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) { if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) + telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure") + bin := "http_other" + if resp.StatusCode >= 500 { + bin = "http_5xx" + } else if resp.StatusCode >= 400 { + bin = "http_4xx" + } + telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin) + // Reconnect reason mapping for auth failures + if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { + telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError) + } return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) } @@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) { } logger.Debug("Received token: %s", tokenResp.Data.Token) + telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success") return tokenResp.Data.Token, nil } +// classifyConnError maps common errors to low-cardinality error_type labels +func classifyConnError(err error) string { + if err == nil { + return "" + } + msg := strings.ToLower(err.Error()) + switch { + case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"): + return "tls" + case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"): + return "timeout" + case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"): + return "dns" + case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"): + return "auth" + case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"): + return "io" + default: + return "other" + } +} + func (c *Client) connectWithRetry() { for { select { @@ -337,6 +377,10 @@ func (c *Client) establishConnection() error { // Get token for authentication token, err := c.getToken() if err != nil { + // telemetry: connection attempt failed before dialing + // site_id isn't globally available here; use client ID as site_id (low cardinality) + telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") + telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err)) return fmt.Errorf("failed to get token: %w", err) } @@ -369,7 +413,11 @@ func (c *Client) establishConnection() error { q.Set("clientType", c.clientType) u.RawQuery = q.Encode() - // Connect to WebSocket + // Connect to WebSocket (optional span) + tr := otel.Tracer("newt") + spanCtx, span := tr.Start(context.Background(), "ws.connect") + defer span.End() + dialer := websocket.DefaultDialer // Use new TLS configuration method @@ -391,11 +439,23 @@ func (c *Client) establishConnection() error { logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable") } - conn, _, err := dialer.Dial(u.String(), nil) +conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) if err != nil { + telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") + etype := classifyConnError(err) + telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype) + // Map handshake-related errors to reconnect reasons where appropriate + if etype == "tls" { + telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError) + } else if etype == "timeout" { + telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout) + } else { + telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError) + } return fmt.Errorf("failed to connect to WebSocket: %w", err) } + telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success") c.conn = conn c.setConnected(true) From 8f7f9c417ce3e68fa2a2cc08a364a5ab5aaab671 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:13:05 +0200 Subject: [PATCH 11/72] Refactor WireGuard and netstack services for telemetry integration --- wg/wg.go | 15 +++++++++++++-- wgnetstack/wgnetstack.go | 13 +++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/wg/wg.go b/wg/wg.go index 3cee1a9..adf8df6 100644 --- a/wg/wg.go +++ b/wg/wg.go @@ -3,6 +3,7 @@ package wg import ( + "context" "encoding/json" "errors" "fmt" @@ -13,16 +14,19 @@ import ( "sync" "time" + "math/rand" + "github.com/fosrl/newt/logger" "github.com/fosrl/newt/network" "github.com/fosrl/newt/websocket" "github.com/vishvananda/netlink" "golang.org/x/crypto/chacha20poly1305" "golang.org/x/crypto/curve25519" - "golang.org/x/exp/rand" "golang.zx2c4.com/wireguard/conn" "golang.zx2c4.com/wireguard/wgctrl" "golang.zx2c4.com/wireguard/wgctrl/wgtypes" + + "github.com/fosrl/newt/internal/telemetry" ) type WgConfig struct { @@ -106,7 +110,7 @@ func FindAvailableUDPPort(minPort, maxPort uint16) (uint16, error) { } // Fisher-Yates shuffle to randomize the port order - rand.Seed(uint64(time.Now().UnixNano())) + rand.Seed(time.Now().UnixNano()) for i := len(portRange) - 1; i > 0; i-- { j := rand.Intn(i + 1) portRange[i], portRange[j] = portRange[j], portRange[i] @@ -298,6 +302,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { s.stopGetConfig = nil } + // telemetry: config reload success + telemetry.IncConfigReload(context.Background(), "success") + // Optional reconnect reason mapping: config change + if s.serverPubKey != "" { + telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange) + } + // Ensure the WireGuard interface and peers are configured if err := s.ensureWireguardInterface(config); err != nil { logger.Error("Failed to ensure WireGuard interface: %v", err) diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go index 6684c40..09f160e 100644 --- a/wgnetstack/wgnetstack.go +++ b/wgnetstack/wgnetstack.go @@ -1,6 +1,7 @@ package wgnetstack import ( + "context" "crypto/rand" "encoding/base64" "encoding/hex" @@ -26,6 +27,8 @@ import ( "golang.zx2c4.com/wireguard/tun" "golang.zx2c4.com/wireguard/tun/netstack" "golang.zx2c4.com/wireguard/wgctrl/wgtypes" + + "github.com/fosrl/newt/internal/telemetry" ) type WgConfig struct { @@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str return service, nil } +// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally. +func (s *WireGuardService) ReportRTT(seconds float64) { + if s.serverPubKey == "" { return } + telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds) +} + func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) { logger.Debug("Received: %+v", msg) // if there is no wgData or pm, we can't add targets if s.TunnelIP == "" || s.proxyManager == nil { logger.Info("No tunnel IP or proxy manager available") - return - } + return +} targetData, err := parseTargetData(msg.Data) if err != nil { From 2d4f656852f69cc5466ba76363d8301f07aec354 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:15:36 +0200 Subject: [PATCH 12/72] Add telemetry metrics and constants for improved observability --- internal/telemetry/constants.go | 19 ++ internal/telemetry/constants_test.go | 32 +++ internal/telemetry/metrics.go | 231 +++++++++++++++ internal/telemetry/state_view.go | 63 +++++ internal/telemetry/telemetry.go | 265 ++++++++++++++++++ .../telemetry/telemetry_attrfilter_test.go | 43 +++ internal/telemetry/telemetry_golden_test.go | 50 ++++ internal/telemetry/telemetry_smoke_test.go | 54 ++++ 8 files changed, 757 insertions(+) create mode 100644 internal/telemetry/constants.go create mode 100644 internal/telemetry/constants_test.go create mode 100644 internal/telemetry/metrics.go create mode 100644 internal/telemetry/state_view.go create mode 100644 internal/telemetry/telemetry.go create mode 100644 internal/telemetry/telemetry_attrfilter_test.go create mode 100644 internal/telemetry/telemetry_golden_test.go create mode 100644 internal/telemetry/telemetry_smoke_test.go diff --git a/internal/telemetry/constants.go b/internal/telemetry/constants.go new file mode 100644 index 0000000..bc117bf --- /dev/null +++ b/internal/telemetry/constants.go @@ -0,0 +1,19 @@ +package telemetry + +// Protocol labels (low-cardinality) +const ( + ProtocolTCP = "tcp" + ProtocolUDP = "udp" +) + +// Reconnect reason bins (fixed, low-cardinality) +const ( + ReasonServerRequest = "server_request" + ReasonTimeout = "timeout" + ReasonPeerClose = "peer_close" + ReasonNetworkChange = "network_change" + ReasonAuthError = "auth_error" + ReasonHandshakeError = "handshake_error" + ReasonConfigChange = "config_change" + ReasonError = "error" +) diff --git a/internal/telemetry/constants_test.go b/internal/telemetry/constants_test.go new file mode 100644 index 0000000..e95fb52 --- /dev/null +++ b/internal/telemetry/constants_test.go @@ -0,0 +1,32 @@ +package telemetry + +import "testing" + +func TestAllowedConstants(t *testing.T) { + allowedReasons := map[string]struct{}{ + ReasonServerRequest: {}, + ReasonTimeout: {}, + ReasonPeerClose: {}, + ReasonNetworkChange: {}, + ReasonAuthError: {}, + ReasonHandshakeError: {}, + ReasonConfigChange: {}, + ReasonError: {}, + } + for k := range allowedReasons { + if k == "" { + t.Fatalf("empty reason constant") + } + } + + allowedProtocols := map[string]struct{}{ + ProtocolTCP: {}, + ProtocolUDP: {}, + } + for k := range allowedProtocols { + if k == "" { + t.Fatalf("empty protocol constant") + } + } +} + diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go new file mode 100644 index 0000000..130fbd3 --- /dev/null +++ b/internal/telemetry/metrics.go @@ -0,0 +1,231 @@ +package telemetry + +import ( + "context" + "sync" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Instruments and helpers for Newt metrics following the naming, units, and +// low-cardinality label guidance from the issue description. +// +// Counters end with _total, durations are in seconds, sizes in bytes. +// Only low-cardinality stable labels are supported: site_id, tunnel_id, +// transport, direction, result, reason, error_type, region. +var ( + initOnce sync.Once + + meter metric.Meter + + // Site / Registration + mSiteRegistrations metric.Int64Counter + mSiteOnline metric.Int64ObservableGauge + mSiteLastHeartbeat metric.Float64ObservableGauge + + // Tunnel / Sessions + mTunnelSessions metric.Int64ObservableGauge + mTunnelBytes metric.Int64Counter + mTunnelLatency metric.Float64Histogram + mReconnects metric.Int64Counter + + // Connection / NAT + mConnAttempts metric.Int64Counter + mConnErrors metric.Int64Counter + + // Config/Restart + mConfigReloads metric.Int64Counter + mRestartCount metric.Int64Counter + + // Build info + mBuildInfo metric.Int64ObservableGauge + + buildVersion string + buildCommit string +) + +func registerInstruments() error { + var err error + initOnce.Do(func() { + meter = otel.Meter("newt") + + // Site / Registration + mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total", + metric.WithDescription("Total site registration attempts")) + if err != nil { + return + } + mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online", + metric.WithDescription("Site online (0/1)")) + if err != nil { + return + } + mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_seconds", + metric.WithDescription("Seconds since last site heartbeat")) + if err != nil { + return + } + + // Tunnel / Sessions + mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions", + metric.WithDescription("Active tunnel sessions")) + if err != nil { + return + } + mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total", + metric.WithDescription("Tunnel bytes in/out")) + if err != nil { + return + } + mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds", + metric.WithDescription("Per-tunnel latency in seconds")) + if err != nil { + return + } + mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total", + metric.WithDescription("Tunnel reconnect events")) + if err != nil { + return + } + + // Connection / NAT + mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total", + metric.WithDescription("Connection attempts")) + if err != nil { + return + } + mConnErrors, err = meter.Int64Counter("newt_connection_errors_total", + metric.WithDescription("Connection errors by type")) + if err != nil { + return + } + + // Config/Restart + mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total", + metric.WithDescription("Configuration reloads")) + mRestartCount, _ = meter.Int64Counter("newt_restart_count_total", + metric.WithDescription("Process restart count (incremented on start)")) + + // Build info gauge (value 1 with version/commit attributes) + mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info", + metric.WithDescription("Newt build information (value is always 1)")) + + // Register a default callback for build info if version/commit set + meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + if buildVersion == "" && buildCommit == "" { + return nil + } + attrs := []attribute.KeyValue{} + if buildVersion != "" { + attrs = append(attrs, attribute.String("version", buildVersion)) + } + if buildCommit != "" { + attrs = append(attrs, attribute.String("commit", buildCommit)) + } + o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...)) + return nil + }, mBuildInfo) + }) + return err +} + +// Observable registration: Newt can register a callback to report gauges. +// Call SetObservableCallback once to start observing online status, last +// heartbeat seconds, and active sessions. + +var ( + obsOnce sync.Once + obsStopper func() +) + +// SetObservableCallback registers a single callback that will be invoked +// on collection. Use the provided observer to emit values for the observable +// gauges defined here. +// +// Example inside your code (where you have access to current state): +// +// telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error { +// o.ObserveInt64(mSiteOnline, 1, attribute.String("site_id", siteID)) +// o.ObserveFloat64(mSiteLastHeartbeat, time.Since(lastHB).Seconds(), attribute.String("site_id", siteID)) +// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions)), attribute.String("site_id", siteID)) +// return nil +// }) +func SetObservableCallback(cb func(context.Context, metric.Observer) error) { + obsOnce.Do(func() { + meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions) + obsStopper = func() { /* no-op; otel callbacks are unregistered when provider shuts down */ } + }) +} + +// Build info registration +func RegisterBuildInfo(version, commit string) { + buildVersion = version + buildCommit = commit + // Increment restart count on boot + mRestartCount.Add(context.Background(), 1) +} + +// Config reloads +func IncConfigReload(ctx context.Context, result string) { + mConfigReloads.Add(ctx, 1, metric.WithAttributes(attribute.String("result", result))) +} + +// Helpers for counters/histograms + +func IncSiteRegistration(ctx context.Context, siteID, region, result string) { + attrs := []attribute.KeyValue{ + attribute.String("site_id", siteID), + attribute.String("result", result), + } + if region != "" { + attrs = append(attrs, attribute.String("region", region)) + } + mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrs...)) +} + +func AddTunnelBytes(ctx context.Context, siteID, tunnelID, direction string, n int64) { + mTunnelBytes.Add(ctx, n, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("tunnel_id", tunnelID), + attribute.String("direction", direction), + )) +} + +// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations. +func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) { + mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs)) +} + +func ObserveTunnelLatency(ctx context.Context, siteID, tunnelID, transport string, seconds float64) { + mTunnelLatency.Record(ctx, seconds, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("tunnel_id", tunnelID), + attribute.String("transport", transport), + )) +} + +func IncReconnect(ctx context.Context, siteID, tunnelID, reason string) { + mReconnects.Add(ctx, 1, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("tunnel_id", tunnelID), + attribute.String("reason", reason), + )) +} + +func IncConnAttempt(ctx context.Context, siteID, transport, result string) { + mConnAttempts.Add(ctx, 1, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("transport", transport), + attribute.String("result", result), + )) +} + +func IncConnError(ctx context.Context, siteID, transport, typ string) { + mConnErrors.Add(ctx, 1, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("transport", transport), + attribute.String("error_type", typ), + )) +} diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go new file mode 100644 index 0000000..4c97ddf --- /dev/null +++ b/internal/telemetry/state_view.go @@ -0,0 +1,63 @@ +package telemetry + +import ( + "context" + "sync/atomic" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// StateView provides a read-only view for observable gauges. +// Implementations must be concurrency-safe and avoid blocking operations. +// All methods should be fast and use RLocks where applicable. +type StateView interface { + // ListSites returns a stable, low-cardinality list of site IDs to expose. + ListSites() []string + // Online returns whether the site is online. + Online(siteID string) (online bool, ok bool) + // LastHeartbeat returns the last heartbeat time for a site. + LastHeartbeat(siteID string) (t time.Time, ok bool) + // ActiveSessions returns the current number of active sessions for a site (across tunnels), + // or scoped to site if your model is site-scoped. + ActiveSessions(siteID string) (n int64, ok bool) +} + +var ( + stateView atomic.Value // of type StateView +) + +// RegisterStateView sets the global StateView used by the default observable callback. +func RegisterStateView(v StateView) { + stateView.Store(v) + // If instruments are registered, ensure a callback exists. + if v != nil { + SetObservableCallback(func(ctx context.Context, o metric.Observer) error { + if any := stateView.Load(); any != nil { + if sv, ok := any.(StateView); ok { + for _, siteID := range sv.ListSites() { + if online, ok := sv.Online(siteID); ok { + val := int64(0) + if online { + val = 1 + } + o.ObserveInt64(mSiteOnline, val, metric.WithAttributes(attribute.String("site_id", siteID))) + } + if t, ok := sv.LastHeartbeat(siteID); ok { + secs := time.Since(t).Seconds() + o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes(attribute.String("site_id", siteID))) + } + // If the view supports per-tunnel sessions, report them labeled by tunnel_id. + if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok { + for tid, n := range tm.SessionsByTunnel() { + o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attribute.String("tunnel_id", tid))) + } + } + } + } + } + return nil + }) + } +} diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go new file mode 100644 index 0000000..20a25c0 --- /dev/null +++ b/internal/telemetry/telemetry.go @@ -0,0 +1,265 @@ +package telemetry + +import ( + "context" + "errors" + "net/http" + "os" + "strings" + "time" + + promclient "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "go.opentelemetry.io/contrib/instrumentation/runtime" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/prometheus" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" + "google.golang.org/grpc/credentials" +) + +// Config controls telemetry initialization via env flags. +// +// Defaults align with the issue requirements: +// - Prometheus exporter enabled by default (/metrics) +// - OTLP exporter disabled by default +// - Durations in seconds, bytes in raw bytes +// - Admin HTTP server address configurable (for mounting /metrics) +type Config struct { + ServiceName string + ServiceVersion string + + // Optional resource attributes + SiteID string + Region string + + PromEnabled bool + OTLPEnabled bool + + OTLPEndpoint string // host:port + OTLPInsecure bool + + MetricExportInterval time.Duration + AdminAddr string // e.g.: ":2112" + + // Optional build info for newt_build_info metric + BuildVersion string + BuildCommit string +} + +// FromEnv reads configuration from environment variables. +// +// NEWT_METRICS_PROMETHEUS_ENABLED (default: true) +// NEWT_METRICS_OTLP_ENABLED (default: false) +// OTEL_EXPORTER_OTLP_ENDPOINT (default: "localhost:4317") +// OTEL_EXPORTER_OTLP_INSECURE (default: true) +// OTEL_METRIC_EXPORT_INTERVAL (default: 15s) +// OTEL_SERVICE_NAME (default: "newt") +// OTEL_SERVICE_VERSION (default: "") +// NEWT_ADMIN_ADDR (default: ":2112") +func FromEnv() Config { + return Config{ + ServiceName: getenv("OTEL_SERVICE_NAME", "newt"), + ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"), + Region: os.Getenv("NEWT_REGION"), + PromEnabled: getenv("NEWT_METRICS_PROMETHEUS_ENABLED", "true") == "true", + OTLPEnabled: getenv("NEWT_METRICS_OTLP_ENABLED", "false") == "true", + OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"), + OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true", + MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second), + AdminAddr: getenv("NEWT_ADMIN_ADDR", "127.0.0.1:2112"), + } +} + +// Setup holds initialized telemetry providers and (optionally) a /metrics handler. +// Call Shutdown when the process terminates to flush exporters. +type Setup struct { + MeterProvider *sdkmetric.MeterProvider + TracerProvider *sdktrace.TracerProvider + + PrometheusHandler http.Handler // nil if Prometheus exporter disabled + + shutdowns []func(context.Context) error +} + +// Init configures OpenTelemetry metrics and (optionally) tracing. +// +// It sets a global MeterProvider and TracerProvider, registers runtime instrumentation, +// installs recommended histogram views for *_latency_seconds, and returns a Setup with +// a Shutdown method to flush exporters. +func Init(ctx context.Context, cfg Config) (*Setup, error) { + res, _ := resource.New(ctx, + resource.WithFromEnv(), + resource.WithHost(), + resource.WithAttributes( + semconv.ServiceName(cfg.ServiceName), + semconv.ServiceVersion(cfg.ServiceVersion), + // Optional resource attributes + attribute.String("site_id", cfg.SiteID), + attribute.String("region", cfg.Region), + ), + ) + + s := &Setup{} + + // Build metric readers/exporters + var readers []sdkmetric.Reader + + // Prometheus exporter exposes a native /metrics handler for scraping + if cfg.PromEnabled { + reg := promclient.NewRegistry() + exp, err := prometheus.New(prometheus.WithRegisterer(reg)) + if err != nil { + return nil, err + } + readers = append(readers, exp) + s.PrometheusHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{}) + } + + // Optional OTLP metric exporter (gRPC) + if cfg.OTLPEnabled { + mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)} + // Headers support via OTEL_EXPORTER_OTLP_HEADERS (k=v,k2=v2) + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { + mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs)) + } + if cfg.OTLPInsecure { + mopts = append(mopts, otlpmetricgrpc.WithInsecure()) + } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + creds, cerr := credentials.NewClientTLSFromFile(certFile, "") + if cerr == nil { + mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds)) + } + } + mexp, err := otlpmetricgrpc.New(ctx, mopts...) + if err != nil { + return nil, err + } + readers = append(readers, sdkmetric.NewPeriodicReader(mexp, sdkmetric.WithInterval(cfg.MetricExportInterval))) + s.shutdowns = append(s.shutdowns, mexp.Shutdown) + } + + // Build provider options iteratively (WithReader is not variadic) + var mpOpts []sdkmetric.Option + mpOpts = append(mpOpts, sdkmetric.WithResource(res)) + for _, r := range readers { + mpOpts = append(mpOpts, sdkmetric.WithReader(r)) + } + // Default view for latency histograms in seconds. + mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( + sdkmetric.Instrument{ + Name: "newt_*_latency_seconds", + }, + sdkmetric.Stream{ + Aggregation: sdkmetric.AggregationExplicitBucketHistogram{ + Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, + }, + }, + ))) + // Attribute whitelist: only allow expected low-cardinality keys on newt_* instruments. + mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( + sdkmetric.Instrument{Name: "newt_*"}, + sdkmetric.Stream{ + AttributeFilter: func(kv attribute.KeyValue) bool { + k := string(kv.Key) + switch k { + case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type": + return true + default: + return false + } + }, + }, + ))) + mp := sdkmetric.NewMeterProvider(mpOpts...) + otel.SetMeterProvider(mp) + s.MeterProvider = mp + s.shutdowns = append(s.shutdowns, mp.Shutdown) + + // Optional tracing (OTLP over gRPC) + if cfg.OTLPEnabled { + topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)} + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { + topts = append(topts, otlptracegrpc.WithHeaders(hdrs)) + } + if cfg.OTLPInsecure { + topts = append(topts, otlptracegrpc.WithInsecure()) + } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + creds, cerr := credentials.NewClientTLSFromFile(certFile, "") + if cerr == nil { + topts = append(topts, otlptracegrpc.WithTLSCredentials(creds)) + } + } + exp, err := otlptracegrpc.New(ctx, topts...) + if err == nil { + tp := sdktrace.NewTracerProvider( + sdktrace.WithBatcher(exp), + sdktrace.WithResource(res), + ) + otel.SetTracerProvider(tp) + s.TracerProvider = tp + s.shutdowns = append(s.shutdowns, func(ctx context.Context) error { + return errors.Join(exp.Shutdown(ctx), tp.Shutdown(ctx)) + }) + } + } + + // Export Go runtime metrics (goroutines, GC, mem, etc.) + _ = runtime.Start(runtime.WithMeterProvider(mp)) + + // Register instruments after provider is set + if err := registerInstruments(); err != nil { + return nil, err + } + // Optional build info metric + if cfg.BuildVersion != "" || cfg.BuildCommit != "" { + RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit) + } + + return s, nil +} + +// Shutdown flushes exporters and providers in reverse init order. +func (s *Setup) Shutdown(ctx context.Context) error { + var err error + for i := len(s.shutdowns) - 1; i >= 0; i-- { + err = errors.Join(err, s.shutdowns[i](ctx)) + } + return err +} + +func parseOTLPHeaders(h string) map[string]string { + m := map[string]string{} + if h == "" { + return m + } + pairs := strings.Split(h, ",") + for _, p := range pairs { + kv := strings.SplitN(strings.TrimSpace(p), "=", 2) + if len(kv) == 2 { + m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1]) + } + } + return m +} + +func getenv(k, d string) string { + if v := os.Getenv(k); v != "" { + return v + } + return d +} + +func getdur(k string, d time.Duration) time.Duration { + if v := os.Getenv(k); v != "" { + if p, e := time.ParseDuration(v); e == nil { + return p + } + } + return d +} diff --git a/internal/telemetry/telemetry_attrfilter_test.go b/internal/telemetry/telemetry_attrfilter_test.go new file mode 100644 index 0000000..461888f --- /dev/null +++ b/internal/telemetry/telemetry_attrfilter_test.go @@ -0,0 +1,43 @@ +package telemetry + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "go.opentelemetry.io/otel/attribute" +) + +// Test that disallowed attributes are filtered from the exposition. +func TestAttributeFilterDropsUnknownKeys(t *testing.T) { + ctx := context.Background() +cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"} + tel, err := Init(ctx, cfg) + if err != nil { t.Fatalf("init: %v", err) } + defer func() { _ = tel.Shutdown(context.Background()) }() + + if tel.PrometheusHandler == nil { t.Fatalf("prom handler nil") } + ts := httptest.NewServer(tel.PrometheusHandler) + defer ts.Close() + +// Add samples with disallowed attribute keys + for _, k := range []string{"forbidden", "site_id", "host"} { + set := attribute.NewSet(attribute.String(k, "x")) + AddTunnelBytesSet(ctx, 123, set) + } + time.Sleep(50 * time.Millisecond) + + resp, err := http.Get(ts.URL) + if err != nil { t.Fatalf("GET: %v", err) } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + body := string(b) + if strings.Contains(body, "forbidden=") { + t.Fatalf("unexpected forbidden attribute leaked into metrics: %s", body) + } +} + diff --git a/internal/telemetry/telemetry_golden_test.go b/internal/telemetry/telemetry_golden_test.go new file mode 100644 index 0000000..91dcbd2 --- /dev/null +++ b/internal/telemetry/telemetry_golden_test.go @@ -0,0 +1,50 @@ +package telemetry + +import ( + "bufio" + "context" + "io" + "net/http" + "net/http/httptest" + "os" + "strings" + "testing" + "time" +) + +// Golden test that /metrics contains expected metric names. +func TestMetricsGoldenContains(t *testing.T) { + ctx := context.Background() +cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", BuildVersion: "test"} + tel, err := Init(ctx, cfg) + if err != nil { t.Fatalf("telemetry init error: %v", err) } + defer func() { _ = tel.Shutdown(context.Background()) }() + + if tel.PrometheusHandler == nil { t.Fatalf("prom handler nil") } + ts := httptest.NewServer(tel.PrometheusHandler) + defer ts.Close() + + // Trigger a counter + IncConnAttempt(ctx, "ignored", "websocket", "success") + time.Sleep(100 * time.Millisecond) + + resp, err := http.Get(ts.URL) + if err != nil { t.Fatalf("GET metrics failed: %v", err) } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + body := string(b) + + f, err := os.Open("internal/telemetry/testdata/expected_contains.golden") + if err != nil { t.Fatalf("read golden: %v", err) } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + needle := strings.TrimSpace(s.Text()) + if needle == "" { continue } + if !strings.Contains(body, needle) { + t.Fatalf("expected metrics body to contain %q. body=\n%s", needle, body) + } + } + if err := s.Err(); err != nil { t.Fatalf("scan golden: %v", err) } +} + diff --git a/internal/telemetry/telemetry_smoke_test.go b/internal/telemetry/telemetry_smoke_test.go new file mode 100644 index 0000000..b820af1 --- /dev/null +++ b/internal/telemetry/telemetry_smoke_test.go @@ -0,0 +1,54 @@ +package telemetry + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +// Smoke test that /metrics contains at least one newt_* metric when Prom exporter is enabled. +func TestMetricsSmoke(t *testing.T) { + ctx := context.Background() + cfg := Config{ + ServiceName: "newt", + PromEnabled: true, + OTLPEnabled: false, + AdminAddr: "127.0.0.1:0", + BuildVersion: "test", + BuildCommit: "deadbeef", + MetricExportInterval: 5 * time.Second, + } + tel, err := Init(ctx, cfg) + if err != nil { + t.Fatalf("telemetry init error: %v", err) + } + defer func() { _ = tel.Shutdown(context.Background()) }() + + // Serve the Prom handler on a test server + if tel.PrometheusHandler == nil { + t.Fatalf("Prometheus handler nil; PromEnabled should enable it") + } + ts := httptest.NewServer(tel.PrometheusHandler) + defer ts.Close() + + // Record a simple metric and then fetch /metrics + IncConnAttempt(ctx, "site-1", "websocket", "success") + // Give the exporter a tick to collect + time.Sleep(100 * time.Millisecond) + + resp, err := http.Get(ts.URL) + if err != nil { + t.Fatalf("GET /metrics failed: %v", err) + } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + body := string(b) + if !strings.Contains(body, "newt_connection_attempts_total") { + t.Fatalf("expected newt_connection_attempts_total in metrics, got:\n%s", body) + } +} + From 09e9bd9493b9e52df854772992ee5c5390b49e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:16:17 +0200 Subject: [PATCH 13/72] Implement TelemetryView for thread-safe session management and observability --- internal/state/telemetry_view.go | 80 ++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 internal/state/telemetry_view.go diff --git a/internal/state/telemetry_view.go b/internal/state/telemetry_view.go new file mode 100644 index 0000000..fb1d44a --- /dev/null +++ b/internal/state/telemetry_view.go @@ -0,0 +1,80 @@ +package state + +import ( + "sync" + "sync/atomic" + "time" + + "github.com/fosrl/newt/internal/telemetry" +) + +// TelemetryView is a minimal, thread-safe implementation to feed observables. +// Since one Newt process represents one site, we expose a single logical site. +// site_id is a resource attribute, so we do not emit per-site labels here. +type TelemetryView struct { + online atomic.Bool + lastHBUnix atomic.Int64 // unix seconds + // per-tunnel sessions + sessMu sync.RWMutex + sessions map[string]*atomic.Int64 +} + +var ( + globalView atomic.Pointer[TelemetryView] +) + +// Global returns a singleton TelemetryView. +func Global() *TelemetryView { + if v := globalView.Load(); v != nil { return v } + v := &TelemetryView{ sessions: make(map[string]*atomic.Int64) } + globalView.Store(v) + telemetry.RegisterStateView(v) + return v +} + +// Instrumentation helpers +func (v *TelemetryView) IncSessions(tunnelID string) { + v.sessMu.Lock(); defer v.sessMu.Unlock() + c := v.sessions[tunnelID] + if c == nil { c = &atomic.Int64{}; v.sessions[tunnelID] = c } + c.Add(1) +} +func (v *TelemetryView) DecSessions(tunnelID string) { + v.sessMu.Lock(); defer v.sessMu.Unlock() + if c := v.sessions[tunnelID]; c != nil { + c.Add(-1) + if c.Load() <= 0 { delete(v.sessions, tunnelID) } + } +} +func (v *TelemetryView) ClearTunnel(tunnelID string) { + v.sessMu.Lock(); defer v.sessMu.Unlock() + delete(v.sessions, tunnelID) +} +func (v *TelemetryView) SetOnline(b bool) { v.online.Store(b) } +func (v *TelemetryView) TouchHeartbeat() { v.lastHBUnix.Store(time.Now().Unix()) } + +// --- telemetry.StateView interface --- + +func (v *TelemetryView) ListSites() []string { return []string{"self"} } +func (v *TelemetryView) Online(_ string) (bool, bool) { return v.online.Load(), true } +func (v *TelemetryView) LastHeartbeat(_ string) (time.Time, bool) { + sec := v.lastHBUnix.Load() + if sec == 0 { return time.Time{}, false } + return time.Unix(sec, 0), true +} +func (v *TelemetryView) ActiveSessions(_ string) (int64, bool) { + // aggregated sessions (not used for per-tunnel gauge) + v.sessMu.RLock(); defer v.sessMu.RUnlock() + var sum int64 + for _, c := range v.sessions { if c != nil { sum += c.Load() } } + return sum, true +} + +// Extended accessor used by telemetry callback to publish per-tunnel samples. +func (v *TelemetryView) SessionsByTunnel() map[string]int64 { + v.sessMu.RLock(); defer v.sessMu.RUnlock() + out := make(map[string]int64, len(v.sessions)) + for id, c := range v.sessions { if c != nil && c.Load() > 0 { out[id] = c.Load() } } + return out +} + From 0f83489f11c74f04561ce996f43652d24dd59f9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:16:44 +0200 Subject: [PATCH 14/72] Add OpenTelemetry configuration and observability documentation --- docker-compose.metrics.yml | 33 +++++++ docs/observability.md | 169 +++++++++++++++++++++++++++++++++++ examples/otel-collector.yaml | 41 +++++++++ examples/prometheus.yml | 11 +++ 4 files changed, 254 insertions(+) create mode 100644 docker-compose.metrics.yml create mode 100644 docs/observability.md create mode 100644 examples/otel-collector.yaml create mode 100644 examples/prometheus.yml diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml new file mode 100644 index 0000000..76b92a8 --- /dev/null +++ b/docker-compose.metrics.yml @@ -0,0 +1,33 @@ +services: + collector: + image: otel/opentelemetry-collector:0.111.0 + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC in + - "8889:8889" # Prometheus scrape out + + newt: + build: . + image: newt:dev + environment: + OTEL_SERVICE_NAME: newt + NEWT_METRICS_PROMETHEUS_ENABLED: "true" + NEWT_METRICS_OTLP_ENABLED: "true" + OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317" + OTEL_EXPORTER_OTLP_INSECURE: "true" + OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative" + NEWT_ADMIN_ADDR: "0.0.0.0:2112" + ports: + - "2112:2112" + depends_on: + - collector + + prometheus: + image: prom/prometheus:v2.55.0 + volumes: + - ./examples/prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 0000000..3e9e890 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,169 @@ +# OpenTelemetry Observability for Newt + +This document describes how Newt exposes metrics using the OpenTelemetry (OTel) Go SDK, how to enable Prometheus scraping, and how to send data to an OpenTelemetry Collector for further export. + +Goals + +- Provide a /metrics endpoint in Prometheus exposition format (via OTel Prometheus exporter) +- Keep metrics backend-agnostic; optional OTLP export to a Collector +- Use OTel semantic conventions where applicable and enforce SI units +- Low-cardinality, stable labels only + +Enable via flags (ENV mirrors) + +- --metrics (default: true) ↔ NEWT_METRICS_PROMETHEUS_ENABLED +- --metrics-admin-addr (default: 127.0.0.1:2112) ↔ NEWT_ADMIN_ADDR +- --otlp (default: false) ↔ NEWT_METRICS_OTLP_ENABLED + +Enable exporters via environment variables (no code changes required) + +- NEWT_METRICS_PROMETHEUS_ENABLED=true|false (default: true) +- NEWT_METRICS_OTLP_ENABLED=true|false (default: false) +- OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 +- OTEL_EXPORTER_OTLP_INSECURE=true|false (default: true for dev) +- OTEL_SERVICE_NAME=newt (default) +- OTEL_SERVICE_VERSION= +- OTEL_RESOURCE_ATTRIBUTES=service.instance.id=,site_id= +- OTEL_METRIC_EXPORT_INTERVAL=15s (default) +- NEWT_ADMIN_ADDR=127.0.0.1:2112 (default admin HTTP with /metrics) + +Runtime behavior + +- When Prometheus exporter is enabled, Newt serves /metrics on NEWT_ADMIN_ADDR (default :2112) +- When OTLP is enabled, metrics and traces are exported to OTLP gRPC endpoint +- Go runtime metrics (goroutines, GC, memory) are exported automatically + +Metric catalog (initial) + +- newt_site_registrations_total (counter) labels: result, region (optional); site_id is a resource attribute +- newt_site_online (observable gauge) no labels (0/1) +- newt_site_last_heartbeat_seconds (observable gauge) no labels +- newt_tunnel_sessions (observable gauge) labels: tunnel_id, transport +- newt_tunnel_bytes_total (counter) labels: tunnel_id, direction (in|out) +- newt_tunnel_latency_seconds (histogram) labels: tunnel_id, transport +- newt_tunnel_reconnects_total (counter) labels: tunnel_id, reason +- newt_connection_attempts_total (counter) labels: transport, result +- newt_connection_errors_total (counter) labels: transport, error_type + +Conventions + +- Durations in seconds, names end with _seconds +- Sizes in bytes, names end with _bytes +- Counters end with _total +- Labels must be low-cardinality and stable + +Histogram buckets + +- Latency (seconds): 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30 + +Local quickstart + +1) Direct Prometheus scrape (do not also scrape the Collector) + NEWT_METRICS_PROMETHEUS_ENABLED=true \ + NEWT_METRICS_OTLP_ENABLED=false \ + NEWT_ADMIN_ADDR="127.0.0.1:2112" \ + ./newt + + curl -s | grep ^newt_ + +2) Using the Collector (compose-style) + NEWT_METRICS_PROMETHEUS_ENABLED=true \ + NEWT_METRICS_OTLP_ENABLED=true \ + OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \ + OTEL_EXPORTER_OTLP_INSECURE=true \ + OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=cumulative \ + ./newt + + Collector config example: examples/otel-collector.yaml + Prometheus scrape config: examples/prometheus.yml + +Adding new metrics + +- Use helpers in internal/telemetry/metrics.go for counters/histograms +- Keep labels low-cardinality +- Add observable gauges through SetObservableCallback + +Optional tracing + +- When --otlp is enabled, you can wrap outbound HTTP clients with otelhttp.NewTransport to create spans for HTTP requests to Pangolin. This affects traces only and does not add metric labels. + +OTLP TLS example + +- Enable TLS to Collector with a custom CA and headers: + +``` +NEWT_METRICS_OTLP_ENABLED=true \ +OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \ +OTEL_EXPORTER_OTLP_INSECURE=false \ +OTEL_EXPORTER_OTLP_CERTIFICATE=/etc/otel/custom-ca.pem \ +OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer abc123,tenant=acme" \ +./newt +``` + +Prometheus scrape strategy (choose one) +A) Scrape Newt directly: + +``` +global: + scrape_interval: 15s +scrape_configs: + - job_name: newt + static_configs: + - targets: ["newt:2112"] +``` + +B) Scrape the Collector’s Prometheus exporter: + +``` +global: + scrape_interval: 15s +scrape_configs: + - job_name: otel-collector + static_configs: + - targets: ["collector:8889"] +``` + +Reason mapping (source → reason) + +- Server instructs reconnect/terminate → server_request +- Heartbeat/Ping threshold exceeded → timeout +- Peer closed connection gracefully → peer_close +- Route/Interface change detected → network_change +- Auth/token failure (HTTP 401/403) → auth_error +- TLS/WG handshake error → handshake_error +- Config reloaded/applied (causing reconnection) → config_change +- Other/unclassified errors → error + +PromQL snippets + +- Throughput in (5m): + +``` +sum(rate(newt_tunnel_bytes_total{direction="in"}[5m])) +``` + +- P95 latency (seconds): + +``` +histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket[5m])) by (le)) +``` + +- Active sessions: + +``` +sum(newt_tunnel_sessions) +``` + +Compatibility notes + +- Gauges do not use the _total suffix (e.g., newt_tunnel_sessions). +- site_id is a resource attribute (one process = one site). tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels. +- Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both. +- Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write. +- No free text in labels; use only the enumerated constants for reason and protocol. + +Troubleshooting + +- curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics +- Check Collector logs for OTLP connection issues +- Verify Prometheus Targets are UP and scraping Newt or Collector diff --git a/examples/otel-collector.yaml b/examples/otel-collector.yaml new file mode 100644 index 0000000..c2b6854 --- /dev/null +++ b/examples/otel-collector.yaml @@ -0,0 +1,41 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + +processors: + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + batch: {} + transform/promote: + metric_statements: + - context: datapoint + statements: + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where IsMapKey(resource.attributes, "service.instance.id") + - set(attributes["site_id"], resource.attributes["site_id"]) where IsMapKey(resource.attributes, "site_id") + resourcedetection: + detectors: [env, host] + timeout: 5s + +exporters: + prometheus: + endpoint: 0.0.0.0:8889 + send_timestamps: true + prometheusremotewrite: + # Replace with your remote_write endpoint (Mimir/Cortex/VictoriaMetrics/Thanos Receive) + endpoint: http://mimir:9009/api/v1/push + +service: + pipelines: + metrics: + receivers: [otlp] + processors: [memory_limiter, resourcedetection, batch, transform/promote] + exporters: [prometheus, prometheusremotewrite] + traces: + receivers: [otlp] + processors: [memory_limiter, resourcedetection, batch] + exporters: [] + diff --git a/examples/prometheus.yml b/examples/prometheus.yml new file mode 100644 index 0000000..5323b20 --- /dev/null +++ b/examples/prometheus.yml @@ -0,0 +1,11 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: newt + static_configs: + - targets: ["newt:2112"] + - job_name: otel-collector + static_configs: + - targets: ["collector:8889"] + From b53fb70778908560154c56323d7cc0e136186c88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:17:05 +0200 Subject: [PATCH 15/72] feat: Implement telemetry for reconnect reasons and RTT reporting - Added telemetry hooks to track reconnect reasons for WireGuard connections, including server requests and authentication errors. - Introduced RTT reporting to telemetry for better latency monitoring. - Enhanced metrics configuration with flags for Prometheus and OTLP exporters. - Implemented graceful shutdown and signal handling in the main application. - Updated WebSocket client to classify connection errors and report them to telemetry. - Added support for async byte counting in metrics. - Improved handling of reconnect scenarios in the WireGuard service. - Added documentation for applying patches and rollback procedures. --- artifacts/test-results.txt | 13 + .../testdata/expected_contains.golden | 3 + patches/00_all_changes.patch | 802 ++++++++++++++++++ patches/01_proxy_multitunnel.patch | 301 +++++++ patches/02_reconnect_reasons.patch | 422 +++++++++ patches/02_reconnect_rtt.patch | 466 ++++++++++ patches/03_constants_docs.patch | 0 patches/03_wg_rtt_hook.patch | 44 + patches/04_tests_docs.patch | 0 patches/HOWTO-APPLY.md | 25 + 10 files changed, 2076 insertions(+) create mode 100644 artifacts/test-results.txt create mode 100644 internal/telemetry/testdata/expected_contains.golden create mode 100644 patches/00_all_changes.patch create mode 100644 patches/01_proxy_multitunnel.patch create mode 100644 patches/02_reconnect_reasons.patch create mode 100644 patches/02_reconnect_rtt.patch create mode 100644 patches/03_constants_docs.patch create mode 100644 patches/03_wg_rtt_hook.patch create mode 100644 patches/04_tests_docs.patch create mode 100644 patches/HOWTO-APPLY.md diff --git a/artifacts/test-results.txt b/artifacts/test-results.txt new file mode 100644 index 0000000..db66eb8 --- /dev/null +++ b/artifacts/test-results.txt @@ -0,0 +1,13 @@ +FAIL github.com/fosrl/newt [setup failed] +FAIL github.com/fosrl/newt/docker [setup failed] +FAIL github.com/fosrl/newt/internal/state [setup failed] +FAIL github.com/fosrl/newt/internal/telemetry [setup failed] +FAIL github.com/fosrl/newt/proxy [setup failed] +FAIL github.com/fosrl/newt/websocket [setup failed] +FAIL github.com/fosrl/newt/wgnetstack [setup failed] +? github.com/fosrl/newt/healthcheck [no test files] +? github.com/fosrl/newt/logger [no test files] +? github.com/fosrl/newt/network [no test files] +? github.com/fosrl/newt/updates [no test files] +FAIL github.com/fosrl/newt/wgtester [build failed] +FAIL diff --git a/internal/telemetry/testdata/expected_contains.golden b/internal/telemetry/testdata/expected_contains.golden new file mode 100644 index 0000000..48123dd --- /dev/null +++ b/internal/telemetry/testdata/expected_contains.golden @@ -0,0 +1,3 @@ +newt_connection_attempts_total +newt_build_info + diff --git a/patches/00_all_changes.patch b/patches/00_all_changes.patch new file mode 100644 index 0000000..ed7a234 --- /dev/null +++ b/patches/00_all_changes.patch @@ -0,0 +1,802 @@ +diff --git a/Dockerfile b/Dockerfile +index b9c4d29..b9b6dea 100644 +--- a/Dockerfile ++++ b/Dockerfile +@@ -22,6 +22,9 @@ RUN apk --no-cache add ca-certificates tzdata + COPY --from=builder /newt /usr/local/bin/ + COPY entrypoint.sh / + ++# Admin/metrics endpoint (Prometheus scrape) ++EXPOSE 2112 ++ + RUN chmod +x /entrypoint.sh + ENTRYPOINT ["/entrypoint.sh"] +-CMD ["newt"] +\ No newline at end of file ++CMD ["newt"] +diff --git a/go.mod b/go.mod +index d475835..5909955 100644 +--- a/go.mod ++++ b/go.mod +@@ -7,6 +7,14 @@ require ( + github.com/google/gopacket v1.1.19 + github.com/gorilla/websocket v1.5.3 + github.com/vishvananda/netlink v1.3.1 ++ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 ++ go.opentelemetry.io/contrib/instrumentation/runtime v0.62.0 ++ go.opentelemetry.io/otel v1.37.0 ++ go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.37.0 ++ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.37.0 ++ go.opentelemetry.io/otel/sdk/metric v1.37.0 ++ go.opentelemetry.io/otel/sdk/trace v1.37.0 ++ go.opentelemetry.io/otel/semconv v1.26.0 + golang.org/x/crypto v0.42.0 + golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 + golang.org/x/net v0.44.0 +diff --git a/main.go b/main.go +index 12849b1..c223b75 100644 +--- a/main.go ++++ b/main.go +@@ -1,7 +1,9 @@ + package main + + import ( ++ "context" + "encoding/json" ++ "errors" + "flag" + "fmt" + "net" +@@ -22,6 +24,9 @@ import ( + "github.com/fosrl/newt/updates" + "github.com/fosrl/newt/websocket" + ++ "github.com/fosrl/newt/internal/state" ++ "github.com/fosrl/newt/internal/telemetry" ++ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + "golang.zx2c4.com/wireguard/conn" + "golang.zx2c4.com/wireguard/device" + "golang.zx2c4.com/wireguard/tun" +@@ -116,6 +121,13 @@ var ( + healthMonitor *healthcheck.Monitor + enforceHealthcheckCert bool + ++ // Observability/metrics flags ++ metricsEnabled bool ++ otlpEnabled bool ++ adminAddr string ++ region string ++ metricsAsyncBytes bool ++ + // New mTLS configuration variables + tlsClientCert string + tlsClientKey string +@@ -126,6 +138,10 @@ var ( + ) + + func main() { ++ // Prepare context for graceful shutdown and signal handling ++ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) ++ defer stop() ++ + // if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values + endpoint = os.Getenv("PANGOLIN_ENDPOINT") + id = os.Getenv("NEWT_ID") +@@ -141,6 +157,13 @@ func main() { + useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE") + enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT") + ++ // Metrics/observability env mirrors ++ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED") ++ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED") ++ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR") ++ regionEnv := os.Getenv("NEWT_REGION") ++ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES") ++ + keepInterface = keepInterfaceEnv == "true" + acceptClients = acceptClientsEnv == "true" + useNativeInterface = useNativeInterfaceEnv == "true" +@@ -272,6 +295,35 @@ func main() { + flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)") + } + ++ // Metrics/observability flags (mirror ENV if unset) ++ if metricsEnabledEnv == "" { ++ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter") ++ } else { ++ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true } ++ } ++ if otlpEnabledEnv == "" { ++ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT") ++ } else { ++ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v } ++ } ++ if adminAddrEnv == "" { ++ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address") ++ } else { ++ adminAddr = adminAddrEnv ++ } ++ // Async bytes toggle ++ if asyncBytesEnv == "" { ++ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)") ++ } else { ++ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v } ++ } ++ // Optional region flag (resource attribute) ++ if regionEnv == "" { ++ flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)") ++ } else { ++ region = regionEnv ++ } ++ + // do a --version check + version := flag.Bool("version", false, "Print the version") + +@@ -286,6 +338,50 @@ func main() { + loggerLevel := parseLogLevel(logLevel) + logger.GetLogger().SetLevel(parseLogLevel(logLevel)) + ++ // Initialize telemetry after flags are parsed (so flags override env) ++ tcfg := telemetry.FromEnv() ++ tcfg.PromEnabled = metricsEnabled ++ tcfg.OTLPEnabled = otlpEnabled ++ if adminAddr != "" { tcfg.AdminAddr = adminAddr } ++ // Resource attributes (if available) ++ tcfg.SiteID = id ++ tcfg.Region = region ++ // Build info ++ tcfg.BuildVersion = newtVersion ++ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT") ++ ++ tel, telErr := telemetry.Init(ctx, tcfg) ++ if telErr != nil { ++ logger.Warn("Telemetry init failed: %v", telErr) ++ } ++ if tel != nil { ++ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled) ++ mux := http.NewServeMux() ++ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) }) ++ if tel.PrometheusHandler != nil { ++ mux.Handle("/metrics", tel.PrometheusHandler) ++ } ++ admin := &http.Server{ ++ Addr: tcfg.AdminAddr, ++ Handler: otelhttp.NewHandler(mux, "newt-admin"), ++ ReadTimeout: 5 * time.Second, ++ WriteTimeout: 10 * time.Second, ++ ReadHeaderTimeout: 5 * time.Second, ++ IdleTimeout: 30 * time.Second, ++ } ++ go func() { ++ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { ++ logger.Warn("admin http error: %v", err) ++ } ++ }() ++ defer func() { ++ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) ++ defer cancel() ++ _ = admin.Shutdown(ctx) ++ }() ++ defer func() { _ = tel.Shutdown(context.Background()) }() ++ } ++ + newtVersion := "version_replaceme" + if *version { + fmt.Println("Newt version " + newtVersion) +@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + } + // Use reliable ping for initial connection test + logger.Debug("Testing initial connection with reliable ping...") +- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) ++ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) ++ if err == nil && wgData.PublicKey != "" { ++ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds()) ++ } + if err != nil { + logger.Warn("Initial reliable ping failed, but continuing: %v", err) + } else { +@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + // as the pings will continue in the background + if !connected { + logger.Debug("Starting ping check") +- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client) ++ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey) + } + + // Create proxy manager + pm = proxy.NewProxyManager(tnet) ++ pm.SetAsyncBytes(metricsAsyncBytes) ++ // Set tunnel_id for metrics (WireGuard peer public key) ++ pm.SetTunnelID(wgData.PublicKey) + + connected = true + ++ // telemetry: record a successful site registration (omit region unless available) ++ telemetry.IncSiteRegistration(context.Background(), id, "", "success") ++ + // add the targets if there are any + if len(wgData.Targets.TCP) > 0 { + updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP}) +@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + + client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) { + logger.Info("Received reconnect message") ++ if wgData.PublicKey != "" { ++ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") ++ } + + // Close the WireGuard device and TUN + closeWgTunnel() + ++ // Clear metrics attrs and sessions for the tunnel ++ if pm != nil { ++ pm.ClearTunnelID() ++ state.Global().ClearTunnel(wgData.PublicKey) ++ } ++ ++ // Clear metrics attrs and sessions for the tunnel ++ if pm != nil { ++ pm.ClearTunnelID() ++ state.Global().ClearTunnel(wgData.PublicKey) ++ } ++ + // Mark as disconnected + connected = false + +@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + + client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) { + logger.Info("Received termination message") ++ if wgData.PublicKey != "" { ++ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") ++ } + + // Close the WireGuard device and TUN + closeWgTunnel() +diff --git a/proxy/manager.go b/proxy/manager.go +index bf10322..86c47a8 100644 +--- a/proxy/manager.go ++++ b/proxy/manager.go +@@ -1,16 +1,22 @@ + package proxy + + import ( ++ "context" + "fmt" + "io" + "net" ++ "os" + "strings" + "sync" ++ "sync/atomic" + "time" + ++ "github.com/fosrl/newt/internal/state" ++ "github.com/fosrl/newt/internal/telemetry" + "github.com/fosrl/newt/logger" + "golang.zx2c4.com/wireguard/tun/netstack" + "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" ++ "go.opentelemetry.io/otel/attribute" + ) + + // Target represents a proxy target with its address and port +@@ -28,6 +34,52 @@ type ProxyManager struct { + udpConns []*gonet.UDPConn + running bool + mutex sync.RWMutex ++ ++ // telemetry (multi-tunnel) ++ currentTunnelID string ++ tunnels map[string]*tunnelEntry ++ asyncBytes bool ++ flushStop chan struct{} ++} ++ ++// tunnelEntry holds per-tunnel attributes and (optional) async counters. ++type tunnelEntry struct { ++ attrInTCP attribute.Set ++ attrOutTCP attribute.Set ++ attrInUDP attribute.Set ++ attrOutUDP attribute.Set ++ ++ bytesInTCP atomic.Uint64 ++ bytesOutTCP atomic.Uint64 ++ bytesInUDP atomic.Uint64 ++ bytesOutUDP atomic.Uint64 ++} ++ ++// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set. ++type countingWriter struct { ++ ctx context.Context ++ w io.Writer ++ set attribute.Set ++ pm *ProxyManager ++ ent *tunnelEntry ++ out bool // false=in, true=out ++ proto string // "tcp" or "udp" ++} ++ ++func (cw *countingWriter) Write(p []byte) (int, error) { ++ n, err := cw.w.Write(p) ++ if n > 0 { ++ if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil { ++ if cw.proto == "tcp" { ++ if cw.out { cw.ent.bytesOutTCP.Add(uint64(n)) } else { cw.ent.bytesInTCP.Add(uint64(n)) } ++ } else if cw.proto == "udp" { ++ if cw.out { cw.ent.bytesOutUDP.Add(uint64(n)) } else { cw.ent.bytesInUDP.Add(uint64(n)) } ++ } ++ } else { ++ telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set) ++ } ++ } ++ return n, err + } + + // NewProxyManager creates a new proxy manager instance +@@ -38,9 +90,46 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager { + udpTargets: make(map[string]map[int]string), + listeners: make([]*gonet.TCPListener, 0), + udpConns: make([]*gonet.UDPConn, 0), ++ tunnels: make(map[string]*tunnelEntry), + } + } + ++// SetTunnelID sets the WireGuard peer public key used as tunnel_id label. ++func (pm *ProxyManager) SetTunnelID(id string) { ++ pm.mutex.Lock() ++ defer pm.mutex.Unlock() ++ pm.currentTunnelID = id ++ if _, ok := pm.tunnels[id]; !ok { ++ pm.tunnels[id] = &tunnelEntry{} ++ } ++ e := pm.tunnels[id] ++ e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp")) ++ e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp")) ++ e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp")) ++ e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp")) ++} ++ ++// ClearTunnelID clears cached attribute sets for the current tunnel. ++func (pm *ProxyManager) ClearTunnelID() { ++ pm.mutex.Lock() ++ defer pm.mutex.Unlock() ++ id := pm.currentTunnelID ++ if id == "" { return } ++ if e, ok := pm.tunnels[id]; ok { ++ // final flush for this tunnel ++ inTCP := e.bytesInTCP.Swap(0) ++ outTCP := e.bytesOutTCP.Swap(0) ++ inUDP := e.bytesInUDP.Swap(0) ++ outUDP := e.bytesOutUDP.Swap(0) ++ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) } ++ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) } ++ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) } ++ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) } ++ delete(pm.tunnels, id) ++ } ++ pm.currentTunnelID = "" ++} ++ + // init function without tnet + func NewProxyManagerWithoutTNet() *ProxyManager { + return &ProxyManager{ +@@ -160,6 +249,57 @@ func (pm *ProxyManager) Start() error { + return nil + } + ++func (pm *ProxyManager) SetAsyncBytes(b bool) { ++ pm.mutex.Lock() ++ defer pm.mutex.Unlock() ++ pm.asyncBytes = b ++ if b && pm.flushStop == nil { ++ pm.flushStop = make(chan struct{}) ++ go pm.flushLoop() ++ } ++} ++func (pm *ProxyManager) flushLoop() { ++ flushInterval := 2 * time.Second ++ if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" { ++ if d, err := time.ParseDuration(v); err == nil && d > 0 { ++ if d/2 < flushInterval { flushInterval = d / 2 } ++ } ++ } ++ ticker := time.NewTicker(flushInterval) ++ defer ticker.Stop() ++ for { ++ select { ++ case <-ticker.C: ++ pm.mutex.RLock() ++ for _, e := range pm.tunnels { ++ inTCP := e.bytesInTCP.Swap(0) ++ outTCP := e.bytesOutTCP.Swap(0) ++ inUDP := e.bytesInUDP.Swap(0) ++ outUDP := e.bytesOutUDP.Swap(0) ++ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) } ++ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) } ++ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) } ++ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) } ++ } ++ pm.mutex.RUnlock() ++ case <-pm.flushStop: ++ pm.mutex.RLock() ++ for _, e := range pm.tunnels { ++ inTCP := e.bytesInTCP.Swap(0) ++ outTCP := e.bytesOutTCP.Swap(0) ++ inUDP := e.bytesInUDP.Swap(0) ++ outUDP := e.bytesOutUDP.Swap(0) ++ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) } ++ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) } ++ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) } ++ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) } ++ } ++ pm.mutex.RUnlock() ++ return ++ } ++ } ++} ++ + func (pm *ProxyManager) Stop() error { + pm.mutex.Lock() + defer pm.mutex.Unlock() +@@ -236,6 +376,14 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr + return nil + } + ++// getEntry returns per-tunnel entry or nil. ++func (pm *ProxyManager) getEntry(id string) *tunnelEntry { ++ pm.mutex.RLock() ++ e := pm.tunnels[id] ++ pm.mutex.RUnlock() ++ return e ++} ++ + func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) { + for { + conn, err := listener.Accept() +@@ -257,6 +405,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) + continue + } + ++// Count sessions only once per accepted TCP connection ++ if pm.tunnelID != "" { state.Global().IncSessions(pm.tunnelID) } ++ + go func() { + target, err := net.Dial("tcp", targetAddr) + if err != nil { +@@ -265,24 +416,33 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) + return + } + ++ // already incremented on accept ++ + // Create a WaitGroup to ensure both copy operations complete + var wg sync.WaitGroup + wg.Add(2) + ++ // client -> target (direction=in) + go func() { + defer wg.Done() +- io.Copy(target, conn) +- target.Close() ++e := pm.getEntry(pm.currentTunnelID) ++cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"} ++ _, _ = io.Copy(cw, conn) ++ _ = target.Close() + }() + ++ // target -> client (direction=out) + go func() { + defer wg.Done() +- io.Copy(conn, target) +- conn.Close() ++e := pm.getEntry(pm.currentTunnelID) ++cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"} ++ _, _ = io.Copy(cw, target) ++ _ = conn.Close() + }() + +- // Wait for both copies to complete ++ // Wait for both copies to complete then session -1 + wg.Wait() ++ if pm.tunnelID != "" { state.Global().DecSessions(pm.tunnelID) } + }() + } + } +@@ -326,6 +486,14 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { + } + + clientKey := remoteAddr.String() ++ // bytes from client -> target (direction=in) ++if pm.currentTunnelID != "" && n > 0 { ++if pm.asyncBytes { ++ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(n)) } ++ } else { ++ if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) } ++ } ++ } + clientsMutex.RLock() + targetConn, exists := clientConns[clientKey] + clientsMutex.RUnlock() +@@ -366,6 +534,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { + return // defer will handle cleanup + } + ++ // bytes from target -> client (direction=out) ++ if pm.currentTunnelID != "" && n > 0 { ++ if pm.asyncBytes { ++ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesOutUDP.Add(uint64(n)) } ++ } else { ++if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) } ++ } ++ } ++ + _, err = conn.WriteTo(buffer[:n], remoteAddr) + if err != nil { + logger.Error("Error writing to client: %v", err) +@@ -375,13 +552,19 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { + }(clientKey, targetConn, remoteAddr) + } + +- _, err = targetConn.Write(buffer[:n]) ++ written, err := targetConn.Write(buffer[:n]) + if err != nil { + logger.Error("Error writing to target: %v", err) + targetConn.Close() + clientsMutex.Lock() + delete(clientConns, clientKey) + clientsMutex.Unlock() ++} else if pm.currentTunnelID != "" && written > 0 { ++ if pm.asyncBytes { ++ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) } ++ } else { ++if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) } ++ } + } + } + } +diff --git a/util.go b/util.go +index 7d6da4f..c1f4915 100644 +--- a/util.go ++++ b/util.go +@@ -17,6 +17,7 @@ import ( + "github.com/fosrl/newt/logger" + "github.com/fosrl/newt/proxy" + "github.com/fosrl/newt/websocket" ++ "github.com/fosrl/newt/internal/telemetry" + "golang.org/x/net/icmp" + "golang.org/x/net/ipv4" + "golang.zx2c4.com/wireguard/device" +@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC + return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background") + } + +-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} { ++func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} { + maxInterval := 6 * time.Second + currentInterval := pingInterval + consecutiveFailures := 0 +@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien + if !connectionLost { + connectionLost = true + logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) ++ if tunnelID != "" { ++ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout) ++ } + stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second) + // Send registration message to the server for backward compatibility + err := client.SendMessage("newt/wg/register", map[string]interface{}{ +@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien + } else { + // Track recent latencies + recentLatencies = append(recentLatencies, latency) ++ // Record tunnel latency (limit sampling to this periodic check) ++ if tunnelID != "" { ++ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds()) ++ } + if len(recentLatencies) > 10 { + recentLatencies = recentLatencies[1:] + } +diff --git a/websocket/client.go b/websocket/client.go +index 0c0664a..c9ac264 100644 +--- a/websocket/client.go ++++ b/websocket/client.go +@@ -18,6 +18,10 @@ import ( + + "github.com/fosrl/newt/logger" + "github.com/gorilla/websocket" ++ ++ "context" ++ "github.com/fosrl/newt/internal/telemetry" ++ "go.opentelemetry.io/otel" + ) + + type Client struct { +@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) { + } + resp, err := client.Do(req) + if err != nil { ++ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err)) + return "", fmt.Errorf("failed to request new token: %w", err) + } + defer resp.Body.Close() +@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) { + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure") ++ bin := "http_other" ++ if resp.StatusCode >= 500 { ++ bin = "http_5xx" ++ } else if resp.StatusCode >= 400 { ++ bin = "http_4xx" ++ } ++ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin) ++ // Reconnect reason mapping for auth failures ++ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError) ++ } + return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) + } + +@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) { + } + + logger.Debug("Received token: %s", tokenResp.Data.Token) ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success") + + return tokenResp.Data.Token, nil + } + ++// classifyConnError maps common errors to low-cardinality error_type labels ++func classifyConnError(err error) string { ++ if err == nil { ++ return "" ++ } ++ msg := strings.ToLower(err.Error()) ++ switch { ++ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"): ++ return "tls" ++ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"): ++ return "timeout" ++ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"): ++ return "dns" ++ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"): ++ return "auth" ++ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"): ++ return "io" ++ default: ++ return "other" ++ } ++} ++ + func (c *Client) connectWithRetry() { + for { + select { +@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error { + // Get token for authentication + token, err := c.getToken() + if err != nil { ++ // telemetry: connection attempt failed before dialing ++ // site_id isn't globally available here; use client ID as site_id (low cardinality) ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") ++ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err)) + return fmt.Errorf("failed to get token: %w", err) + } + +@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error { + q.Set("clientType", c.clientType) + u.RawQuery = q.Encode() + +- // Connect to WebSocket ++ // Connect to WebSocket (optional span) ++ tr := otel.Tracer("newt") ++ spanCtx, span := tr.Start(context.Background(), "ws.connect") ++ defer span.End() ++ + dialer := websocket.DefaultDialer + + // Use new TLS configuration method +@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error { + logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable") + } + +- conn, _, err := dialer.Dial(u.String(), nil) ++conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) + if err != nil { ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") ++ etype := classifyConnError(err) ++ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype) ++ // Map handshake-related errors to reconnect reasons where appropriate ++ if etype == "tls" { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError) ++ } else if etype == "timeout" { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout) ++ } else { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError) ++ } + return fmt.Errorf("failed to connect to WebSocket: %w", err) + } + ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success") + c.conn = conn + c.setConnected(true) + +diff --git a/wg/wg.go b/wg/wg.go +index 3cee1a9..a765279 100644 +--- a/wg/wg.go ++++ b/wg/wg.go +@@ -3,6 +3,7 @@ + package wg + + import ( ++ "context" + "encoding/json" + "errors" + "fmt" +@@ -23,6 +24,8 @@ import ( + "golang.zx2c4.com/wireguard/conn" + "golang.zx2c4.com/wireguard/wgctrl" + "golang.zx2c4.com/wireguard/wgctrl/wgtypes" ++ ++ "github.com/fosrl/newt/internal/telemetry" + ) + + type WgConfig struct { +@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { + s.stopGetConfig = nil + } + ++ // telemetry: config reload success ++ telemetry.IncConfigReload(context.Background(), "success") ++ // Optional reconnect reason mapping: config change ++ if s.serverPubKey != "" { ++ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange) ++ } ++ + // Ensure the WireGuard interface and peers are configured + if err := s.ensureWireguardInterface(config); err != nil { + logger.Error("Failed to ensure WireGuard interface: %v", err) +diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go +index 6684c40..09f160e 100644 +--- a/wgnetstack/wgnetstack.go ++++ b/wgnetstack/wgnetstack.go +@@ -1,6 +1,7 @@ + package wgnetstack + + import ( ++ "context" + "crypto/rand" + "encoding/base64" + "encoding/hex" +@@ -26,6 +27,8 @@ import ( + "golang.zx2c4.com/wireguard/tun" + "golang.zx2c4.com/wireguard/tun/netstack" + "golang.zx2c4.com/wireguard/wgctrl/wgtypes" ++ ++ "github.com/fosrl/newt/internal/telemetry" + ) + + type WgConfig struct { +@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str + return service, nil + } + ++// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally. ++func (s *WireGuardService) ReportRTT(seconds float64) { ++ if s.serverPubKey == "" { return } ++ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds) ++} ++ + func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) { + logger.Debug("Received: %+v", msg) + + // if there is no wgData or pm, we can't add targets + if s.TunnelIP == "" || s.proxyManager == nil { + logger.Info("No tunnel IP or proxy manager available") +- return +- } ++ return ++} + + targetData, err := parseTargetData(msg.Data) + if err != nil { diff --git a/patches/01_proxy_multitunnel.patch b/patches/01_proxy_multitunnel.patch new file mode 100644 index 0000000..c4aafb6 --- /dev/null +++ b/patches/01_proxy_multitunnel.patch @@ -0,0 +1,301 @@ +diff --git a/proxy/manager.go b/proxy/manager.go +index bf10322..86c47a8 100644 +--- a/proxy/manager.go ++++ b/proxy/manager.go +@@ -1,16 +1,22 @@ + package proxy + + import ( ++ "context" + "fmt" + "io" + "net" ++ "os" + "strings" + "sync" ++ "sync/atomic" + "time" + ++ "github.com/fosrl/newt/internal/state" ++ "github.com/fosrl/newt/internal/telemetry" + "github.com/fosrl/newt/logger" + "golang.zx2c4.com/wireguard/tun/netstack" + "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" ++ "go.opentelemetry.io/otel/attribute" + ) + + // Target represents a proxy target with its address and port +@@ -28,6 +34,52 @@ type ProxyManager struct { + udpConns []*gonet.UDPConn + running bool + mutex sync.RWMutex ++ ++ // telemetry (multi-tunnel) ++ currentTunnelID string ++ tunnels map[string]*tunnelEntry ++ asyncBytes bool ++ flushStop chan struct{} ++} ++ ++// tunnelEntry holds per-tunnel attributes and (optional) async counters. ++type tunnelEntry struct { ++ attrInTCP attribute.Set ++ attrOutTCP attribute.Set ++ attrInUDP attribute.Set ++ attrOutUDP attribute.Set ++ ++ bytesInTCP atomic.Uint64 ++ bytesOutTCP atomic.Uint64 ++ bytesInUDP atomic.Uint64 ++ bytesOutUDP atomic.Uint64 ++} ++ ++// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set. ++type countingWriter struct { ++ ctx context.Context ++ w io.Writer ++ set attribute.Set ++ pm *ProxyManager ++ ent *tunnelEntry ++ out bool // false=in, true=out ++ proto string // "tcp" or "udp" ++} ++ ++func (cw *countingWriter) Write(p []byte) (int, error) { ++ n, err := cw.w.Write(p) ++ if n > 0 { ++ if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil { ++ if cw.proto == "tcp" { ++ if cw.out { cw.ent.bytesOutTCP.Add(uint64(n)) } else { cw.ent.bytesInTCP.Add(uint64(n)) } ++ } else if cw.proto == "udp" { ++ if cw.out { cw.ent.bytesOutUDP.Add(uint64(n)) } else { cw.ent.bytesInUDP.Add(uint64(n)) } ++ } ++ } else { ++ telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set) ++ } ++ } ++ return n, err + } + + // NewProxyManager creates a new proxy manager instance +@@ -38,9 +90,46 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager { + udpTargets: make(map[string]map[int]string), + listeners: make([]*gonet.TCPListener, 0), + udpConns: make([]*gonet.UDPConn, 0), ++ tunnels: make(map[string]*tunnelEntry), + } + } + ++// SetTunnelID sets the WireGuard peer public key used as tunnel_id label. ++func (pm *ProxyManager) SetTunnelID(id string) { ++ pm.mutex.Lock() ++ defer pm.mutex.Unlock() ++ pm.currentTunnelID = id ++ if _, ok := pm.tunnels[id]; !ok { ++ pm.tunnels[id] = &tunnelEntry{} ++ } ++ e := pm.tunnels[id] ++ e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp")) ++ e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp")) ++ e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp")) ++ e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp")) ++} ++ ++// ClearTunnelID clears cached attribute sets for the current tunnel. ++func (pm *ProxyManager) ClearTunnelID() { ++ pm.mutex.Lock() ++ defer pm.mutex.Unlock() ++ id := pm.currentTunnelID ++ if id == "" { return } ++ if e, ok := pm.tunnels[id]; ok { ++ // final flush for this tunnel ++ inTCP := e.bytesInTCP.Swap(0) ++ outTCP := e.bytesOutTCP.Swap(0) ++ inUDP := e.bytesInUDP.Swap(0) ++ outUDP := e.bytesOutUDP.Swap(0) ++ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) } ++ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) } ++ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) } ++ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) } ++ delete(pm.tunnels, id) ++ } ++ pm.currentTunnelID = "" ++} ++ + // init function without tnet + func NewProxyManagerWithoutTNet() *ProxyManager { + return &ProxyManager{ +@@ -160,6 +249,57 @@ func (pm *ProxyManager) Start() error { + return nil + } + ++func (pm *ProxyManager) SetAsyncBytes(b bool) { ++ pm.mutex.Lock() ++ defer pm.mutex.Unlock() ++ pm.asyncBytes = b ++ if b && pm.flushStop == nil { ++ pm.flushStop = make(chan struct{}) ++ go pm.flushLoop() ++ } ++} ++func (pm *ProxyManager) flushLoop() { ++ flushInterval := 2 * time.Second ++ if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" { ++ if d, err := time.ParseDuration(v); err == nil && d > 0 { ++ if d/2 < flushInterval { flushInterval = d / 2 } ++ } ++ } ++ ticker := time.NewTicker(flushInterval) ++ defer ticker.Stop() ++ for { ++ select { ++ case <-ticker.C: ++ pm.mutex.RLock() ++ for _, e := range pm.tunnels { ++ inTCP := e.bytesInTCP.Swap(0) ++ outTCP := e.bytesOutTCP.Swap(0) ++ inUDP := e.bytesInUDP.Swap(0) ++ outUDP := e.bytesOutUDP.Swap(0) ++ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) } ++ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) } ++ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) } ++ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) } ++ } ++ pm.mutex.RUnlock() ++ case <-pm.flushStop: ++ pm.mutex.RLock() ++ for _, e := range pm.tunnels { ++ inTCP := e.bytesInTCP.Swap(0) ++ outTCP := e.bytesOutTCP.Swap(0) ++ inUDP := e.bytesInUDP.Swap(0) ++ outUDP := e.bytesOutUDP.Swap(0) ++ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) } ++ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) } ++ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) } ++ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) } ++ } ++ pm.mutex.RUnlock() ++ return ++ } ++ } ++} ++ + func (pm *ProxyManager) Stop() error { + pm.mutex.Lock() + defer pm.mutex.Unlock() +@@ -236,6 +376,14 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr + return nil + } + ++// getEntry returns per-tunnel entry or nil. ++func (pm *ProxyManager) getEntry(id string) *tunnelEntry { ++ pm.mutex.RLock() ++ e := pm.tunnels[id] ++ pm.mutex.RUnlock() ++ return e ++} ++ + func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) { + for { + conn, err := listener.Accept() +@@ -257,6 +405,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) + continue + } + ++// Count sessions only once per accepted TCP connection ++ if pm.tunnelID != "" { state.Global().IncSessions(pm.tunnelID) } ++ + go func() { + target, err := net.Dial("tcp", targetAddr) + if err != nil { +@@ -265,24 +416,33 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) + return + } + ++ // already incremented on accept ++ + // Create a WaitGroup to ensure both copy operations complete + var wg sync.WaitGroup + wg.Add(2) + ++ // client -> target (direction=in) + go func() { + defer wg.Done() +- io.Copy(target, conn) +- target.Close() ++e := pm.getEntry(pm.currentTunnelID) ++cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"} ++ _, _ = io.Copy(cw, conn) ++ _ = target.Close() + }() + ++ // target -> client (direction=out) + go func() { + defer wg.Done() +- io.Copy(conn, target) +- conn.Close() ++e := pm.getEntry(pm.currentTunnelID) ++cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"} ++ _, _ = io.Copy(cw, target) ++ _ = conn.Close() + }() + +- // Wait for both copies to complete ++ // Wait for both copies to complete then session -1 + wg.Wait() ++ if pm.tunnelID != "" { state.Global().DecSessions(pm.tunnelID) } + }() + } + } +@@ -326,6 +486,14 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { + } + + clientKey := remoteAddr.String() ++ // bytes from client -> target (direction=in) ++if pm.currentTunnelID != "" && n > 0 { ++if pm.asyncBytes { ++ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(n)) } ++ } else { ++ if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) } ++ } ++ } + clientsMutex.RLock() + targetConn, exists := clientConns[clientKey] + clientsMutex.RUnlock() +@@ -366,6 +534,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { + return // defer will handle cleanup + } + ++ // bytes from target -> client (direction=out) ++ if pm.currentTunnelID != "" && n > 0 { ++ if pm.asyncBytes { ++ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesOutUDP.Add(uint64(n)) } ++ } else { ++if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) } ++ } ++ } ++ + _, err = conn.WriteTo(buffer[:n], remoteAddr) + if err != nil { + logger.Error("Error writing to client: %v", err) +@@ -375,13 +552,19 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { + }(clientKey, targetConn, remoteAddr) + } + +- _, err = targetConn.Write(buffer[:n]) ++ written, err := targetConn.Write(buffer[:n]) + if err != nil { + logger.Error("Error writing to target: %v", err) + targetConn.Close() + clientsMutex.Lock() + delete(clientConns, clientKey) + clientsMutex.Unlock() ++} else if pm.currentTunnelID != "" && written > 0 { ++ if pm.asyncBytes { ++ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) } ++ } else { ++if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) } ++ } + } + } + } diff --git a/patches/02_reconnect_reasons.patch b/patches/02_reconnect_reasons.patch new file mode 100644 index 0000000..c70560f --- /dev/null +++ b/patches/02_reconnect_reasons.patch @@ -0,0 +1,422 @@ +diff --git a/main.go b/main.go +index 12849b1..c223b75 100644 +--- a/main.go ++++ b/main.go +@@ -1,7 +1,9 @@ + package main + + import ( ++ "context" + "encoding/json" ++ "errors" + "flag" + "fmt" + "net" +@@ -22,6 +24,9 @@ import ( + "github.com/fosrl/newt/updates" + "github.com/fosrl/newt/websocket" + ++ "github.com/fosrl/newt/internal/state" ++ "github.com/fosrl/newt/internal/telemetry" ++ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + "golang.zx2c4.com/wireguard/conn" + "golang.zx2c4.com/wireguard/device" + "golang.zx2c4.com/wireguard/tun" +@@ -116,6 +121,13 @@ var ( + healthMonitor *healthcheck.Monitor + enforceHealthcheckCert bool + ++ // Observability/metrics flags ++ metricsEnabled bool ++ otlpEnabled bool ++ adminAddr string ++ region string ++ metricsAsyncBytes bool ++ + // New mTLS configuration variables + tlsClientCert string + tlsClientKey string +@@ -126,6 +138,10 @@ var ( + ) + + func main() { ++ // Prepare context for graceful shutdown and signal handling ++ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) ++ defer stop() ++ + // if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values + endpoint = os.Getenv("PANGOLIN_ENDPOINT") + id = os.Getenv("NEWT_ID") +@@ -141,6 +157,13 @@ func main() { + useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE") + enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT") + ++ // Metrics/observability env mirrors ++ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED") ++ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED") ++ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR") ++ regionEnv := os.Getenv("NEWT_REGION") ++ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES") ++ + keepInterface = keepInterfaceEnv == "true" + acceptClients = acceptClientsEnv == "true" + useNativeInterface = useNativeInterfaceEnv == "true" +@@ -272,6 +295,35 @@ func main() { + flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)") + } + ++ // Metrics/observability flags (mirror ENV if unset) ++ if metricsEnabledEnv == "" { ++ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter") ++ } else { ++ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true } ++ } ++ if otlpEnabledEnv == "" { ++ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT") ++ } else { ++ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v } ++ } ++ if adminAddrEnv == "" { ++ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address") ++ } else { ++ adminAddr = adminAddrEnv ++ } ++ // Async bytes toggle ++ if asyncBytesEnv == "" { ++ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)") ++ } else { ++ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v } ++ } ++ // Optional region flag (resource attribute) ++ if regionEnv == "" { ++ flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)") ++ } else { ++ region = regionEnv ++ } ++ + // do a --version check + version := flag.Bool("version", false, "Print the version") + +@@ -286,6 +338,50 @@ func main() { + loggerLevel := parseLogLevel(logLevel) + logger.GetLogger().SetLevel(parseLogLevel(logLevel)) + ++ // Initialize telemetry after flags are parsed (so flags override env) ++ tcfg := telemetry.FromEnv() ++ tcfg.PromEnabled = metricsEnabled ++ tcfg.OTLPEnabled = otlpEnabled ++ if adminAddr != "" { tcfg.AdminAddr = adminAddr } ++ // Resource attributes (if available) ++ tcfg.SiteID = id ++ tcfg.Region = region ++ // Build info ++ tcfg.BuildVersion = newtVersion ++ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT") ++ ++ tel, telErr := telemetry.Init(ctx, tcfg) ++ if telErr != nil { ++ logger.Warn("Telemetry init failed: %v", telErr) ++ } ++ if tel != nil { ++ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled) ++ mux := http.NewServeMux() ++ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) }) ++ if tel.PrometheusHandler != nil { ++ mux.Handle("/metrics", tel.PrometheusHandler) ++ } ++ admin := &http.Server{ ++ Addr: tcfg.AdminAddr, ++ Handler: otelhttp.NewHandler(mux, "newt-admin"), ++ ReadTimeout: 5 * time.Second, ++ WriteTimeout: 10 * time.Second, ++ ReadHeaderTimeout: 5 * time.Second, ++ IdleTimeout: 30 * time.Second, ++ } ++ go func() { ++ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { ++ logger.Warn("admin http error: %v", err) ++ } ++ }() ++ defer func() { ++ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) ++ defer cancel() ++ _ = admin.Shutdown(ctx) ++ }() ++ defer func() { _ = tel.Shutdown(context.Background()) }() ++ } ++ + newtVersion := "version_replaceme" + if *version { + fmt.Println("Newt version " + newtVersion) +@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + } + // Use reliable ping for initial connection test + logger.Debug("Testing initial connection with reliable ping...") +- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) ++ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) ++ if err == nil && wgData.PublicKey != "" { ++ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds()) ++ } + if err != nil { + logger.Warn("Initial reliable ping failed, but continuing: %v", err) + } else { +@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + // as the pings will continue in the background + if !connected { + logger.Debug("Starting ping check") +- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client) ++ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey) + } + + // Create proxy manager + pm = proxy.NewProxyManager(tnet) ++ pm.SetAsyncBytes(metricsAsyncBytes) ++ // Set tunnel_id for metrics (WireGuard peer public key) ++ pm.SetTunnelID(wgData.PublicKey) + + connected = true + ++ // telemetry: record a successful site registration (omit region unless available) ++ telemetry.IncSiteRegistration(context.Background(), id, "", "success") ++ + // add the targets if there are any + if len(wgData.Targets.TCP) > 0 { + updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP}) +@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + + client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) { + logger.Info("Received reconnect message") ++ if wgData.PublicKey != "" { ++ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") ++ } + + // Close the WireGuard device and TUN + closeWgTunnel() + ++ // Clear metrics attrs and sessions for the tunnel ++ if pm != nil { ++ pm.ClearTunnelID() ++ state.Global().ClearTunnel(wgData.PublicKey) ++ } ++ ++ // Clear metrics attrs and sessions for the tunnel ++ if pm != nil { ++ pm.ClearTunnelID() ++ state.Global().ClearTunnel(wgData.PublicKey) ++ } ++ + // Mark as disconnected + connected = false + +@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + + client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) { + logger.Info("Received termination message") ++ if wgData.PublicKey != "" { ++ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") ++ } + + // Close the WireGuard device and TUN + closeWgTunnel() +diff --git a/util.go b/util.go +index 7d6da4f..c1f4915 100644 +--- a/util.go ++++ b/util.go +@@ -17,6 +17,7 @@ import ( + "github.com/fosrl/newt/logger" + "github.com/fosrl/newt/proxy" + "github.com/fosrl/newt/websocket" ++ "github.com/fosrl/newt/internal/telemetry" + "golang.org/x/net/icmp" + "golang.org/x/net/ipv4" + "golang.zx2c4.com/wireguard/device" +@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC + return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background") + } + +-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} { ++func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} { + maxInterval := 6 * time.Second + currentInterval := pingInterval + consecutiveFailures := 0 +@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien + if !connectionLost { + connectionLost = true + logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) ++ if tunnelID != "" { ++ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout) ++ } + stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second) + // Send registration message to the server for backward compatibility + err := client.SendMessage("newt/wg/register", map[string]interface{}{ +@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien + } else { + // Track recent latencies + recentLatencies = append(recentLatencies, latency) ++ // Record tunnel latency (limit sampling to this periodic check) ++ if tunnelID != "" { ++ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds()) ++ } + if len(recentLatencies) > 10 { + recentLatencies = recentLatencies[1:] + } +diff --git a/websocket/client.go b/websocket/client.go +index 0c0664a..c9ac264 100644 +--- a/websocket/client.go ++++ b/websocket/client.go +@@ -18,6 +18,10 @@ import ( + + "github.com/fosrl/newt/logger" + "github.com/gorilla/websocket" ++ ++ "context" ++ "github.com/fosrl/newt/internal/telemetry" ++ "go.opentelemetry.io/otel" + ) + + type Client struct { +@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) { + } + resp, err := client.Do(req) + if err != nil { ++ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err)) + return "", fmt.Errorf("failed to request new token: %w", err) + } + defer resp.Body.Close() +@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) { + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure") ++ bin := "http_other" ++ if resp.StatusCode >= 500 { ++ bin = "http_5xx" ++ } else if resp.StatusCode >= 400 { ++ bin = "http_4xx" ++ } ++ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin) ++ // Reconnect reason mapping for auth failures ++ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError) ++ } + return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) + } + +@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) { + } + + logger.Debug("Received token: %s", tokenResp.Data.Token) ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success") + + return tokenResp.Data.Token, nil + } + ++// classifyConnError maps common errors to low-cardinality error_type labels ++func classifyConnError(err error) string { ++ if err == nil { ++ return "" ++ } ++ msg := strings.ToLower(err.Error()) ++ switch { ++ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"): ++ return "tls" ++ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"): ++ return "timeout" ++ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"): ++ return "dns" ++ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"): ++ return "auth" ++ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"): ++ return "io" ++ default: ++ return "other" ++ } ++} ++ + func (c *Client) connectWithRetry() { + for { + select { +@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error { + // Get token for authentication + token, err := c.getToken() + if err != nil { ++ // telemetry: connection attempt failed before dialing ++ // site_id isn't globally available here; use client ID as site_id (low cardinality) ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") ++ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err)) + return fmt.Errorf("failed to get token: %w", err) + } + +@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error { + q.Set("clientType", c.clientType) + u.RawQuery = q.Encode() + +- // Connect to WebSocket ++ // Connect to WebSocket (optional span) ++ tr := otel.Tracer("newt") ++ spanCtx, span := tr.Start(context.Background(), "ws.connect") ++ defer span.End() ++ + dialer := websocket.DefaultDialer + + // Use new TLS configuration method +@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error { + logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable") + } + +- conn, _, err := dialer.Dial(u.String(), nil) ++conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) + if err != nil { ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") ++ etype := classifyConnError(err) ++ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype) ++ // Map handshake-related errors to reconnect reasons where appropriate ++ if etype == "tls" { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError) ++ } else if etype == "timeout" { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout) ++ } else { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError) ++ } + return fmt.Errorf("failed to connect to WebSocket: %w", err) + } + ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success") + c.conn = conn + c.setConnected(true) + +diff --git a/wg/wg.go b/wg/wg.go +index 3cee1a9..a765279 100644 +--- a/wg/wg.go ++++ b/wg/wg.go +@@ -3,6 +3,7 @@ + package wg + + import ( ++ "context" + "encoding/json" + "errors" + "fmt" +@@ -23,6 +24,8 @@ import ( + "golang.zx2c4.com/wireguard/conn" + "golang.zx2c4.com/wireguard/wgctrl" + "golang.zx2c4.com/wireguard/wgctrl/wgtypes" ++ ++ "github.com/fosrl/newt/internal/telemetry" + ) + + type WgConfig struct { +@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { + s.stopGetConfig = nil + } + ++ // telemetry: config reload success ++ telemetry.IncConfigReload(context.Background(), "success") ++ // Optional reconnect reason mapping: config change ++ if s.serverPubKey != "" { ++ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange) ++ } ++ + // Ensure the WireGuard interface and peers are configured + if err := s.ensureWireguardInterface(config); err != nil { + logger.Error("Failed to ensure WireGuard interface: %v", err) diff --git a/patches/02_reconnect_rtt.patch b/patches/02_reconnect_rtt.patch new file mode 100644 index 0000000..04a88f9 --- /dev/null +++ b/patches/02_reconnect_rtt.patch @@ -0,0 +1,466 @@ +diff --git a/main.go b/main.go +index 12849b1..c223b75 100644 +--- a/main.go ++++ b/main.go +@@ -1,7 +1,9 @@ + package main + + import ( ++ "context" + "encoding/json" ++ "errors" + "flag" + "fmt" + "net" +@@ -22,6 +24,9 @@ import ( + "github.com/fosrl/newt/updates" + "github.com/fosrl/newt/websocket" + ++ "github.com/fosrl/newt/internal/state" ++ "github.com/fosrl/newt/internal/telemetry" ++ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + "golang.zx2c4.com/wireguard/conn" + "golang.zx2c4.com/wireguard/device" + "golang.zx2c4.com/wireguard/tun" +@@ -116,6 +121,13 @@ var ( + healthMonitor *healthcheck.Monitor + enforceHealthcheckCert bool + ++ // Observability/metrics flags ++ metricsEnabled bool ++ otlpEnabled bool ++ adminAddr string ++ region string ++ metricsAsyncBytes bool ++ + // New mTLS configuration variables + tlsClientCert string + tlsClientKey string +@@ -126,6 +138,10 @@ var ( + ) + + func main() { ++ // Prepare context for graceful shutdown and signal handling ++ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) ++ defer stop() ++ + // if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values + endpoint = os.Getenv("PANGOLIN_ENDPOINT") + id = os.Getenv("NEWT_ID") +@@ -141,6 +157,13 @@ func main() { + useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE") + enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT") + ++ // Metrics/observability env mirrors ++ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED") ++ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED") ++ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR") ++ regionEnv := os.Getenv("NEWT_REGION") ++ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES") ++ + keepInterface = keepInterfaceEnv == "true" + acceptClients = acceptClientsEnv == "true" + useNativeInterface = useNativeInterfaceEnv == "true" +@@ -272,6 +295,35 @@ func main() { + flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)") + } + ++ // Metrics/observability flags (mirror ENV if unset) ++ if metricsEnabledEnv == "" { ++ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter") ++ } else { ++ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true } ++ } ++ if otlpEnabledEnv == "" { ++ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT") ++ } else { ++ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v } ++ } ++ if adminAddrEnv == "" { ++ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address") ++ } else { ++ adminAddr = adminAddrEnv ++ } ++ // Async bytes toggle ++ if asyncBytesEnv == "" { ++ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)") ++ } else { ++ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v } ++ } ++ // Optional region flag (resource attribute) ++ if regionEnv == "" { ++ flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)") ++ } else { ++ region = regionEnv ++ } ++ + // do a --version check + version := flag.Bool("version", false, "Print the version") + +@@ -286,6 +338,50 @@ func main() { + loggerLevel := parseLogLevel(logLevel) + logger.GetLogger().SetLevel(parseLogLevel(logLevel)) + ++ // Initialize telemetry after flags are parsed (so flags override env) ++ tcfg := telemetry.FromEnv() ++ tcfg.PromEnabled = metricsEnabled ++ tcfg.OTLPEnabled = otlpEnabled ++ if adminAddr != "" { tcfg.AdminAddr = adminAddr } ++ // Resource attributes (if available) ++ tcfg.SiteID = id ++ tcfg.Region = region ++ // Build info ++ tcfg.BuildVersion = newtVersion ++ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT") ++ ++ tel, telErr := telemetry.Init(ctx, tcfg) ++ if telErr != nil { ++ logger.Warn("Telemetry init failed: %v", telErr) ++ } ++ if tel != nil { ++ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled) ++ mux := http.NewServeMux() ++ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) }) ++ if tel.PrometheusHandler != nil { ++ mux.Handle("/metrics", tel.PrometheusHandler) ++ } ++ admin := &http.Server{ ++ Addr: tcfg.AdminAddr, ++ Handler: otelhttp.NewHandler(mux, "newt-admin"), ++ ReadTimeout: 5 * time.Second, ++ WriteTimeout: 10 * time.Second, ++ ReadHeaderTimeout: 5 * time.Second, ++ IdleTimeout: 30 * time.Second, ++ } ++ go func() { ++ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { ++ logger.Warn("admin http error: %v", err) ++ } ++ }() ++ defer func() { ++ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) ++ defer cancel() ++ _ = admin.Shutdown(ctx) ++ }() ++ defer func() { _ = tel.Shutdown(context.Background()) }() ++ } ++ + newtVersion := "version_replaceme" + if *version { + fmt.Println("Newt version " + newtVersion) +@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + } + // Use reliable ping for initial connection test + logger.Debug("Testing initial connection with reliable ping...") +- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) ++ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) ++ if err == nil && wgData.PublicKey != "" { ++ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds()) ++ } + if err != nil { + logger.Warn("Initial reliable ping failed, but continuing: %v", err) + } else { +@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + // as the pings will continue in the background + if !connected { + logger.Debug("Starting ping check") +- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client) ++ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey) + } + + // Create proxy manager + pm = proxy.NewProxyManager(tnet) ++ pm.SetAsyncBytes(metricsAsyncBytes) ++ // Set tunnel_id for metrics (WireGuard peer public key) ++ pm.SetTunnelID(wgData.PublicKey) + + connected = true + ++ // telemetry: record a successful site registration (omit region unless available) ++ telemetry.IncSiteRegistration(context.Background(), id, "", "success") ++ + // add the targets if there are any + if len(wgData.Targets.TCP) > 0 { + updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP}) +@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + + client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) { + logger.Info("Received reconnect message") ++ if wgData.PublicKey != "" { ++ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") ++ } + + // Close the WireGuard device and TUN + closeWgTunnel() + ++ // Clear metrics attrs and sessions for the tunnel ++ if pm != nil { ++ pm.ClearTunnelID() ++ state.Global().ClearTunnel(wgData.PublicKey) ++ } ++ ++ // Clear metrics attrs and sessions for the tunnel ++ if pm != nil { ++ pm.ClearTunnelID() ++ state.Global().ClearTunnel(wgData.PublicKey) ++ } ++ + // Mark as disconnected + connected = false + +@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub + + client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) { + logger.Info("Received termination message") ++ if wgData.PublicKey != "" { ++ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") ++ } + + // Close the WireGuard device and TUN + closeWgTunnel() +diff --git a/util.go b/util.go +index 7d6da4f..c1f4915 100644 +--- a/util.go ++++ b/util.go +@@ -17,6 +17,7 @@ import ( + "github.com/fosrl/newt/logger" + "github.com/fosrl/newt/proxy" + "github.com/fosrl/newt/websocket" ++ "github.com/fosrl/newt/internal/telemetry" + "golang.org/x/net/icmp" + "golang.org/x/net/ipv4" + "golang.zx2c4.com/wireguard/device" +@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC + return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background") + } + +-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} { ++func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} { + maxInterval := 6 * time.Second + currentInterval := pingInterval + consecutiveFailures := 0 +@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien + if !connectionLost { + connectionLost = true + logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) ++ if tunnelID != "" { ++ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout) ++ } + stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second) + // Send registration message to the server for backward compatibility + err := client.SendMessage("newt/wg/register", map[string]interface{}{ +@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien + } else { + // Track recent latencies + recentLatencies = append(recentLatencies, latency) ++ // Record tunnel latency (limit sampling to this periodic check) ++ if tunnelID != "" { ++ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds()) ++ } + if len(recentLatencies) > 10 { + recentLatencies = recentLatencies[1:] + } +diff --git a/websocket/client.go b/websocket/client.go +index 0c0664a..c9ac264 100644 +--- a/websocket/client.go ++++ b/websocket/client.go +@@ -18,6 +18,10 @@ import ( + + "github.com/fosrl/newt/logger" + "github.com/gorilla/websocket" ++ ++ "context" ++ "github.com/fosrl/newt/internal/telemetry" ++ "go.opentelemetry.io/otel" + ) + + type Client struct { +@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) { + } + resp, err := client.Do(req) + if err != nil { ++ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err)) + return "", fmt.Errorf("failed to request new token: %w", err) + } + defer resp.Body.Close() +@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) { + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure") ++ bin := "http_other" ++ if resp.StatusCode >= 500 { ++ bin = "http_5xx" ++ } else if resp.StatusCode >= 400 { ++ bin = "http_4xx" ++ } ++ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin) ++ // Reconnect reason mapping for auth failures ++ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError) ++ } + return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) + } + +@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) { + } + + logger.Debug("Received token: %s", tokenResp.Data.Token) ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success") + + return tokenResp.Data.Token, nil + } + ++// classifyConnError maps common errors to low-cardinality error_type labels ++func classifyConnError(err error) string { ++ if err == nil { ++ return "" ++ } ++ msg := strings.ToLower(err.Error()) ++ switch { ++ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"): ++ return "tls" ++ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"): ++ return "timeout" ++ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"): ++ return "dns" ++ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"): ++ return "auth" ++ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"): ++ return "io" ++ default: ++ return "other" ++ } ++} ++ + func (c *Client) connectWithRetry() { + for { + select { +@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error { + // Get token for authentication + token, err := c.getToken() + if err != nil { ++ // telemetry: connection attempt failed before dialing ++ // site_id isn't globally available here; use client ID as site_id (low cardinality) ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") ++ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err)) + return fmt.Errorf("failed to get token: %w", err) + } + +@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error { + q.Set("clientType", c.clientType) + u.RawQuery = q.Encode() + +- // Connect to WebSocket ++ // Connect to WebSocket (optional span) ++ tr := otel.Tracer("newt") ++ spanCtx, span := tr.Start(context.Background(), "ws.connect") ++ defer span.End() ++ + dialer := websocket.DefaultDialer + + // Use new TLS configuration method +@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error { + logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable") + } + +- conn, _, err := dialer.Dial(u.String(), nil) ++conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) + if err != nil { ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") ++ etype := classifyConnError(err) ++ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype) ++ // Map handshake-related errors to reconnect reasons where appropriate ++ if etype == "tls" { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError) ++ } else if etype == "timeout" { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout) ++ } else { ++ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError) ++ } + return fmt.Errorf("failed to connect to WebSocket: %w", err) + } + ++ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success") + c.conn = conn + c.setConnected(true) + +diff --git a/wg/wg.go b/wg/wg.go +index 3cee1a9..a765279 100644 +--- a/wg/wg.go ++++ b/wg/wg.go +@@ -3,6 +3,7 @@ + package wg + + import ( ++ "context" + "encoding/json" + "errors" + "fmt" +@@ -23,6 +24,8 @@ import ( + "golang.zx2c4.com/wireguard/conn" + "golang.zx2c4.com/wireguard/wgctrl" + "golang.zx2c4.com/wireguard/wgctrl/wgtypes" ++ ++ "github.com/fosrl/newt/internal/telemetry" + ) + + type WgConfig struct { +@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { + s.stopGetConfig = nil + } + ++ // telemetry: config reload success ++ telemetry.IncConfigReload(context.Background(), "success") ++ // Optional reconnect reason mapping: config change ++ if s.serverPubKey != "" { ++ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange) ++ } ++ + // Ensure the WireGuard interface and peers are configured + if err := s.ensureWireguardInterface(config); err != nil { + logger.Error("Failed to ensure WireGuard interface: %v", err) +diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go +index 6684c40..09f160e 100644 +--- a/wgnetstack/wgnetstack.go ++++ b/wgnetstack/wgnetstack.go +@@ -1,6 +1,7 @@ + package wgnetstack + + import ( ++ "context" + "crypto/rand" + "encoding/base64" + "encoding/hex" +@@ -26,6 +27,8 @@ import ( + "golang.zx2c4.com/wireguard/tun" + "golang.zx2c4.com/wireguard/tun/netstack" + "golang.zx2c4.com/wireguard/wgctrl/wgtypes" ++ ++ "github.com/fosrl/newt/internal/telemetry" + ) + + type WgConfig struct { +@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str + return service, nil + } + ++// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally. ++func (s *WireGuardService) ReportRTT(seconds float64) { ++ if s.serverPubKey == "" { return } ++ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds) ++} ++ + func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) { + logger.Debug("Received: %+v", msg) + + // if there is no wgData or pm, we can't add targets + if s.TunnelIP == "" || s.proxyManager == nil { + logger.Info("No tunnel IP or proxy manager available") +- return +- } ++ return ++} + + targetData, err := parseTargetData(msg.Data) + if err != nil { diff --git a/patches/03_constants_docs.patch b/patches/03_constants_docs.patch new file mode 100644 index 0000000..e69de29 diff --git a/patches/03_wg_rtt_hook.patch b/patches/03_wg_rtt_hook.patch new file mode 100644 index 0000000..4d5eb19 --- /dev/null +++ b/patches/03_wg_rtt_hook.patch @@ -0,0 +1,44 @@ +diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go +index 6684c40..09f160e 100644 +--- a/wgnetstack/wgnetstack.go ++++ b/wgnetstack/wgnetstack.go +@@ -1,6 +1,7 @@ + package wgnetstack + + import ( ++ "context" + "crypto/rand" + "encoding/base64" + "encoding/hex" +@@ -26,6 +27,8 @@ import ( + "golang.zx2c4.com/wireguard/tun" + "golang.zx2c4.com/wireguard/tun/netstack" + "golang.zx2c4.com/wireguard/wgctrl/wgtypes" ++ ++ "github.com/fosrl/newt/internal/telemetry" + ) + + type WgConfig struct { +@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str + return service, nil + } + ++// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally. ++func (s *WireGuardService) ReportRTT(seconds float64) { ++ if s.serverPubKey == "" { return } ++ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds) ++} ++ + func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) { + logger.Debug("Received: %+v", msg) + + // if there is no wgData or pm, we can't add targets + if s.TunnelIP == "" || s.proxyManager == nil { + logger.Info("No tunnel IP or proxy manager available") +- return +- } ++ return ++} + + targetData, err := parseTargetData(msg.Data) + if err != nil { diff --git a/patches/04_tests_docs.patch b/patches/04_tests_docs.patch new file mode 100644 index 0000000..e69de29 diff --git a/patches/HOWTO-APPLY.md b/patches/HOWTO-APPLY.md new file mode 100644 index 0000000..aaf0e53 --- /dev/null +++ b/patches/HOWTO-APPLY.md @@ -0,0 +1,25 @@ +# How to apply patches + +These patches were generated from the working tree without commits. You can apply them in one shot or in topic order. + +One shot (recommended during review): + +```bash +git apply patches/00_all_changes.patch +``` + +Topic order: + +```bash +git apply patches/01_proxy_multitunnel.patch +git apply patches/02_reconnect_rtt.patch +git apply patches/03_constants_docs.patch +``` + +Rollback (restore to HEAD and clean untracked files): + +```bash +git restore --source=HEAD --worktree --staged . +git clean -fd +``` + From 9ac4cee48d55b0636c0bcd6ba998828f49e89703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 11:09:20 +0200 Subject: [PATCH 16/72] feat: Add Docker Compose configuration for OpenTelemetry collector and Prometheus --- docker-compose-coolify.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 docker-compose-coolify.yml diff --git a/docker-compose-coolify.yml b/docker-compose-coolify.yml new file mode 100644 index 0000000..7073d12 --- /dev/null +++ b/docker-compose-coolify.yml @@ -0,0 +1,32 @@ +services: + collector: + image: otel/opentelemetry-collector:0.111.0 + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC in + - "8889:8889" # Prometheus scrape out + + newt: + build: . + image: newt:dev + environment: + OTEL_SERVICE_NAME: newt + NEWT_METRICS_PROMETHEUS_ENABLED: "true" + NEWT_METRICS_OTLP_ENABLED: "true" + OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317" + OTEL_EXPORTER_OTLP_INSECURE: "true" + OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative" + NEWT_ADMIN_ADDR: "0.0.0.0:2112" + ports: + - "2112:2112" + depends_on: + - collector + + prometheus: + image: prom/prometheus:v2.55.0 + volumes: + - ./examples/prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" From d31d08c1c8d08ac62d973e004ba63d71e1d1af97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 11:25:07 +0200 Subject: [PATCH 17/72] feat: Update Dockerfile to include installation of git and ca-certificates --- Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Dockerfile b/Dockerfile index b9b6dea..2ae125d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,10 @@ +#ghcr.io/marcschaeferger/newt-private:1.0.0-otel +#tademsh/newt:1.0.0-otel FROM golang:1.25-alpine AS builder +# Install git and ca-certificates +RUN apk --no-cache add ca-certificates git tzdata + # Set the working directory inside the container WORKDIR /app From cfe52caa4a89db9fb4e4ec2eaf8e473106d81cce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 11:30:53 +0200 Subject: [PATCH 18/72] chore: No code changes made to the Dockerfile --- Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Dockerfile b/Dockerfile index 2ae125d..24393c6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,11 @@ WORKDIR /app # Copy go mod and sum files COPY go.mod go.sum ./ +# Coolify specific Test - set Go proxy to direct to avoid issues +# ENV GOSUMDB=off +ENV GOPROXY=direct +RUN go env | grep -E 'GOPROXY|GOSUMDB|GOPRIVATE' + # Download all dependencies RUN go mod download From 922591b26928d3125977e13e5d9cdf5b87cff3a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 11:36:23 +0200 Subject: [PATCH 19/72] chore: Update Dockerfile to enhance Go proxy settings and optimize build process --- Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 24393c6..55c9988 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,17 +13,17 @@ COPY go.mod go.sum ./ # Coolify specific Test - set Go proxy to direct to avoid issues # ENV GOSUMDB=off -ENV GOPROXY=direct -RUN go env | grep -E 'GOPROXY|GOSUMDB|GOPRIVATE' +ENV GOPROXY=https://goproxy.io,https://proxy.golang.org,direct +RUN go env | grep -E 'GOPROXY|GOSUMDB|GOPRIVATE' && go mod download # Download all dependencies -RUN go mod download +#RUN go mod download # Copy the source code into the container COPY . . # Build the application -RUN CGO_ENABLED=0 GOOS=linux go build -o /newt +RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /newt FROM alpine:3.22 AS runner From 3e9c74a65b1a9170cccab56b5d224ccd28e494ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 11:51:13 +0200 Subject: [PATCH 20/72] chore: Update OpenTelemetry collector image to version 0.136.0 --- docker-compose.metrics.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml index 76b92a8..89c152c 100644 --- a/docker-compose.metrics.yml +++ b/docker-compose.metrics.yml @@ -1,6 +1,6 @@ services: collector: - image: otel/opentelemetry-collector:0.111.0 + image: otel/opentelemetry-collector-contrib:0.136.0 command: ["--config=/etc/otelcol/config.yaml"] volumes: - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro From bd55269b39131a111c5070c763bf9d1718b1f2f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 12:37:16 +0200 Subject: [PATCH 21/72] feat: Add .env.example file and update docker-compose to use environment variables --- .env.example | 5 +++++ .gitignore | 1 + docker-compose.metrics.yml | 9 ++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..0697458 --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +# Copy this file to .env and fill in your values +# Required for connecting to Pangolin service +PANGOLIN_ENDPOINT=https://example.com +NEWT_ID=changeme-id +NEWT_SECRET=changeme-secret \ No newline at end of file diff --git a/.gitignore b/.gitignore index d14efa9..ee03c76 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ nohup.out *.iml certs/ newt_arm64 +.env diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml index 89c152c..69746f1 100644 --- a/docker-compose.metrics.yml +++ b/docker-compose.metrics.yml @@ -11,6 +11,8 @@ services: newt: build: . image: newt:dev + env_file: + - .env environment: OTEL_SERVICE_NAME: newt NEWT_METRICS_PROMETHEUS_ENABLED: "true" @@ -19,13 +21,18 @@ services: OTEL_EXPORTER_OTLP_INSECURE: "true" OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative" NEWT_ADMIN_ADDR: "0.0.0.0:2112" + # Base NEWT configuration + PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT} + NEWT_ID: ${NEWT_ID} + NEWT_SECRET: ${NEWT_SECRET} + LOG_LEVEL: "DEBUG" ports: - "2112:2112" depends_on: - collector prometheus: - image: prom/prometheus:v2.55.0 + image: prom/prometheus:v3.6.0 volumes: - ./examples/prometheus.yml:/etc/prometheus/prometheus.yml:ro ports: From 0b5e662abc9b269e8e7e4c9a018ed1c020734288 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 12:37:44 +0200 Subject: [PATCH 22/72] fix: Update otel-collector.yaml to correct resource attribute checks and streamline processor/exporter configuration --- examples/otel-collector.yaml | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/examples/otel-collector.yaml b/examples/otel-collector.yaml index c2b6854..6000013 100644 --- a/examples/otel-collector.yaml +++ b/examples/otel-collector.yaml @@ -9,33 +9,36 @@ processors: check_interval: 5s limit_percentage: 80 spike_limit_percentage: 25 - batch: {} - transform/promote: - metric_statements: - - context: datapoint - statements: - - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where IsMapKey(resource.attributes, "service.instance.id") - - set(attributes["site_id"], resource.attributes["site_id"]) where IsMapKey(resource.attributes, "site_id") resourcedetection: detectors: [env, host] timeout: 5s + batch: {} + transform/promote: + # optional, damit fehlende Keys nicht die Pipeline abbrechen: + error_mode: ignore + metric_statements: + - context: datapoint + statements: + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where IsSet(resource.attributes["service.instance.id"]) + - set(attributes["site_id"], resource.attributes["site_id"]) where IsSet(resource.attributes["site_id"]) exporters: prometheus: endpoint: 0.0.0.0:8889 send_timestamps: true - prometheusremotewrite: - # Replace with your remote_write endpoint (Mimir/Cortex/VictoriaMetrics/Thanos Receive) - endpoint: http://mimir:9009/api/v1/push + # Falls du kein Remote-Write-Ziel hast, kommentiere es aus: + # prometheusremotewrite: + # endpoint: http://mimir:9009/api/v1/push + debug: + verbosity: basic service: pipelines: metrics: receivers: [otlp] - processors: [memory_limiter, resourcedetection, batch, transform/promote] - exporters: [prometheus, prometheusremotewrite] + processors: [memory_limiter, resourcedetection, transform/promote, batch] + exporters: [prometheus] # , prometheusremotewrite traces: receivers: [otlp] processors: [memory_limiter, resourcedetection, batch] - exporters: [] - + exporters: [debug] From f8fd8e1bc50924a8dda0228624f616a599aaf00a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 17:53:55 +0200 Subject: [PATCH 23/72] fix: Update otel-collector.yaml and docker-compose to correct endpoint configurations and enhance resource detection --- docker-compose.metrics.yml | 2 +- examples/otel-collector.yaml | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml index 69746f1..7c8e889 100644 --- a/docker-compose.metrics.yml +++ b/docker-compose.metrics.yml @@ -20,7 +20,7 @@ services: OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317" OTEL_EXPORTER_OTLP_INSECURE: "true" OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative" - NEWT_ADMIN_ADDR: "0.0.0.0:2112" + NEWT_ADMIN_ADDR: ":2112" # Base NEWT configuration PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT} NEWT_ID: ${NEWT_ID} diff --git a/examples/otel-collector.yaml b/examples/otel-collector.yaml index 6000013..b00cb67 100644 --- a/examples/otel-collector.yaml +++ b/examples/otel-collector.yaml @@ -2,7 +2,7 @@ receivers: otlp: protocols: grpc: - endpoint: 0.0.0.0:4317 + endpoint: ":4317" processors: memory_limiter: @@ -10,7 +10,7 @@ processors: limit_percentage: 80 spike_limit_percentage: 25 resourcedetection: - detectors: [env, host] + detectors: [env, system] timeout: 5s batch: {} transform/promote: @@ -19,12 +19,12 @@ processors: metric_statements: - context: datapoint statements: - - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where IsSet(resource.attributes["service.instance.id"]) - - set(attributes["site_id"], resource.attributes["site_id"]) where IsSet(resource.attributes["site_id"]) + - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil + - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil exporters: prometheus: - endpoint: 0.0.0.0:8889 + endpoint: ":8889" send_timestamps: true # Falls du kein Remote-Write-Ziel hast, kommentiere es aus: # prometheusremotewrite: From a86b14d97dbea5610705141fa8b8b6fea625b036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 18:43:09 +0200 Subject: [PATCH 24/72] refactor: Simplify telemetry metrics by removing site_id and enhancing tunnel_id usage --- internal/telemetry/metrics.go | 31 ++++++++------------- internal/telemetry/state_view.go | 9 ++++-- internal/telemetry/telemetry.go | 21 ++++++++------ internal/telemetry/telemetry_golden_test.go | 2 +- internal/telemetry/telemetry_smoke_test.go | 2 +- main.go | 8 +++--- util.go | 4 +-- websocket/client.go | 26 ++++++++--------- wg/wg.go | 2 +- wgnetstack/wgnetstack.go | 2 +- 10 files changed, 53 insertions(+), 54 deletions(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index 130fbd3..bd163ca 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -13,8 +13,8 @@ import ( // low-cardinality label guidance from the issue description. // // Counters end with _total, durations are in seconds, sizes in bytes. -// Only low-cardinality stable labels are supported: site_id, tunnel_id, -// transport, direction, result, reason, error_type, region. +// Only low-cardinality stable labels are supported: tunnel_id, +// transport, direction, result, reason, error_type. var ( initOnce sync.Once @@ -147,9 +147,9 @@ var ( // Example inside your code (where you have access to current state): // // telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error { -// o.ObserveInt64(mSiteOnline, 1, attribute.String("site_id", siteID)) -// o.ObserveFloat64(mSiteLastHeartbeat, time.Since(lastHB).Seconds(), attribute.String("site_id", siteID)) -// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions)), attribute.String("site_id", siteID)) +// o.ObserveInt64(mSiteOnline, 1) +// o.ObserveFloat64(mSiteLastHeartbeat, time.Since(lastHB).Seconds()) +// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions))) // return nil // }) func SetObservableCallback(cb func(context.Context, metric.Observer) error) { @@ -174,20 +174,15 @@ func IncConfigReload(ctx context.Context, result string) { // Helpers for counters/histograms -func IncSiteRegistration(ctx context.Context, siteID, region, result string) { +func IncSiteRegistration(ctx context.Context, result string) { attrs := []attribute.KeyValue{ - attribute.String("site_id", siteID), attribute.String("result", result), } - if region != "" { - attrs = append(attrs, attribute.String("region", region)) - } mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrs...)) } -func AddTunnelBytes(ctx context.Context, siteID, tunnelID, direction string, n int64) { +func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) { mTunnelBytes.Add(ctx, n, metric.WithAttributes( - attribute.String("site_id", siteID), attribute.String("tunnel_id", tunnelID), attribute.String("direction", direction), )) @@ -198,33 +193,29 @@ func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) { mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs)) } -func ObserveTunnelLatency(ctx context.Context, siteID, tunnelID, transport string, seconds float64) { +func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) { mTunnelLatency.Record(ctx, seconds, metric.WithAttributes( - attribute.String("site_id", siteID), attribute.String("tunnel_id", tunnelID), attribute.String("transport", transport), )) } -func IncReconnect(ctx context.Context, siteID, tunnelID, reason string) { +func IncReconnect(ctx context.Context, tunnelID, reason string) { mReconnects.Add(ctx, 1, metric.WithAttributes( - attribute.String("site_id", siteID), attribute.String("tunnel_id", tunnelID), attribute.String("reason", reason), )) } -func IncConnAttempt(ctx context.Context, siteID, transport, result string) { +func IncConnAttempt(ctx context.Context, transport, result string) { mConnAttempts.Add(ctx, 1, metric.WithAttributes( - attribute.String("site_id", siteID), attribute.String("transport", transport), attribute.String("result", result), )) } -func IncConnError(ctx context.Context, siteID, transport, typ string) { +func IncConnError(ctx context.Context, transport, typ string) { mConnErrors.Add(ctx, 1, metric.WithAttributes( - attribute.String("site_id", siteID), attribute.String("transport", transport), attribute.String("error_type", typ), )) diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go index 4c97ddf..ec3f529 100644 --- a/internal/telemetry/state_view.go +++ b/internal/telemetry/state_view.go @@ -42,16 +42,19 @@ func RegisterStateView(v StateView) { if online { val = 1 } - o.ObserveInt64(mSiteOnline, val, metric.WithAttributes(attribute.String("site_id", siteID))) + o.ObserveInt64(mSiteOnline, val) } if t, ok := sv.LastHeartbeat(siteID); ok { secs := time.Since(t).Seconds() - o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes(attribute.String("site_id", siteID))) + o.ObserveFloat64(mSiteLastHeartbeat, secs) } // If the view supports per-tunnel sessions, report them labeled by tunnel_id. if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok { for tid, n := range tm.SessionsByTunnel() { - o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attribute.String("tunnel_id", tid))) + o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes( + attribute.String("tunnel_id", tid), + attribute.String("transport", "tcp"), + )) } } } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 20a25c0..d54e4d8 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -93,16 +93,21 @@ type Setup struct { // installs recommended histogram views for *_latency_seconds, and returns a Setup with // a Shutdown method to flush exporters. func Init(ctx context.Context, cfg Config) (*Setup, error) { + // Build resource with required attributes and only include optional ones when non-empty + attrs := []attribute.KeyValue{ + semconv.ServiceName(cfg.ServiceName), + semconv.ServiceVersion(cfg.ServiceVersion), + } + if cfg.SiteID != "" { + attrs = append(attrs, attribute.String("site_id", cfg.SiteID)) + } + if cfg.Region != "" { + attrs = append(attrs, attribute.String("region", cfg.Region)) + } res, _ := resource.New(ctx, resource.WithFromEnv(), resource.WithHost(), - resource.WithAttributes( - semconv.ServiceName(cfg.ServiceName), - semconv.ServiceVersion(cfg.ServiceVersion), - // Optional resource attributes - attribute.String("site_id", cfg.SiteID), - attribute.String("region", cfg.Region), - ), + resource.WithAttributes(attrs...), ) s := &Setup{} @@ -168,7 +173,7 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { AttributeFilter: func(kv attribute.KeyValue) bool { k := string(kv.Key) switch k { - case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type": + case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type", "version", "commit": return true default: return false diff --git a/internal/telemetry/telemetry_golden_test.go b/internal/telemetry/telemetry_golden_test.go index 91dcbd2..3e6f896 100644 --- a/internal/telemetry/telemetry_golden_test.go +++ b/internal/telemetry/telemetry_golden_test.go @@ -25,7 +25,7 @@ cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", defer ts.Close() // Trigger a counter - IncConnAttempt(ctx, "ignored", "websocket", "success") + IncConnAttempt(ctx, "websocket", "success") time.Sleep(100 * time.Millisecond) resp, err := http.Get(ts.URL) diff --git a/internal/telemetry/telemetry_smoke_test.go b/internal/telemetry/telemetry_smoke_test.go index b820af1..d51ea8e 100644 --- a/internal/telemetry/telemetry_smoke_test.go +++ b/internal/telemetry/telemetry_smoke_test.go @@ -36,7 +36,7 @@ func TestMetricsSmoke(t *testing.T) { defer ts.Close() // Record a simple metric and then fetch /metrics - IncConnAttempt(ctx, "site-1", "websocket", "success") + IncConnAttempt(ctx, "websocket", "success") // Give the exporter a tick to collect time.Sleep(100 * time.Millisecond) diff --git a/main.go b/main.go index 025967a..d3624c8 100644 --- a/main.go +++ b/main.go @@ -666,7 +666,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub logger.Debug("Testing initial connection with reliable ping...") lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) if err == nil && wgData.PublicKey != "" { - telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds()) + telemetry.ObserveTunnelLatency(context.Background(), wgData.PublicKey, "wireguard", lat.Seconds()) } if err != nil { logger.Warn("Initial reliable ping failed, but continuing: %v", err) @@ -692,7 +692,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub connected = true // telemetry: record a successful site registration (omit region unless available) - telemetry.IncSiteRegistration(context.Background(), id, "", "success") + telemetry.IncSiteRegistration(context.Background(), "success") // add the targets if there are any if len(wgData.Targets.TCP) > 0 { @@ -728,7 +728,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) { logger.Info("Received reconnect message") if wgData.PublicKey != "" { - telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") + telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server_request") } // Close the WireGuard device and TUN @@ -763,7 +763,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) { logger.Info("Received termination message") if wgData.PublicKey != "" { - telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request") + telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server_request") } // Close the WireGuard device and TUN diff --git a/util.go b/util.go index dc19388..25cdb9d 100644 --- a/util.go +++ b/util.go @@ -295,7 +295,7 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien connectionLost = true logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) if tunnelID != "" { - telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout) + telemetry.IncReconnect(context.Background(), tunnelID, telemetry.ReasonTimeout) } stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second) // Send registration message to the server for backward compatibility @@ -325,7 +325,7 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien recentLatencies = append(recentLatencies, latency) // Record tunnel latency (limit sampling to this periodic check) if tunnelID != "" { - telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds()) + telemetry.ObserveTunnelLatency(context.Background(), tunnelID, "wireguard", latency.Seconds()) } if len(recentLatencies) > 10 { recentLatencies = recentLatencies[1:] diff --git a/websocket/client.go b/websocket/client.go index c9ac264..e38a6c9 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -291,7 +291,7 @@ func (c *Client) getToken() (string, error) { } resp, err := client.Do(req) if err != nil { - telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err)) + telemetry.IncConnError(context.Background(), "auth", classifyConnError(err)) return "", fmt.Errorf("failed to request new token: %w", err) } defer resp.Body.Close() @@ -299,17 +299,17 @@ func (c *Client) getToken() (string, error) { if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) - telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure") + telemetry.IncConnAttempt(context.Background(), "auth", "failure") bin := "http_other" if resp.StatusCode >= 500 { bin = "http_5xx" } else if resp.StatusCode >= 400 { bin = "http_4xx" } - telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin) + telemetry.IncConnError(context.Background(), "auth", bin) // Reconnect reason mapping for auth failures if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { - telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError) + telemetry.IncReconnect(context.Background(), c.config.ID, telemetry.ReasonAuthError) } return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) } @@ -329,7 +329,7 @@ func (c *Client) getToken() (string, error) { } logger.Debug("Received token: %s", tokenResp.Data.Token) - telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success") + telemetry.IncConnAttempt(context.Background(), "auth", "success") return tokenResp.Data.Token, nil } @@ -379,8 +379,8 @@ func (c *Client) establishConnection() error { if err != nil { // telemetry: connection attempt failed before dialing // site_id isn't globally available here; use client ID as site_id (low cardinality) - telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") - telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err)) + telemetry.IncConnAttempt(context.Background(), "websocket", "failure") + telemetry.IncConnError(context.Background(), "websocket", classifyConnError(err)) return fmt.Errorf("failed to get token: %w", err) } @@ -441,21 +441,21 @@ func (c *Client) establishConnection() error { conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) if err != nil { - telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure") + telemetry.IncConnAttempt(context.Background(), "websocket", "failure") etype := classifyConnError(err) - telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype) + telemetry.IncConnError(context.Background(), "websocket", etype) // Map handshake-related errors to reconnect reasons where appropriate if etype == "tls" { - telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError) + telemetry.IncReconnect(context.Background(), c.config.ID, telemetry.ReasonHandshakeError) } else if etype == "timeout" { - telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout) + telemetry.IncReconnect(context.Background(), c.config.ID, telemetry.ReasonTimeout) } else { - telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError) + telemetry.IncReconnect(context.Background(), c.config.ID, telemetry.ReasonError) } return fmt.Errorf("failed to connect to WebSocket: %w", err) } - telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success") + telemetry.IncConnAttempt(context.Background(), "websocket", "success") c.conn = conn c.setConnected(true) diff --git a/wg/wg.go b/wg/wg.go index adf8df6..1607427 100644 --- a/wg/wg.go +++ b/wg/wg.go @@ -306,7 +306,7 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { telemetry.IncConfigReload(context.Background(), "success") // Optional reconnect reason mapping: config change if s.serverPubKey != "" { - telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange) + telemetry.IncReconnect(context.Background(), s.serverPubKey, telemetry.ReasonConfigChange) } // Ensure the WireGuard interface and peers are configured diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go index 09f160e..dd7d493 100644 --- a/wgnetstack/wgnetstack.go +++ b/wgnetstack/wgnetstack.go @@ -246,7 +246,7 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str // ReportRTT allows reporting native RTTs to telemetry, rate-limited externally. func (s *WireGuardService) ReportRTT(seconds float64) { if s.serverPubKey == "" { return } - telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds) + telemetry.ObserveTunnelLatency(context.Background(), s.serverPubKey, "wireguard", seconds) } func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) { From 427ab67bb5391670092669f482a2859f9e493cc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 18:45:02 +0200 Subject: [PATCH 25/72] fix: Update observability documentation to clarify resource attributes and scraping strategy --- docs/observability.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/observability.md b/docs/observability.md index 3e9e890..6267372 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -35,7 +35,8 @@ Runtime behavior Metric catalog (initial) -- newt_site_registrations_total (counter) labels: result, region (optional); site_id is a resource attribute +- newt_build_info (gauge) labels: version, commit; value is always 1 +- newt_site_registrations_total (counter) labels: result; site_id and region are resource attributes - newt_site_online (observable gauge) no labels (0/1) - newt_site_last_heartbeat_seconds (observable gauge) no labels - newt_tunnel_sessions (observable gauge) labels: tunnel_id, transport @@ -101,6 +102,9 @@ OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer abc123,tenant=acme" \ ``` Prometheus scrape strategy (choose one) + +Important: Do not scrape both Newt (2112) and the Collector’s Prometheus exporter (8889) at the same time for the same process. Doing so will double-count cumulative counters. + A) Scrape Newt directly: ``` @@ -157,7 +161,7 @@ sum(newt_tunnel_sessions) Compatibility notes - Gauges do not use the _total suffix (e.g., newt_tunnel_sessions). -- site_id is a resource attribute (one process = one site). tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels. +- site_id and region are resource attributes (one process = one site). Only non-empty resource attributes are exported. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels. - Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both. - Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write. - No free text in labels; use only the enumerated constants for reason and protocol. From d745aa79d4c0873c709d31aade0fc510f8198b7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 18:45:40 +0200 Subject: [PATCH 26/72] feat: Add Grafana dashboard and Prometheus datasource configuration files --- .../grafana/dashboards/newt-overview.json | 898 ++++++++++++++++++ .../provisioning/dashboards/dashboard.yaml | 9 + .../provisioning/datasources/prometheus.yaml | 9 + 3 files changed, 916 insertions(+) create mode 100644 examples/grafana/dashboards/newt-overview.json create mode 100644 examples/grafana/provisioning/dashboards/dashboard.yaml create mode 100644 examples/grafana/provisioning/datasources/prometheus.yaml diff --git a/examples/grafana/dashboards/newt-overview.json b/examples/grafana/dashboards/newt-overview.json new file mode 100644 index 0000000..2f3a539 --- /dev/null +++ b/examples/grafana/dashboards/newt-overview.json @@ -0,0 +1,898 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 500 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_goroutine_count", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Goroutines", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 256 + }, + { + "color": "red", + "value": 512 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memory_gc_goal_bytes / 1024 / 1024", + "format": "time_series", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "GC Target Heap (MiB)", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 10 + }, + { + "color": "red", + "value": 25 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(http_server_request_duration_seconds_count[$__rate_interval]))", + "instant": false, + "legendFormat": "req/s", + "refId": "A" + } + ], + "title": "HTTP Requests / s", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 0.1 + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(newt_connection_errors_total{site_id=~\"$site_id\"}[$__rate_interval]))", + "instant": false, + "legendFormat": "errors/s", + "refId": "A" + } + ], + "title": "Connection Errors / s", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(go_memory_used_bytes)", + "legendFormat": "Used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memory_gc_goal_bytes", + "legendFormat": "GC Goal", + "refId": "B" + } + ], + "title": "Go Heap Usage vs GC Goal", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(go_memory_allocations_total[$__rate_interval])", + "legendFormat": "Allocations/s", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(go_memory_allocated_bytes_total[$__rate_interval])", + "legendFormat": "Allocated bytes/s", + "refId": "B" + } + ], + "title": "Allocation Activity", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "HTTP Request Duration Quantiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(http_server_request_duration_seconds_count[$__rate_interval])) by (http_response_status_code)", + "legendFormat": "{{http_response_status_code}}", + "refId": "A" + } + ], + "title": "HTTP Requests by Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(newt_connection_attempts_total{site_id=~\"$site_id\"}[$__rate_interval])) by (transport, result)", + "legendFormat": "{{transport}} • {{result}}", + "refId": "A" + } + ], + "title": "Connection Attempts by Transport/Result", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(newt_connection_errors_total{site_id=~\"$site_id\"}[$__rate_interval])) by (transport, error_type)", + "legendFormat": "{{transport}} • {{error_type}}", + "refId": "A" + } + ], + "title": "Connection Errors by Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 3, + "mappings": [], + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Tunnel Latency Quantiles", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateTurbo" + }, + "dataFormat": "tsbuckets", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 34 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "id": 12, + "legend": { + "show": false + }, + "options": { + "calculate": true, + "cellGap": 2, + "cellSize": "auto", + "color": { + "exponent": 0.5 + }, + "exemplars": { + "color": "rgba(255,255,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "tooltip": { + "mode": "single", + "show": true + }, + "xAxis": { + "show": true + }, + "yAxis": { + "decimals": 3, + "show": true + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Tunnel Latency Bucket Rate", + "type": "heatmap" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": [ + "newt", + "otel" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "label": "Datasource", + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(target_info, site_id)", + "hide": 0, + "includeAll": true, + "label": "Site", + "multi": true, + "name": "site_id", + "options": [], + "query": { + "query": "label_values(target_info, site_id)", + "refId": "SiteIdVar" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\"}, tunnel_id)", + "hide": 0, + "includeAll": true, + "label": "Tunnel", + "multi": true, + "name": "tunnel_id", + "options": [], + "query": { + "query": "label_values(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\"}, tunnel_id)", + "refId": "TunnelVar" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Newt Overview", + "uid": "newt-overview", + "version": 1, + "weekStart": "" +} diff --git a/examples/grafana/provisioning/dashboards/dashboard.yaml b/examples/grafana/provisioning/dashboards/dashboard.yaml new file mode 100644 index 0000000..0acac20 --- /dev/null +++ b/examples/grafana/provisioning/dashboards/dashboard.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 +providers: + - name: "newt" + folder: "Newt" + type: file + disableDeletion: false + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards diff --git a/examples/grafana/provisioning/datasources/prometheus.yaml b/examples/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 0000000..4efb4f7 --- /dev/null +++ b/examples/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,9 @@ +apiVersion: 1 +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + uid: prometheus + isDefault: true + editable: true From d907ae9e84637cd8976915983c5734f7d96acb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 23:55:23 +0200 Subject: [PATCH 27/72] fix: Remove unnecessary blank line in prometheus.yml --- examples/prometheus.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/prometheus.yml b/examples/prometheus.yml index 5323b20..c3018a5 100644 --- a/examples/prometheus.yml +++ b/examples/prometheus.yml @@ -8,4 +8,3 @@ scrape_configs: - job_name: otel-collector static_configs: - targets: ["collector:8889"] - From 59e8d79404f57428ecf2f38111b685c3608238e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 23:55:47 +0200 Subject: [PATCH 28/72] chore: Update docker-compose.metrics.yml for improved service configuration --- docker-compose.metrics.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml index 7c8e889..e3eb21d 100644 --- a/docker-compose.metrics.yml +++ b/docker-compose.metrics.yml @@ -38,3 +38,18 @@ services: ports: - "9090:9090" + grafana: + image: grafana/grafana:latest + container_name: newt-metrics-grafana + restart: unless-stopped + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + ports: + - "3005:3000" + depends_on: + - prometheus + volumes: + - ./examples/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro + - ./examples/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro + - ./examples/grafana/dashboards:/var/lib/grafana/dashboards:ro \ No newline at end of file From d91c6ef1689922799125d25db8ac841f41ff3fd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:00:56 +0200 Subject: [PATCH 29/72] fix: Update observability documentation to correct code block syntax and improve clarity --- docs/observability.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/observability.md b/docs/observability.md index 6267372..93e5595 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -1,3 +1,4 @@ + # OpenTelemetry Observability for Newt This document describes how Newt exposes metrics using the OpenTelemetry (OTel) Go SDK, how to enable Prometheus scraping, and how to send data to an OpenTelemetry Collector for further export. @@ -92,7 +93,7 @@ OTLP TLS example - Enable TLS to Collector with a custom CA and headers: -``` +```sh NEWT_METRICS_OTLP_ENABLED=true \ OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \ OTEL_EXPORTER_OTLP_INSECURE=false \ @@ -107,7 +108,7 @@ Important: Do not scrape both Newt (2112) and the Collector’s Prometheus expor A) Scrape Newt directly: -``` +```yaml global: scrape_interval: 15s scrape_configs: @@ -118,7 +119,7 @@ scrape_configs: B) Scrape the Collector’s Prometheus exporter: -``` +```yaml global: scrape_interval: 15s scrape_configs: @@ -142,19 +143,19 @@ PromQL snippets - Throughput in (5m): -``` +```sh sum(rate(newt_tunnel_bytes_total{direction="in"}[5m])) ``` - P95 latency (seconds): -``` +```sh histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket[5m])) by (le)) ``` - Active sessions: -``` +```sh sum(newt_tunnel_sessions) ``` From 62407b0c740168623a8b27d4f71c357954d225e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:02:44 +0200 Subject: [PATCH 30/72] remove: removed test results --- artifacts/test-results.txt | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 artifacts/test-results.txt diff --git a/artifacts/test-results.txt b/artifacts/test-results.txt deleted file mode 100644 index db66eb8..0000000 --- a/artifacts/test-results.txt +++ /dev/null @@ -1,13 +0,0 @@ -FAIL github.com/fosrl/newt [setup failed] -FAIL github.com/fosrl/newt/docker [setup failed] -FAIL github.com/fosrl/newt/internal/state [setup failed] -FAIL github.com/fosrl/newt/internal/telemetry [setup failed] -FAIL github.com/fosrl/newt/proxy [setup failed] -FAIL github.com/fosrl/newt/websocket [setup failed] -FAIL github.com/fosrl/newt/wgnetstack [setup failed] -? github.com/fosrl/newt/healthcheck [no test files] -? github.com/fosrl/newt/logger [no test files] -? github.com/fosrl/newt/network [no test files] -? github.com/fosrl/newt/updates [no test files] -FAIL github.com/fosrl/newt/wgtester [build failed] -FAIL From 1e88fb86b40ec6c774a8f1a7e7f01ddad8787ffc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:09:17 +0200 Subject: [PATCH 31/72] feat(telemetry,metrics): allow site_id/region in attribute filter; read site_id from NEWT_SITE_ID/NEWT_ID or OTEL_RESOURCE_ATTRIBUTES; propagate site_id/region labels across metrics; include site labels in build_info; seed global site info --- internal/telemetry/metrics.go | 35 ++++++++----- internal/telemetry/telemetry.go | 89 +++++++++++++++++++++++++++++++-- 2 files changed, 108 insertions(+), 16 deletions(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index bd163ca..2f4b005 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -46,6 +46,14 @@ var ( buildCommit string ) +// attrsWithSite appends global site/region labels when present. +func attrsWithSite(extra ...attribute.KeyValue) []attribute.KeyValue { + attrs := make([]attribute.KeyValue, 0, len(extra)+2) + attrs = append(attrs, extra...) + attrs = append(attrs, siteAttrs()...) + return attrs +} + func registerInstruments() error { var err error initOnce.Do(func() { @@ -124,6 +132,7 @@ func registerInstruments() error { if buildCommit != "" { attrs = append(attrs, attribute.String("commit", buildCommit)) } + attrs = append(attrs, siteAttrs()...) o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...)) return nil }, mBuildInfo) @@ -169,7 +178,9 @@ func RegisterBuildInfo(version, commit string) { // Config reloads func IncConfigReload(ctx context.Context, result string) { - mConfigReloads.Add(ctx, 1, metric.WithAttributes(attribute.String("result", result))) + mConfigReloads.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("result", result), + )...)) } // Helpers for counters/histograms @@ -178,14 +189,14 @@ func IncSiteRegistration(ctx context.Context, result string) { attrs := []attribute.KeyValue{ attribute.String("result", result), } - mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrs...)) + mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) } func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) { - mTunnelBytes.Add(ctx, n, metric.WithAttributes( + mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite( attribute.String("tunnel_id", tunnelID), attribute.String("direction", direction), - )) + )...)) } // AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations. @@ -194,29 +205,29 @@ func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) { } func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) { - mTunnelLatency.Record(ctx, seconds, metric.WithAttributes( + mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite( attribute.String("tunnel_id", tunnelID), attribute.String("transport", transport), - )) + )...)) } func IncReconnect(ctx context.Context, tunnelID, reason string) { - mReconnects.Add(ctx, 1, metric.WithAttributes( + mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite( attribute.String("tunnel_id", tunnelID), attribute.String("reason", reason), - )) + )...)) } func IncConnAttempt(ctx context.Context, transport, result string) { - mConnAttempts.Add(ctx, 1, metric.WithAttributes( + mConnAttempts.Add(ctx, 1, metric.WithAttributes(attrsWithSite( attribute.String("transport", transport), attribute.String("result", result), - )) + )...)) } func IncConnError(ctx context.Context, transport, typ string) { - mConnErrors.Add(ctx, 1, metric.WithAttributes( + mConnErrors.Add(ctx, 1, metric.WithAttributes(attrsWithSite( attribute.String("transport", transport), attribute.String("error_type", typ), - )) + )...)) } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index d54e4d8..30efd46 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -6,6 +6,7 @@ import ( "net/http" "os" "strings" + "sync/atomic" "time" promclient "github.com/prometheus/client_golang/prometheus" @@ -63,16 +64,34 @@ type Config struct { // OTEL_SERVICE_VERSION (default: "") // NEWT_ADMIN_ADDR (default: ":2112") func FromEnv() Config { + // Prefer explicit NEWT_* env vars, then fall back to OTEL_RESOURCE_ATTRIBUTES + site := getenv("NEWT_SITE_ID", "") + if site == "" { + site = getenv("NEWT_ID", "") + } + region := os.Getenv("NEWT_REGION") + if site == "" || region == "" { + if ra := os.Getenv("OTEL_RESOURCE_ATTRIBUTES"); ra != "" { + m := parseResourceAttributes(ra) + if site == "" { + site = m["site_id"] + } + if region == "" { + region = m["region"] + } + } + } return Config{ ServiceName: getenv("OTEL_SERVICE_NAME", "newt"), ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"), - Region: os.Getenv("NEWT_REGION"), + SiteID: site, + Region: region, PromEnabled: getenv("NEWT_METRICS_PROMETHEUS_ENABLED", "true") == "true", OTLPEnabled: getenv("NEWT_METRICS_OTLP_ENABLED", "false") == "true", OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"), OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true", MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second), - AdminAddr: getenv("NEWT_ADMIN_ADDR", "127.0.0.1:2112"), + AdminAddr: getenv("NEWT_ADMIN_ADDR", "*********:2112"), } } @@ -110,6 +129,9 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { resource.WithAttributes(attrs...), ) + // Seed global site/region for label propagation + UpdateSiteInfo(cfg.SiteID, cfg.Region) + s := &Setup{} // Build metric readers/exporters @@ -166,14 +188,14 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { }, }, ))) - // Attribute whitelist: only allow expected low-cardinality keys on newt_* instruments. +// Attribute whitelist: only allow expected low-cardinality keys on newt_* instruments. mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( sdkmetric.Instrument{Name: "newt_*"}, sdkmetric.Stream{ AttributeFilter: func(kv attribute.KeyValue) bool { k := string(kv.Key) switch k { - case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type", "version", "commit": + case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type", "version", "commit", "site_id", "region": return true default: return false @@ -253,6 +275,65 @@ func parseOTLPHeaders(h string) map[string]string { return m } +// parseResourceAttributes parses OTEL_RESOURCE_ATTRIBUTES formatted as k=v,k2=v2 +func parseResourceAttributes(s string) map[string]string { + m := map[string]string{} + if s == "" { + return m + } + parts := strings.Split(s, ",") + for _, p := range parts { + kv := strings.SplitN(strings.TrimSpace(p), "=", 2) + if len(kv) == 2 { + m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1]) + } + } + return m +} + +// Global site/region used to enrich metric labels. +var siteIDVal atomic.Value +var regionVal atomic.Value + +// UpdateSiteInfo updates the global site_id and region used for metric labels. +func UpdateSiteInfo(siteID, region string) { + if siteID != "" { + siteIDVal.Store(siteID) + } + if region != "" { + regionVal.Store(region) + } +} + +func getSiteID() string { + if v, ok := siteIDVal.Load().(string); ok { + return v + } + return "" +} + +func getRegion() string { + if v, ok := regionVal.Load().(string); ok { + return v + } + return "" +} + +// siteAttrs returns label KVs for site_id and region (if set). +func siteAttrs() []attribute.KeyValue { + var out []attribute.KeyValue + if s := getSiteID(); s != "" { + out = append(out, attribute.String("site_id", s)) + } + if r := getRegion(); r != "" { + out = append(out, attribute.String("region", r)) + } + return out +} + +// SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager). +func SiteLabelKVs() []attribute.KeyValue { return siteAttrs() } + func getenv(k, d string) string { if v := os.Getenv(k); v != "" { return v From 83c3ae5cf950416fe4d72ba8ead309c183487a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:09:30 +0200 Subject: [PATCH 32/72] feat(telemetry/state_view): add site_id label to gauges and set tunnel_sessions transport=wireguard (no hardcoded tcp) --- internal/telemetry/state_view.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go index ec3f529..fe57dc3 100644 --- a/internal/telemetry/state_view.go +++ b/internal/telemetry/state_view.go @@ -42,18 +42,23 @@ func RegisterStateView(v StateView) { if online { val = 1 } - o.ObserveInt64(mSiteOnline, val) + o.ObserveInt64(mSiteOnline, val, metric.WithAttributes( + attribute.String("site_id", getSiteID()), + )) } if t, ok := sv.LastHeartbeat(siteID); ok { secs := time.Since(t).Seconds() - o.ObserveFloat64(mSiteLastHeartbeat, secs) + o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes( + attribute.String("site_id", getSiteID()), + )) } // If the view supports per-tunnel sessions, report them labeled by tunnel_id. if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok { for tid, n := range tm.SessionsByTunnel() { o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes( + attribute.String("site_id", getSiteID()), attribute.String("tunnel_id", tid), - attribute.String("transport", "tcp"), + attribute.String("transport", "wireguard"), )) } } From 09fcb369633f3b3e69ba666514c8477aa7d8cd4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:09:44 +0200 Subject: [PATCH 33/72] fix(main): remove duplicate ClearTunnelID/State and call telemetry.UpdateSiteInfo after resolving client ID --- main.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/main.go b/main.go index d3624c8..6c68431 100644 --- a/main.go +++ b/main.go @@ -469,6 +469,8 @@ func main() { } endpoint = client.GetConfig().Endpoint // Update endpoint from config id = client.GetConfig().ID // Update ID from config + // Update site labels for metrics with the resolved ID + telemetry.UpdateSiteInfo(id, region) // output env var values if set logger.Debug("Endpoint: %v", endpoint) @@ -740,12 +742,6 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub state.Global().ClearTunnel(wgData.PublicKey) } - // Clear metrics attrs and sessions for the tunnel - if pm != nil { - pm.ClearTunnelID() - state.Global().ClearTunnel(wgData.PublicKey) - } - // Mark as disconnected connected = false From 31514f26df6684b3b7359951754ef49e5f2e5994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:10:03 +0200 Subject: [PATCH 34/72] feat(proxy): add site_id (and optional region) to bytes attribute sets for tunnel metrics --- proxy/manager.go | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/proxy/manager.go b/proxy/manager.go index e2b7a79..2fd731e 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -111,10 +111,28 @@ func (pm *ProxyManager) SetTunnelID(id string) { pm.tunnels[id] = &tunnelEntry{} } e := pm.tunnels[id] - e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp")) - e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp")) - e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp")) - e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp")) + // include site labels if available + site := telemetry.SiteLabelKVs() + e.attrInTCP = attribute.NewSet(append(site, + attribute.String("tunnel_id", id), + attribute.String("direction", "in"), + attribute.String("protocol", "tcp"), + )...) + e.attrOutTCP = attribute.NewSet(append(site, + attribute.String("tunnel_id", id), + attribute.String("direction", "out"), + attribute.String("protocol", "tcp"), + )...) + e.attrInUDP = attribute.NewSet(append(site, + attribute.String("tunnel_id", id), + attribute.String("direction", "in"), + attribute.String("protocol", "udp"), + )...) + e.attrOutUDP = attribute.NewSet(append(site, + attribute.String("tunnel_id", id), + attribute.String("direction", "out"), + attribute.String("protocol", "udp"), + )...) } // ClearTunnelID clears cached attribute sets for the current tunnel. From 31f70e50320ada1b37044158d8cbfe48d9748391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:10:17 +0200 Subject: [PATCH 35/72] test(telemetry): assert allowed attribute site_id appears in metrics exposition --- internal/telemetry/telemetry_attrfilter_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/telemetry/telemetry_attrfilter_test.go b/internal/telemetry/telemetry_attrfilter_test.go index 461888f..6c54afe 100644 --- a/internal/telemetry/telemetry_attrfilter_test.go +++ b/internal/telemetry/telemetry_attrfilter_test.go @@ -39,5 +39,8 @@ cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"} if strings.Contains(body, "forbidden=") { t.Fatalf("unexpected forbidden attribute leaked into metrics: %s", body) } + if !strings.Contains(body, "site_id=\"x\"") { + t.Fatalf("expected allowed attribute site_id to be present in metrics, got: %s", body) + } } From f86031f4583575fe885ddab8eeeff6477802b47c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:10:54 +0200 Subject: [PATCH 36/72] docs: update observability catalog to include site_id labels and clarify transport vs protocol; add METRICS_RECOMMENDATIONS.md with roadmap and ops guidance --- docs/METRICS_RECOMMENDATIONS.md | 66 +++++++++++++++++++++++++++++++++ docs/observability.md | 28 ++++++++------ 2 files changed, 82 insertions(+), 12 deletions(-) create mode 100644 docs/METRICS_RECOMMENDATIONS.md diff --git a/docs/METRICS_RECOMMENDATIONS.md b/docs/METRICS_RECOMMENDATIONS.md new file mode 100644 index 0000000..e1dfbf9 --- /dev/null +++ b/docs/METRICS_RECOMMENDATIONS.md @@ -0,0 +1,66 @@ +# Newt Metrics: Recommendations, Gaps, and Roadmap + +This document captures the current state of Newt metrics, prioritized fixes, and a pragmatic roadmap for near-term improvements. + +1) Current setup (summary) +- Export: Prometheus exposition (default), optional OTLP (gRPC) +- Existing instruments: + - Sites: newt_site_registrations_total, newt_site_online (0/1), newt_site_last_heartbeat_seconds + - Tunnel/Traffic: newt_tunnel_sessions, newt_tunnel_bytes_total, newt_tunnel_latency_seconds, newt_tunnel_reconnects_total + - Connection lifecycle: newt_connection_attempts_total, newt_connection_errors_total + - Operations: newt_config_reloads_total, newt_restart_count_total, newt_build_info + - Go runtime: GC, heap, goroutines via runtime instrumentation + +2) Main issues addressed now +- Attribute filter (allow-list) extended to include site_id and region in addition to existing keys (tunnel_id, transport, protocol, direction, result, reason, error_type, version, commit). +- site_id and region propagation: site_id is now attached as a metric label across newt_*; region is added as a metric label when set. Both remain resource attributes for consistency with OTEL. +- Label semantics clarified: + - transport: control-plane mechanism (e.g., websocket, wireguard) + - protocol: L4 payload type (tcp, udp) + - newt_tunnel_bytes_total uses protocol and direction, not transport. +- Robustness improvements: removed duplicate clear logic on reconnect; avoided empty site_id by reading NEWT_SITE_ID/NEWT_ID and OTEL_RESOURCE_ATTRIBUTES. + +3) Remaining gaps and deviations +- Some call sites still need initiator label on reconnect outcomes (client vs server). This is planned. +- WebSocket and Proxy metrics (connect latency, messages, active connections, buffer/drops, async backlog) are planned additions. +- Config apply duration and cert rotation counters are planned. + +4) Roadmap (phased) +- Phase 1 (done in this iteration) + - Fix attribute filter (site_id, region) + - Propagate site_id (and optional region) across metrics + - Correct label semantics (transport vs protocol); fix sessions transport labelling + - Documentation alignment +- Phase 2 (next) + - WebSocket: newt_websocket_connect_latency_seconds; newt_websocket_messages_total{direction,msg_type} + - Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_drops_total, newt_proxy_async_backlog_bytes + - Reconnect: add initiator label (client/server) + - Config & PKI: newt_config_apply_seconds{phase,result}; newt_cert_rotation_total{result} + +5) Operational guidance +- Do not double scrape: scrape either Newt (/metrics) or the Collector’s Prometheus exporter (not both) to avoid double-counting cumulative counters. +- For high cardinality tunnel_id, consider relabeling or dropping per-tunnel series in Prometheus to control cardinality. +- OTLP troubleshooting: enable TLS via OTEL_EXPORTER_OTLP_CERTIFICATE, use OTEL_EXPORTER_OTLP_HEADERS for auth; verify endpoint reachability. + +6) Example alerts/recording rules (suggestions) +- Reconnect spikes: + - increase(newt_tunnel_reconnects_total[5m]) by (site_id) +- Sustained connection errors: + - rate(newt_connection_errors_total[5m]) by (site_id,transport,error_type) +- Heartbeat gaps: + - max_over_time(newt_site_last_heartbeat_seconds[15m]) by (site_id) +- Proxy drops: + - increase(newt_proxy_drops_total[5m]) by (site_id,protocol) +- WebSocket connect p95 (when added): + - histogram_quantile(0.95, sum(rate(newt_websocket_connect_latency_seconds_bucket[5m])) by (le,site_id)) + +7) Collector configuration +- Direct scrape variant requires no attribute promotion since site_id is already a metric label. +- Transform/promote variant remains optional for environments that rely on resource-to-label promotion. + +8) Testing +- curl :2112/metrics | grep ^newt_ +- Verify presence of site_id across series; region appears when set. +- Ensure disallowed attributes are filtered; allowed (site_id) retained. + + diff --git a/docs/observability.md b/docs/observability.md index 93e5595..1aa7a77 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -36,16 +36,16 @@ Runtime behavior Metric catalog (initial) -- newt_build_info (gauge) labels: version, commit; value is always 1 -- newt_site_registrations_total (counter) labels: result; site_id and region are resource attributes -- newt_site_online (observable gauge) no labels (0/1) -- newt_site_last_heartbeat_seconds (observable gauge) no labels -- newt_tunnel_sessions (observable gauge) labels: tunnel_id, transport -- newt_tunnel_bytes_total (counter) labels: tunnel_id, direction (in|out) -- newt_tunnel_latency_seconds (histogram) labels: tunnel_id, transport -- newt_tunnel_reconnects_total (counter) labels: tunnel_id, reason -- newt_connection_attempts_total (counter) labels: transport, result -- newt_connection_errors_total (counter) labels: transport, error_type +- newt_build_info (gauge) labels: version, commit, site_id[, region]; value is always 1 +- newt_site_registrations_total (counter) labels: result, site_id[, region] +- newt_site_online (observable gauge) labels: site_id (0/1) +- newt_site_last_heartbeat_seconds (observable gauge) labels: site_id +- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id, transport (transport e.g. wireguard) +- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (in|out) +- newt_tunnel_latency_seconds (histogram) labels: site_id, tunnel_id, transport (e.g., wireguard) +- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, reason +- newt_connection_attempts_total (counter) labels: site_id, transport, result +- newt_connection_errors_total (counter) labels: site_id, transport, error_type Conventions @@ -162,10 +162,14 @@ sum(newt_tunnel_sessions) Compatibility notes - Gauges do not use the _total suffix (e.g., newt_tunnel_sessions). -- site_id and region are resource attributes (one process = one site). Only non-empty resource attributes are exported. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels. +- site_id is emitted as both resource attribute and metric label on all newt_* series; region is included as a metric label only when set. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels. - Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both. - Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write. -- No free text in labels; use only the enumerated constants for reason and protocol. +- No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard). + +Further reading + +- See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts. Troubleshooting From d74065a71b0b9558caeab5ede4bc0c05a483692e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:30:07 +0200 Subject: [PATCH 37/72] feat(phase2): websocket connect latency and message counters; proxy active/buffer/drops gauges and counters; config apply histogram; reconnect initiator label; update call-sites --- internal/telemetry/metrics.go | 109 ++++++++++++++++++++++++++++++-- internal/telemetry/telemetry.go | 2 +- main.go | 4 +- proxy/manager.go | 50 ++++++++++++--- util.go | 12 ++-- websocket/client.go | 29 +++++++-- wg/wg.go | 10 ++- 7 files changed, 187 insertions(+), 29 deletions(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index 2f4b005..2b332e4 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -36,12 +36,24 @@ var ( mConnErrors metric.Int64Counter // Config/Restart - mConfigReloads metric.Int64Counter - mRestartCount metric.Int64Counter + mConfigReloads metric.Int64Counter + mRestartCount metric.Int64Counter + mConfigApply metric.Float64Histogram + mCertRotationTotal metric.Int64Counter // Build info mBuildInfo metric.Int64ObservableGauge + // WebSocket + mWSConnectLatency metric.Float64Histogram + mWSMessages metric.Int64Counter + + // Proxy + mProxyActiveConns metric.Int64ObservableGauge + mProxyBufferBytes metric.Int64ObservableGauge + mProxyAsyncBacklogByte metric.Int64ObservableGauge + mProxyDropsTotal metric.Int64Counter + buildVersion string buildCommit string ) @@ -115,11 +127,31 @@ func registerInstruments() error { metric.WithDescription("Configuration reloads")) mRestartCount, _ = meter.Int64Counter("newt_restart_count_total", metric.WithDescription("Process restart count (incremented on start)")) + mConfigApply, _ = meter.Float64Histogram("newt_config_apply_seconds", + metric.WithDescription("Configuration apply duration in seconds")) + mCertRotationTotal, _ = meter.Int64Counter("newt_cert_rotation_total", + metric.WithDescription("Certificate rotation events (success/failure)")) // Build info gauge (value 1 with version/commit attributes) mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info", metric.WithDescription("Newt build information (value is always 1)")) + // WebSocket + mWSConnectLatency, _ = meter.Float64Histogram("newt_websocket_connect_latency_seconds", + metric.WithDescription("WebSocket connect latency in seconds")) + mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total", + metric.WithDescription("WebSocket messages by direction and type")) + + // Proxy + mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections", + metric.WithDescription("Proxy active connections per tunnel and protocol")) + mProxyBufferBytes, _ = meter.Int64ObservableGauge("newt_proxy_buffer_bytes", + metric.WithDescription("Proxy buffer bytes (may approximate async backlog)")) + mProxyAsyncBacklogByte, _ = meter.Int64ObservableGauge("newt_proxy_async_backlog_bytes", + metric.WithDescription("Unflushed async byte backlog per tunnel and protocol")) + mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total", + metric.WithDescription("Proxy drops due to write errors")) + // Register a default callback for build info if version/commit set meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { if buildVersion == "" && buildCommit == "" { @@ -145,8 +177,10 @@ func registerInstruments() error { // heartbeat seconds, and active sessions. var ( - obsOnce sync.Once - obsStopper func() + obsOnce sync.Once + obsStopper func() + proxyObsOnce sync.Once + proxyStopper func() ) // SetObservableCallback registers a single callback that will be invoked @@ -168,6 +202,14 @@ func SetObservableCallback(cb func(context.Context, metric.Observer) error) { }) } +// SetProxyObservableCallback registers a callback to observe proxy gauges. +func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) { + proxyObsOnce.Do(func() { + meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte) + proxyStopper = func() {} + }) +} + // Build info registration func RegisterBuildInfo(version, commit string) { buildVersion = version @@ -204,6 +246,62 @@ func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) { mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs)) } +// --- WebSocket helpers --- + +func ObserveWSConnectLatency(ctx context.Context, seconds float64, result, errorType string) { + attrs := []attribute.KeyValue{ + attribute.String("transport", "websocket"), + attribute.String("result", result), + } + if errorType != "" { + attrs = append(attrs, attribute.String("error_type", errorType)) + } + mWSConnectLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +func IncWSMessage(ctx context.Context, direction, msgType string) { + mWSMessages.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("direction", direction), + attribute.String("msg_type", msgType), + )...)) +} + +// --- Proxy helpers --- + +func ObserveProxyActiveConnsObs(o metric.Observer, value int64, attrs []attribute.KeyValue) { + o.ObserveInt64(mProxyActiveConns, value, metric.WithAttributes(attrs...)) +} + +func ObserveProxyBufferBytesObs(o metric.Observer, value int64, attrs []attribute.KeyValue) { + o.ObserveInt64(mProxyBufferBytes, value, metric.WithAttributes(attrs...)) +} + +func ObserveProxyAsyncBacklogObs(o metric.Observer, value int64, attrs []attribute.KeyValue) { + o.ObserveInt64(mProxyAsyncBacklogByte, value, metric.WithAttributes(attrs...)) +} + +func IncProxyDrops(ctx context.Context, tunnelID, protocol string) { + mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("tunnel_id", tunnelID), + attribute.String("protocol", protocol), + )...)) +} + +// --- Config/PKI helpers --- + +func ObserveConfigApply(ctx context.Context, phase, result string, seconds float64) { + mConfigApply.Record(ctx, seconds, metric.WithAttributes(attrsWithSite( + attribute.String("phase", phase), + attribute.String("result", result), + )...)) +} + +func IncCertRotation(ctx context.Context, result string) { + mCertRotationTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("result", result), + )...)) +} + func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) { mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite( attribute.String("tunnel_id", tunnelID), @@ -211,9 +309,10 @@ func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, secon )...)) } -func IncReconnect(ctx context.Context, tunnelID, reason string) { +func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) { mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite( attribute.String("tunnel_id", tunnelID), + attribute.String("initiator", initiator), attribute.String("reason", reason), )...)) } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 30efd46..9b2a84c 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -195,7 +195,7 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { AttributeFilter: func(kv attribute.KeyValue) bool { k := string(kv.Key) switch k { - case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type", "version", "commit", "site_id", "region": + case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "version", "commit", "site_id", "region": return true default: return false diff --git a/main.go b/main.go index 6c68431..8360888 100644 --- a/main.go +++ b/main.go @@ -730,7 +730,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) { logger.Info("Received reconnect message") if wgData.PublicKey != "" { - telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server_request") + telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server", telemetry.ReasonServerRequest) } // Close the WireGuard device and TUN @@ -759,7 +759,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) { logger.Info("Received termination message") if wgData.PublicKey != "" { - telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server_request") + telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server", telemetry.ReasonServerRequest) } // Close the WireGuard device and TUN diff --git a/proxy/manager.go b/proxy/manager.go index 2fd731e..9cdcf43 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -53,6 +53,9 @@ type tunnelEntry struct { bytesOutTCP atomic.Uint64 bytesInUDP atomic.Uint64 bytesOutUDP atomic.Uint64 + + activeTCP atomic.Int64 + activeUDP atomic.Int64 } // countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set. @@ -256,6 +259,21 @@ func (pm *ProxyManager) RemoveTarget(proto, listenIP string, port int) error { // Start begins listening for all configured proxy targets func (pm *ProxyManager) Start() error { + // Register proxy observables once per process + telemetry.SetProxyObservableCallback(func(ctx context.Context, o metric.Observer) error { + pm.mutex.RLock() + defer pm.mutex.RUnlock() + for _, e := range pm.tunnels { + // active connections + telemetry.ObserveProxyActiveConnsObs(o, e.activeTCP.Load(), e.attrOutTCP.ToSlice()) + telemetry.ObserveProxyActiveConnsObs(o, e.activeUDP.Load(), e.attrOutUDP.ToSlice()) + // backlog bytes (sum of unflushed counters) + b := int64(e.bytesInTCP.Load()+e.bytesOutTCP.Load()+e.bytesInUDP.Load()+e.bytesOutUDP.Load()) + telemetry.ObserveProxyAsyncBacklogObs(o, b, e.attrOutTCP.ToSlice()) + telemetry.ObserveProxyBufferBytesObs(o, b, e.attrOutTCP.ToSlice()) + } + return nil + }) pm.mutex.Lock() defer pm.mutex.Unlock() @@ -462,6 +480,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) // Count sessions only once per accepted TCP connection if pm.currentTunnelID != "" { state.Global().IncSessions(pm.currentTunnelID) + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.activeTCP.Add(1) + } } go func() { @@ -500,6 +521,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) wg.Wait() if pm.currentTunnelID != "" { state.Global().DecSessions(pm.currentTunnelID) + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.activeTCP.Add(-1) + } } }() } @@ -567,7 +591,10 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { continue } - targetConn, err = net.DialUDP("udp", nil, targetUDPAddr) + targetConn, err = net.DialUDP("udp", nil, targetUDPAddr) + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.activeUDP.Add(1) + } if err != nil { logger.Error("Error connecting to target: %v", err) continue @@ -584,6 +611,9 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { if storedConn, exists := clientConns[clientKey]; exists && storedConn == targetConn { delete(clientConns, clientKey) targetConn.Close() + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.activeUDP.Add(-1) + } } clientsMutex.Unlock() }() @@ -612,20 +642,22 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { _, err = conn.WriteTo(buffer[:n], remoteAddr) if err != nil { logger.Error("Error writing to client: %v", err) + telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp") return // defer will handle cleanup } } }(clientKey, targetConn, remoteAddr) } - written, err := targetConn.Write(buffer[:n]) - if err != nil { - logger.Error("Error writing to target: %v", err) - targetConn.Close() - clientsMutex.Lock() - delete(clientConns, clientKey) - clientsMutex.Unlock() - } else if pm.currentTunnelID != "" && written > 0 { + written, err := targetConn.Write(buffer[:n]) + if err != nil { + logger.Error("Error writing to target: %v", err) + telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp") + targetConn.Close() + clientsMutex.Lock() + delete(clientConns, clientKey) + clientsMutex.Unlock() + } else if pm.currentTunnelID != "" && written > 0 { if pm.asyncBytes { if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) diff --git a/util.go b/util.go index 25cdb9d..64bf24d 100644 --- a/util.go +++ b/util.go @@ -291,12 +291,12 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien // More lenient threshold for declaring connection lost under load failureThreshold := 4 if consecutiveFailures >= failureThreshold && currentInterval < maxInterval { - if !connectionLost { - connectionLost = true - logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) - if tunnelID != "" { - telemetry.IncReconnect(context.Background(), tunnelID, telemetry.ReasonTimeout) - } + if !connectionLost { + connectionLost = true + logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) + if tunnelID != "" { + telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) + } stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second) // Send registration message to the server for backward compatibility err := client.SendMessage("newt/wg/register", map[string]interface{}{ diff --git a/websocket/client.go b/websocket/client.go index e38a6c9..1f4dc49 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -173,7 +173,11 @@ func (c *Client) SendMessage(messageType string, data interface{}) error { c.writeMux.Lock() defer c.writeMux.Unlock() - return c.conn.WriteJSON(msg) + if err := c.conn.WriteJSON(msg); err != nil { + return err + } + telemetry.IncWSMessage(context.Background(), "out", "text") + return nil } func (c *Client) SendMessageInterval(messageType string, data interface{}, interval time.Duration) (stop func()) { @@ -418,6 +422,7 @@ func (c *Client) establishConnection() error { spanCtx, span := tr.Start(context.Background(), "ws.connect") defer span.End() + start := time.Now() dialer := websocket.DefaultDialer // Use new TLS configuration method @@ -440,24 +445,32 @@ func (c *Client) establishConnection() error { } conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) + lat := time.Since(start).Seconds() if err != nil { telemetry.IncConnAttempt(context.Background(), "websocket", "failure") etype := classifyConnError(err) telemetry.IncConnError(context.Background(), "websocket", etype) + telemetry.ObserveWSConnectLatency(context.Background(), lat, "failure", etype) // Map handshake-related errors to reconnect reasons where appropriate if etype == "tls" { - telemetry.IncReconnect(context.Background(), c.config.ID, telemetry.ReasonHandshakeError) + telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonHandshakeError) } else if etype == "timeout" { - telemetry.IncReconnect(context.Background(), c.config.ID, telemetry.ReasonTimeout) + telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonTimeout) } else { - telemetry.IncReconnect(context.Background(), c.config.ID, telemetry.ReasonError) + telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonError) } return fmt.Errorf("failed to connect to WebSocket: %w", err) } telemetry.IncConnAttempt(context.Background(), "websocket", "success") + telemetry.ObserveWSConnectLatency(context.Background(), lat, "success", "") c.conn = conn c.setConnected(true) + // Wire up pong handler for metrics + c.conn.SetPongHandler(func(appData string) error { + telemetry.IncWSMessage(context.Background(), "in", "pong") + return nil + }) // Start the ping monitor go c.pingMonitor() @@ -554,7 +567,10 @@ func (c *Client) pingMonitor() { return } c.writeMux.Lock() - err := c.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(c.pingTimeout)) + err := c.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(c.pingTimeout)) + if err == nil { + telemetry.IncWSMessage(context.Background(), "out", "ping") + } c.writeMux.Unlock() if err != nil { // Check if we're shutting down before logging error and reconnecting @@ -595,6 +611,9 @@ func (c *Client) readPumpWithDisconnectDetection() { default: var msg WSMessage err := c.conn.ReadJSON(&msg) + if err == nil { + telemetry.IncWSMessage(context.Background(), "in", "text") + } if err != nil { // Check if we're shutting down before logging error select { diff --git a/wg/wg.go b/wg/wg.go index 1607427..eccd64f 100644 --- a/wg/wg.go +++ b/wg/wg.go @@ -306,16 +306,24 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { telemetry.IncConfigReload(context.Background(), "success") // Optional reconnect reason mapping: config change if s.serverPubKey != "" { - telemetry.IncReconnect(context.Background(), s.serverPubKey, telemetry.ReasonConfigChange) + telemetry.IncReconnect(context.Background(), s.serverPubKey, "client", telemetry.ReasonConfigChange) } // Ensure the WireGuard interface and peers are configured + start := time.Now() if err := s.ensureWireguardInterface(config); err != nil { logger.Error("Failed to ensure WireGuard interface: %v", err) + telemetry.ObserveConfigApply(context.Background(), "interface", "failure", time.Since(start).Seconds()) + } else { + telemetry.ObserveConfigApply(context.Background(), "interface", "success", time.Since(start).Seconds()) } + startPeers := time.Now() if err := s.ensureWireguardPeers(config.Peers); err != nil { logger.Error("Failed to ensure WireGuard peers: %v", err) + telemetry.ObserveConfigApply(context.Background(), "peer", "failure", time.Since(startPeers).Seconds()) + } else { + telemetry.ObserveConfigApply(context.Background(), "peer", "success", time.Since(startPeers).Seconds()) } } From 75d5e695d623fe292acceb586d2ecd6fb0052308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:32:39 +0200 Subject: [PATCH 38/72] fix: update IncReconnect for auth failures; import metric in proxy manager for observable callback --- proxy/manager.go | 1 + websocket/client.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/proxy/manager.go b/proxy/manager.go index 9cdcf43..eda7389 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -15,6 +15,7 @@ import ( "github.com/fosrl/newt/internal/telemetry" "github.com/fosrl/newt/logger" "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "golang.zx2c4.com/wireguard/tun/netstack" "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" ) diff --git a/websocket/client.go b/websocket/client.go index 1f4dc49..5a7e91f 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -313,7 +313,7 @@ func (c *Client) getToken() (string, error) { telemetry.IncConnError(context.Background(), "auth", bin) // Reconnect reason mapping for auth failures if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { - telemetry.IncReconnect(context.Background(), c.config.ID, telemetry.ReasonAuthError) + telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonAuthError) } return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) } From 9ace45e71f388477d493eb87d9b9758d4a43290e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:43:53 +0200 Subject: [PATCH 39/72] fix(metrics): direction=ingress|egress for bytes; remove transport on tunnel_sessions; extend allow-list (msg_type, phase); add units for histograms and bytes; handle callback errors; normalize error_type taxonomy; HTTP error mapping to enums --- internal/telemetry/metrics.go | 33 ++++++++++++++++++++++---------- internal/telemetry/state_view.go | 1 - internal/telemetry/telemetry.go | 4 +++- proxy/manager.go | 8 ++++---- websocket/client.go | 28 ++++++++++++--------------- 5 files changed, 42 insertions(+), 32 deletions(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index 2b332e4..ac17fb9 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -95,12 +95,14 @@ func registerInstruments() error { return } mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total", - metric.WithDescription("Tunnel bytes in/out")) + metric.WithDescription("Tunnel bytes ingress/egress"), + metric.WithUnit("By")) if err != nil { return } mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds", - metric.WithDescription("Per-tunnel latency in seconds")) + metric.WithDescription("Per-tunnel latency in seconds"), + metric.WithUnit("s")) if err != nil { return } @@ -128,7 +130,8 @@ func registerInstruments() error { mRestartCount, _ = meter.Int64Counter("newt_restart_count_total", metric.WithDescription("Process restart count (incremented on start)")) mConfigApply, _ = meter.Float64Histogram("newt_config_apply_seconds", - metric.WithDescription("Configuration apply duration in seconds")) + metric.WithDescription("Configuration apply duration in seconds"), + metric.WithUnit("s")) mCertRotationTotal, _ = meter.Int64Counter("newt_cert_rotation_total", metric.WithDescription("Certificate rotation events (success/failure)")) @@ -138,7 +141,8 @@ func registerInstruments() error { // WebSocket mWSConnectLatency, _ = meter.Float64Histogram("newt_websocket_connect_latency_seconds", - metric.WithDescription("WebSocket connect latency in seconds")) + metric.WithDescription("WebSocket connect latency in seconds"), + metric.WithUnit("s")) mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total", metric.WithDescription("WebSocket messages by direction and type")) @@ -146,14 +150,16 @@ func registerInstruments() error { mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections", metric.WithDescription("Proxy active connections per tunnel and protocol")) mProxyBufferBytes, _ = meter.Int64ObservableGauge("newt_proxy_buffer_bytes", - metric.WithDescription("Proxy buffer bytes (may approximate async backlog)")) + metric.WithDescription("Proxy buffer bytes (may approximate async backlog)"), + metric.WithUnit("By")) mProxyAsyncBacklogByte, _ = meter.Int64ObservableGauge("newt_proxy_async_backlog_bytes", - metric.WithDescription("Unflushed async byte backlog per tunnel and protocol")) + metric.WithDescription("Unflushed async byte backlog per tunnel and protocol"), + metric.WithUnit("By")) mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total", metric.WithDescription("Proxy drops due to write errors")) // Register a default callback for build info if version/commit set - meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + if e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { if buildVersion == "" && buildCommit == "" { return nil } @@ -167,7 +173,10 @@ func registerInstruments() error { attrs = append(attrs, siteAttrs()...) o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...)) return nil - }, mBuildInfo) + }, mBuildInfo); e != nil { + // forward to global OTel error handler; Init will continue but build_info will be missing + otel.Handle(e) + } }) return err } @@ -197,7 +206,9 @@ var ( // }) func SetObservableCallback(cb func(context.Context, metric.Observer) error) { obsOnce.Do(func() { - meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions) + if e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions); e != nil { + otel.Handle(e) + } obsStopper = func() { /* no-op; otel callbacks are unregistered when provider shuts down */ } }) } @@ -205,7 +216,9 @@ func SetObservableCallback(cb func(context.Context, metric.Observer) error) { // SetProxyObservableCallback registers a callback to observe proxy gauges. func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) { proxyObsOnce.Do(func() { - meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte) + if e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte); e != nil { + otel.Handle(e) + } proxyStopper = func() {} }) } diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go index fe57dc3..8bb22e4 100644 --- a/internal/telemetry/state_view.go +++ b/internal/telemetry/state_view.go @@ -58,7 +58,6 @@ func RegisterStateView(v StateView) { o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes( attribute.String("site_id", getSiteID()), attribute.String("tunnel_id", tid), - attribute.String("transport", "wireguard"), )) } } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 9b2a84c..7e3b819 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -195,7 +195,7 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { AttributeFilter: func(kv attribute.KeyValue) bool { k := string(kv.Key) switch k { - case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "version", "commit", "site_id", "region": + case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "msg_type", "phase", "version", "commit", "site_id", "region": return true default: return false @@ -296,6 +296,8 @@ var siteIDVal atomic.Value var regionVal atomic.Value // UpdateSiteInfo updates the global site_id and region used for metric labels. +// Thread-safe via atomic.Value: subsequent metric emissions will include +// the new labels, prior emissions remain unchanged. func UpdateSiteInfo(siteID, region string) { if siteID != "" { siteIDVal.Store(siteID) diff --git a/proxy/manager.go b/proxy/manager.go index eda7389..cf15c66 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -119,22 +119,22 @@ func (pm *ProxyManager) SetTunnelID(id string) { site := telemetry.SiteLabelKVs() e.attrInTCP = attribute.NewSet(append(site, attribute.String("tunnel_id", id), - attribute.String("direction", "in"), + attribute.String("direction", "ingress"), attribute.String("protocol", "tcp"), )...) e.attrOutTCP = attribute.NewSet(append(site, attribute.String("tunnel_id", id), - attribute.String("direction", "out"), + attribute.String("direction", "egress"), attribute.String("protocol", "tcp"), )...) e.attrInUDP = attribute.NewSet(append(site, attribute.String("tunnel_id", id), - attribute.String("direction", "in"), + attribute.String("direction", "ingress"), attribute.String("protocol", "udp"), )...) e.attrOutUDP = attribute.NewSet(append(site, attribute.String("tunnel_id", id), - attribute.String("direction", "out"), + attribute.String("direction", "egress"), attribute.String("protocol", "udp"), )...) } diff --git a/websocket/client.go b/websocket/client.go index 5a7e91f..db9d810 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -304,13 +304,11 @@ func (c *Client) getToken() (string, error) { body, _ := io.ReadAll(resp.Body) logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) telemetry.IncConnAttempt(context.Background(), "auth", "failure") - bin := "http_other" - if resp.StatusCode >= 500 { - bin = "http_5xx" - } else if resp.StatusCode >= 400 { - bin = "http_4xx" + etype := "io_error" + if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { + etype = "auth_failed" } - telemetry.IncConnError(context.Background(), "auth", bin) + telemetry.IncConnError(context.Background(), "auth", etype) // Reconnect reason mapping for auth failures if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonAuthError) @@ -338,7 +336,8 @@ func (c *Client) getToken() (string, error) { return tokenResp.Data.Token, nil } -// classifyConnError maps common errors to low-cardinality error_type labels +// classifyConnError maps to fixed, low-cardinality error_type values. +// Allowed enum: dial_timeout, tls_handshake, auth_failed, io_error func classifyConnError(err error) string { if err == nil { return "" @@ -346,17 +345,14 @@ func classifyConnError(err error) string { msg := strings.ToLower(err.Error()) switch { case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"): - return "tls" - case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"): - return "timeout" - case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"): - return "dns" + return "tls_handshake" + case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout") || strings.Contains(msg, "deadline exceeded"): + return "dial_timeout" case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"): - return "auth" - case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"): - return "io" + return "auth_failed" default: - return "other" + // Group remaining network/socket errors as io_error to avoid label explosion + return "io_error" } } From 4a90e36a442afbb46fe3c027eae205d11bf02d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:46:01 +0200 Subject: [PATCH 40/72] docs+examples: document direction=ingress|egress, initiator and error_type enums; add cardinality relabel tips; provide Collector variants; add scripts/smoke-metrics.sh --- docs/observability.md | 26 ++++++++++++++++++---- examples/otel-collector.yaml | 39 +++++++++++++++++++++++---------- scripts/smoke-metrics.sh | 42 ++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 15 deletions(-) create mode 100644 scripts/smoke-metrics.sh diff --git a/docs/observability.md b/docs/observability.md index 1aa7a77..cf8de79 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -40,12 +40,12 @@ Metric catalog (initial) - newt_site_registrations_total (counter) labels: result, site_id[, region] - newt_site_online (observable gauge) labels: site_id (0/1) - newt_site_last_heartbeat_seconds (observable gauge) labels: site_id -- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id, transport (transport e.g. wireguard) -- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (in|out) +- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id [transport optional when known] +- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (ingress|egress) - newt_tunnel_latency_seconds (histogram) labels: site_id, tunnel_id, transport (e.g., wireguard) -- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, reason +- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, initiator (client|server), reason - newt_connection_attempts_total (counter) labels: site_id, transport, result -- newt_connection_errors_total (counter) labels: site_id, transport, error_type +- newt_connection_errors_total (counter) labels: site_id, transport, error_type (dial_timeout|tls_handshake|auth_failed|io_error) Conventions @@ -171,6 +171,24 @@ Further reading - See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts. +Cardinality tips + +- tunnel_id can grow in larger fleets. Use relabeling to drop or retain a subset, for example: + +``` +# Drop all tunnel_id on bytes to reduce series +- source_labels: [__name__] + regex: newt_tunnel_bytes_total + action: keep +- action: labeldrop + regex: tunnel_id + +# Or drop only high-churn tunnels +- source_labels: [tunnel_id] + regex: .* + action: drop +``` + Troubleshooting - curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics diff --git a/examples/otel-collector.yaml b/examples/otel-collector.yaml index b00cb67..408c6a6 100644 --- a/examples/otel-collector.yaml +++ b/examples/otel-collector.yaml @@ -1,3 +1,20 @@ +# Variant A: Direct scrape of Newt (/metrics) via Prometheus (no Collector needed) +# Note: Newt already exposes labels like site_id, protocol, direction. Do not promote +# resource attributes into labels when scraping Newt directly. +# +# Example Prometheus scrape config: +# global: +# scrape_interval: 15s +# scrape_configs: +# - job_name: newt +# static_configs: +# - targets: ["newt:2112"] +# +# Variant B: Use OTEL Collector (Newt -> OTLP -> Collector -> Prometheus) +# This pipeline scrapes metrics from the Collector's Prometheus exporter. +# Labels are already on datapoints; promotion from resource is OPTIONAL and typically NOT required. +# If you enable transform/promote below, ensure you do not duplicate labels. + receivers: otlp: protocols: @@ -13,20 +30,20 @@ processors: detectors: [env, system] timeout: 5s batch: {} - transform/promote: - # optional, damit fehlende Keys nicht die Pipeline abbrechen: - error_mode: ignore - metric_statements: - - context: datapoint - statements: - - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil - - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil + # OPTIONAL: Only enable if you need to promote resource attributes to labels. + # WARNING: Newt already provides site_id as a label; avoid double-promotion. + # transform/promote: + # error_mode: ignore + # metric_statements: + # - context: datapoint + # statements: + # - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil + # - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil exporters: prometheus: endpoint: ":8889" send_timestamps: true - # Falls du kein Remote-Write-Ziel hast, kommentiere es aus: # prometheusremotewrite: # endpoint: http://mimir:9009/api/v1/push debug: @@ -36,8 +53,8 @@ service: pipelines: metrics: receivers: [otlp] - processors: [memory_limiter, resourcedetection, transform/promote, batch] - exporters: [prometheus] # , prometheusremotewrite + processors: [memory_limiter, resourcedetection, batch] # add transform/promote if you really need it + exporters: [prometheus] traces: receivers: [otlp] processors: [memory_limiter, resourcedetection, batch] diff --git a/scripts/smoke-metrics.sh b/scripts/smoke-metrics.sh new file mode 100644 index 0000000..e0eac32 --- /dev/null +++ b/scripts/smoke-metrics.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -euo pipefail + +NEWTHOST=${NEWTHOST:-localhost} +NEWTPORT=${NEWTPORT:-2112} +METRICS_URL="http://${NEWTHOST}:${NEWTPORT}/metrics" + +probe() { + local name=$1 + local pattern=$2 + echo "[probe] ${name}" + curl -sf "${METRICS_URL}" | grep -E "${pattern}" || { + echo "[warn] ${name} not found" + return 1 + } +} + +# Basic presence +probe "newt_* presence" "^newt_" || true + +# Site gauges with site_id +probe "site_online with site_id" "^newt_site_online\{.*site_id=\"[^\"]+\"" || true +probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_id=\"[^\"]+\"" || true + +# Bytes with direction ingress/egress and protocol +probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true +probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true + +# WebSocket metrics (when OTLP/WS used) +probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true +probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true + +# Proxy metrics (when proxy active) +probe "proxy active connections" "^newt_proxy_active_connections\{" || true +probe "proxy buffer bytes" "^newt_proxy_buffer_bytes\{" || true +probe "proxy drops total" "^newt_proxy_drops_total\{" || true + +# Config apply +probe "config apply seconds buckets" "^newt_config_apply_seconds_bucket\{" || true + +echo "Smoke checks completed (warnings above are acceptable if the feature isn't exercised yet)." + From f28d90595ba47cbb54c3b8a1515da7cd0c47400f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:46:41 +0200 Subject: [PATCH 41/72] fix(telemetry): adapt to RegisterCallback returning (Registration, error) --- internal/telemetry/metrics.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index ac17fb9..84c0f27 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -159,7 +159,7 @@ func registerInstruments() error { metric.WithDescription("Proxy drops due to write errors")) // Register a default callback for build info if version/commit set - if e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + if _, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { if buildVersion == "" && buildCommit == "" { return nil } @@ -206,7 +206,7 @@ var ( // }) func SetObservableCallback(cb func(context.Context, metric.Observer) error) { obsOnce.Do(func() { - if e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions); e != nil { + if _, e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions); e != nil { otel.Handle(e) } obsStopper = func() { /* no-op; otel callbacks are unregistered when provider shuts down */ } @@ -216,7 +216,7 @@ func SetObservableCallback(cb func(context.Context, metric.Observer) error) { // SetProxyObservableCallback registers a callback to observe proxy gauges. func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) { proxyObsOnce.Do(func() { - if e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte); e != nil { + if _, e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte); e != nil { otel.Handle(e) } proxyStopper = func() {} From b20f7a02b2fe861d27111f8c7716a225866ee025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:53:40 +0200 Subject: [PATCH 42/72] feat(metrics): NEWT_METRICS_INCLUDE_TUNNEL_ID toggle; conditionally drop tunnel_id across bytes/sessions/proxy/reconnect; docs and smoke test updated; examples/prometheus.yml with relabels; docker-compose defaults avoid double-scrape --- docker-compose.metrics.yml | 32 ++++++++++++++++---------------- docs/observability.md | 5 +++-- examples/prometheus.yml | 21 ++++++++++++++++----- internal/telemetry/metrics.go | 27 ++++++++++++++++++--------- internal/telemetry/telemetry.go | 15 +++++++++++++++ proxy/manager.go | 27 +++++++++++++++------------ scripts/smoke-metrics.sh | 10 ++++++++++ 7 files changed, 93 insertions(+), 44 deletions(-) diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml index e3eb21d..1dcb633 100644 --- a/docker-compose.metrics.yml +++ b/docker-compose.metrics.yml @@ -1,25 +1,17 @@ services: - collector: - image: otel/opentelemetry-collector-contrib:0.136.0 - command: ["--config=/etc/otelcol/config.yaml"] - volumes: - - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro - ports: - - "4317:4317" # OTLP gRPC in - - "8889:8889" # Prometheus scrape out + # Recommended Variant A: Direct Prometheus scrape of Newt (/metrics) + # Optional: You may add the Collector service and enable OTLP export, but do NOT + # scrape both Newt and the Collector for the same process. newt: build: . image: newt:dev - env_file: - - .env + env_file: + - .env environment: OTEL_SERVICE_NAME: newt NEWT_METRICS_PROMETHEUS_ENABLED: "true" - NEWT_METRICS_OTLP_ENABLED: "true" - OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317" - OTEL_EXPORTER_OTLP_INSECURE: "true" - OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative" + NEWT_METRICS_OTLP_ENABLED: "false" # avoid double-scrape by default NEWT_ADMIN_ADDR: ":2112" # Base NEWT configuration PANGOLIN_ENDPOINT: ${PANGOLIN_ENDPOINT} @@ -28,8 +20,16 @@ services: LOG_LEVEL: "DEBUG" ports: - "2112:2112" - depends_on: - - collector + + # Optional Variant B: Enable the Collector and switch Prometheus scrape to it. + # collector: + # image: otel/opentelemetry-collector-contrib:0.136.0 + # command: ["--config=/etc/otelcol/config.yaml"] + # volumes: + # - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro + # ports: + # - "4317:4317" # OTLP gRPC in + # - "8889:8889" # Prometheus scrape out prometheus: image: prom/prometheus:v3.6.0 diff --git a/docs/observability.md b/docs/observability.md index cf8de79..bae5fb7 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -49,8 +49,8 @@ Metric catalog (initial) Conventions -- Durations in seconds, names end with _seconds -- Sizes in bytes, names end with _bytes +- Durations in seconds (unit: s), names end with _seconds +- Sizes in bytes (unit: By), names end with _bytes - Counters end with _total - Labels must be low-cardinality and stable @@ -163,6 +163,7 @@ Compatibility notes - Gauges do not use the _total suffix (e.g., newt_tunnel_sessions). - site_id is emitted as both resource attribute and metric label on all newt_* series; region is included as a metric label only when set. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels. +- NEWT_METRICS_INCLUDE_TUNNEL_ID (default: true) toggles whether tunnel_id is included as a label on bytes/sessions/proxy/reconnect metrics. Disable in high-cardinality environments. - Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both. - Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write. - No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard). diff --git a/examples/prometheus.yml b/examples/prometheus.yml index c3018a5..89e82b4 100644 --- a/examples/prometheus.yml +++ b/examples/prometheus.yml @@ -2,9 +2,20 @@ global: scrape_interval: 15s scrape_configs: - - job_name: newt + - job_name: 'newt' + scrape_interval: 15s static_configs: - - targets: ["newt:2112"] - - job_name: otel-collector - static_configs: - - targets: ["collector:8889"] + - targets: ['newt:2112'] # /metrics + relabel_configs: + # optional: tunnel_id droppen + - action: labeldrop + regex: 'tunnel_id' + # optional: nur bestimmte sites zulassen + - action: keep + source_labels: [site_id] + regex: '(site-a|site-b)' + + # WARNING: Do not enable this together with the 'newt' job above or you will double-count. + # - job_name: 'otel-collector' + # static_configs: + # - targets: ['collector:8889'] diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index 84c0f27..2571379 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -248,10 +248,13 @@ func IncSiteRegistration(ctx context.Context, result string) { } func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) { - mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite( - attribute.String("tunnel_id", tunnelID), + attrs := []attribute.KeyValue{ attribute.String("direction", direction), - )...)) + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(attrs...)...)) } // AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations. @@ -316,18 +319,24 @@ func IncCertRotation(ctx context.Context, result string) { } func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) { - mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite( - attribute.String("tunnel_id", tunnelID), + attrs := []attribute.KeyValue{ attribute.String("transport", transport), - )...)) + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...)) } func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) { - mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite( - attribute.String("tunnel_id", tunnelID), + attrs := []attribute.KeyValue{ attribute.String("initiator", initiator), attribute.String("reason", reason), - )...)) + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) } func IncConnAttempt(ctx context.Context, transport, result string) { diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 7e3b819..8eb0927 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -112,6 +112,12 @@ type Setup struct { // installs recommended histogram views for *_latency_seconds, and returns a Setup with // a Shutdown method to flush exporters. func Init(ctx context.Context, cfg Config) (*Setup, error) { + // Configure tunnel_id label inclusion from env (default true) + if getenv("NEWT_METRICS_INCLUDE_TUNNEL_ID", "true") == "true" { + includeTunnelIDVal.Store(true) + } else { + includeTunnelIDVal.Store(false) + } // Build resource with required attributes and only include optional ones when non-empty attrs := []attribute.KeyValue{ semconv.ServiceName(cfg.ServiceName), @@ -294,6 +300,7 @@ func parseResourceAttributes(s string) map[string]string { // Global site/region used to enrich metric labels. var siteIDVal atomic.Value var regionVal atomic.Value +var includeTunnelIDVal atomic.Value // bool; default true // UpdateSiteInfo updates the global site_id and region used for metric labels. // Thread-safe via atomic.Value: subsequent metric emissions will include @@ -336,6 +343,14 @@ func siteAttrs() []attribute.KeyValue { // SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager). func SiteLabelKVs() []attribute.KeyValue { return siteAttrs() } +// ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted. +func ShouldIncludeTunnelID() bool { + if v, ok := includeTunnelIDVal.Load().(bool); ok { + return v + } + return true +} + func getenv(k, d string) string { if v := os.Getenv(k); v != "" { return v diff --git a/proxy/manager.go b/proxy/manager.go index cf15c66..3052f56 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -117,26 +117,29 @@ func (pm *ProxyManager) SetTunnelID(id string) { e := pm.tunnels[id] // include site labels if available site := telemetry.SiteLabelKVs() - e.attrInTCP = attribute.NewSet(append(site, - attribute.String("tunnel_id", id), + build := func(base []attribute.KeyValue) attribute.Set { + if telemetry.ShouldIncludeTunnelID() { + base = append([]attribute.KeyValue{attribute.String("tunnel_id", id)}, base...) + } + base = append(site, base...) + return attribute.NewSet(base...) + } + e.attrInTCP = build([]attribute.KeyValue{ attribute.String("direction", "ingress"), attribute.String("protocol", "tcp"), - )...) - e.attrOutTCP = attribute.NewSet(append(site, - attribute.String("tunnel_id", id), + }) + e.attrOutTCP = build([]attribute.KeyValue{ attribute.String("direction", "egress"), attribute.String("protocol", "tcp"), - )...) - e.attrInUDP = attribute.NewSet(append(site, - attribute.String("tunnel_id", id), + }) + e.attrInUDP = build([]attribute.KeyValue{ attribute.String("direction", "ingress"), attribute.String("protocol", "udp"), - )...) - e.attrOutUDP = attribute.NewSet(append(site, - attribute.String("tunnel_id", id), + }) + e.attrOutUDP = build([]attribute.KeyValue{ attribute.String("direction", "egress"), attribute.String("protocol", "udp"), - )...) + }) } // ClearTunnelID clears cached attribute sets for the current tunnel. diff --git a/scripts/smoke-metrics.sh b/scripts/smoke-metrics.sh index e0eac32..d2eb11f 100644 --- a/scripts/smoke-metrics.sh +++ b/scripts/smoke-metrics.sh @@ -26,6 +26,16 @@ probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_i probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true +# Optional: verify absence/presence of tunnel_id based on EXPECT_TUNNEL_ID (default true) +EXPECT_TUNNEL_ID=${EXPECT_TUNNEL_ID:-true} +if [ "$EXPECT_TUNNEL_ID" = "false" ]; then + echo "[probe] ensure tunnel_id label is absent when NEWT_METRICS_INCLUDE_TUNNEL_ID=false" + ! curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[fail] tunnel_id present but EXPECT_TUNNEL_ID=false"; exit 1; } +else + echo "[probe] ensure tunnel_id label is present (default)" + curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[warn] tunnel_id not found (may be expected if no tunnel is active)"; } +fi + # WebSocket metrics (when OTLP/WS used) probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true From aea80200e002a31e60ba94da551411804b2d8db3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 00:58:30 +0200 Subject: [PATCH 43/72] docs: add Quickstart in observability; examples: add docker-compose.metrics.collector.yml and prometheus.with-collector.yml (collector-only scrape) --- docker-compose.metrics.collector.yml | 27 ++++++++++++++++++++++ docs/observability.md | 32 ++++++++++++++++++++++++++ examples/prometheus.with-collector.yml | 17 ++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 docker-compose.metrics.collector.yml create mode 100644 examples/prometheus.with-collector.yml diff --git a/docker-compose.metrics.collector.yml b/docker-compose.metrics.collector.yml new file mode 100644 index 0000000..e06c1eb --- /dev/null +++ b/docker-compose.metrics.collector.yml @@ -0,0 +1,27 @@ +services: + newt: + image: your/newt:latest + environment: + - NEWT_METRICS_PROMETHEUS_ENABLED=false # wichtig: direkte /metrics-Erfassung aus + - NEWT_METRICS_OTLP_ENABLED=true # OTLP an den Collector + # optional: + # - NEWT_METRICS_INCLUDE_TUNNEL_ID=false + # Falls Newt selbst Ports exponiert, hier NICHT 2112 mappen + # ports: [] + + otel-collector: + image: otel/opentelemetry-collector-contrib:latest + command: ["--config=/etc/otelcol/config.yaml"] + volumes: + - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC + - "8889:8889" # Prometheus Exporter (wird von Prometheus gescraped) + + prometheus: + image: prom/prometheus:latest + volumes: + - ./examples/prometheus.with-collector.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "9090:9090" + diff --git a/docs/observability.md b/docs/observability.md index bae5fb7..e77e2fd 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -190,6 +190,38 @@ Cardinality tips action: drop ``` +Quickstart: direkte Prometheus-Erfassung (empfohlen) + +``` +# Start (direkter /metrics-Scrape, keine Doppel-Erfassung) +docker compose -f docker-compose.metrics.yml up -d + +# Smoke-Checks +./scripts/smoke-metrics.sh +# Tunnel-IDs ausblenden (optional): +# EXPECT_TUNNEL_ID=false NEWT_METRICS_INCLUDE_TUNNEL_ID=false ./scripts/smoke-metrics.sh +``` + +- Prometheus UI: http://localhost:9090 +- Standard-Scrape-Intervall: 15s +- Kein OTLP aktiv (NEWT_METRICS_OTLP_ENABLED=false in docker-compose.metrics.yml) + +Häufige PromQL-Schnelltests + +``` +# Online-Status einer Site in den letzten 5 Minuten +max_over_time(newt_site_online{site_id="$site"}[5m]) + +# TCP egress-Bytes pro Site/Tunnel (10m) +sum by (site_id, tunnel_id) (increase(newt_tunnel_bytes_total{protocol="tcp",direction="egress"}[10m])) + +# WebSocket-Connect P95 +histogram_quantile(0.95, sum by (le, site_id) (rate(newt_websocket_connect_latency_seconds_bucket[5m]))) + +# Reconnects nach Initiator +increase(newt_tunnel_reconnects_total{site_id="$site"}[30m]) by (initiator, reason) +``` + Troubleshooting - curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics diff --git a/examples/prometheus.with-collector.yml b/examples/prometheus.with-collector.yml new file mode 100644 index 0000000..829730d --- /dev/null +++ b/examples/prometheus.with-collector.yml @@ -0,0 +1,17 @@ +global: + scrape_interval: 15s + +scrape_configs: + # WICHTIG: Newt NICHT direkt scrapen, nur den Collector! + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:8889'] + + # optional: Kardinalität begrenzen + relabel_configs: + - action: labeldrop + regex: 'tunnel_id' + # - action: keep + # source_labels: [site_id] + # regex: '(site-a|site-b)' + From 744a74155610529e339b60d5d59a8217524bbd01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 01:01:33 +0200 Subject: [PATCH 44/72] docs(README): add Observability Quickstart section and link to docs/observability.md --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 42b306e..578fe3f 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Newt is a fully user space [WireGuard](https://www.wireguard.com/) tunnel client Newt is used with Pangolin and Gerbil as part of the larger system. See documentation below: - [Full Documentation](https://docs.fossorial.io) +- Observability Quickstart: see docs/observability.md (Prometheus/OTel Collector setup, smoke tests) ## Preview @@ -107,6 +108,13 @@ Default locations: - **Windows**: `%PROGRAMDATA%\newt\newt-client\config.json` - **Linux/Others**: `~/.config/newt-client/config.json` +## Observability Quickstart + +For a quick start with Prometheus scraping and smoke checks, read the step-by-step guide in docs/observability.md. It includes: +- docker-compose.metrics.yml for direct /metrics scraping (recommended) +- docker-compose.metrics.collector.yml for the OTLP → Collector → Prometheus exporter path (no double-scrape) +- scripts/smoke-metrics.sh for basic verification + ## Examples **Note**: When both environment variables and CLI arguments are provided, CLI arguments take precedence. From ee2f8899ff545abba1f6e37fc9ed240f9311ff0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 01:06:13 +0200 Subject: [PATCH 45/72] refactor(telemetry): reduce cognitive complexity by splitting registerInstruments and Init; add unregister stoppers; extract state_view helpers --- internal/telemetry/metrics.go | 223 ++++++++++++++++--------------- internal/telemetry/state_view.go | 60 +++++---- internal/telemetry/telemetry.go | 184 ++++++++++--------------- 3 files changed, 224 insertions(+), 243 deletions(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index 2571379..a8cecd3 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -70,117 +70,118 @@ func registerInstruments() error { var err error initOnce.Do(func() { meter = otel.Meter("newt") - - // Site / Registration - mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total", - metric.WithDescription("Total site registration attempts")) - if err != nil { - return - } - mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online", - metric.WithDescription("Site online (0/1)")) - if err != nil { - return - } - mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_seconds", - metric.WithDescription("Seconds since last site heartbeat")) - if err != nil { - return - } - - // Tunnel / Sessions - mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions", - metric.WithDescription("Active tunnel sessions")) - if err != nil { - return - } - mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total", - metric.WithDescription("Tunnel bytes ingress/egress"), - metric.WithUnit("By")) - if err != nil { - return - } - mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds", - metric.WithDescription("Per-tunnel latency in seconds"), - metric.WithUnit("s")) - if err != nil { - return - } - mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total", - metric.WithDescription("Tunnel reconnect events")) - if err != nil { - return - } - - // Connection / NAT - mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total", - metric.WithDescription("Connection attempts")) - if err != nil { - return - } - mConnErrors, err = meter.Int64Counter("newt_connection_errors_total", - metric.WithDescription("Connection errors by type")) - if err != nil { - return - } - - // Config/Restart - mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total", - metric.WithDescription("Configuration reloads")) - mRestartCount, _ = meter.Int64Counter("newt_restart_count_total", - metric.WithDescription("Process restart count (incremented on start)")) - mConfigApply, _ = meter.Float64Histogram("newt_config_apply_seconds", - metric.WithDescription("Configuration apply duration in seconds"), - metric.WithUnit("s")) - mCertRotationTotal, _ = meter.Int64Counter("newt_cert_rotation_total", - metric.WithDescription("Certificate rotation events (success/failure)")) - - // Build info gauge (value 1 with version/commit attributes) - mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info", - metric.WithDescription("Newt build information (value is always 1)")) - - // WebSocket - mWSConnectLatency, _ = meter.Float64Histogram("newt_websocket_connect_latency_seconds", - metric.WithDescription("WebSocket connect latency in seconds"), - metric.WithUnit("s")) - mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total", - metric.WithDescription("WebSocket messages by direction and type")) - - // Proxy - mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections", - metric.WithDescription("Proxy active connections per tunnel and protocol")) - mProxyBufferBytes, _ = meter.Int64ObservableGauge("newt_proxy_buffer_bytes", - metric.WithDescription("Proxy buffer bytes (may approximate async backlog)"), - metric.WithUnit("By")) - mProxyAsyncBacklogByte, _ = meter.Int64ObservableGauge("newt_proxy_async_backlog_bytes", - metric.WithDescription("Unflushed async byte backlog per tunnel and protocol"), - metric.WithUnit("By")) - mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total", - metric.WithDescription("Proxy drops due to write errors")) - - // Register a default callback for build info if version/commit set - if _, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { - if buildVersion == "" && buildCommit == "" { - return nil - } - attrs := []attribute.KeyValue{} - if buildVersion != "" { - attrs = append(attrs, attribute.String("version", buildVersion)) - } - if buildCommit != "" { - attrs = append(attrs, attribute.String("commit", buildCommit)) - } - attrs = append(attrs, siteAttrs()...) - o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...)) - return nil - }, mBuildInfo); e != nil { - // forward to global OTel error handler; Init will continue but build_info will be missing - otel.Handle(e) - } + if e := registerSiteInstruments(); e != nil { err = e; return } + if e := registerTunnelInstruments(); e != nil { err = e; return } + if e := registerConnInstruments(); e != nil { err = e; return } + if e := registerConfigInstruments(); e != nil { err = e; return } + if e := registerBuildWSProxyInstruments(); e != nil { err = e; return } }) return err } +func registerSiteInstruments() error { + var err error + mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total", + metric.WithDescription("Total site registration attempts")) + if err != nil { return err } + mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online", + metric.WithDescription("Site online (0/1)")) + if err != nil { return err } + mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_seconds", + metric.WithDescription("Seconds since last site heartbeat")) + if err != nil { return err } + return nil +} + +func registerTunnelInstruments() error { + var err error + mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions", + metric.WithDescription("Active tunnel sessions")) + if err != nil { return err } + mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total", + metric.WithDescription("Tunnel bytes ingress/egress"), + metric.WithUnit("By")) + if err != nil { return err } + mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds", + metric.WithDescription("Per-tunnel latency in seconds"), + metric.WithUnit("s")) + if err != nil { return err } + mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total", + metric.WithDescription("Tunnel reconnect events")) + if err != nil { return err } + return nil +} + +func registerConnInstruments() error { + var err error + mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total", + metric.WithDescription("Connection attempts")) + if err != nil { return err } + mConnErrors, err = meter.Int64Counter("newt_connection_errors_total", + metric.WithDescription("Connection errors by type")) + if err != nil { return err } + return nil +} + +func registerConfigInstruments() error { + mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total", + metric.WithDescription("Configuration reloads")) + mRestartCount, _ = meter.Int64Counter("newt_restart_count_total", + metric.WithDescription("Process restart count (incremented on start)")) + mConfigApply, _ = meter.Float64Histogram("newt_config_apply_seconds", + metric.WithDescription("Configuration apply duration in seconds"), + metric.WithUnit("s")) + mCertRotationTotal, _ = meter.Int64Counter("newt_cert_rotation_total", + metric.WithDescription("Certificate rotation events (success/failure)")) + return nil +} + +func registerBuildWSProxyInstruments() error { + // Build info gauge (value 1 with version/commit attributes) + mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info", + metric.WithDescription("Newt build information (value is always 1)")) + // WebSocket + mWSConnectLatency, _ = meter.Float64Histogram("newt_websocket_connect_latency_seconds", + metric.WithDescription("WebSocket connect latency in seconds"), + metric.WithUnit("s")) + mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total", + metric.WithDescription("WebSocket messages by direction and type")) + // Proxy + mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections", + metric.WithDescription("Proxy active connections per tunnel and protocol")) + mProxyBufferBytes, _ = meter.Int64ObservableGauge("newt_proxy_buffer_bytes", + metric.WithDescription("Proxy buffer bytes (may approximate async backlog)"), + metric.WithUnit("By")) + mProxyAsyncBacklogByte, _ = meter.Int64ObservableGauge("newt_proxy_async_backlog_bytes", + metric.WithDescription("Unflushed async byte backlog per tunnel and protocol"), + metric.WithUnit("By")) + mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total", + metric.WithDescription("Proxy drops due to write errors")) + // Register a default callback for build info if version/commit set + reg, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + if buildVersion == "" && buildCommit == "" { + return nil + } + attrs := []attribute.KeyValue{} + if buildVersion != "" { + attrs = append(attrs, attribute.String("version", buildVersion)) + } + if buildCommit != "" { + attrs = append(attrs, attribute.String("commit", buildCommit)) + } + attrs = append(attrs, siteAttrs()...) + o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...)) + return nil + }, mBuildInfo) + if e != nil { + otel.Handle(e) + } else { + // Provide a functional stopper that unregisters the callback + obsStopper = func() { _ = reg.Unregister(context.Background()) } + } + return nil +} + // Observable registration: Newt can register a callback to report gauges. // Call SetObservableCallback once to start observing online status, last // heartbeat seconds, and active sessions. @@ -216,10 +217,14 @@ func SetObservableCallback(cb func(context.Context, metric.Observer) error) { // SetProxyObservableCallback registers a callback to observe proxy gauges. func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) { proxyObsOnce.Do(func() { - if _, e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte); e != nil { + reg, e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte) + if e != nil { otel.Handle(e) + proxyStopper = func() {} + return } - proxyStopper = func() {} + // Provide a functional stopper to unregister later if needed + proxyStopper = func() { _ = reg.Unregister(context.Background()) } }) } diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go index 8bb22e4..275217c 100644 --- a/internal/telemetry/state_view.go +++ b/internal/telemetry/state_view.go @@ -37,30 +37,9 @@ func RegisterStateView(v StateView) { if any := stateView.Load(); any != nil { if sv, ok := any.(StateView); ok { for _, siteID := range sv.ListSites() { - if online, ok := sv.Online(siteID); ok { - val := int64(0) - if online { - val = 1 - } - o.ObserveInt64(mSiteOnline, val, metric.WithAttributes( - attribute.String("site_id", getSiteID()), - )) - } - if t, ok := sv.LastHeartbeat(siteID); ok { - secs := time.Since(t).Seconds() - o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes( - attribute.String("site_id", getSiteID()), - )) - } - // If the view supports per-tunnel sessions, report them labeled by tunnel_id. - if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok { - for tid, n := range tm.SessionsByTunnel() { - o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes( - attribute.String("site_id", getSiteID()), - attribute.String("tunnel_id", tid), - )) - } - } + observeSiteOnlineFor(o, sv, siteID) + observeLastHeartbeatFor(o, sv, siteID) + observeSessionsFor(o, any) } } } @@ -68,3 +47,36 @@ func RegisterStateView(v StateView) { }) } } + +func observeSiteOnlineFor(o metric.Observer, sv StateView, siteID string) { + if online, ok := sv.Online(siteID); ok { + val := int64(0) + if online { val = 1 } + o.ObserveInt64(mSiteOnline, val, metric.WithAttributes( + attribute.String("site_id", getSiteID()), + )) + } +} + +func observeLastHeartbeatFor(o metric.Observer, sv StateView, siteID string) { + if t, ok := sv.LastHeartbeat(siteID); ok { + secs := time.Since(t).Seconds() + o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes( + attribute.String("site_id", getSiteID()), + )) + } +} + +func observeSessionsFor(o metric.Observer, any interface{}) { + if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok { + for tid, n := range tm.SessionsByTunnel() { + attrs := []attribute.KeyValue{ + attribute.String("site_id", getSiteID()), + } + if ShouldIncludeTunnelID() && tid != "" { + attrs = append(attrs, attribute.String("tunnel_id", tid)) + } + o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attrs...)) + } + } +} diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 8eb0927..c064f03 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -118,143 +118,107 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { } else { includeTunnelIDVal.Store(false) } - // Build resource with required attributes and only include optional ones when non-empty + res := buildResource(ctx, cfg) + UpdateSiteInfo(cfg.SiteID, cfg.Region) + + s := &Setup{} + readers, promHandler, shutdowns, err := setupMetricExport(ctx, cfg, res) + if err != nil { return nil, err } + s.PrometheusHandler = promHandler + // Build provider + mp := buildMeterProvider(res, readers) + otel.SetMeterProvider(mp) + s.MeterProvider = mp + s.shutdowns = append(s.shutdowns, mp.Shutdown) + // Optional tracing + if cfg.OTLPEnabled { + if tp, exp := setupTracing(ctx, cfg, res); tp != nil { + otel.SetTracerProvider(tp) + s.TracerProvider = tp + s.shutdowns = append(s.shutdowns, func(c context.Context) error { + return errors.Join(exp.Shutdown(c), tp.Shutdown(c)) + }) + } + } + // Add metric exporter shutdowns + s.shutdowns = append(s.shutdowns, shutdowns...) + // Runtime metrics + _ = runtime.Start(runtime.WithMeterProvider(mp)) + // Instruments + if err := registerInstruments(); err != nil { return nil, err } + if cfg.BuildVersion != "" || cfg.BuildCommit != "" { RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit) } + return s, nil +} + +func buildResource(ctx context.Context, cfg Config) *resource.Resource { attrs := []attribute.KeyValue{ semconv.ServiceName(cfg.ServiceName), semconv.ServiceVersion(cfg.ServiceVersion), } - if cfg.SiteID != "" { - attrs = append(attrs, attribute.String("site_id", cfg.SiteID)) - } - if cfg.Region != "" { - attrs = append(attrs, attribute.String("region", cfg.Region)) - } - res, _ := resource.New(ctx, - resource.WithFromEnv(), - resource.WithHost(), - resource.WithAttributes(attrs...), - ) + if cfg.SiteID != "" { attrs = append(attrs, attribute.String("site_id", cfg.SiteID)) } + if cfg.Region != "" { attrs = append(attrs, attribute.String("region", cfg.Region)) } + res, _ := resource.New(ctx, resource.WithFromEnv(), resource.WithHost(), resource.WithAttributes(attrs...)) + return res +} - // Seed global site/region for label propagation - UpdateSiteInfo(cfg.SiteID, cfg.Region) - - s := &Setup{} - - // Build metric readers/exporters +func setupMetricExport(ctx context.Context, cfg Config, res *resource.Resource) ([]sdkmetric.Reader, http.Handler, []func(context.Context) error, error) { var readers []sdkmetric.Reader - - // Prometheus exporter exposes a native /metrics handler for scraping + var shutdowns []func(context.Context) error + var promHandler http.Handler if cfg.PromEnabled { reg := promclient.NewRegistry() exp, err := prometheus.New(prometheus.WithRegisterer(reg)) - if err != nil { - return nil, err - } + if err != nil { return nil, nil, nil, err } readers = append(readers, exp) - s.PrometheusHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{}) + promHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{}) } - - // Optional OTLP metric exporter (gRPC) if cfg.OTLPEnabled { mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)} - // Headers support via OTEL_EXPORTER_OTLP_HEADERS (k=v,k2=v2) - if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { - mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs)) - } - if cfg.OTLPInsecure { - mopts = append(mopts, otlpmetricgrpc.WithInsecure()) - } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { - creds, cerr := credentials.NewClientTLSFromFile(certFile, "") - if cerr == nil { - mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds)) - } + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs)) } + if cfg.OTLPInsecure { mopts = append(mopts, otlpmetricgrpc.WithInsecure()) } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil { mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds)) } } mexp, err := otlpmetricgrpc.New(ctx, mopts...) - if err != nil { - return nil, err - } + if err != nil { return nil, nil, nil, err } readers = append(readers, sdkmetric.NewPeriodicReader(mexp, sdkmetric.WithInterval(cfg.MetricExportInterval))) - s.shutdowns = append(s.shutdowns, mexp.Shutdown) + shutdowns = append(shutdowns, mexp.Shutdown) } + return readers, promHandler, shutdowns, nil +} - // Build provider options iteratively (WithReader is not variadic) +func buildMeterProvider(res *resource.Resource, readers []sdkmetric.Reader) *sdkmetric.MeterProvider { var mpOpts []sdkmetric.Option mpOpts = append(mpOpts, sdkmetric.WithResource(res)) - for _, r := range readers { - mpOpts = append(mpOpts, sdkmetric.WithReader(r)) - } - // Default view for latency histograms in seconds. + for _, r := range readers { mpOpts = append(mpOpts, sdkmetric.WithReader(r)) } mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( - sdkmetric.Instrument{ - Name: "newt_*_latency_seconds", - }, - sdkmetric.Stream{ - Aggregation: sdkmetric.AggregationExplicitBucketHistogram{ - Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, - }, - }, + sdkmetric.Instrument{Name: "newt_*_latency_seconds"}, + sdkmetric.Stream{Aggregation: sdkmetric.AggregationExplicitBucketHistogram{Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}}}, ))) -// Attribute whitelist: only allow expected low-cardinality keys on newt_* instruments. mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( sdkmetric.Instrument{Name: "newt_*"}, - sdkmetric.Stream{ - AttributeFilter: func(kv attribute.KeyValue) bool { - k := string(kv.Key) - switch k { - case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "msg_type", "phase", "version", "commit", "site_id", "region": - return true - default: - return false - } - }, - }, - ))) - mp := sdkmetric.NewMeterProvider(mpOpts...) - otel.SetMeterProvider(mp) - s.MeterProvider = mp - s.shutdowns = append(s.shutdowns, mp.Shutdown) - - // Optional tracing (OTLP over gRPC) - if cfg.OTLPEnabled { - topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)} - if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { - topts = append(topts, otlptracegrpc.WithHeaders(hdrs)) - } - if cfg.OTLPInsecure { - topts = append(topts, otlptracegrpc.WithInsecure()) - } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { - creds, cerr := credentials.NewClientTLSFromFile(certFile, "") - if cerr == nil { - topts = append(topts, otlptracegrpc.WithTLSCredentials(creds)) + sdkmetric.Stream{AttributeFilter: func(kv attribute.KeyValue) bool { + k := string(kv.Key) + switch k { + case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "msg_type", "phase", "version", "commit", "site_id", "region": + return true + default: + return false } - } - exp, err := otlptracegrpc.New(ctx, topts...) - if err == nil { - tp := sdktrace.NewTracerProvider( - sdktrace.WithBatcher(exp), - sdktrace.WithResource(res), - ) - otel.SetTracerProvider(tp) - s.TracerProvider = tp - s.shutdowns = append(s.shutdowns, func(ctx context.Context) error { - return errors.Join(exp.Shutdown(ctx), tp.Shutdown(ctx)) - }) - } - } + }}, + ))) + return sdkmetric.NewMeterProvider(mpOpts...) +} - // Export Go runtime metrics (goroutines, GC, mem, etc.) - _ = runtime.Start(runtime.WithMeterProvider(mp)) - - // Register instruments after provider is set - if err := registerInstruments(); err != nil { - return nil, err +func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*sdktrace.TracerProvider, *otlptracegrpc.Exporter) { + topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)} + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { topts = append(topts, otlptracegrpc.WithHeaders(hdrs)) } + if cfg.OTLPInsecure { topts = append(topts, otlptracegrpc.WithInsecure()) } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil { topts = append(topts, otlptracegrpc.WithTLSCredentials(creds)) } } - // Optional build info metric - if cfg.BuildVersion != "" || cfg.BuildCommit != "" { - RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit) - } - - return s, nil + exp, err := otlptracegrpc.New(ctx, topts...) + if err != nil { return nil, nil } + tp := sdktrace.NewTracerProvider(sdktrace.WithBatcher(exp), sdktrace.WithResource(res)) + return tp, exp } // Shutdown flushes exporters and providers in reverse init order. From 587e829e42a8112f1205dd3c468504887b1c4887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 01:07:08 +0200 Subject: [PATCH 46/72] fix(build): use Registration.Unregister() without context; return tracer shutdown func from setupTracing --- internal/telemetry/metrics.go | 4 ++-- internal/telemetry/telemetry.go | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index a8cecd3..e29c166 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -177,7 +177,7 @@ func registerBuildWSProxyInstruments() error { otel.Handle(e) } else { // Provide a functional stopper that unregisters the callback - obsStopper = func() { _ = reg.Unregister(context.Background()) } + obsStopper = func() { _ = reg.Unregister() } } return nil } @@ -224,7 +224,7 @@ func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) return } // Provide a functional stopper to unregister later if needed - proxyStopper = func() { _ = reg.Unregister(context.Background()) } + proxyStopper = func() { _ = reg.Unregister() } }) } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index c064f03..d336d51 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -132,11 +132,11 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { s.shutdowns = append(s.shutdowns, mp.Shutdown) // Optional tracing if cfg.OTLPEnabled { - if tp, exp := setupTracing(ctx, cfg, res); tp != nil { + if tp, shutdown := setupTracing(ctx, cfg, res); tp != nil { otel.SetTracerProvider(tp) s.TracerProvider = tp s.shutdowns = append(s.shutdowns, func(c context.Context) error { - return errors.Join(exp.Shutdown(c), tp.Shutdown(c)) + return errors.Join(shutdown(c), tp.Shutdown(c)) }) } } @@ -209,7 +209,7 @@ func buildMeterProvider(res *resource.Resource, readers []sdkmetric.Reader) *sdk return sdkmetric.NewMeterProvider(mpOpts...) } -func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*sdktrace.TracerProvider, *otlptracegrpc.Exporter) { +func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*sdktrace.TracerProvider, func(context.Context) error) { topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)} if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { topts = append(topts, otlptracegrpc.WithHeaders(hdrs)) } if cfg.OTLPInsecure { topts = append(topts, otlptracegrpc.WithInsecure()) } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { @@ -218,7 +218,7 @@ func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*sdk exp, err := otlptracegrpc.New(ctx, topts...) if err != nil { return nil, nil } tp := sdktrace.NewTracerProvider(sdktrace.WithBatcher(exp), sdktrace.WithResource(res)) - return tp, exp + return tp, exp.Shutdown } // Shutdown flushes exporters and providers in reverse init order. From e16881b7c87a3666c68d668969f947ed3442cc59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 01:09:18 +0200 Subject: [PATCH 47/72] fix(sonar): SetObservableCallback uses unregister stopper instead of empty function to satisfy S1186 --- internal/telemetry/metrics.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index e29c166..c6158fe 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -207,10 +207,14 @@ var ( // }) func SetObservableCallback(cb func(context.Context, metric.Observer) error) { obsOnce.Do(func() { - if _, e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions); e != nil { + reg, e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions) + if e != nil { otel.Handle(e) + obsStopper = func() {} + return } - obsStopper = func() { /* no-op; otel callbacks are unregistered when provider shuts down */ } + // Provide a functional stopper mirroring proxy/build-info behavior + obsStopper = func() { _ = reg.Unregister() } }) } From 84e659acdeb2316b3c58815d9cd76872c650447c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 01:12:51 +0200 Subject: [PATCH 48/72] docs(observability): update code blocks to specify language for better syntax highlighting --- docs/observability.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/observability.md b/docs/observability.md index e77e2fd..a652096 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -176,7 +176,7 @@ Cardinality tips - tunnel_id can grow in larger fleets. Use relabeling to drop or retain a subset, for example: -``` +```yaml # Drop all tunnel_id on bytes to reduce series - source_labels: [__name__] regex: newt_tunnel_bytes_total @@ -192,7 +192,7 @@ Cardinality tips Quickstart: direkte Prometheus-Erfassung (empfohlen) -``` +```sh # Start (direkter /metrics-Scrape, keine Doppel-Erfassung) docker compose -f docker-compose.metrics.yml up -d @@ -202,13 +202,13 @@ docker compose -f docker-compose.metrics.yml up -d # EXPECT_TUNNEL_ID=false NEWT_METRICS_INCLUDE_TUNNEL_ID=false ./scripts/smoke-metrics.sh ``` -- Prometheus UI: http://localhost:9090 +- Prometheus UI: - Standard-Scrape-Intervall: 15s - Kein OTLP aktiv (NEWT_METRICS_OTLP_ENABLED=false in docker-compose.metrics.yml) Häufige PromQL-Schnelltests -``` +```yaml # Online-Status einer Site in den letzten 5 Minuten max_over_time(newt_site_online{site_id="$site"}[5m]) From 60196455d1526626a641b3ba63add3314f289383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 07:33:11 +0200 Subject: [PATCH 49/72] fix(telemetry): improve error handling and formatting in telemetry setup functions --- internal/telemetry/telemetry.go | 62 ++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index d336d51..baa8220 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -91,7 +91,7 @@ func FromEnv() Config { OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"), OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true", MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second), - AdminAddr: getenv("NEWT_ADMIN_ADDR", "*********:2112"), + AdminAddr: getenv("NEWT_ADMIN_ADDR", ":2112"), } } @@ -123,7 +123,9 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { s := &Setup{} readers, promHandler, shutdowns, err := setupMetricExport(ctx, cfg, res) - if err != nil { return nil, err } + if err != nil { + return nil, err + } s.PrometheusHandler = promHandler // Build provider mp := buildMeterProvider(res, readers) @@ -145,8 +147,12 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { // Runtime metrics _ = runtime.Start(runtime.WithMeterProvider(mp)) // Instruments - if err := registerInstruments(); err != nil { return nil, err } - if cfg.BuildVersion != "" || cfg.BuildCommit != "" { RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit) } + if err := registerInstruments(); err != nil { + return nil, err + } + if cfg.BuildVersion != "" || cfg.BuildCommit != "" { + RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit) + } return s, nil } @@ -155,8 +161,12 @@ func buildResource(ctx context.Context, cfg Config) *resource.Resource { semconv.ServiceName(cfg.ServiceName), semconv.ServiceVersion(cfg.ServiceVersion), } - if cfg.SiteID != "" { attrs = append(attrs, attribute.String("site_id", cfg.SiteID)) } - if cfg.Region != "" { attrs = append(attrs, attribute.String("region", cfg.Region)) } + if cfg.SiteID != "" { + attrs = append(attrs, attribute.String("site_id", cfg.SiteID)) + } + if cfg.Region != "" { + attrs = append(attrs, attribute.String("region", cfg.Region)) + } res, _ := resource.New(ctx, resource.WithFromEnv(), resource.WithHost(), resource.WithAttributes(attrs...)) return res } @@ -168,18 +178,28 @@ func setupMetricExport(ctx context.Context, cfg Config, res *resource.Resource) if cfg.PromEnabled { reg := promclient.NewRegistry() exp, err := prometheus.New(prometheus.WithRegisterer(reg)) - if err != nil { return nil, nil, nil, err } + if err != nil { + return nil, nil, nil, err + } readers = append(readers, exp) promHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{}) } if cfg.OTLPEnabled { mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)} - if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs)) } - if cfg.OTLPInsecure { mopts = append(mopts, otlpmetricgrpc.WithInsecure()) } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { - if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil { mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds)) } + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { + mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs)) + } + if cfg.OTLPInsecure { + mopts = append(mopts, otlpmetricgrpc.WithInsecure()) + } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil { + mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds)) + } } mexp, err := otlpmetricgrpc.New(ctx, mopts...) - if err != nil { return nil, nil, nil, err } + if err != nil { + return nil, nil, nil, err + } readers = append(readers, sdkmetric.NewPeriodicReader(mexp, sdkmetric.WithInterval(cfg.MetricExportInterval))) shutdowns = append(shutdowns, mexp.Shutdown) } @@ -189,7 +209,9 @@ func setupMetricExport(ctx context.Context, cfg Config, res *resource.Resource) func buildMeterProvider(res *resource.Resource, readers []sdkmetric.Reader) *sdkmetric.MeterProvider { var mpOpts []sdkmetric.Option mpOpts = append(mpOpts, sdkmetric.WithResource(res)) - for _, r := range readers { mpOpts = append(mpOpts, sdkmetric.WithReader(r)) } + for _, r := range readers { + mpOpts = append(mpOpts, sdkmetric.WithReader(r)) + } mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( sdkmetric.Instrument{Name: "newt_*_latency_seconds"}, sdkmetric.Stream{Aggregation: sdkmetric.AggregationExplicitBucketHistogram{Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}}}, @@ -211,12 +233,20 @@ func buildMeterProvider(res *resource.Resource, readers []sdkmetric.Reader) *sdk func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*sdktrace.TracerProvider, func(context.Context) error) { topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)} - if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { topts = append(topts, otlptracegrpc.WithHeaders(hdrs)) } - if cfg.OTLPInsecure { topts = append(topts, otlptracegrpc.WithInsecure()) } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { - if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil { topts = append(topts, otlptracegrpc.WithTLSCredentials(creds)) } + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { + topts = append(topts, otlptracegrpc.WithHeaders(hdrs)) + } + if cfg.OTLPInsecure { + topts = append(topts, otlptracegrpc.WithInsecure()) + } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil { + topts = append(topts, otlptracegrpc.WithTLSCredentials(creds)) + } } exp, err := otlptracegrpc.New(ctx, topts...) - if err != nil { return nil, nil } + if err != nil { + return nil, nil + } tp := sdktrace.NewTracerProvider(sdktrace.WithBatcher(exp), sdktrace.WithResource(res)) return tp, exp.Shutdown } From 5cbda356372c074f77623576b3ad8d4e71307613 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 07:34:27 +0200 Subject: [PATCH 50/72] fix(docker-compose): update newt service configuration to use local build and environment file --- docker-compose.metrics.collector.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker-compose.metrics.collector.yml b/docker-compose.metrics.collector.yml index e06c1eb..36d9938 100644 --- a/docker-compose.metrics.collector.yml +++ b/docker-compose.metrics.collector.yml @@ -1,6 +1,9 @@ services: newt: - image: your/newt:latest + build: . + image: newt:dev + env_file: + - .env environment: - NEWT_METRICS_PROMETHEUS_ENABLED=false # wichtig: direkte /metrics-Erfassung aus - NEWT_METRICS_OTLP_ENABLED=true # OTLP an den Collector From 20ddbb53823b1a66031a2844b98f0e151dc2644b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 08:12:20 +0200 Subject: [PATCH 51/72] fix(telemetry): update proxyStopper to be a no-op function when registration fails --- internal/telemetry/metrics.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index c6158fe..8c5e164 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -224,7 +224,9 @@ func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) reg, e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte) if e != nil { otel.Handle(e) - proxyStopper = func() {} + proxyStopper = func() { + // no-op: registration failed; keep stopper callable + } return } // Provide a functional stopper to unregister later if needed From ed127a2d612aa1ce26b1464a626c769709a03f65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 08:12:58 +0200 Subject: [PATCH 52/72] fix(docker-compose): update comments in metrics configuration for clarity and consistency --- docker-compose.metrics.collector.yml | 8 ++++---- examples/prometheus.yml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docker-compose.metrics.collector.yml b/docker-compose.metrics.collector.yml index 36d9938..8d0d536 100644 --- a/docker-compose.metrics.collector.yml +++ b/docker-compose.metrics.collector.yml @@ -5,11 +5,11 @@ services: env_file: - .env environment: - - NEWT_METRICS_PROMETHEUS_ENABLED=false # wichtig: direkte /metrics-Erfassung aus - - NEWT_METRICS_OTLP_ENABLED=true # OTLP an den Collector + - NEWT_METRICS_PROMETHEUS_ENABLED=false # important: disable direct /metrics scraping + - NEWT_METRICS_OTLP_ENABLED=true # OTLP to the Collector # optional: # - NEWT_METRICS_INCLUDE_TUNNEL_ID=false - # Falls Newt selbst Ports exponiert, hier NICHT 2112 mappen + # If Newt itself exposes ports, DO NOT map 2112 here # ports: [] otel-collector: @@ -19,7 +19,7 @@ services: - ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro ports: - "4317:4317" # OTLP gRPC - - "8889:8889" # Prometheus Exporter (wird von Prometheus gescraped) + - "8889:8889" # Prometheus Exporter (scraped by Prometheus) prometheus: image: prom/prometheus:latest diff --git a/examples/prometheus.yml b/examples/prometheus.yml index 89e82b4..8b73c5c 100644 --- a/examples/prometheus.yml +++ b/examples/prometheus.yml @@ -7,10 +7,10 @@ scrape_configs: static_configs: - targets: ['newt:2112'] # /metrics relabel_configs: - # optional: tunnel_id droppen + # optional: drop tunnel_id - action: labeldrop regex: 'tunnel_id' - # optional: nur bestimmte sites zulassen + # optional: allow only specific sites - action: keep source_labels: [site_id] regex: '(site-a|site-b)' From ae5129a7c73900a16b8907e2e859dd13cacc3382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 08:13:35 +0200 Subject: [PATCH 53/72] fix(sonar-telemetry): update observeSessionsFor function to include siteID and improve attribute handling --- internal/telemetry/metrics.go | 4 +++- internal/telemetry/state_view.go | 14 ++++++++------ internal/telemetry/telemetry.go | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index 8c5e164..c75ebb9 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -210,7 +210,9 @@ func SetObservableCallback(cb func(context.Context, metric.Observer) error) { reg, e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions) if e != nil { otel.Handle(e) - obsStopper = func() {} + obsStopper = func() { + // no-op: registration failed; keep stopper callable + } return } // Provide a functional stopper mirroring proxy/build-info behavior diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go index 275217c..1a51452 100644 --- a/internal/telemetry/state_view.go +++ b/internal/telemetry/state_view.go @@ -39,7 +39,7 @@ func RegisterStateView(v StateView) { for _, siteID := range sv.ListSites() { observeSiteOnlineFor(o, sv, siteID) observeLastHeartbeatFor(o, sv, siteID) - observeSessionsFor(o, any) + observeSessionsFor(o, siteID, sv) } } } @@ -51,9 +51,11 @@ func RegisterStateView(v StateView) { func observeSiteOnlineFor(o metric.Observer, sv StateView, siteID string) { if online, ok := sv.Online(siteID); ok { val := int64(0) - if online { val = 1 } + if online { + val = 1 + } o.ObserveInt64(mSiteOnline, val, metric.WithAttributes( - attribute.String("site_id", getSiteID()), + attribute.String("site_id", siteID), )) } } @@ -62,16 +64,16 @@ func observeLastHeartbeatFor(o metric.Observer, sv StateView, siteID string) { if t, ok := sv.LastHeartbeat(siteID); ok { secs := time.Since(t).Seconds() o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes( - attribute.String("site_id", getSiteID()), + attribute.String("site_id", siteID), )) } } -func observeSessionsFor(o metric.Observer, any interface{}) { +func observeSessionsFor(o metric.Observer, siteID string, any interface{}) { if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok { for tid, n := range tm.SessionsByTunnel() { attrs := []attribute.KeyValue{ - attribute.String("site_id", getSiteID()), + attribute.String("site_id", siteID), } if ShouldIncludeTunnelID() && tid != "" { attrs = append(attrs, attribute.String("tunnel_id", tid)) diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index baa8220..14100ec 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -171,7 +171,7 @@ func buildResource(ctx context.Context, cfg Config) *resource.Resource { return res } -func setupMetricExport(ctx context.Context, cfg Config, res *resource.Resource) ([]sdkmetric.Reader, http.Handler, []func(context.Context) error, error) { +func setupMetricExport(ctx context.Context, cfg Config, _ *resource.Resource) ([]sdkmetric.Reader, http.Handler, []func(context.Context) error, error) { var readers []sdkmetric.Reader var shutdowns []func(context.Context) error var promHandler http.Handler From fef9e8c76b313dd13d6a9ec32832ce2fe6191de2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 08:14:04 +0200 Subject: [PATCH 54/72] fix(websocket): improve error type handling in connection establishment and ping monitoring --- websocket/client.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/websocket/client.go b/websocket/client.go index db9d810..ee6c2e6 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -20,6 +20,7 @@ import ( "github.com/gorilla/websocket" "context" + "github.com/fosrl/newt/internal/telemetry" "go.opentelemetry.io/otel" ) @@ -440,7 +441,7 @@ func (c *Client) establishConnection() error { logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable") } -conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) + conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) lat := time.Since(start).Seconds() if err != nil { telemetry.IncConnAttempt(context.Background(), "websocket", "failure") @@ -448,9 +449,9 @@ conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) telemetry.IncConnError(context.Background(), "websocket", etype) telemetry.ObserveWSConnectLatency(context.Background(), lat, "failure", etype) // Map handshake-related errors to reconnect reasons where appropriate - if etype == "tls" { + if etype == "tls_handshake" { telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonHandshakeError) - } else if etype == "timeout" { + } else if etype == "dial_timeout" { telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonTimeout) } else { telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonError) @@ -563,10 +564,10 @@ func (c *Client) pingMonitor() { return } c.writeMux.Lock() - err := c.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(c.pingTimeout)) - if err == nil { - telemetry.IncWSMessage(context.Background(), "out", "ping") - } + err := c.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(c.pingTimeout)) + if err == nil { + telemetry.IncWSMessage(context.Background(), "out", "ping") + } c.writeMux.Unlock() if err != nil { // Check if we're shutting down before logging error and reconnecting From 6ec0ab813c0f1cfa4d1104bc7cd2837363715d8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 08:14:25 +0200 Subject: [PATCH 55/72] fix(main): refactor logging messages and introduce constants for improved readability --- main.go | 56 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/main.go b/main.go index 8360888..959b906 100644 --- a/main.go +++ b/main.go @@ -91,6 +91,14 @@ func (s *stringSlice) Set(value string) error { return nil } +const ( + fmtErrMarshaling = "Error marshaling data: %v" + fmtReceivedMsg = "Received: %+v" + topicWGRegister = "newt/wg/register" + msgNoTunnelOrProxy = "No tunnel IP or proxy manager available" + fmtErrParsingTargetData = "Error parsing target data: %v" +) + var ( endpoint string id string @@ -398,7 +406,7 @@ func main() { fmt.Println("Newt version " + newtVersion) os.Exit(0) } else { - logger.Info("Newt version " + newtVersion) + logger.Info("Newt version %s", newtVersion) } if err := updates.CheckForUpdate("fosrl", "newt", newtVersion); err != nil { @@ -596,7 +604,7 @@ func main() { jsonData, err := json.Marshal(msg.Data) if err != nil { - logger.Info("Error marshaling data: %v", err) + logger.Info(fmtErrMarshaling, err) return } @@ -605,7 +613,7 @@ func main() { return } - logger.Debug("Received: %+v", msg) + logger.Debug(fmtReceivedMsg, msg) tun, tnet, err = netstack.CreateNetTUN( []netip.Addr{netip.MustParseAddr(wgData.TunnelIP)}, []netip.Addr{netip.MustParseAddr(dns)}, @@ -788,7 +796,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub jsonData, err := json.Marshal(msg.Data) if err != nil { - logger.Info("Error marshaling data: %v", err) + logger.Info(fmtErrMarshaling, err) return } if err := json.Unmarshal(jsonData, &exitNodeData); err != nil { @@ -829,7 +837,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }, } - stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{ +stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{ "publicKey": publicKey.String(), "pingResults": pingResults, "newtVersion": newtVersion, @@ -932,7 +940,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } // Send the ping results to the cloud for selection - stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{ +stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{ "publicKey": publicKey.String(), "pingResults": pingResults, "newtVersion": newtVersion, @@ -942,17 +950,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }) client.RegisterHandler("newt/tcp/add", func(msg websocket.WSMessage) { - logger.Debug("Received: %+v", msg) + logger.Debug(fmtReceivedMsg, msg) // if there is no wgData or pm, we can't add targets if wgData.TunnelIP == "" || pm == nil { - logger.Info("No tunnel IP or proxy manager available") + logger.Info(msgNoTunnelOrProxy) return } targetData, err := parseTargetData(msg.Data) if err != nil { - logger.Info("Error parsing target data: %v", err) + logger.Info(fmtErrParsingTargetData, err) return } @@ -967,17 +975,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }) client.RegisterHandler("newt/udp/add", func(msg websocket.WSMessage) { - logger.Info("Received: %+v", msg) + logger.Info(fmtReceivedMsg, msg) // if there is no wgData or pm, we can't add targets if wgData.TunnelIP == "" || pm == nil { - logger.Info("No tunnel IP or proxy manager available") + logger.Info(msgNoTunnelOrProxy) return } targetData, err := parseTargetData(msg.Data) if err != nil { - logger.Info("Error parsing target data: %v", err) + logger.Info(fmtErrParsingTargetData, err) return } @@ -992,17 +1000,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }) client.RegisterHandler("newt/udp/remove", func(msg websocket.WSMessage) { - logger.Info("Received: %+v", msg) + logger.Info(fmtReceivedMsg, msg) // if there is no wgData or pm, we can't add targets if wgData.TunnelIP == "" || pm == nil { - logger.Info("No tunnel IP or proxy manager available") + logger.Info(msgNoTunnelOrProxy) return } targetData, err := parseTargetData(msg.Data) if err != nil { - logger.Info("Error parsing target data: %v", err) + logger.Info(fmtErrParsingTargetData, err) return } @@ -1017,17 +1025,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }) client.RegisterHandler("newt/tcp/remove", func(msg websocket.WSMessage) { - logger.Info("Received: %+v", msg) + logger.Info(fmtReceivedMsg, msg) // if there is no wgData or pm, we can't add targets if wgData.TunnelIP == "" || pm == nil { - logger.Info("No tunnel IP or proxy manager available") + logger.Info(msgNoTunnelOrProxy) return } targetData, err := parseTargetData(msg.Data) if err != nil { - logger.Info("Error parsing target data: %v", err) + logger.Info(fmtErrParsingTargetData, err) return } @@ -1111,7 +1119,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub jsonData, err := json.Marshal(msg.Data) if err != nil { - logger.Info("Error marshaling data: %v", err) + logger.Info(fmtErrMarshaling, err) return } if err := json.Unmarshal(jsonData, &sshPublicKeyData); err != nil { @@ -1268,9 +1276,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } if err := healthMonitor.EnableTarget(requestData.ID); err != nil { - logger.Error("Failed to enable health check target %s: %v", requestData.ID, err) + logger.Error("Failed to enable health check target %d: %v", requestData.ID, err) } else { - logger.Info("Enabled health check target: %s", requestData.ID) + logger.Info("Enabled health check target: %d", requestData.ID) } }) @@ -1293,9 +1301,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } if err := healthMonitor.DisableTarget(requestData.ID); err != nil { - logger.Error("Failed to disable health check target %s: %v", requestData.ID, err) + logger.Error("Failed to disable health check target %d: %v", requestData.ID, err) } else { - logger.Info("Disabled health check target: %s", requestData.ID) + logger.Info("Disabled health check target: %d", requestData.ID) } }) @@ -1340,7 +1348,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub } // Send registration message to the server for backward compatibility - err := client.SendMessage("newt/wg/register", map[string]interface{}{ + err := client.SendMessage(topicWGRegister, map[string]interface{}{ "publicKey": publicKey.String(), "newtVersion": newtVersion, "backwardsCompatible": true, From 77d56596ab370e4f210bd497d7ba1f8c20c2e3dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Wed, 8 Oct 2025 08:14:35 +0200 Subject: [PATCH 56/72] fix(wgtester): improve logging format for consistency and clarity --- wgtester/wgtester.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/wgtester/wgtester.go b/wgtester/wgtester.go index 0035f05..26988f6 100644 --- a/wgtester/wgtester.go +++ b/wgtester/wgtester.go @@ -126,7 +126,7 @@ func (s *Server) Stop() { s.conn.Close() } s.isRunning = false - logger.Info(s.outputPrefix + "Server stopped") + logger.Info("%sServer stopped", s.outputPrefix) } // RestartWithNetstack stops the current server and restarts it with netstack @@ -161,7 +161,7 @@ func (s *Server) handleConnections() { // Set read deadline to avoid blocking forever err := s.conn.SetReadDeadline(time.Now().Add(1 * time.Second)) if err != nil { - logger.Error(s.outputPrefix+"Error setting read deadline: %v", err) + logger.Error("%sError setting read deadline: %v", s.outputPrefix, err) continue } @@ -187,7 +187,7 @@ func (s *Server) handleConnections() { case <-s.shutdownCh: return // Don't log error if we're shutting down default: - logger.Error(s.outputPrefix+"Error reading from UDP: %v", err) + logger.Error("%sError reading from UDP: %v", s.outputPrefix, err) } continue } @@ -219,7 +219,7 @@ func (s *Server) handleConnections() { copy(responsePacket[5:13], buffer[5:13]) // Log response being sent for debugging - logger.Debug(s.outputPrefix+"Sending response to %s", addr.String()) + logger.Debug("%sSending response to %s", s.outputPrefix, addr.String()) // Send the response packet - handle both regular UDP and netstack UDP if s.useNetstack { @@ -233,9 +233,9 @@ func (s *Server) handleConnections() { } if err != nil { - logger.Error(s.outputPrefix+"Error sending response: %v", err) + logger.Error("%sError sending response: %v", s.outputPrefix, err) } else { - logger.Debug(s.outputPrefix + "Response sent successfully") + logger.Debug("%sResponse sent successfully", s.outputPrefix) } } } From 89274eb9a8e721b9fbc7cc197541f476ff906799 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:41:55 +0000 Subject: [PATCH 57/72] Bump the prod-minor-updates group across 1 directory with 4 updates Bumps the prod-minor-updates group with 3 updates in the / directory: [go.opentelemetry.io/otel/exporters/prometheus](https://github.com/open-telemetry/opentelemetry-go), [golang.org/x/crypto](https://github.com/golang/crypto) and [google.golang.org/grpc](https://github.com/grpc/grpc-go). Updates `go.opentelemetry.io/otel/exporters/prometheus` from 0.57.0 to 0.60.0 - [Release notes](https://github.com/open-telemetry/opentelemetry-go/releases) - [Changelog](https://github.com/open-telemetry/opentelemetry-go/blob/main/CHANGELOG.md) - [Commits](https://github.com/open-telemetry/opentelemetry-go/compare/exporters/prometheus/v0.57.0...exporters/prometheus/v0.60.0) Updates `golang.org/x/crypto` from 0.42.0 to 0.43.0 - [Commits](https://github.com/golang/crypto/compare/v0.42.0...v0.43.0) Updates `golang.org/x/net` from 0.44.0 to 0.45.0 - [Commits](https://github.com/golang/net/compare/v0.44.0...v0.45.0) Updates `google.golang.org/grpc` from 1.75.1 to 1.76.0 - [Release notes](https://github.com/grpc/grpc-go/releases) - [Commits](https://github.com/grpc/grpc-go/compare/v1.75.1...v1.76.0) --- updated-dependencies: - dependency-name: go.opentelemetry.io/otel/exporters/prometheus dependency-version: 0.60.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: prod-minor-updates - dependency-name: golang.org/x/crypto dependency-version: 0.43.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: prod-minor-updates - dependency-name: golang.org/x/net dependency-version: 0.45.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: prod-minor-updates - dependency-name: google.golang.org/grpc dependency-version: 1.76.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: prod-minor-updates ... Signed-off-by: dependabot[bot] --- go.mod | 21 +++++++++++---------- go.sum | 58 ++++++++++++++++++++++++++++++---------------------------- 2 files changed, 41 insertions(+), 38 deletions(-) diff --git a/go.mod b/go.mod index dfa73c0..79a7b41 100644 --- a/go.mod +++ b/go.mod @@ -13,15 +13,15 @@ require ( go.opentelemetry.io/otel v1.38.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 - go.opentelemetry.io/otel/exporters/prometheus v0.57.0 + go.opentelemetry.io/otel/exporters/prometheus v0.60.0 go.opentelemetry.io/otel/metric v1.38.0 go.opentelemetry.io/otel/sdk v1.38.0 go.opentelemetry.io/otel/sdk/metric v1.38.0 - golang.org/x/crypto v0.42.0 - golang.org/x/net v0.44.0 + golang.org/x/crypto v0.43.0 + golang.org/x/net v0.45.0 golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 - google.golang.org/grpc v1.75.1 + google.golang.org/grpc v1.76.0 gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c software.sslmate.com/src/go-pkcs12 v0.6.0 ) @@ -42,9 +42,9 @@ require ( github.com/google/btree v1.1.2 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect github.com/josharian/native v1.1.0 // indirect - github.com/klauspost/compress v1.18.0 // indirect github.com/mdlayher/genetlink v1.3.2 // indirect github.com/mdlayher/netlink v1.7.2 // indirect github.com/mdlayher/socket v0.5.1 // indirect @@ -58,7 +58,8 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect - github.com/prometheus/procfs v0.16.1 // indirect + github.com/prometheus/otlptranslator v0.0.2 // indirect + github.com/prometheus/procfs v0.17.0 // indirect github.com/vishvananda/netns v0.0.5 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect @@ -66,12 +67,12 @@ require ( go.opentelemetry.io/otel/trace v1.38.0 // indirect go.opentelemetry.io/proto/otlp v1.7.1 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/mod v0.27.0 // indirect + golang.org/x/mod v0.28.0 // indirect golang.org/x/sync v0.17.0 // indirect - golang.org/x/sys v0.36.0 // indirect - golang.org/x/text v0.29.0 // indirect + golang.org/x/sys v0.37.0 // indirect + golang.org/x/text v0.30.0 // indirect golang.org/x/time v0.7.0 // indirect - golang.org/x/tools v0.36.0 // indirect + golang.org/x/tools v0.37.0 // indirect golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect diff --git a/go.sum b/go.sum index 5814d42..6c8c7e3 100644 --- a/go.sum +++ b/go.sum @@ -43,14 +43,18 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrRoT6yV5+wkrOpcszoIsO4+4ds248= +github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= -github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= -github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/mdlayher/genetlink v1.3.2 h1:KdrNKe+CTu+IbZnm/GVUMXSqBBLqcGpRDa0xkQy56gw= @@ -81,22 +85,18 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= -github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= -github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/prometheus/otlptranslator v0.0.2 h1:+1CdeLVrRQ6Psmhnobldo0kTp96Rj80DRXRd5OSnMEQ= +github.com/prometheus/otlptranslator v0.0.2/go.mod h1:P8AwMgdD7XEr6QRUJ2QWLpiAZTgTE2UYgjlu3svompI= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= @@ -121,8 +121,8 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4D go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 h1:aTL7F04bJHUlztTsNGJ2l+6he8c+y/b//eR0jjjemT4= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4= -go.opentelemetry.io/otel/exporters/prometheus v0.57.0 h1:AHh/lAP1BHrY5gBwk8ncc25FXWm/gmmY3BX258z5nuk= -go.opentelemetry.io/otel/exporters/prometheus v0.57.0/go.mod h1:QpFWz1QxqevfjwzYdbMb4Y1NnlJvqSGwyuU0B4iuc9c= +go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo= +go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk= go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= @@ -139,16 +139,16 @@ go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= -golang.org/x/crypto v0.42.0 h1:chiH31gIWm57EkTXpwnqf8qeuMUi0yekh6mT2AvFlqI= -golang.org/x/crypto v0.42.0/go.mod h1:4+rDnOTJhQCx2q7/j6rAN5XDw8kPjeaXEUR2eL94ix8= +golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= +golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= -golang.org/x/mod v0.27.0 h1:kb+q2PyFnEADO2IEF935ehFUXlWiNjJWtRNgBLSfbxQ= -golang.org/x/mod v0.27.0/go.mod h1:rWI627Fq0DEoudcK+MBkNkCe0EetEaDSwJJkCcjpazc= +golang.org/x/mod v0.28.0 h1:gQBtGhjxykdjY9YhZpSlZIsbnaE2+PgjfLWUQTnoZ1U= +golang.org/x/mod v0.28.0/go.mod h1:yfB/L0NOf/kmEbXjzCPOx1iK1fRutOydrCMsqRhEBxI= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/net v0.44.0 h1:evd8IRDyfNBMBTTY5XRF1vaZlD+EmWx6x8PkhR04H/I= -golang.org/x/net v0.44.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= +golang.org/x/net v0.45.0 h1:RLBg5JKixCy82FtLJpeNlVM0nrSqpCRYzVU1n8kj0tM= +golang.org/x/net v0.45.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= @@ -156,16 +156,16 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= -golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= -golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= -golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= -golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= +golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE= +golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg= golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2/go.mod h1:deeaetjYA+DHMHg+sMSMI58GrEteJUUzzw7en6TJQcI= @@ -179,11 +179,13 @@ google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1: google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE= google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE= google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc= -google.golang.org/grpc v1.75.1 h1:/ODCNEuf9VghjgO3rqLcfg8fiOP0nSluljWFlDxELLI= -google.golang.org/grpc v1.75.1/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= +google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= +google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= From b62e18622e0f428d1ab88b887d85ee207c433f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 14:16:28 +0200 Subject: [PATCH 58/72] fix(manager, stub, util): enhance error handling and logging consistency --- proxy/manager.go | 13 ++++++++----- stub.go | 12 +++++++----- util.go | 15 ++++++++++----- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/proxy/manager.go b/proxy/manager.go index 3052f56..ac80d8f 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -20,6 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet" ) +const errUnsupportedProtoFmt = "unsupported protocol: %s" + // Target represents a proxy target with its address and port type Target struct { Address string @@ -74,13 +76,14 @@ func (cw *countingWriter) Write(p []byte) (int, error) { n, err := cw.w.Write(p) if n > 0 { if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil { - if cw.proto == "tcp" { + switch cw.proto { + case "tcp": if cw.out { cw.ent.bytesOutTCP.Add(uint64(n)) } else { cw.ent.bytesInTCP.Add(uint64(n)) } - } else if cw.proto == "udp" { + case "udp": if cw.out { cw.ent.bytesOutUDP.Add(uint64(n)) } else { @@ -207,7 +210,7 @@ func (pm *ProxyManager) AddTarget(proto, listenIP string, port int, targetAddr s } pm.udpTargets[listenIP][port] = targetAddr default: - return fmt.Errorf("unsupported protocol: %s", proto) + return fmt.Errorf(errUnsupportedProtoFmt, proto) } if pm.running { @@ -256,7 +259,7 @@ func (pm *ProxyManager) RemoveTarget(proto, listenIP string, port int) error { return fmt.Errorf("target not found: %s:%d", listenIP, port) } default: - return fmt.Errorf("unsupported protocol: %s", proto) + return fmt.Errorf(errUnsupportedProtoFmt, proto) } return nil } @@ -443,7 +446,7 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr go pm.handleUDPProxy(conn, targetAddr) default: - return fmt.Errorf("unsupported protocol: %s", proto) + return fmt.Errorf(errUnsupportedProtoFmt, proto) } logger.Info("Started %s proxy to %s", proto, targetAddr) diff --git a/stub.go b/stub.go index ec91299..3bdbe19 100644 --- a/stub.go +++ b/stub.go @@ -8,25 +8,27 @@ import ( ) func setupClientsNative(client *websocket.Client, host string) { - return // This function is not implemented for non-Linux systems. + _ = client + _ = host + // No-op for non-Linux systems } func closeWgServiceNative() { // No-op for non-Linux systems - return } func clientsOnConnectNative() { // No-op for non-Linux systems - return } func clientsHandleNewtConnectionNative(publicKey, endpoint string) { + _ = publicKey + _ = endpoint // No-op for non-Linux systems - return } func clientsAddProxyTargetNative(pm *proxy.ProxyManager, tunnelIp string) { + _ = pm + _ = tunnelIp // No-op for non-Linux systems - return } diff --git a/util.go b/util.go index 64bf24d..fa339e8 100644 --- a/util.go +++ b/util.go @@ -25,6 +25,8 @@ import ( "golang.zx2c4.com/wireguard/tun/netstack" ) +const msgHealthFileWriteFailed = "Failed to write health file: %v" + func fixKey(key string) string { // Remove any whitespace key = strings.TrimSpace(key) @@ -177,7 +179,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC if healthFile != "" { err := os.WriteFile(healthFile, []byte("ok"), 0644) if err != nil { - logger.Warn("Failed to write health file: %v", err) + logger.Warn(msgHealthFileWriteFailed, err) } } return stopChan, nil @@ -218,11 +220,11 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC if healthFile != "" { err := os.WriteFile(healthFile, []byte("ok"), 0644) if err != nil { - logger.Warn("Failed to write health file: %v", err) + logger.Warn(msgHealthFileWriteFailed, err) } } - return } + case <-pingStopChan: } } }() @@ -476,7 +478,8 @@ func updateTargets(pm *proxy.ProxyManager, action string, tunnelIP string, proto continue } - if action == "add" { + switch action { + case "add": target := parts[1] + ":" + parts[2] // Call updown script if provided @@ -502,7 +505,7 @@ func updateTargets(pm *proxy.ProxyManager, action string, tunnelIP string, proto // Add the new target pm.AddTarget(proto, tunnelIP, port, processedTarget) - } else if action == "remove" { + case "remove": logger.Info("Removing target with port %d", port) target := parts[1] + ":" + parts[2] @@ -520,6 +523,8 @@ func updateTargets(pm *proxy.ProxyManager, action string, tunnelIP string, proto logger.Error("Failed to remove target: %v", err) return err } + default: + logger.Info("Unknown action: %s", action) } } From 8d0e6be2c703acce1ef9fe40b4cc6c9a96aef88c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 14:17:24 +0200 Subject: [PATCH 59/72] fix(metrics): enhance documentation clarity and structure for metrics recommendations --- docs/METRICS_RECOMMENDATIONS.md | 94 ++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/docs/METRICS_RECOMMENDATIONS.md b/docs/METRICS_RECOMMENDATIONS.md index e1dfbf9..2ce365b 100644 --- a/docs/METRICS_RECOMMENDATIONS.md +++ b/docs/METRICS_RECOMMENDATIONS.md @@ -3,64 +3,70 @@ This document captures the current state of Newt metrics, prioritized fixes, and a pragmatic roadmap for near-term improvements. 1) Current setup (summary) -- Export: Prometheus exposition (default), optional OTLP (gRPC) -- Existing instruments: - - Sites: newt_site_registrations_total, newt_site_online (0/1), newt_site_last_heartbeat_seconds - - Tunnel/Traffic: newt_tunnel_sessions, newt_tunnel_bytes_total, newt_tunnel_latency_seconds, newt_tunnel_reconnects_total - - Connection lifecycle: newt_connection_attempts_total, newt_connection_errors_total - - Operations: newt_config_reloads_total, newt_restart_count_total, newt_build_info - - Go runtime: GC, heap, goroutines via runtime instrumentation + + - Export: Prometheus exposition (default), optional OTLP (gRPC) + - Existing instruments: + - Sites: newt_site_registrations_total, newt_site_online (0/1), newt_site_last_heartbeat_seconds + - Tunnel/Traffic: newt_tunnel_sessions, newt_tunnel_bytes_total, newt_tunnel_latency_seconds, newt_tunnel_reconnects_total + - Connection lifecycle: newt_connection_attempts_total, newt_connection_errors_total + - Operations: newt_config_reloads_total, newt_restart_count_total, newt_build_info + - Go runtime: GC, heap, goroutines via runtime instrumentation 2) Main issues addressed now -- Attribute filter (allow-list) extended to include site_id and region in addition to existing keys (tunnel_id, transport, protocol, direction, result, reason, error_type, version, commit). -- site_id and region propagation: site_id is now attached as a metric label across newt_*; region is added as a metric label when set. Both remain resource attributes for consistency with OTEL. -- Label semantics clarified: - - transport: control-plane mechanism (e.g., websocket, wireguard) - - protocol: L4 payload type (tcp, udp) - - newt_tunnel_bytes_total uses protocol and direction, not transport. -- Robustness improvements: removed duplicate clear logic on reconnect; avoided empty site_id by reading NEWT_SITE_ID/NEWT_ID and OTEL_RESOURCE_ATTRIBUTES. + + - Attribute filter (allow-list) extended to include site_id and region in addition to existing keys (tunnel_id, transport, protocol, direction, result, reason, error_type, version, commit). + - site_id and region propagation: site_id is now attached as a metric label across newt_*; region is added as a metric label when set. Both remain resource attributes for consistency with OTEL. + - Label semantics clarified: + - transport: control-plane mechanism (e.g., websocket, wireguard) + - protocol: L4 payload type (tcp, udp) + - newt_tunnel_bytes_total uses protocol and direction, not transport. + - Robustness improvements: removed duplicate clear logic on reconnect; avoided empty site_id by reading NEWT_SITE_ID/NEWT_ID and OTEL_RESOURCE_ATTRIBUTES. 3) Remaining gaps and deviations -- Some call sites still need initiator label on reconnect outcomes (client vs server). This is planned. -- WebSocket and Proxy metrics (connect latency, messages, active connections, buffer/drops, async backlog) are planned additions. -- Config apply duration and cert rotation counters are planned. + + - Some call sites still need initiator label on reconnect outcomes (client vs server). This is planned. + - WebSocket and Proxy metrics (connect latency, messages, active connections, buffer/drops, async backlog) are planned additions. + - Config apply duration and cert rotation counters are planned. 4) Roadmap (phased) -- Phase 1 (done in this iteration) - - Fix attribute filter (site_id, region) - - Propagate site_id (and optional region) across metrics - - Correct label semantics (transport vs protocol); fix sessions transport labelling - - Documentation alignment -- Phase 2 (next) - - WebSocket: newt_websocket_connect_latency_seconds; newt_websocket_messages_total{direction,msg_type} - - Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_drops_total, newt_proxy_async_backlog_bytes - - Reconnect: add initiator label (client/server) - - Config & PKI: newt_config_apply_seconds{phase,result}; newt_cert_rotation_total{result} + + - Phase 1 (done in this iteration) + - Fix attribute filter (site_id, region) + - Propagate site_id (and optional region) across metrics + - Correct label semantics (transport vs protocol); fix sessions transport labelling + - Documentation alignment + - Phase 2 (next) + - WebSocket: newt_websocket_connect_latency_seconds; newt_websocket_messages_total{direction,msg_type} + - Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_drops_total, newt_proxy_async_backlog_bytes + - Reconnect: add initiator label (client/server) + - Config & PKI: newt_config_apply_seconds{phase,result}; newt_cert_rotation_total{result} 5) Operational guidance -- Do not double scrape: scrape either Newt (/metrics) or the Collector’s Prometheus exporter (not both) to avoid double-counting cumulative counters. -- For high cardinality tunnel_id, consider relabeling or dropping per-tunnel series in Prometheus to control cardinality. -- OTLP troubleshooting: enable TLS via OTEL_EXPORTER_OTLP_CERTIFICATE, use OTEL_EXPORTER_OTLP_HEADERS for auth; verify endpoint reachability. + + - Do not double scrape: scrape either Newt (/metrics) or the Collector’s Prometheus exporter (not both) to avoid double-counting cumulative counters. + - For high cardinality tunnel_id, consider relabeling or dropping per-tunnel series in Prometheus to control cardinality. + - OTLP troubleshooting: enable TLS via OTEL_EXPORTER_OTLP_CERTIFICATE, use OTEL_EXPORTER_OTLP_HEADERS for auth; verify endpoint reachability. 6) Example alerts/recording rules (suggestions) -- Reconnect spikes: - - increase(newt_tunnel_reconnects_total[5m]) by (site_id) -- Sustained connection errors: - - rate(newt_connection_errors_total[5m]) by (site_id,transport,error_type) -- Heartbeat gaps: - - max_over_time(newt_site_last_heartbeat_seconds[15m]) by (site_id) -- Proxy drops: - - increase(newt_proxy_drops_total[5m]) by (site_id,protocol) -- WebSocket connect p95 (when added): - - histogram_quantile(0.95, sum(rate(newt_websocket_connect_latency_seconds_bucket[5m])) by (le,site_id)) + + - Reconnect spikes: + - increase(newt_tunnel_reconnects_total[5m]) by (site_id) + - Sustained connection errors: + - rate(newt_connection_errors_total[5m]) by (site_id,transport,error_type) + - Heartbeat gaps: + - max_over_time(newt_site_last_heartbeat_seconds[15m]) by (site_id) + - Proxy drops: + - increase(newt_proxy_drops_total[5m]) by (site_id,protocol) + - WebSocket connect p95 (when added): + - histogram_quantile(0.95, sum(rate(newt_websocket_connect_latency_seconds_bucket[5m])) by (le,site_id)) 7) Collector configuration -- Direct scrape variant requires no attribute promotion since site_id is already a metric label. -- Transform/promote variant remains optional for environments that rely on resource-to-label promotion. + + - Direct scrape variant requires no attribute promotion since site_id is already a metric label. + - Transform/promote variant remains optional for environments that rely on resource-to-label promotion. 8) Testing + - curl :2112/metrics | grep ^newt_ - Verify presence of site_id across series; region appears when set. - Ensure disallowed attributes are filtered; allowed (site_id) retained. - - From bd62da4cc9f780cf254eb5aeb770f14308d5c529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 14:42:05 +0200 Subject: [PATCH 60/72] fix(docker-compose, prometheus, telemetry, proxy): standardize collector naming and improve error handling --- docker-compose-coolify.yml | 6 +-- docs/observability.md | 2 +- examples/prometheus.yml | 2 +- internal/telemetry/metrics.go | 70 ++++++++++++++++++++++++++--------- proxy/manager.go | 29 ++++++++------- 5 files changed, 73 insertions(+), 36 deletions(-) diff --git a/docker-compose-coolify.yml b/docker-compose-coolify.yml index 7073d12..e2bb1e6 100644 --- a/docker-compose-coolify.yml +++ b/docker-compose-coolify.yml @@ -1,5 +1,5 @@ services: - collector: + otel-collector: image: otel/opentelemetry-collector:0.111.0 command: ["--config=/etc/otelcol/config.yaml"] volumes: @@ -15,14 +15,14 @@ services: OTEL_SERVICE_NAME: newt NEWT_METRICS_PROMETHEUS_ENABLED: "true" NEWT_METRICS_OTLP_ENABLED: "true" - OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317" + OTEL_EXPORTER_OTLP_ENDPOINT: "otel-collector:4317" OTEL_EXPORTER_OTLP_INSECURE: "true" OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative" NEWT_ADMIN_ADDR: "0.0.0.0:2112" ports: - "2112:2112" depends_on: - - collector + - otel-collector prometheus: image: prom/prometheus:v2.55.0 diff --git a/docs/observability.md b/docs/observability.md index a652096..ae2d0d4 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -125,7 +125,7 @@ global: scrape_configs: - job_name: otel-collector static_configs: - - targets: ["collector:8889"] + - targets: ["otel-collector:8889"] ``` Reason mapping (source → reason) diff --git a/examples/prometheus.yml b/examples/prometheus.yml index 8b73c5c..9edb661 100644 --- a/examples/prometheus.yml +++ b/examples/prometheus.yml @@ -18,4 +18,4 @@ scrape_configs: # WARNING: Do not enable this together with the 'newt' job above or you will double-count. # - job_name: 'otel-collector' # static_configs: - # - targets: ['collector:8889'] + # - targets: ['otel-collector:8889'] diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index c75ebb9..1d11927 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -70,11 +70,26 @@ func registerInstruments() error { var err error initOnce.Do(func() { meter = otel.Meter("newt") - if e := registerSiteInstruments(); e != nil { err = e; return } - if e := registerTunnelInstruments(); e != nil { err = e; return } - if e := registerConnInstruments(); e != nil { err = e; return } - if e := registerConfigInstruments(); e != nil { err = e; return } - if e := registerBuildWSProxyInstruments(); e != nil { err = e; return } + if e := registerSiteInstruments(); e != nil { + err = e + return + } + if e := registerTunnelInstruments(); e != nil { + err = e + return + } + if e := registerConnInstruments(); e != nil { + err = e + return + } + if e := registerConfigInstruments(); e != nil { + err = e + return + } + if e := registerBuildWSProxyInstruments(); e != nil { + err = e + return + } }) return err } @@ -83,13 +98,19 @@ func registerSiteInstruments() error { var err error mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total", metric.WithDescription("Total site registration attempts")) - if err != nil { return err } + if err != nil { + return err + } mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online", metric.WithDescription("Site online (0/1)")) - if err != nil { return err } + if err != nil { + return err + } mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_seconds", metric.WithDescription("Seconds since last site heartbeat")) - if err != nil { return err } + if err != nil { + return err + } return nil } @@ -97,18 +118,26 @@ func registerTunnelInstruments() error { var err error mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions", metric.WithDescription("Active tunnel sessions")) - if err != nil { return err } + if err != nil { + return err + } mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total", metric.WithDescription("Tunnel bytes ingress/egress"), metric.WithUnit("By")) - if err != nil { return err } + if err != nil { + return err + } mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds", metric.WithDescription("Per-tunnel latency in seconds"), metric.WithUnit("s")) - if err != nil { return err } + if err != nil { + return err + } mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total", metric.WithDescription("Tunnel reconnect events")) - if err != nil { return err } + if err != nil { + return err + } return nil } @@ -116,10 +145,14 @@ func registerConnInstruments() error { var err error mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total", metric.WithDescription("Connection attempts")) - if err != nil { return err } + if err != nil { + return err + } mConnErrors, err = meter.Int64Counter("newt_connection_errors_total", metric.WithDescription("Connection errors by type")) - if err != nil { return err } + if err != nil { + return err + } return nil } @@ -310,10 +343,13 @@ func ObserveProxyAsyncBacklogObs(o metric.Observer, value int64, attrs []attribu } func IncProxyDrops(ctx context.Context, tunnelID, protocol string) { - mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite( - attribute.String("tunnel_id", tunnelID), + attrs := []attribute.KeyValue{ attribute.String("protocol", protocol), - )...)) + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) } // --- Config/PKI helpers --- diff --git a/proxy/manager.go b/proxy/manager.go index ac80d8f..ceaa12b 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -275,7 +275,7 @@ func (pm *ProxyManager) Start() error { telemetry.ObserveProxyActiveConnsObs(o, e.activeTCP.Load(), e.attrOutTCP.ToSlice()) telemetry.ObserveProxyActiveConnsObs(o, e.activeUDP.Load(), e.attrOutUDP.ToSlice()) // backlog bytes (sum of unflushed counters) - b := int64(e.bytesInTCP.Load()+e.bytesOutTCP.Load()+e.bytesInUDP.Load()+e.bytesOutUDP.Load()) + b := int64(e.bytesInTCP.Load() + e.bytesOutTCP.Load() + e.bytesInUDP.Load() + e.bytesOutUDP.Load()) telemetry.ObserveProxyAsyncBacklogObs(o, b, e.attrOutTCP.ToSlice()) telemetry.ObserveProxyBufferBytesObs(o, b, e.attrOutTCP.ToSlice()) } @@ -598,14 +598,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { continue } - targetConn, err = net.DialUDP("udp", nil, targetUDPAddr) - if e := pm.getEntry(pm.currentTunnelID); e != nil { - e.activeUDP.Add(1) - } + targetConn, err = net.DialUDP("udp", nil, targetUDPAddr) if err != nil { logger.Error("Error connecting to target: %v", err) continue } + // Only increment activeUDP after a successful DialUDP + if e := pm.getEntry(pm.currentTunnelID); e != nil { + e.activeUDP.Add(1) + } clientsMutex.Lock() clientConns[clientKey] = targetConn @@ -656,15 +657,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { }(clientKey, targetConn, remoteAddr) } - written, err := targetConn.Write(buffer[:n]) - if err != nil { - logger.Error("Error writing to target: %v", err) - telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp") - targetConn.Close() - clientsMutex.Lock() - delete(clientConns, clientKey) - clientsMutex.Unlock() - } else if pm.currentTunnelID != "" && written > 0 { + written, err := targetConn.Write(buffer[:n]) + if err != nil { + logger.Error("Error writing to target: %v", err) + telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp") + targetConn.Close() + clientsMutex.Lock() + delete(clientConns, clientKey) + clientsMutex.Unlock() + } else if pm.currentTunnelID != "" && written > 0 { if pm.asyncBytes { if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) From 34902208032df5e662f78fdd9fdd37a95a3e6255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 14:46:17 +0200 Subject: [PATCH 61/72] fix(docker-compose, prometheus): remove unnecessary comments and improve clarity --- docker-compose.metrics.yml | 4 ++-- examples/prometheus.with-collector.yml | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docker-compose.metrics.yml b/docker-compose.metrics.yml index 1dcb633..0366522 100644 --- a/docker-compose.metrics.yml +++ b/docker-compose.metrics.yml @@ -39,7 +39,7 @@ services: - "9090:9090" grafana: - image: grafana/grafana:latest + image: grafana/grafana:12.2.0 container_name: newt-metrics-grafana restart: unless-stopped environment: @@ -52,4 +52,4 @@ services: volumes: - ./examples/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro - ./examples/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro - - ./examples/grafana/dashboards:/var/lib/grafana/dashboards:ro \ No newline at end of file + - ./examples/grafana/dashboards:/var/lib/grafana/dashboards:ro diff --git a/examples/prometheus.with-collector.yml b/examples/prometheus.with-collector.yml index 829730d..ff4f3f8 100644 --- a/examples/prometheus.with-collector.yml +++ b/examples/prometheus.with-collector.yml @@ -14,4 +14,3 @@ scrape_configs: # - action: keep # source_labels: [site_id] # regex: '(site-a|site-b)' - From 3cd7329d8bb83c660ea4f3cd7c48cda9f7c96437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 14:47:49 +0200 Subject: [PATCH 62/72] fix(prometheus): update comment for clarity and consistency in scraping instructions --- examples/prometheus.with-collector.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/prometheus.with-collector.yml b/examples/prometheus.with-collector.yml index ff4f3f8..ddb67ea 100644 --- a/examples/prometheus.with-collector.yml +++ b/examples/prometheus.with-collector.yml @@ -2,7 +2,7 @@ global: scrape_interval: 15s scrape_configs: - # WICHTIG: Newt NICHT direkt scrapen, nur den Collector! + # IMPORTANT: Do not scrape Newt directly; scrape only the Collector! - job_name: 'otel-collector' static_configs: - targets: ['otel-collector:8889'] From c32828128f7a7efc7da44b2d3ff49b6f6f4da9c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 14:49:14 +0200 Subject: [PATCH 63/72] fix(readme): enhance clarity and structure of installation and documentation sections --- README.md | 192 ++++++++++++++++++++++++++---------------------------- 1 file changed, 94 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index 578fe3f..6ed8a8a 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,18 @@ + # Newt + [![PkgGoDev](https://pkg.go.dev/badge/github.com/fosrl/newt)](https://pkg.go.dev/github.com/fosrl/newt) [![GitHub License](https://img.shields.io/github/license/fosrl/newt)](https://github.com/fosrl/newt/blob/main/LICENSE) [![Go Report Card](https://goreportcard.com/badge/github.com/fosrl/newt)](https://goreportcard.com/report/github.com/fosrl/newt) Newt is a fully user space [WireGuard](https://www.wireguard.com/) tunnel client and TCP/UDP proxy, designed to securely expose private resources controlled by Pangolin. By using Newt, you don't need to manage complex WireGuard tunnels and NATing. -### Installation and Documentation +## Installation and Documentation Newt is used with Pangolin and Gerbil as part of the larger system. See documentation below: -- [Full Documentation](https://docs.fossorial.io) -- Observability Quickstart: see docs/observability.md (Prometheus/OTel Collector setup, smoke tests) +- [Full Documentation](https://docs.fossorial.io) +- Observability Quickstart: see `docs/observability.md` — canonical Prometheus/OTel Collector quickstart and smoke tests ## Preview @@ -34,63 +36,63 @@ When Newt receives WireGuard control messages, it will use the information encod ## CLI Args -- `id`: Newt ID generated by Pangolin to identify the client. -- `secret`: A unique secret (not shared and kept private) used to authenticate the client ID with the websocket in order to receive commands. -- `endpoint`: The endpoint where both Gerbil and Pangolin reside in order to connect to the websocket. +- `id`: Newt ID generated by Pangolin to identify the client. +- `secret`: A unique secret (not shared and kept private) used to authenticate the client ID with the websocket in order to receive commands. +- `endpoint`: The endpoint where both Gerbil and Pangolin reside in order to connect to the websocket. -- `mtu` (optional): MTU for the internal WG interface. Default: 1280 -- `dns` (optional): DNS server to use to resolve the endpoint. Default: 9.9.9.9 -- `log-level` (optional): The log level to use (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO -- `enforce-hc-cert` (optional): Enforce certificate validation for health checks. Default: false (accepts any cert) -- `docker-socket` (optional): Set the Docker socket to use the container discovery integration -- `ping-interval` (optional): Interval for pinging the server. Default: 3s -- `ping-timeout` (optional): Timeout for each ping. Default: 5s -- `updown` (optional): A script to be called when targets are added or removed. -- `tls-client-cert` (optional): Client certificate (p12 or pfx) for mTLS. See [mTLS](#mtls) -- `tls-client-cert` (optional): Path to client certificate (PEM format, optional if using PKCS12). See [mTLS](#mtls) -- `tls-client-key` (optional): Path to private key for mTLS (PEM format, optional if using PKCS12) -- `tls-ca-cert` (optional): Path to CA certificate to verify server (PEM format, optional if using PKCS12) -- `docker-enforce-network-validation` (optional): Validate the container target is on the same network as the newt process. Default: false -- `health-file` (optional): Check if connection to WG server (pangolin) is ok. creates a file if ok, removes it if not ok. Can be used with docker healtcheck to restart newt -- `accept-clients` (optional): Enable WireGuard server mode to accept incoming newt client connections. Default: false - - `generateAndSaveKeyTo` (optional): Path to save generated private key - - `native` (optional): Use native WireGuard interface when accepting clients (requires WireGuard kernel module and Linux, must run as root). Default: false (uses userspace netstack) - - `interface` (optional): Name of the WireGuard interface. Default: newt - - `keep-interface` (optional): Keep the WireGuard interface. Default: false +- `mtu` (optional): MTU for the internal WG interface. Default: 1280 +- `dns` (optional): DNS server to use to resolve the endpoint. Default: 9.9.9.9 +- `log-level` (optional): The log level to use (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO +- `enforce-hc-cert` (optional): Enforce certificate validation for health checks. Default: false (accepts any cert) +- `docker-socket` (optional): Set the Docker socket to use the container discovery integration +- `ping-interval` (optional): Interval for pinging the server. Default: 3s +- `ping-timeout` (optional): Timeout for each ping. Default: 5s +- `updown` (optional): A script to be called when targets are added or removed. +- `tls-client-cert` (optional): Client certificate (p12 or pfx) for mTLS. See [mTLS](#mtls) +- `tls-client-cert` (optional): Path to client certificate (PEM format, optional if using PKCS12). See [mTLS](#mtls) +- `tls-client-key` (optional): Path to private key for mTLS (PEM format, optional if using PKCS12) +- `tls-ca-cert` (optional): Path to CA certificate to verify server (PEM format, optional if using PKCS12) +- `docker-enforce-network-validation` (optional): Validate the container target is on the same network as the newt process. Default: false +- `health-file` (optional): Check if connection to WG server (pangolin) is ok. creates a file if ok, removes it if not ok. Can be used with docker healtcheck to restart newt +- `accept-clients` (optional): Enable WireGuard server mode to accept incoming newt client connections. Default: false + - `generateAndSaveKeyTo` (optional): Path to save generated private key + - `native` (optional): Use native WireGuard interface when accepting clients (requires WireGuard kernel module and Linux, must run as root). Default: false (uses userspace netstack) + - `interface` (optional): Name of the WireGuard interface. Default: newt + - `keep-interface` (optional): Keep the WireGuard interface. Default: false ## Environment Variables All CLI arguments can be set using environment variables as an alternative to command line flags. Environment variables are particularly useful when running Newt in containerized environments. -- `PANGOLIN_ENDPOINT`: Endpoint of your pangolin server (equivalent to `--endpoint`) -- `NEWT_ID`: Newt ID generated by Pangolin (equivalent to `--id`) -- `NEWT_SECRET`: Newt secret for authentication (equivalent to `--secret`) -- `MTU`: MTU for the internal WG interface. Default: 1280 (equivalent to `--mtu`) -- `DNS`: DNS server to use to resolve the endpoint. Default: 9.9.9.9 (equivalent to `--dns`) -- `LOG_LEVEL`: Log level (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO (equivalent to `--log-level`) -- `DOCKER_SOCKET`: Path to Docker socket for container discovery (equivalent to `--docker-socket`) -- `PING_INTERVAL`: Interval for pinging the server. Default: 3s (equivalent to `--ping-interval`) -- `PING_TIMEOUT`: Timeout for each ping. Default: 5s (equivalent to `--ping-timeout`) -- `UPDOWN_SCRIPT`: Path to updown script for target add/remove events (equivalent to `--updown`) -- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`) -- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`) -- `TLS_CLIENT_KEY`: Path to private key for mTLS (equivalent to `--tls-client-key`) -- `TLS_CA_CERT`: Path to CA certificate to verify server (equivalent to `--tls-ca-cert`) -- `DOCKER_ENFORCE_NETWORK_VALIDATION`: Validate container targets are on same network. Default: false (equivalent to `--docker-enforce-network-validation`) -- `ENFORCE_HC_CERT`: Enforce certificate validation for health checks. Default: false (equivalent to `--enforce-hc-cert`) -- `HEALTH_FILE`: Path to health file for connection monitoring (equivalent to `--health-file`) -- `ACCEPT_CLIENTS`: Enable WireGuard server mode. Default: false (equivalent to `--accept-clients`) -- `GENERATE_AND_SAVE_KEY_TO`: Path to save generated private key (equivalent to `--generateAndSaveKeyTo`) -- `USE_NATIVE_INTERFACE`: Use native WireGuard interface (Linux only). Default: false (equivalent to `--native`) -- `INTERFACE`: Name of the WireGuard interface. Default: newt (equivalent to `--interface`) -- `KEEP_INTERFACE`: Keep the WireGuard interface after shutdown. Default: false (equivalent to `--keep-interface`) -- `CONFIG_FILE`: Load the config json from this file instead of in the home folder. +- `PANGOLIN_ENDPOINT`: Endpoint of your pangolin server (equivalent to `--endpoint`) +- `NEWT_ID`: Newt ID generated by Pangolin (equivalent to `--id`) +- `NEWT_SECRET`: Newt secret for authentication (equivalent to `--secret`) +- `MTU`: MTU for the internal WG interface. Default: 1280 (equivalent to `--mtu`) +- `DNS`: DNS server to use to resolve the endpoint. Default: 9.9.9.9 (equivalent to `--dns`) +- `LOG_LEVEL`: Log level (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO (equivalent to `--log-level`) +- `DOCKER_SOCKET`: Path to Docker socket for container discovery (equivalent to `--docker-socket`) +- `PING_INTERVAL`: Interval for pinging the server. Default: 3s (equivalent to `--ping-interval`) +- `PING_TIMEOUT`: Timeout for each ping. Default: 5s (equivalent to `--ping-timeout`) +- `UPDOWN_SCRIPT`: Path to updown script for target add/remove events (equivalent to `--updown`) +- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`) +- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`) +- `TLS_CLIENT_KEY`: Path to private key for mTLS (equivalent to `--tls-client-key`) +- `TLS_CA_CERT`: Path to CA certificate to verify server (equivalent to `--tls-ca-cert`) +- `DOCKER_ENFORCE_NETWORK_VALIDATION`: Validate container targets are on same network. Default: false (equivalent to `--docker-enforce-network-validation`) +- `ENFORCE_HC_CERT`: Enforce certificate validation for health checks. Default: false (equivalent to `--enforce-hc-cert`) +- `HEALTH_FILE`: Path to health file for connection monitoring (equivalent to `--health-file`) +- `ACCEPT_CLIENTS`: Enable WireGuard server mode. Default: false (equivalent to `--accept-clients`) +- `GENERATE_AND_SAVE_KEY_TO`: Path to save generated private key (equivalent to `--generateAndSaveKeyTo`) +- `USE_NATIVE_INTERFACE`: Use native WireGuard interface (Linux only). Default: false (equivalent to `--native`) +- `INTERFACE`: Name of the WireGuard interface. Default: newt (equivalent to `--interface`) +- `KEEP_INTERFACE`: Keep the WireGuard interface after shutdown. Default: false (equivalent to `--keep-interface`) +- `CONFIG_FILE`: Load the config json from this file instead of in the home folder. ## Loading secrets from files -You can use `CONFIG_FILE` to define a location of a config file to store the credentials between runs. +You can use `CONFIG_FILE` to define a location of a config file to store the credentials between runs. -``` +```sh $ cat ~/.config/newt-client/config.json { "id": "spmzu8rbpzj1qq6", @@ -100,26 +102,21 @@ $ cat ~/.config/newt-client/config.json } ``` -This file is also written to when newt first starts up. So you do not need to run every time with --id and secret if you have run it once! +This file is also written to when newt first starts up. So you do not need to run every time with --id and secret if you have run it once! -Default locations: +Default locations: - **macOS**: `~/Library/Application Support/newt-client/config.json` - **Windows**: `%PROGRAMDATA%\newt\newt-client\config.json` - **Linux/Others**: `~/.config/newt-client/config.json` -## Observability Quickstart - -For a quick start with Prometheus scraping and smoke checks, read the step-by-step guide in docs/observability.md. It includes: -- docker-compose.metrics.yml for direct /metrics scraping (recommended) -- docker-compose.metrics.collector.yml for the OTLP → Collector → Prometheus exporter path (no double-scrape) -- scripts/smoke-metrics.sh for basic verification + ## Examples **Note**: When both environment variables and CLI arguments are provided, CLI arguments take precedence. -- Example: +- Example: ```bash newt \ @@ -166,16 +163,16 @@ When the `--accept-clients` flag is enabled (or `ACCEPT_CLIENTS=true` environmen In client acceptance mode, Newt: -- **Creates a WireGuard service** that can accept incoming connections from other WireGuard clients -- **Starts a connection testing server** (WGTester) that responds to connectivity checks from remote clients -- **Manages peer configurations** dynamically based on Pangolin's instructions -- **Enables bidirectional communication** between the Newt instance and connected clients +- **Creates a WireGuard service** that can accept incoming connections from other WireGuard clients +- **Starts a connection testing server** (WGTester) that responds to connectivity checks from remote clients +- **Manages peer configurations** dynamically based on Pangolin's instructions +- **Enables bidirectional communication** between the Newt instance and connected clients ### Use Cases -- **Site-to-site connectivity**: Connect multiple locations through a central Newt instance -- **Client access to private networks**: Allow remote clients to access resources behind the Newt instance -- **Development environments**: Provide developers secure access to internal services +- **Site-to-site connectivity**: Connect multiple locations through a central Newt instance +- **Client access to private networks**: Allow remote clients to access resources behind the Newt instance +- **Development environments**: Provide developers secure access to internal services ### Client Tunneling Modes @@ -185,11 +182,11 @@ Newt supports two WireGuard tunneling modes: By default, Newt uses a fully userspace WireGuard implementation using [netstack](https://github.com/WireGuard/wireguard-go/blob/master/tun/netstack/examples/http_server.go). This mode: -- **Does not require root privileges** -- **Works on all supported platforms** (Linux, Windows, macOS) -- **Does not require WireGuard kernel module** to be installed -- **Runs entirely in userspace** - no system network interface is created -- **Is containerization-friendly** - works seamlessly in Docker containers +- **Does not require root privileges** +- **Works on all supported platforms** (Linux, Windows, macOS) +- **Does not require WireGuard kernel module** to be installed +- **Runs entirely in userspace** - no system network interface is created +- **Is containerization-friendly** - works seamlessly in Docker containers This is the recommended mode for most deployments, especially containerized environments. @@ -199,11 +196,11 @@ In this mode, TCP and UDP is proxied out of newt from the remote client using TC When using the `--native` flag or setting `USE_NATIVE_INTERFACE=true`, Newt uses the native WireGuard kernel module. This mode: -- **Requires root privileges** to create and manage network interfaces -- **Only works on Linux** with the WireGuard kernel module installed -- **Creates a real network interface** (e.g., `newt0`) on the system -- **May offer better performance** for high-throughput scenarios -- **Requires proper network permissions** and may conflict with existing network configurations +- **Requires root privileges** to create and manage network interfaces +- **Only works on Linux** with the WireGuard kernel module installed +- **Creates a real network interface** (e.g., `newt0`) on the system +- **May offer better performance** for high-throughput scenarios +- **Requires proper network permissions** and may conflict with existing network configurations In this mode it functions like a traditional VPN interface - all data arrives on the interface and you must get it to the destination (or access things locally). @@ -235,10 +232,10 @@ services: When client acceptance is enabled: -- **WGTester Server**: Runs on `port + 1` (e.g., if WireGuard uses port 51820, WGTester uses 51821) -- **Connection Testing**: Responds to UDP packets with magic header `0xDEADBEEF` for connectivity verification -- **Dynamic Configuration**: Peer configurations are managed remotely through Pangolin -- **Proxy Integration**: Can work with both userspace (netstack) and native WireGuard modes +- **WGTester Server**: Runs on `port + 1` (e.g., if WireGuard uses port 51820, WGTester uses 51821) +- **Connection Testing**: Responds to UDP packets with magic header `0xDEADBEEF` for connectivity verification +- **Dynamic Configuration**: Peer configurations are managed remotely through Pangolin +- **Proxy Integration**: Can work with both userspace (netstack) and native WireGuard modes **Note**: Client acceptance mode requires coordination with Pangolin for peer management and configuration distribution. @@ -252,24 +249,23 @@ You can specify the Docker socket path using the `--docker-socket` CLI argument Supported values include: -- Local UNIX socket (default): +- Local UNIX socket (default): >You must mount the socket file into the container using a volume, so Newt can access it. `unix:///var/run/docker.sock` -- TCP socket (e.g., via Docker Socket Proxy): +- TCP socket (e.g., via Docker Socket Proxy): `tcp://localhost:2375` -- HTTP/HTTPS endpoints (e.g., remote Docker APIs): +- HTTP/HTTPS endpoints (e.g., remote Docker APIs): `http://your-host:2375` -- SSH connections (experimental, requires SSH setup): +- SSH connections (experimental, requires SSH setup): `ssh://user@host` - ```yaml services: newt: @@ -284,16 +280,17 @@ services: - NEWT_SECRET=nnisrfsdfc7prqsp9ewo1dvtvci50j5uiqotez00dgap0ii2 - DOCKER_SOCKET=unix:///var/run/docker.sock ``` + >If you previously used just a path like `/var/run/docker.sock`, it still works — Newt assumes it is a UNIX socket by default. #### Hostnames vs IPs When the Docker Socket Integration is used, depending on the network which Newt is run with, either the hostname (generally considered the container name) or the IP address of the container will be sent to Pangolin. Here are some of the scenarios where IPs or hostname of the container will be utilised: -- **Running in Network Mode 'host'**: IP addresses will be used -- **Running in Network Mode 'bridge'**: IP addresses will be used -- **Running in docker-compose without a network specification**: Docker compose creates a network for the compose by default, hostnames will be used -- **Running on docker-compose with defined network**: Hostnames will be used +- **Running in Network Mode 'host'**: IP addresses will be used +- **Running in Network Mode 'bridge'**: IP addresses will be used +- **Running in docker-compose without a network specification**: Docker compose creates a network for the compose by default, hostnames will be used +- **Running on docker-compose with defined network**: Hostnames will be used ### Docker Enforce Network Validation @@ -329,12 +326,12 @@ Newt supports mutual TLS (mTLS) authentication if the server is configured to re > This is the original method and still supported. -* File must contain: +- File must contain: - * Client private key - * Public certificate - * CA certificate -* Encrypted `.p12` files are **not supported** + - Client private key + - Public certificate + - CA certificate +- Encrypted `.p12` files are **not supported** Example: @@ -350,9 +347,9 @@ newt \ You can now provide separate files for: -* `--tls-client-cert`: client certificate (`.crt` or `.pem`) -* `--tls-client-key`: client private key (`.key` or `.pem`) -* `--tls-ca-cert`: CA cert to verify the server +- `--tls-client-cert`: client certificate (`.crt` or `.pem`) +- `--tls-client-key`: client private key (`.key` or `.pem`) +- `--tls-ca-cert`: CA cert to verify the server Example: @@ -366,7 +363,6 @@ newt \ --tls-ca-cert ./ca.crt ``` - ```yaml services: newt: From 8d26de5f4da731181231327cca1135f0ad56773d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 15:29:24 +0200 Subject: [PATCH 64/72] fix(docker-compose): improve comments for clarity on port mapping and collector usage --- docker-compose.metrics.collector.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/docker-compose.metrics.collector.yml b/docker-compose.metrics.collector.yml index 8d0d536..040f410 100644 --- a/docker-compose.metrics.collector.yml +++ b/docker-compose.metrics.collector.yml @@ -9,8 +9,19 @@ services: - NEWT_METRICS_OTLP_ENABLED=true # OTLP to the Collector # optional: # - NEWT_METRICS_INCLUDE_TUNNEL_ID=false - # If Newt itself exposes ports, DO NOT map 2112 here - # ports: [] + # When using the Collector pattern, do NOT map the Newt admin/metrics port + # (2112) on the application service. Mapping 2112 here can cause port + # conflicts and may result in duplicated Prometheus scraping (app AND + # collector being scraped for the same metrics). Instead either: + # - leave ports unset on the app service (recommended), or + # - map 2112 only on a dedicated metrics/collector service that is + # responsible for exposing metrics to Prometheus. + # Example: do NOT map here + # ports: [] + # Example: map 2112 only on a collector service + # collector: + # ports: + # - "2112:2112" # collector's prometheus exporter (scraped by Prometheus) otel-collector: image: otel/opentelemetry-collector-contrib:latest From b68777e83aaa4834158dfa7791983d4f0cdeb138 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 15:29:45 +0200 Subject: [PATCH 65/72] fix(prometheus): clarify instructions regarding scraping the Collector --- examples/prometheus.with-collector.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/prometheus.with-collector.yml b/examples/prometheus.with-collector.yml index ddb67ea..ca465e3 100644 --- a/examples/prometheus.with-collector.yml +++ b/examples/prometheus.with-collector.yml @@ -7,7 +7,7 @@ scrape_configs: static_configs: - targets: ['otel-collector:8889'] - # optional: Kardinalität begrenzen + # optional: limit metric cardinality relabel_configs: - action: labeldrop regex: 'tunnel_id' From 4ef9737862a307ea4baf80d442da4957a5a29785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 15:29:53 +0200 Subject: [PATCH 66/72] fix(observability): enhance clarity and structure of metrics documentation --- docs/observability.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/observability.md b/docs/observability.md index ae2d0d4..6f71ecb 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -144,7 +144,7 @@ PromQL snippets - Throughput in (5m): ```sh -sum(rate(newt_tunnel_bytes_total{direction="in"}[5m])) +sum(rate(newt_tunnel_bytes_total{direction="ingress"}[5m])) ``` - P95 latency (seconds): From b6f5458ad995a2e8efe332adc614ccce1c578745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 15:30:06 +0200 Subject: [PATCH 67/72] fix(telemetry): enhance session observation logic for tunnel IDs and site-level aggregation --- internal/telemetry/state_view.go | 36 +++++++++++++++++++++++++------- util.go | 14 +++++++------ 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go index 1a51452..071a405 100644 --- a/internal/telemetry/state_view.go +++ b/internal/telemetry/state_view.go @@ -71,14 +71,36 @@ func observeLastHeartbeatFor(o metric.Observer, sv StateView, siteID string) { func observeSessionsFor(o metric.Observer, siteID string, any interface{}) { if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok { - for tid, n := range tm.SessionsByTunnel() { - attrs := []attribute.KeyValue{ - attribute.String("site_id", siteID), + sessions := tm.SessionsByTunnel() + // If tunnel_id labels are enabled, preserve existing per-tunnel observations + if ShouldIncludeTunnelID() { + for tid, n := range sessions { + attrs := []attribute.KeyValue{ + attribute.String("site_id", siteID), + } + if tid != "" { + attrs = append(attrs, attribute.String("tunnel_id", tid)) + } + o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attrs...)) } - if ShouldIncludeTunnelID() && tid != "" { - attrs = append(attrs, attribute.String("tunnel_id", tid)) - } - o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attrs...)) + return } + // When tunnel_id is disabled, collapse per-tunnel counts into a single site-level value + var total int64 + for _, n := range sessions { + total += n + } + // If there are no per-tunnel entries, fall back to ActiveSessions() if available + if total == 0 { + if svAny := stateView.Load(); svAny != nil { + if sv, ok := svAny.(StateView); ok { + if n, ok2 := sv.ActiveSessions(siteID); ok2 { + total = n + } + } + } + } + o.ObserveInt64(mTunnelSessions, total, metric.WithAttributes(attribute.String("site_id", siteID))) + return } } diff --git a/util.go b/util.go index fa339e8..9f0d268 100644 --- a/util.go +++ b/util.go @@ -225,6 +225,8 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC } } case <-pingStopChan: + // Stop the goroutine when signaled + return } } }() @@ -293,12 +295,12 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien // More lenient threshold for declaring connection lost under load failureThreshold := 4 if consecutiveFailures >= failureThreshold && currentInterval < maxInterval { - if !connectionLost { - connectionLost = true - logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) - if tunnelID != "" { - telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) - } + if !connectionLost { + connectionLost = true + logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) + if tunnelID != "" { + telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) + } stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second) // Send registration message to the server for backward compatibility err := client.SendMessage("newt/wg/register", map[string]interface{}{ From 1a9f6c46852e28c319fbaf1170757bb6169eec4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 15:34:00 +0200 Subject: [PATCH 68/72] fix(github-actions): add permissions section for content read access in workflows --- .github/workflows/cicd.yml | 3 +++ .github/workflows/test.yml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 2364f89..7c463f5 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -1,5 +1,8 @@ name: CI/CD Pipeline +permissions: + contents: read + on: push: tags: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 79143df..8fba9ae 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,5 +1,8 @@ name: Run Tests +permissions: + contents: read + on: pull_request: branches: From 52e4a57cc1060acd12226696ab27003f4d6a5b4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 18:17:59 +0200 Subject: [PATCH 69/72] Enhance telemetry metrics and context propagation --- docs/otel-review.md | 64 +++++++++++++++++ internal/telemetry/metrics.go | 65 ++++++++++++++++- main.go | 25 ++++--- proxy/manager.go | 103 ++++++++++++++++++--------- websocket/client.go | 130 ++++++++++++++++++++++++++-------- wg/wg.go | 24 +++++-- 6 files changed, 330 insertions(+), 81 deletions(-) create mode 100644 docs/otel-review.md diff --git a/docs/otel-review.md b/docs/otel-review.md new file mode 100644 index 0000000..14cfb53 --- /dev/null +++ b/docs/otel-review.md @@ -0,0 +1,64 @@ +# OpenTelemetry Review + +## Metric inventory +The table below lists every instrument registered by `internal/telemetry/metrics.go`, the helper that emits it, and an example time-series. Attribute sets automatically add `site_id` (and optionally `region`) via `attrsWithSite` unless the observable callback overrides them. 【F:internal/telemetry/metrics.go†L23-L205】【F:internal/telemetry/metrics.go†L289-L403】 + +| Metric | Instrument & unit | Purpose | Emission path | Example series | +| --- | --- | --- | --- | --- | +| `newt_site_registrations_total` | Counter | Counts Pangolin registration attempts keyed by result (`success`, `failure`). | `telemetry.IncSiteRegistration` (called after registration completes). | `newt_site_registrations_total{result="success",site_id="abc"} 1` | +| `newt_site_online` | Observable gauge | 0/1 heartbeat for the active site, driven by the registered `StateView`. | `telemetry.SetObservableCallback` via `state.TelemetryView`. | `newt_site_online{site_id="self"} 1` | +| `newt_site_last_heartbeat_seconds` | Observable gauge | Seconds since the last Pangolin heartbeat. | Same callback as above using `state.TelemetryView.TouchHeartbeat`. | `newt_site_last_heartbeat_seconds{site_id="self"} 3.2` | +| `newt_tunnel_sessions` | Observable gauge | Active sessions per tunnel; collapses to site total when `tunnel_id` emission is disabled. | `state.TelemetryView.SessionsByTunnel` via `RegisterStateView`. | `newt_tunnel_sessions{site_id="self",tunnel_id="wgpub"} 2` | +| `newt_tunnel_bytes_total` | Counter (`By`) | Traffic accounting per tunnel, direction (`ingress`/`egress`), protocol (`tcp`/`udp`). | Proxy manager counting writers (`AddTunnelBytes`/`AddTunnelBytesSet`). | `newt_tunnel_bytes_total{direction="egress",protocol="tcp",site_id="self",tunnel_id="wgpub"} 8192` | +| `newt_tunnel_latency_seconds` | Histogram (`s`) | RTT samples from WireGuard stack and health pings per tunnel/transport. | `telemetry.ObserveTunnelLatency` from tunnel health checks. | `newt_tunnel_latency_seconds_bucket{transport="wireguard",le="0.05",tunnel_id="wgpub"} 4` | +| `newt_tunnel_reconnects_total` | Counter | Reconnect attempts bucketed by initiator (`client`/`server`) and reason enums. | `telemetry.IncReconnect` across websocket, WG, and utility flows. | `newt_tunnel_reconnects_total{initiator="client",reason="timeout",tunnel_id="wgpub"} 3` | +| `newt_connection_attempts_total` | Counter | Auth and WebSocket attempt counts by transport (`auth`, `websocket`) and result (`success`/`failure`). | `telemetry.IncConnAttempt` in auth/token and dial paths. | `newt_connection_attempts_total{transport="websocket",result="failure",site_id="self"} 2` | +| `newt_connection_errors_total` | Counter | Connection error tally keyed by transport and canonical error type (`dial_timeout`, `tls_handshake`, `auth_failed`, `io_error`). | `telemetry.IncConnError` in auth/websocket flows. | `newt_connection_errors_total{transport="auth",error_type="auth_failed",site_id="self"} 1` | +| `newt_config_reloads_total` | Counter | Successful/failed config reload attempts. | `telemetry.IncConfigReload` during WireGuard config reloads. | `newt_config_reloads_total{result="success",site_id="self"} 1` | +| `newt_restart_count_total` | Counter | Bumps to 1 at process boot for build info scrapers. | `telemetry.RegisterBuildInfo` called from `Init`. | `newt_restart_count_total{site_id="self"} 1` | +| `newt_config_apply_seconds` | Histogram (`s`) | Measures interface/peer apply duration per phase and result. | `telemetry.ObserveConfigApply` around config updates. | `newt_config_apply_seconds_bucket{phase="peer",result="success",le="0.1"} 5` | +| `newt_cert_rotation_total` | Counter | Certificate rotation events tagged by result. | `telemetry.IncCertRotation` during PKI updates. | `newt_cert_rotation_total{result="success",site_id="self"} 1` | +| `newt_build_info` | Observable gauge | Constant 1 with `version`/`commit` attributes to expose build metadata. | Callback registered in `registerBuildWSProxyInstruments`. | `newt_build_info{version="1.2.3",commit="abc123",site_id="self"} 1` | +| `newt_websocket_connect_latency_seconds` | Histogram (`s`) | Dial latency for Pangolin WebSocket connects annotated with result/error_type. | `telemetry.ObserveWSConnectLatency` inside `Client.establishConnection`. | `newt_websocket_connect_latency_seconds_bucket{result="success",transport="websocket",le="0.5"} 1` | +| `newt_websocket_messages_total` | Counter | Counts inbound/outbound WebSocket messages by direction and logical message type. | `telemetry.IncWSMessage` for ping/pong/text events. | `newt_websocket_messages_total{direction="out",msg_type="ping",site_id="self"} 4` | +| `newt_websocket_disconnects_total` | Counter | Tracks WebSocket disconnects grouped by `reason` (`shutdown`, `unexpected_close`, etc.) and `result`. | Emitted from `Client.readPumpWithDisconnectDetection` defer block. | `newt_websocket_disconnects_total{reason="unexpected_close",result="error",site_id="self"} 1` | +| `newt_websocket_keepalive_failures_total` | Counter | Failed WebSocket ping/pong keepalive attempts by reason. | Incremented in `Client.pingMonitor` when `WriteControl` fails. | `newt_websocket_keepalive_failures_total{reason="ping_write",site_id="self"} 1` | +| `newt_websocket_session_duration_seconds` | Histogram (`s`) | Duration of WebSocket sessions by outcome (`result`). | Observed when the read pump exits. | `newt_websocket_session_duration_seconds_sum{result="success",site_id="self"} 120` | +| `newt_proxy_active_connections` | Observable gauge | Active TCP/UDP proxy connections per tunnel and protocol. | Proxy manager callback via `SetProxyObservableCallback`. | `newt_proxy_active_connections{protocol="tcp",tunnel_id="wgpub"} 3` | +| `newt_proxy_buffer_bytes` | Observable gauge (`By`) | Size of proxy buffer pools (synchronous path) per tunnel/protocol. | Same proxy callback as above. | `newt_proxy_buffer_bytes{protocol="tcp",tunnel_id="wgpub"} 10240` | +| `newt_proxy_async_backlog_bytes` | Observable gauge (`By`) | Unflushed async byte backlog when deferred accounting is enabled. | Proxy callback when async accounting is turned on. | `newt_proxy_async_backlog_bytes{protocol="udp",tunnel_id="wgpub"} 4096` | +| `newt_proxy_drops_total` | Counter | Proxy write-drop events per protocol/tunnel. | `telemetry.IncProxyDrops` on UDP drop paths. | `newt_proxy_drops_total{protocol="udp",tunnel_id="wgpub"} 2` | +| `newt_proxy_accept_total` | Counter | Proxy accept attempts labelled by protocol, result, and reason. | `telemetry.IncProxyAccept` in TCP accept loop and UDP dial paths. | `newt_proxy_accept_total{protocol="tcp",result="failure",reason="timeout",site_id="self"} 1` | +| `newt_proxy_connection_duration_seconds` | Histogram (`s`) | Lifecycle duration for proxied TCP/UDP connections by result. | `telemetry.ObserveProxyConnectionDuration` when TCP/UDP handlers complete. | `newt_proxy_connection_duration_seconds_sum{protocol="udp",result="success",site_id="self"} 30` | + +In addition, Go runtime metrics are automatically exported when telemetry is initialised. 【F:internal/telemetry/telemetry.go†L147-L155】 + +## Tracing footprint +* Tracing is enabled only when OTLP export is turned on; `telemetry.Init` wires a batch `TracerProvider` and sets it globally. 【F:internal/telemetry/telemetry.go†L135-L155】 +* The admin HTTP mux (`/metrics`, `/healthz`) is wrapped with `otelhttp.NewHandler`, so any inbound admin requests produce spans. 【F:main.go†L373-L387】 +* WebSocket dials create a `ws.connect` span around the outbound handshake, but subsequent control-plane HTTP requests (token fetch, blueprint sync) use plain `http.Client` without propagation. 【F:websocket/client.go†L417-L459】 + +Overall span coverage is limited to the WebSocket connect loop and admin server; tunnel setup, Docker discovery, config application, and health pings currently emit only metrics. + +## Guideline & best-practice adherence +* **Resource & exporter configuration:** `telemetry.FromEnv` honours OTEL env-vars, sets service name/version, and promotes `site_id`/`region` resource attributes before building the provider. Exporters default to Prometheus with optional OTLP, aligning with OTel deployment guidance. 【F:internal/telemetry/telemetry.go†L56-L206】 +* **Low-cardinality enforcement:** A view-level attribute allow-list retains only approved keys (`tunnel_id`, `transport`, `protocol`, etc.), protecting Prometheus cardinality while still surfacing `site_id`/`region`. 【F:internal/telemetry/telemetry.go†L209-L231】 +* **Units and naming:** Instrument helpers enforce `_total` suffixes for counters, `_seconds` for durations, and attach `metric.WithUnit("By"|"s")` for size/time metrics, matching OTel semantic conventions. 【F:internal/telemetry/metrics.go†L23-L192】 +* **Runtime metrics & shutdown:** The runtime instrumentation is enabled, and `Setup.Shutdown` drains exporters in reverse order to avoid data loss. 【F:internal/telemetry/telemetry.go†L147-L261】 +* **Site-aware observables:** `state.TelemetryView` provides thread-safe snapshots to feed `newt_site_online`/`_last_heartbeat_seconds`/`_tunnel_sessions`, ensuring gauges report cohesive per-site data even when `tunnel_id` labels are disabled. 【F:internal/state/telemetry_view.go†L11-L79】 + +## Gaps & recommended improvements +1. **Tracing coverage:** Instrument the Pangolin REST calls (`getToken`, blueprint downloads) with `otelhttp.NewTransport` or explicit spans, and consider spans for WireGuard handshake/config apply to enable end-to-end traces when OTLP is on. 【F:websocket/client.go†L240-L360】 +2. **Histogram coverage:** Introduce `newt_site_registration_latency_seconds` (bootstrap) and `newt_ping_roundtrip_seconds` (heartbeat) to capture SLO-critical latencies before release. Existing latency buckets (`0.005s` → `30s`) can be reused. 【F:internal/telemetry/telemetry.go†L209-L218】 +3. **Control-plane throughput:** Add `newt_websocket_payload_bytes_total` (direction/msg_type) or reuse the tunnel counter with a `transport="websocket"` label to quantify command traffic volume and detect back-pressure. +4. **Docker discovery metrics:** If Docker auto-discovery is enabled, expose counters for container add/remove events and failures so operators can trace missing backends to discovery issues. + +## Pre-release metric backlog +Prior to GA, we recommend landing the following high-value instruments: +* **Bootstrap latency:** `newt_site_registration_latency_seconds` histogram emitted around the initial Pangolin registration HTTP call to detect slow control-plane responses. +* **Session duration:** `newt_websocket_session_duration_seconds` histogram recorded when a WebSocket closes (result + reason) to quantify stability. +* **Heartbeat lag:** `newt_ping_roundtrip_seconds` histogram from ping/pong monitors to capture tunnel health, complementing the heartbeat gauge. +* **Proxy accept errors:** `newt_proxy_accept_errors_total` counter keyed by protocol/reason to surface listener pressure distinct from data-plane drops. +* **Discovery events:** `newt_discovery_events_total` counter with `action` (`add`, `remove`, `error`) and `source` (`docker`, `file`) to audit service inventory churn. + +Implementing the above will round out visibility into control-plane responsiveness, connection stability, and discovery health while preserving the existing low-cardinality discipline. diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index 1d11927..5403e43 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -45,14 +45,19 @@ var ( mBuildInfo metric.Int64ObservableGauge // WebSocket - mWSConnectLatency metric.Float64Histogram - mWSMessages metric.Int64Counter + mWSConnectLatency metric.Float64Histogram + mWSMessages metric.Int64Counter + mWSDisconnects metric.Int64Counter + mWSKeepaliveFailure metric.Int64Counter + mWSSessionDuration metric.Float64Histogram // Proxy mProxyActiveConns metric.Int64ObservableGauge mProxyBufferBytes metric.Int64ObservableGauge mProxyAsyncBacklogByte metric.Int64ObservableGauge mProxyDropsTotal metric.Int64Counter + mProxyAcceptsTotal metric.Int64Counter + mProxyConnDuration metric.Float64Histogram buildVersion string buildCommit string @@ -179,6 +184,13 @@ func registerBuildWSProxyInstruments() error { metric.WithUnit("s")) mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total", metric.WithDescription("WebSocket messages by direction and type")) + mWSDisconnects, _ = meter.Int64Counter("newt_websocket_disconnects_total", + metric.WithDescription("WebSocket disconnects by reason/result")) + mWSKeepaliveFailure, _ = meter.Int64Counter("newt_websocket_keepalive_failures_total", + metric.WithDescription("WebSocket keepalive (ping/pong) failures")) + mWSSessionDuration, _ = meter.Float64Histogram("newt_websocket_session_duration_seconds", + metric.WithDescription("Duration of established WebSocket sessions"), + metric.WithUnit("s")) // Proxy mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections", metric.WithDescription("Proxy active connections per tunnel and protocol")) @@ -190,6 +202,11 @@ func registerBuildWSProxyInstruments() error { metric.WithUnit("By")) mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total", metric.WithDescription("Proxy drops due to write errors")) + mProxyAcceptsTotal, _ = meter.Int64Counter("newt_proxy_accept_total", + metric.WithDescription("Proxy connection accepts by protocol and result")) + mProxyConnDuration, _ = meter.Float64Histogram("newt_proxy_connection_duration_seconds", + metric.WithDescription("Duration of completed proxy connections"), + metric.WithUnit("s")) // Register a default callback for build info if version/commit set reg, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { if buildVersion == "" && buildCommit == "" { @@ -328,6 +345,25 @@ func IncWSMessage(ctx context.Context, direction, msgType string) { )...)) } +func IncWSDisconnect(ctx context.Context, reason, result string) { + mWSDisconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("reason", reason), + attribute.String("result", result), + )...)) +} + +func IncWSKeepaliveFailure(ctx context.Context, reason string) { + mWSKeepaliveFailure.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("reason", reason), + )...)) +} + +func ObserveWSSessionDuration(ctx context.Context, seconds float64, result string) { + mWSSessionDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite( + attribute.String("result", result), + )...)) +} + // --- Proxy helpers --- func ObserveProxyActiveConnsObs(o metric.Observer, value int64, attrs []attribute.KeyValue) { @@ -352,6 +388,31 @@ func IncProxyDrops(ctx context.Context, tunnelID, protocol string) { mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) } +func IncProxyAccept(ctx context.Context, tunnelID, protocol, result, reason string) { + attrs := []attribute.KeyValue{ + attribute.String("protocol", protocol), + attribute.String("result", result), + } + if reason != "" { + attrs = append(attrs, attribute.String("reason", reason)) + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mProxyAcceptsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + +func ObserveProxyConnectionDuration(ctx context.Context, tunnelID, protocol, result string, seconds float64) { + attrs := []attribute.KeyValue{ + attribute.String("protocol", protocol), + attribute.String("result", result), + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mProxyConnDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + // --- Config/PKI helpers --- func ObserveConfigApply(ctx context.Context, phase, result string, seconds float64) { diff --git a/main.go b/main.go index 959b906..83f7524 100644 --- a/main.go +++ b/main.go @@ -586,6 +586,10 @@ func main() { // Register handlers for different message types client.RegisterHandler("newt/wg/connect", func(msg websocket.WSMessage) { logger.Info("Received registration message") + regResult := "success" + defer func() { + telemetry.IncSiteRegistration(ctx, regResult) + }() if stopFunc != nil { stopFunc() // stop the ws from sending more requests stopFunc = nil // reset stopFunc to nil to avoid double stopping @@ -605,11 +609,13 @@ func main() { jsonData, err := json.Marshal(msg.Data) if err != nil { logger.Info(fmtErrMarshaling, err) + regResult = "failure" return } if err := json.Unmarshal(jsonData, &wgData); err != nil { logger.Info("Error unmarshaling target data: %v", err) + regResult = "failure" return } @@ -620,6 +626,7 @@ func main() { mtuInt) if err != nil { logger.Error("Failed to create TUN device: %v", err) + regResult = "failure" } setDownstreamTNetstack(tnet) @@ -633,6 +640,7 @@ func main() { host, _, err := net.SplitHostPort(wgData.Endpoint) if err != nil { logger.Error("Failed to split endpoint: %v", err) + regResult = "failure" return } @@ -641,6 +649,7 @@ func main() { endpoint, err := resolveDomain(wgData.Endpoint) if err != nil { logger.Error("Failed to resolve endpoint: %v", err) + regResult = "failure" return } @@ -656,12 +665,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub err = dev.IpcSet(config) if err != nil { logger.Error("Failed to configure WireGuard device: %v", err) + regResult = "failure" } // Bring up the device err = dev.Up() if err != nil { logger.Error("Failed to bring up WireGuard device: %v", err) + regResult = "failure" } logger.Debug("WireGuard device created. Lets ping the server now...") @@ -676,10 +687,11 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub logger.Debug("Testing initial connection with reliable ping...") lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5) if err == nil && wgData.PublicKey != "" { - telemetry.ObserveTunnelLatency(context.Background(), wgData.PublicKey, "wireguard", lat.Seconds()) + telemetry.ObserveTunnelLatency(ctx, wgData.PublicKey, "wireguard", lat.Seconds()) } if err != nil { logger.Warn("Initial reliable ping failed, but continuing: %v", err) + regResult = "failure" } else { logger.Info("Initial connection test successful") } @@ -701,9 +713,6 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub connected = true - // telemetry: record a successful site registration (omit region unless available) - telemetry.IncSiteRegistration(context.Background(), "success") - // add the targets if there are any if len(wgData.Targets.TCP) > 0 { updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP}) @@ -738,7 +747,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) { logger.Info("Received reconnect message") if wgData.PublicKey != "" { - telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server", telemetry.ReasonServerRequest) + telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest) } // Close the WireGuard device and TUN @@ -767,7 +776,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) { logger.Info("Received termination message") if wgData.PublicKey != "" { - telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server", telemetry.ReasonServerRequest) + telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest) } // Close the WireGuard device and TUN @@ -837,7 +846,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub }, } -stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{ + stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{ "publicKey": publicKey.String(), "pingResults": pingResults, "newtVersion": newtVersion, @@ -940,7 +949,7 @@ stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{ } // Send the ping results to the cloud for selection -stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{ + stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{ "publicKey": publicKey.String(), "pingResults": pingResults, "newtVersion": newtVersion, diff --git a/proxy/manager.go b/proxy/manager.go index ceaa12b..31e7788 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -2,6 +2,7 @@ package proxy import ( "context" + "errors" "fmt" "io" "net" @@ -97,6 +98,32 @@ func (cw *countingWriter) Write(p []byte) (int, error) { return n, err } +func classifyProxyError(err error) string { + if err == nil { + return "" + } + if errors.Is(err, net.ErrClosed) { + return "closed" + } + if ne, ok := err.(net.Error); ok { + if ne.Timeout() { + return "timeout" + } + if ne.Temporary() { + return "temporary" + } + } + msg := strings.ToLower(err.Error()) + switch { + case strings.Contains(msg, "refused"): + return "refused" + case strings.Contains(msg, "reset"): + return "reset" + default: + return "io_error" + } +} + // NewProxyManager creates a new proxy manager instance func NewProxyManager(tnet *netstack.Net) *ProxyManager { return &ProxyManager{ @@ -467,72 +494,69 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) for { conn, err := listener.Accept() if err != nil { - // Check if we're shutting down or the listener was closed + telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "tcp", "failure", classifyProxyError(err)) if !pm.running { return } - - // Check for specific network errors that indicate the listener is closed if ne, ok := err.(net.Error); ok && !ne.Temporary() { logger.Info("TCP listener closed, stopping proxy handler for %v", listener.Addr()) return } - logger.Error("Error accepting TCP connection: %v", err) - // Don't hammer the CPU if we hit a temporary error time.Sleep(100 * time.Millisecond) continue } - // Count sessions only once per accepted TCP connection - if pm.currentTunnelID != "" { - state.Global().IncSessions(pm.currentTunnelID) - if e := pm.getEntry(pm.currentTunnelID); e != nil { + tunnelID := pm.currentTunnelID + telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "success", "") + if tunnelID != "" { + state.Global().IncSessions(tunnelID) + if e := pm.getEntry(tunnelID); e != nil { e.activeTCP.Add(1) } } - go func() { + go func(tunnelID string, accepted net.Conn) { + connStart := time.Now() target, err := net.Dial("tcp", targetAddr) if err != nil { logger.Error("Error connecting to target: %v", err) - conn.Close() + accepted.Close() + telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "failure", classifyProxyError(err)) + telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "failure", time.Since(connStart).Seconds()) return } - // already incremented on accept - - // Create a WaitGroup to ensure both copy operations complete + entry := pm.getEntry(tunnelID) + if entry == nil { + entry = &tunnelEntry{} + } var wg sync.WaitGroup wg.Add(2) - // client -> target (direction=in) - go func() { + go func(ent *tunnelEntry) { defer wg.Done() - e := pm.getEntry(pm.currentTunnelID) - cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"} - _, _ = io.Copy(cw, conn) + cw := &countingWriter{ctx: context.Background(), w: target, set: ent.attrInTCP, pm: pm, ent: ent, out: false, proto: "tcp"} + _, _ = io.Copy(cw, accepted) _ = target.Close() - }() + }(entry) - // target -> client (direction=out) - go func() { + go func(ent *tunnelEntry) { defer wg.Done() - e := pm.getEntry(pm.currentTunnelID) - cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"} + cw := &countingWriter{ctx: context.Background(), w: accepted, set: ent.attrOutTCP, pm: pm, ent: ent, out: true, proto: "tcp"} _, _ = io.Copy(cw, target) - _ = conn.Close() - }() + _ = accepted.Close() + }(entry) - // Wait for both copies to complete then session -1 wg.Wait() - if pm.currentTunnelID != "" { - state.Global().DecSessions(pm.currentTunnelID) - if e := pm.getEntry(pm.currentTunnelID); e != nil { + if tunnelID != "" { + state.Global().DecSessions(tunnelID) + if e := pm.getEntry(tunnelID); e != nil { e.activeTCP.Add(-1) } } - }() + telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "success", time.Since(connStart).Seconds()) + }(tunnelID, conn) } } @@ -595,16 +619,20 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { targetUDPAddr, err := net.ResolveUDPAddr("udp", targetAddr) if err != nil { logger.Error("Error resolving target address: %v", err) + telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "udp", "failure", "resolve") continue } targetConn, err = net.DialUDP("udp", nil, targetUDPAddr) if err != nil { logger.Error("Error connecting to target: %v", err) + telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "udp", "failure", classifyProxyError(err)) continue } + tunnelID := pm.currentTunnelID + telemetry.IncProxyAccept(context.Background(), tunnelID, "udp", "success", "") // Only increment activeUDP after a successful DialUDP - if e := pm.getEntry(pm.currentTunnelID); e != nil { + if e := pm.getEntry(tunnelID); e != nil { e.activeUDP.Add(1) } @@ -612,18 +640,21 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { clientConns[clientKey] = targetConn clientsMutex.Unlock() - go func(clientKey string, targetConn *net.UDPConn, remoteAddr net.Addr) { + go func(clientKey string, targetConn *net.UDPConn, remoteAddr net.Addr, tunnelID string) { + start := time.Now() + result := "success" defer func() { // Always clean up when this goroutine exits clientsMutex.Lock() if storedConn, exists := clientConns[clientKey]; exists && storedConn == targetConn { delete(clientConns, clientKey) targetConn.Close() - if e := pm.getEntry(pm.currentTunnelID); e != nil { + if e := pm.getEntry(tunnelID); e != nil { e.activeUDP.Add(-1) } } clientsMutex.Unlock() + telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "udp", result, time.Since(start).Seconds()) }() buffer := make([]byte, 65507) @@ -631,6 +662,7 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { n, _, err := targetConn.ReadFromUDP(buffer) if err != nil { logger.Error("Error reading from target: %v", err) + result = "failure" return // defer will handle cleanup } @@ -651,10 +683,11 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { if err != nil { logger.Error("Error writing to client: %v", err) telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp") + result = "failure" return // defer will handle cleanup } } - }(clientKey, targetConn, remoteAddr) + }(clientKey, targetConn, remoteAddr, tunnelID) } written, err := targetConn.Write(buffer[:n]) diff --git a/websocket/client.go b/websocket/client.go index ee6c2e6..ba39202 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -7,6 +7,7 @@ import ( "encoding/json" "fmt" "io" + "net" "net/http" "net/url" "os" @@ -42,6 +43,8 @@ type Client struct { writeMux sync.Mutex clientType string // Type of client (e.g., "newt", "olm") tlsConfig TLSConfig + metricsCtxMu sync.RWMutex + metricsCtx context.Context } type ClientOption func(*Client) @@ -85,6 +88,26 @@ func (c *Client) OnTokenUpdate(callback func(token string)) { c.onTokenUpdate = callback } +func (c *Client) metricsContext() context.Context { + c.metricsCtxMu.RLock() + defer c.metricsCtxMu.RUnlock() + if c.metricsCtx != nil { + return c.metricsCtx + } + return context.Background() +} + +func (c *Client) setMetricsContext(ctx context.Context) { + c.metricsCtxMu.Lock() + c.metricsCtx = ctx + c.metricsCtxMu.Unlock() +} + +// MetricsContext exposes the context used for telemetry emission when a connection is active. +func (c *Client) MetricsContext() context.Context { + return c.metricsContext() +} + // NewClient creates a new websocket client func NewClient(clientType string, ID, secret string, endpoint string, pingInterval time.Duration, pingTimeout time.Duration, opts ...ClientOption) (*Client, error) { config := &Config{ @@ -177,7 +200,7 @@ func (c *Client) SendMessage(messageType string, data interface{}) error { if err := c.conn.WriteJSON(msg); err != nil { return err } - telemetry.IncWSMessage(context.Background(), "out", "text") + telemetry.IncWSMessage(c.metricsContext(), "out", "text") return nil } @@ -273,8 +296,12 @@ func (c *Client) getToken() (string, error) { return "", fmt.Errorf("failed to marshal token request data: %w", err) } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + // Create a new request - req, err := http.NewRequest( + req, err := http.NewRequestWithContext( + ctx, "POST", baseEndpoint+"/api/v1/auth/"+c.clientType+"/get-token", bytes.NewBuffer(jsonData), @@ -296,7 +323,8 @@ func (c *Client) getToken() (string, error) { } resp, err := client.Do(req) if err != nil { - telemetry.IncConnError(context.Background(), "auth", classifyConnError(err)) + telemetry.IncConnAttempt(ctx, "auth", "failure") + telemetry.IncConnError(ctx, "auth", classifyConnError(err)) return "", fmt.Errorf("failed to request new token: %w", err) } defer resp.Body.Close() @@ -304,15 +332,15 @@ func (c *Client) getToken() (string, error) { if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) - telemetry.IncConnAttempt(context.Background(), "auth", "failure") + telemetry.IncConnAttempt(ctx, "auth", "failure") etype := "io_error" if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { etype = "auth_failed" } - telemetry.IncConnError(context.Background(), "auth", etype) + telemetry.IncConnError(ctx, "auth", etype) // Reconnect reason mapping for auth failures if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { - telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonAuthError) + telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonAuthError) } return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body)) } @@ -332,7 +360,7 @@ func (c *Client) getToken() (string, error) { } logger.Debug("Received token: %s", tokenResp.Data.Token) - telemetry.IncConnAttempt(context.Background(), "auth", "success") + telemetry.IncConnAttempt(ctx, "auth", "success") return tokenResp.Data.Token, nil } @@ -357,6 +385,30 @@ func classifyConnError(err error) string { } } +func classifyWSDisconnect(err error) (result, reason string) { + if err == nil { + return "success", "normal" + } + if websocket.IsCloseError(err, websocket.CloseNormalClosure) { + return "success", "normal" + } + if ne, ok := err.(net.Error); ok && ne.Timeout() { + return "error", "timeout" + } + if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) { + return "error", "unexpected_close" + } + msg := strings.ToLower(err.Error()) + switch { + case strings.Contains(msg, "eof"): + return "error", "eof" + case strings.Contains(msg, "reset"): + return "error", "connection_reset" + default: + return "error", "read_error" + } +} + func (c *Client) connectWithRetry() { for { select { @@ -375,13 +427,13 @@ func (c *Client) connectWithRetry() { } func (c *Client) establishConnection() error { + ctx := context.Background() + // Get token for authentication token, err := c.getToken() if err != nil { - // telemetry: connection attempt failed before dialing - // site_id isn't globally available here; use client ID as site_id (low cardinality) - telemetry.IncConnAttempt(context.Background(), "websocket", "failure") - telemetry.IncConnError(context.Background(), "websocket", classifyConnError(err)) + telemetry.IncConnAttempt(ctx, "websocket", "failure") + telemetry.IncConnError(ctx, "websocket", classifyConnError(err)) return fmt.Errorf("failed to get token: %w", err) } @@ -416,7 +468,7 @@ func (c *Client) establishConnection() error { // Connect to WebSocket (optional span) tr := otel.Tracer("newt") - spanCtx, span := tr.Start(context.Background(), "ws.connect") + ctx, span := tr.Start(ctx, "ws.connect") defer span.End() start := time.Now() @@ -441,38 +493,40 @@ func (c *Client) establishConnection() error { logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable") } - conn, _, err := dialer.DialContext(spanCtx, u.String(), nil) + conn, _, err := dialer.DialContext(ctx, u.String(), nil) lat := time.Since(start).Seconds() if err != nil { - telemetry.IncConnAttempt(context.Background(), "websocket", "failure") + telemetry.IncConnAttempt(ctx, "websocket", "failure") etype := classifyConnError(err) - telemetry.IncConnError(context.Background(), "websocket", etype) - telemetry.ObserveWSConnectLatency(context.Background(), lat, "failure", etype) + telemetry.IncConnError(ctx, "websocket", etype) + telemetry.ObserveWSConnectLatency(ctx, lat, "failure", etype) // Map handshake-related errors to reconnect reasons where appropriate if etype == "tls_handshake" { - telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonHandshakeError) + telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonHandshakeError) } else if etype == "dial_timeout" { - telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonTimeout) + telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonTimeout) } else { - telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonError) + telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonError) } return fmt.Errorf("failed to connect to WebSocket: %w", err) } - telemetry.IncConnAttempt(context.Background(), "websocket", "success") - telemetry.ObserveWSConnectLatency(context.Background(), lat, "success", "") + telemetry.IncConnAttempt(ctx, "websocket", "success") + telemetry.ObserveWSConnectLatency(ctx, lat, "success", "") c.conn = conn c.setConnected(true) + c.setMetricsContext(ctx) + sessionStart := time.Now() // Wire up pong handler for metrics c.conn.SetPongHandler(func(appData string) error { - telemetry.IncWSMessage(context.Background(), "in", "pong") + telemetry.IncWSMessage(c.metricsContext(), "in", "pong") return nil }) // Start the ping monitor go c.pingMonitor() // Start the read pump with disconnect detection - go c.readPumpWithDisconnectDetection() + go c.readPumpWithDisconnectDetection(sessionStart) if c.onConnect != nil { err := c.saveConfig() @@ -566,7 +620,7 @@ func (c *Client) pingMonitor() { c.writeMux.Lock() err := c.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(c.pingTimeout)) if err == nil { - telemetry.IncWSMessage(context.Background(), "out", "ping") + telemetry.IncWSMessage(c.metricsContext(), "out", "ping") } c.writeMux.Unlock() if err != nil { @@ -577,6 +631,7 @@ func (c *Client) pingMonitor() { return default: logger.Error("Ping failed: %v", err) + telemetry.IncWSKeepaliveFailure(c.metricsContext(), "ping_write") c.reconnect() return } @@ -586,11 +641,19 @@ func (c *Client) pingMonitor() { } // readPumpWithDisconnectDetection reads messages and triggers reconnect on error -func (c *Client) readPumpWithDisconnectDetection() { +func (c *Client) readPumpWithDisconnectDetection(started time.Time) { + ctx := c.metricsContext() + disconnectReason := "shutdown" + disconnectResult := "success" + defer func() { if c.conn != nil { c.conn.Close() } + if !started.IsZero() { + telemetry.ObserveWSSessionDuration(ctx, time.Since(started).Seconds(), disconnectResult) + } + telemetry.IncWSDisconnect(ctx, disconnectReason, disconnectResult) // Only attempt reconnect if we're not shutting down select { case <-c.done: @@ -604,12 +667,14 @@ func (c *Client) readPumpWithDisconnectDetection() { for { select { case <-c.done: + disconnectReason = "shutdown" + disconnectResult = "success" return default: var msg WSMessage err := c.conn.ReadJSON(&msg) if err == nil { - telemetry.IncWSMessage(context.Background(), "in", "text") + telemetry.IncWSMessage(c.metricsContext(), "in", "text") } if err != nil { // Check if we're shutting down before logging error @@ -617,13 +682,18 @@ func (c *Client) readPumpWithDisconnectDetection() { case <-c.done: // Expected during shutdown, don't log as error logger.Debug("WebSocket connection closed during shutdown") + disconnectReason = "shutdown" + disconnectResult = "success" return default: // Unexpected error during normal operation - if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure, websocket.CloseNormalClosure) { - logger.Error("WebSocket read error: %v", err) - } else { - logger.Debug("WebSocket connection closed: %v", err) + disconnectResult, disconnectReason = classifyWSDisconnect(err) + if disconnectResult == "error" { + if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure, websocket.CloseNormalClosure) { + logger.Error("WebSocket read error: %v", err) + } else { + logger.Debug("WebSocket connection closed: %v", err) + } } return // triggers reconnect via defer } diff --git a/wg/wg.go b/wg/wg.go index eccd64f..0ab1919 100644 --- a/wg/wg.go +++ b/wg/wg.go @@ -280,6 +280,15 @@ func (s *WireGuardService) LoadRemoteConfig() error { } func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { + ctx := context.Background() + if s.client != nil { + ctx = s.client.MetricsContext() + } + result := "success" + defer func() { + telemetry.IncConfigReload(ctx, result) + }() + var config WgConfig logger.Debug("Received message: %v", msg) @@ -288,11 +297,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { jsonData, err := json.Marshal(msg.Data) if err != nil { logger.Info("Error marshaling data: %v", err) + result = "failure" return } if err := json.Unmarshal(jsonData, &config); err != nil { logger.Info("Error unmarshaling target data: %v", err) + result = "failure" return } s.config = config @@ -303,27 +314,28 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) { } // telemetry: config reload success - telemetry.IncConfigReload(context.Background(), "success") // Optional reconnect reason mapping: config change if s.serverPubKey != "" { - telemetry.IncReconnect(context.Background(), s.serverPubKey, "client", telemetry.ReasonConfigChange) + telemetry.IncReconnect(ctx, s.serverPubKey, "client", telemetry.ReasonConfigChange) } // Ensure the WireGuard interface and peers are configured start := time.Now() if err := s.ensureWireguardInterface(config); err != nil { logger.Error("Failed to ensure WireGuard interface: %v", err) - telemetry.ObserveConfigApply(context.Background(), "interface", "failure", time.Since(start).Seconds()) + telemetry.ObserveConfigApply(ctx, "interface", "failure", time.Since(start).Seconds()) + result = "failure" } else { - telemetry.ObserveConfigApply(context.Background(), "interface", "success", time.Since(start).Seconds()) + telemetry.ObserveConfigApply(ctx, "interface", "success", time.Since(start).Seconds()) } startPeers := time.Now() if err := s.ensureWireguardPeers(config.Peers); err != nil { logger.Error("Failed to ensure WireGuard peers: %v", err) - telemetry.ObserveConfigApply(context.Background(), "peer", "failure", time.Since(startPeers).Seconds()) + telemetry.ObserveConfigApply(ctx, "peer", "failure", time.Since(startPeers).Seconds()) + result = "failure" } else { - telemetry.ObserveConfigApply(context.Background(), "peer", "success", time.Since(startPeers).Seconds()) + telemetry.ObserveConfigApply(ctx, "peer", "success", time.Since(startPeers).Seconds()) } } From 46384e6242b7036180ade82c936c3d3ab4a2500d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 18:18:38 +0200 Subject: [PATCH 70/72] fix(metrics): update metrics recommendations and add OpenTelemetry review documentation --- docs/METRICS_RECOMMENDATIONS.md | 18 +++-- docs/observability.md | 34 ++++++--- docs/otel-review.md | 126 ++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 17 deletions(-) create mode 100644 docs/otel-review.md diff --git a/docs/METRICS_RECOMMENDATIONS.md b/docs/METRICS_RECOMMENDATIONS.md index 2ce365b..c085e06 100644 --- a/docs/METRICS_RECOMMENDATIONS.md +++ b/docs/METRICS_RECOMMENDATIONS.md @@ -10,6 +10,10 @@ This document captures the current state of Newt metrics, prioritized fixes, and - Tunnel/Traffic: newt_tunnel_sessions, newt_tunnel_bytes_total, newt_tunnel_latency_seconds, newt_tunnel_reconnects_total - Connection lifecycle: newt_connection_attempts_total, newt_connection_errors_total - Operations: newt_config_reloads_total, newt_restart_count_total, newt_build_info + - Operations: newt_config_reloads_total, newt_restart_count_total, newt_config_apply_seconds, newt_cert_rotation_total + - Build metadata: newt_build_info + - Control plane: newt_websocket_connect_latency_seconds, newt_websocket_messages_total + - Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_async_backlog_bytes, newt_proxy_drops_total - Go runtime: GC, heap, goroutines via runtime instrumentation 2) Main issues addressed now @@ -27,6 +31,10 @@ This document captures the current state of Newt metrics, prioritized fixes, and - Some call sites still need initiator label on reconnect outcomes (client vs server). This is planned. - WebSocket and Proxy metrics (connect latency, messages, active connections, buffer/drops, async backlog) are planned additions. - Config apply duration and cert rotation counters are planned. + - Registration and config reload failures are not yet emitted; add failure code paths so result labels expose churn. + - Restart counter increments only when build metadata is provided; consider decoupling to count all boots. + - Metric helpers often use `context.Background()`. Where lightweight contexts exist (e.g., HTTP handlers), propagate them to ease future correlation. + - Tracing coverage is limited to admin HTTP and WebSocket connect spans; extend to blueprint fetches, proxy accept loops, and WireGuard updates when OTLP is enabled. 4) Roadmap (phased) @@ -40,6 +48,10 @@ This document captures the current state of Newt metrics, prioritized fixes, and - Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_drops_total, newt_proxy_async_backlog_bytes - Reconnect: add initiator label (client/server) - Config & PKI: newt_config_apply_seconds{phase,result}; newt_cert_rotation_total{result} + - WebSocket disconnect and keepalive failure counters + - Proxy connection lifecycle metrics (accept totals, duration histogram) + - Pangolin blueprint/config fetch latency and status metrics + - Certificate rotation duration histogram to complement success/failure counter 5) Operational guidance @@ -64,9 +76,3 @@ This document captures the current state of Newt metrics, prioritized fixes, and - Direct scrape variant requires no attribute promotion since site_id is already a metric label. - Transform/promote variant remains optional for environments that rely on resource-to-label promotion. - -8) Testing - -- curl :2112/metrics | grep ^newt_ -- Verify presence of site_id across series; region appears when set. -- Ensure disallowed attributes are filtered; allowed (site_id) retained. diff --git a/docs/observability.md b/docs/observability.md index 6f71ecb..ba19aac 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -34,18 +34,30 @@ Runtime behavior - When OTLP is enabled, metrics and traces are exported to OTLP gRPC endpoint - Go runtime metrics (goroutines, GC, memory) are exported automatically -Metric catalog (initial) +Metric catalog (current) -- newt_build_info (gauge) labels: version, commit, site_id[, region]; value is always 1 -- newt_site_registrations_total (counter) labels: result, site_id[, region] -- newt_site_online (observable gauge) labels: site_id (0/1) -- newt_site_last_heartbeat_seconds (observable gauge) labels: site_id -- newt_tunnel_sessions (observable gauge) labels: site_id, tunnel_id [transport optional when known] -- newt_tunnel_bytes_total (counter) labels: site_id, tunnel_id, protocol (tcp|udp), direction (ingress|egress) -- newt_tunnel_latency_seconds (histogram) labels: site_id, tunnel_id, transport (e.g., wireguard) -- newt_tunnel_reconnects_total (counter) labels: site_id, tunnel_id, initiator (client|server), reason -- newt_connection_attempts_total (counter) labels: site_id, transport, result -- newt_connection_errors_total (counter) labels: site_id, transport, error_type (dial_timeout|tls_handshake|auth_failed|io_error) +| Metric | Instrument | Key attributes | Purpose | Example | +| --- | --- | --- | --- | --- | +| `newt_build_info` | Observable gauge (Int64) | `version`, `commit`, `site_id`, `region` (optional) | Emits build metadata with value `1` for scrape-time verification. | `newt_build_info{version="1.5.0",site_id="acme-edge-1"} 1` | +| `newt_site_registrations_total` | Counter (Int64) | `result` (`success`/`failure`), `site_id`, `region` (optional) | Counts Pangolin registration attempts. | `newt_site_registrations_total{result="success",site_id="acme-edge-1"} 1` | +| `newt_site_online` | Observable gauge (Int64) | `site_id` | Reports whether the site is currently connected (`1`) or offline (`0`). | `newt_site_online{site_id="acme-edge-1"} 1` | +| `newt_site_last_heartbeat_seconds` | Observable gauge (Float64) | `site_id` | Time since the most recent Pangolin heartbeat. | `newt_site_last_heartbeat_seconds{site_id="acme-edge-1"} 2.4` | +| `newt_tunnel_sessions` | Observable gauge (Int64) | `site_id`, `tunnel_id` (when enabled) | Counts active tunnel sessions per peer; collapses to per-site when tunnel IDs are disabled. | `newt_tunnel_sessions{site_id="acme-edge-1",tunnel_id="wgpub..."} 3` | +| `newt_tunnel_bytes_total` | Counter (Int64) | `direction` (`ingress`/`egress`), `protocol` (`tcp`/`udp`), `tunnel_id` (optional), `site_id`, `region` (optional) | Measures proxied traffic volume across tunnels. | `newt_tunnel_bytes_total{direction="ingress",protocol="tcp",site_id="acme-edge-1"} 4096` | +| `newt_tunnel_latency_seconds` | Histogram (Float64) | `transport` (e.g., `wireguard`), `tunnel_id` (optional), `site_id`, `region` (optional) | Captures RTT or configuration-driven latency samples. | `newt_tunnel_latency_seconds_bucket{transport="wireguard",le="0.5"} 42` | +| `newt_tunnel_reconnects_total` | Counter (Int64) | `initiator` (`client`/`server`), `reason` (enumerated), `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks reconnect causes for troubleshooting flaps. | `newt_tunnel_reconnects_total{initiator="client",reason="timeout",site_id="acme-edge-1"} 5` | +| `newt_connection_attempts_total` | Counter (Int64) | `transport` (`auth`/`websocket`), `result`, `site_id`, `region` (optional) | Measures control-plane dial attempts and their outcomes. | `newt_connection_attempts_total{transport="websocket",result="success",site_id="acme-edge-1"} 8` | +| `newt_connection_errors_total` | Counter (Int64) | `transport`, `error_type`, `site_id`, `region` (optional) | Buckets connection failures by normalized error class. | `newt_connection_errors_total{transport="websocket",error_type="tls_handshake",site_id="acme-edge-1"} 1` | +| `newt_config_reloads_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Counts remote blueprint/config reloads. | `newt_config_reloads_total{result="success",site_id="acme-edge-1"} 3` | +| `newt_restart_count_total` | Counter (Int64) | `site_id`, `region` (optional) | Increments once per process boot to detect restarts. | `newt_restart_count_total{site_id="acme-edge-1"} 1` | +| `newt_config_apply_seconds` | Histogram (Float64) | `phase` (`interface`/`peer`), `result`, `site_id`, `region` (optional) | Measures time spent applying WireGuard configuration phases. | `newt_config_apply_seconds_sum{phase="peer",result="success",site_id="acme-edge-1"} 0.48` | +| `newt_cert_rotation_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Tracks client certificate rotation attempts. | `newt_cert_rotation_total{result="success",site_id="acme-edge-1"} 2` | +| `newt_websocket_connect_latency_seconds` | Histogram (Float64) | `transport="websocket"`, `result`, `error_type` (on failure), `site_id`, `region` (optional) | Measures WebSocket dial latency and exposes failure buckets. | `newt_websocket_connect_latency_seconds_bucket{result="success",le="0.5",site_id="acme-edge-1"} 9` | +| `newt_websocket_messages_total` | Counter (Int64) | `direction` (`in`/`out`), `msg_type` (`text`/`ping`/`pong`), `site_id`, `region` (optional) | Accounts for control WebSocket traffic volume by type. | `newt_websocket_messages_total{direction="out",msg_type="ping",site_id="acme-edge-1"} 12` | +| `newt_proxy_active_connections` | Observable gauge (Int64) | `protocol` (`tcp`/`udp`), `direction` (`ingress`/`egress`), `tunnel_id` (optional), `site_id`, `region` (optional) | Current proxy connections per tunnel and protocol. | `newt_proxy_active_connections{protocol="tcp",direction="egress",site_id="acme-edge-1"} 4` | +| `newt_proxy_buffer_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Volume of buffered data awaiting flush in proxy queues. | `newt_proxy_buffer_bytes{protocol="udp",direction="egress",site_id="acme-edge-1"} 2048` | +| `newt_proxy_async_backlog_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks async write backlog when deferred flushing is enabled. | `newt_proxy_async_backlog_bytes{protocol="tcp",direction="egress",site_id="acme-edge-1"} 512` | +| `newt_proxy_drops_total` | Counter (Int64) | `protocol`, `tunnel_id` (optional), `site_id`, `region` (optional) | Counts proxy drop events caused by downstream write errors. | `newt_proxy_drops_total{protocol="udp",site_id="acme-edge-1"} 1` | Conventions diff --git a/docs/otel-review.md b/docs/otel-review.md new file mode 100644 index 0000000..35c47d2 --- /dev/null +++ b/docs/otel-review.md @@ -0,0 +1,126 @@ +# Newt OpenTelemetry Review + +## Overview + +This document summarises the current OpenTelemetry (OTel) instrumentation in Newt, assesses +compliance with OTel guidelines, and lists concrete improvements to pursue before release. +It is based on the implementation in `internal/telemetry` and the call-sites that emit +metrics and traces across the code base. + +## Current metric instrumentation + +All instruments are registered in `internal/telemetry/metrics.go`. They are grouped +into site, tunnel, connection, configuration, build, WebSocket, and proxy domains. +A global attribute filter (see `buildMeterProvider`) constrains exposed label keys to +`site_id`, `region`, and a curated list of low-cardinality dimensions so that Prometheus +exports stay bounded. + +- **Site lifecycle**: `newt_site_registrations_total`, `newt_site_online`, and + `newt_site_last_heartbeat_seconds` capture registration attempts and liveness. They + are fed either manually (`IncSiteRegistration`) or via the `TelemetryView` state + callback that publishes observable gauges for the active site. +- **Tunnel health and usage**: Counters and histograms track bytes, latency, reconnects, + and active sessions per tunnel (`newt_tunnel_*` family). Attribute helpers respect + the `NEWT_METRICS_INCLUDE_TUNNEL_ID` toggle to keep cardinality manageable on larger + fleets. +- **Connection attempts**: `newt_connection_attempts_total` and + `newt_connection_errors_total` are emitted throughout the WebSocket client to classify + authentication, dial, and transport failures. +- **Operations/configuration**: `newt_config_reloads_total`, + `newt_restart_count_total`, `newt_config_apply_seconds`, and + `newt_cert_rotation_total` provide visibility into blueprint reloads, process boots, + configuration timings, and certificate rotation outcomes. +- **Build metadata**: `newt_build_info` records the binary version/commit together + with a monotonic restart counter when build information is supplied at startup. +- **WebSocket control-plane**: `newt_websocket_connect_latency_seconds` and + `newt_websocket_messages_total` report connect latency and ping/pong/text activity. +- **Proxy data-plane**: Observable gauges (`newt_proxy_active_connections`, + `newt_proxy_buffer_bytes`, `newt_proxy_async_backlog_bytes`) and the + `newt_proxy_drops_total` counter are fed from the proxy manager to monitor backlog + and drop behaviour alongside per-protocol byte counters. + +Refer to `docs/observability.md` for a tabular catalogue with instrument types, +attributes, and sample exposition lines. + +## Tracing coverage + +Tracing is optional and enabled only when OTLP export is configured. When active: + +- The admin HTTP mux is wrapped with `otelhttp.NewHandler`, producing spans for + `/metrics` and `/healthz` requests. +- The WebSocket dial path creates a `ws.connect` span around the gRPC-based handshake. + +No other subsystems currently create spans, so data-plane operations, blueprint fetches, +Docker discovery, and WireGuard reconfiguration happen without trace context. + +## Guideline & best-practice alignment + +The implementation adheres to most OTel Go recommendations: + +- **Naming & units** – Every instrument follows the `newt_*` prefix with `_total` + suffixes for counters and `_seconds`/`_bytes` unit conventions. Histograms are + registered with explicit second-based buckets. +- **Resource attributes** – Service name/version and optional `site_id`/`region` + populate the `resource.Resource` and are also injected as metric attributes for + compatibility with Prometheus queries. +- **Attribute hygiene** – A single attribute filter (`sdkmetric.WithView`) enforces + the allow-list of label keys to prevent accidental high-cardinality emission. +- **Runtime metrics** – Go runtime instrumentation is enabled automatically through + `runtime.Start`. +- **Configuration via environment** – `telemetry.FromEnv` honours `OTEL_*` variables + alongside `NEWT_*` overrides so operators can configure exporters without code + changes. +- **Shutdown handling** – `Setup.Shutdown` iterates exporters in reverse order to + flush buffers before process exit. + +## Adjustments & improvements + +The review identified a few actionable adjustments: + +1. **Record registration failures** – `newt_site_registrations_total` is currently + incremented only on success. Emit `result="failure"` samples whenever Pangolin + rejects a registration or credential exchange so operators can alert on churn. +2. **Surface config reload failures** – `telemetry.IncConfigReload` is invoked with + `result="success"` only. Callers should record a failure result when blueprint + parsing or application aborts before success counters are incremented. +3. **Harmonise restart count behaviour** – `newt_restart_count_total` increments only + when build metadata is provided. Consider moving the increment out of + `RegisterBuildInfo` so the counter advances even for ad-hoc builds without version + strings. +4. **Propagate contexts where available** – Many emitters call metric helpers with + `context.Background()`. Passing real contexts (when inexpensive) would allow future + exporters to correlate spans and metrics. +5. **Extend tracing coverage** – Instrument critical flows such as blueprint fetches, + WireGuard reconfiguration, proxy accept loops, and Docker discovery to provide end + to end visibility when OTLP tracing is enabled. + +## Metrics to add before release + +Prioritised additions that would close visibility gaps: + +1. **WebSocket disconnect outcomes** – A counter (e.g., `newt_websocket_disconnects_total`) + partitioned by `reason` would complement the existing connect latency histogram and + explain reconnect storms. +2. **Keepalive/heartbeat failures** – Counting ping timeouts or heartbeat misses would + make `newt_site_last_heartbeat_seconds` actionable by providing discrete events. +3. **Proxy connection lifecycle** – Add counters/histograms for proxy accept events and + connection durations to correlate drops with load and backlog metrics. +4. **Blueprint/config pull latency** – Measuring Pangolin blueprint fetch durations and + HTTP status distribution would expose slow control-plane operations. +5. **Certificate rotation attempts** – Complement `newt_cert_rotation_total` with a + duration histogram to observe slow PKI updates and detect stuck rotations. + +These metrics rely on data that is already available in the code paths mentioned +above and would round out operational dashboards. + +## Tracing wishlist + +To benefit from tracing when OTLP is active, add spans around: + +- Pangolin REST calls (wrap the HTTP client with `otelhttp.NewTransport`). +- Docker discovery cycles and target registration callbacks. +- WireGuard reconfiguration (interface bring-up, peer updates). +- Proxy dial/accept loops for both TCP and UDP targets. + +Capturing these stages will let operators correlate latency spikes with reconnects +and proxy drops using distributed traces in addition to the metric signals. From d21f4951e98318631069a2fd69453683bf96fa42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 19:15:33 +0200 Subject: [PATCH 71/72] Add WebSocket and proxy lifecycle metrics --- docs/METRICS_RECOMMENDATIONS.md | 19 ++- docs/observability.md | 14 ++- docs/otel-review.md | 49 ++++---- internal/telemetry/metrics.go | 116 +++++++++++++++--- internal/telemetry/metrics_test_helper.go | 59 +++++++++ internal/telemetry/state_view.go | 4 +- internal/telemetry/telemetry.go | 26 +++- .../telemetry/telemetry_attrfilter_test.go | 21 ++-- internal/telemetry/telemetry_golden_test.go | 54 +++++--- internal/telemetry/telemetry_smoke_test.go | 37 ++++-- .../testdata/expected_contains.golden | 4 + proxy/manager.go | 5 + scripts/smoke-metrics.sh | 5 +- websocket/client.go | 6 + 14 files changed, 323 insertions(+), 96 deletions(-) create mode 100644 internal/telemetry/metrics_test_helper.go diff --git a/docs/METRICS_RECOMMENDATIONS.md b/docs/METRICS_RECOMMENDATIONS.md index c085e06..968b0a9 100644 --- a/docs/METRICS_RECOMMENDATIONS.md +++ b/docs/METRICS_RECOMMENDATIONS.md @@ -6,20 +6,20 @@ This document captures the current state of Newt metrics, prioritized fixes, and - Export: Prometheus exposition (default), optional OTLP (gRPC) - Existing instruments: - - Sites: newt_site_registrations_total, newt_site_online (0/1), newt_site_last_heartbeat_seconds + - Sites: newt_site_registrations_total, newt_site_online (0/1), newt_site_last_heartbeat_timestamp_seconds - Tunnel/Traffic: newt_tunnel_sessions, newt_tunnel_bytes_total, newt_tunnel_latency_seconds, newt_tunnel_reconnects_total - Connection lifecycle: newt_connection_attempts_total, newt_connection_errors_total - - Operations: newt_config_reloads_total, newt_restart_count_total, newt_build_info - - Operations: newt_config_reloads_total, newt_restart_count_total, newt_config_apply_seconds, newt_cert_rotation_total + - Operations: newt_config_reloads_total, process_start_time_seconds, newt_build_info + - Operations: newt_config_reloads_total, process_start_time_seconds, newt_config_apply_seconds, newt_cert_rotation_total - Build metadata: newt_build_info - - Control plane: newt_websocket_connect_latency_seconds, newt_websocket_messages_total - - Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_async_backlog_bytes, newt_proxy_drops_total + - Control plane: newt_websocket_connect_latency_seconds, newt_websocket_messages_total, newt_websocket_connected, newt_websocket_reconnects_total + - Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_async_backlog_bytes, newt_proxy_drops_total, newt_proxy_accept_total, newt_proxy_connection_duration_seconds, newt_proxy_connections_total - Go runtime: GC, heap, goroutines via runtime instrumentation 2) Main issues addressed now - Attribute filter (allow-list) extended to include site_id and region in addition to existing keys (tunnel_id, transport, protocol, direction, result, reason, error_type, version, commit). - - site_id and region propagation: site_id is now attached as a metric label across newt_*; region is added as a metric label when set. Both remain resource attributes for consistency with OTEL. + - site_id and region propagation: site_id/region remain resource attributes. Metric labels mirror them on per-site gauges and counters by default; set `NEWT_METRICS_INCLUDE_SITE_LABELS=false` to drop them for multi-tenant scrapes. - Label semantics clarified: - transport: control-plane mechanism (e.g., websocket, wireguard) - protocol: L4 payload type (tcp, udp) @@ -29,10 +29,9 @@ This document captures the current state of Newt metrics, prioritized fixes, and 3) Remaining gaps and deviations - Some call sites still need initiator label on reconnect outcomes (client vs server). This is planned. - - WebSocket and Proxy metrics (connect latency, messages, active connections, buffer/drops, async backlog) are planned additions. - Config apply duration and cert rotation counters are planned. - Registration and config reload failures are not yet emitted; add failure code paths so result labels expose churn. - - Restart counter increments only when build metadata is provided; consider decoupling to count all boots. + - Document using `process_start_time_seconds` (and `time()` in PromQL) to derive uptime; no explicit restart counter is needed. - Metric helpers often use `context.Background()`. Where lightweight contexts exist (e.g., HTTP handlers), propagate them to ease future correlation. - Tracing coverage is limited to admin HTTP and WebSocket connect spans; extend to blueprint fetches, proxy accept loops, and WireGuard updates when OTLP is enabled. @@ -44,8 +43,6 @@ This document captures the current state of Newt metrics, prioritized fixes, and - Correct label semantics (transport vs protocol); fix sessions transport labelling - Documentation alignment - Phase 2 (next) - - WebSocket: newt_websocket_connect_latency_seconds; newt_websocket_messages_total{direction,msg_type} - - Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_drops_total, newt_proxy_async_backlog_bytes - Reconnect: add initiator label (client/server) - Config & PKI: newt_config_apply_seconds{phase,result}; newt_cert_rotation_total{result} - WebSocket disconnect and keepalive failure counters @@ -66,7 +63,7 @@ This document captures the current state of Newt metrics, prioritized fixes, and - Sustained connection errors: - rate(newt_connection_errors_total[5m]) by (site_id,transport,error_type) - Heartbeat gaps: - - max_over_time(newt_site_last_heartbeat_seconds[15m]) by (site_id) + - max_over_time(time() - newt_site_last_heartbeat_timestamp_seconds[15m]) by (site_id) - Proxy drops: - increase(newt_proxy_drops_total[5m]) by (site_id,protocol) - WebSocket connect p95 (when added): diff --git a/docs/observability.md b/docs/observability.md index ba19aac..42f0cb1 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -27,6 +27,7 @@ Enable exporters via environment variables (no code changes required) - OTEL_RESOURCE_ATTRIBUTES=service.instance.id=,site_id= - OTEL_METRIC_EXPORT_INTERVAL=15s (default) - NEWT_ADMIN_ADDR=127.0.0.1:2112 (default admin HTTP with /metrics) +- NEWT_METRICS_INCLUDE_SITE_LABELS=true|false (default: true; disable to drop site_id/region as metric labels and rely on resource attributes only) Runtime behavior @@ -36,12 +37,14 @@ Runtime behavior Metric catalog (current) +Unless otherwise noted, `site_id` and `region` are available via resource attributes and, by default, as metric labels. Set `NEWT_METRICS_INCLUDE_SITE_LABELS=false` to drop them from counter/histogram label sets in high-cardinality environments. + | Metric | Instrument | Key attributes | Purpose | Example | | --- | --- | --- | --- | --- | -| `newt_build_info` | Observable gauge (Int64) | `version`, `commit`, `site_id`, `region` (optional) | Emits build metadata with value `1` for scrape-time verification. | `newt_build_info{version="1.5.0",site_id="acme-edge-1"} 1` | +| `newt_build_info` | Observable gauge (Int64) | `version`, `commit`, `site_id`, `region` (optional when site labels enabled) | Emits build metadata with value `1` for scrape-time verification. | `newt_build_info{version="1.5.0"} 1` | | `newt_site_registrations_total` | Counter (Int64) | `result` (`success`/`failure`), `site_id`, `region` (optional) | Counts Pangolin registration attempts. | `newt_site_registrations_total{result="success",site_id="acme-edge-1"} 1` | | `newt_site_online` | Observable gauge (Int64) | `site_id` | Reports whether the site is currently connected (`1`) or offline (`0`). | `newt_site_online{site_id="acme-edge-1"} 1` | -| `newt_site_last_heartbeat_seconds` | Observable gauge (Float64) | `site_id` | Time since the most recent Pangolin heartbeat. | `newt_site_last_heartbeat_seconds{site_id="acme-edge-1"} 2.4` | +| `newt_site_last_heartbeat_timestamp_seconds` | Observable gauge (Float64) | `site_id` | Unix timestamp of the most recent Pangolin heartbeat (derive age via `time() - metric`). | `newt_site_last_heartbeat_timestamp_seconds{site_id="acme-edge-1"} 1.728e+09` | | `newt_tunnel_sessions` | Observable gauge (Int64) | `site_id`, `tunnel_id` (when enabled) | Counts active tunnel sessions per peer; collapses to per-site when tunnel IDs are disabled. | `newt_tunnel_sessions{site_id="acme-edge-1",tunnel_id="wgpub..."} 3` | | `newt_tunnel_bytes_total` | Counter (Int64) | `direction` (`ingress`/`egress`), `protocol` (`tcp`/`udp`), `tunnel_id` (optional), `site_id`, `region` (optional) | Measures proxied traffic volume across tunnels. | `newt_tunnel_bytes_total{direction="ingress",protocol="tcp",site_id="acme-edge-1"} 4096` | | `newt_tunnel_latency_seconds` | Histogram (Float64) | `transport` (e.g., `wireguard`), `tunnel_id` (optional), `site_id`, `region` (optional) | Captures RTT or configuration-driven latency samples. | `newt_tunnel_latency_seconds_bucket{transport="wireguard",le="0.5"} 42` | @@ -49,15 +52,18 @@ Metric catalog (current) | `newt_connection_attempts_total` | Counter (Int64) | `transport` (`auth`/`websocket`), `result`, `site_id`, `region` (optional) | Measures control-plane dial attempts and their outcomes. | `newt_connection_attempts_total{transport="websocket",result="success",site_id="acme-edge-1"} 8` | | `newt_connection_errors_total` | Counter (Int64) | `transport`, `error_type`, `site_id`, `region` (optional) | Buckets connection failures by normalized error class. | `newt_connection_errors_total{transport="websocket",error_type="tls_handshake",site_id="acme-edge-1"} 1` | | `newt_config_reloads_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Counts remote blueprint/config reloads. | `newt_config_reloads_total{result="success",site_id="acme-edge-1"} 3` | -| `newt_restart_count_total` | Counter (Int64) | `site_id`, `region` (optional) | Increments once per process boot to detect restarts. | `newt_restart_count_total{site_id="acme-edge-1"} 1` | +| `process_start_time_seconds` | Observable gauge (Float64) | — | Unix timestamp of the Newt process start time (use `time() - process_start_time_seconds` for uptime). | `process_start_time_seconds 1.728e+09` | | `newt_config_apply_seconds` | Histogram (Float64) | `phase` (`interface`/`peer`), `result`, `site_id`, `region` (optional) | Measures time spent applying WireGuard configuration phases. | `newt_config_apply_seconds_sum{phase="peer",result="success",site_id="acme-edge-1"} 0.48` | | `newt_cert_rotation_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Tracks client certificate rotation attempts. | `newt_cert_rotation_total{result="success",site_id="acme-edge-1"} 2` | | `newt_websocket_connect_latency_seconds` | Histogram (Float64) | `transport="websocket"`, `result`, `error_type` (on failure), `site_id`, `region` (optional) | Measures WebSocket dial latency and exposes failure buckets. | `newt_websocket_connect_latency_seconds_bucket{result="success",le="0.5",site_id="acme-edge-1"} 9` | | `newt_websocket_messages_total` | Counter (Int64) | `direction` (`in`/`out`), `msg_type` (`text`/`ping`/`pong`), `site_id`, `region` (optional) | Accounts for control WebSocket traffic volume by type. | `newt_websocket_messages_total{direction="out",msg_type="ping",site_id="acme-edge-1"} 12` | +| `newt_websocket_connected` | Observable gauge (Int64) | `site_id`, `region` (optional) | Reports current WebSocket connectivity (`1` when connected). | `newt_websocket_connected{site_id="acme-edge-1"} 1` | +| `newt_websocket_reconnects_total` | Counter (Int64) | `reason` (`tls_handshake`, `dial_timeout`, `io_error`, `ping_write`, `timeout`, etc.), `site_id`, `region` (optional) | Counts reconnect attempts with normalized reasons for failure analysis. | `newt_websocket_reconnects_total{reason="timeout",site_id="acme-edge-1"} 3` | | `newt_proxy_active_connections` | Observable gauge (Int64) | `protocol` (`tcp`/`udp`), `direction` (`ingress`/`egress`), `tunnel_id` (optional), `site_id`, `region` (optional) | Current proxy connections per tunnel and protocol. | `newt_proxy_active_connections{protocol="tcp",direction="egress",site_id="acme-edge-1"} 4` | | `newt_proxy_buffer_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Volume of buffered data awaiting flush in proxy queues. | `newt_proxy_buffer_bytes{protocol="udp",direction="egress",site_id="acme-edge-1"} 2048` | | `newt_proxy_async_backlog_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks async write backlog when deferred flushing is enabled. | `newt_proxy_async_backlog_bytes{protocol="tcp",direction="egress",site_id="acme-edge-1"} 512` | | `newt_proxy_drops_total` | Counter (Int64) | `protocol`, `tunnel_id` (optional), `site_id`, `region` (optional) | Counts proxy drop events caused by downstream write errors. | `newt_proxy_drops_total{protocol="udp",site_id="acme-edge-1"} 1` | +| `newt_proxy_connections_total` | Counter (Int64) | `event` (`opened`/`closed`), `protocol`, `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks proxy connection lifecycle events for rate/SLO calculations. | `newt_proxy_connections_total{event="opened",protocol="tcp",site_id="acme-edge-1"} 10` | Conventions @@ -174,7 +180,7 @@ sum(newt_tunnel_sessions) Compatibility notes - Gauges do not use the _total suffix (e.g., newt_tunnel_sessions). -- site_id is emitted as both resource attribute and metric label on all newt_* series; region is included as a metric label only when set. tunnel_id is a metric label (WireGuard public key). Never expose secrets in labels. +- site_id/region remain resource attributes. Metric labels for these fields appear on per-site gauges (e.g., `newt_site_online`) and, by default, on counters/histograms; disable them with `NEWT_METRICS_INCLUDE_SITE_LABELS=false` if needed. `tunnel_id` is a metric label (WireGuard public key). Never expose secrets in labels. - NEWT_METRICS_INCLUDE_TUNNEL_ID (default: true) toggles whether tunnel_id is included as a label on bytes/sessions/proxy/reconnect metrics. Disable in high-cardinality environments. - Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both. - Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write. diff --git a/docs/otel-review.md b/docs/otel-review.md index ba824e7..1d49d02 100644 --- a/docs/otel-review.md +++ b/docs/otel-review.md @@ -16,7 +16,7 @@ A global attribute filter (see `buildMeterProvider`) constrains exposed label ke exports stay bounded. - **Site lifecycle**: `newt_site_registrations_total`, `newt_site_online`, and - `newt_site_last_heartbeat_seconds` capture registration attempts and liveness. They + `newt_site_last_heartbeat_timestamp_seconds` capture registration attempts and liveness. They are fed either manually (`IncSiteRegistration`) or via the `TelemetryView` state callback that publishes observable gauges for the active site. - **Tunnel health and usage**: Counters and histograms track bytes, latency, reconnects, @@ -27,17 +27,20 @@ exports stay bounded. `newt_connection_errors_total` are emitted throughout the WebSocket client to classify authentication, dial, and transport failures. - **Operations/configuration**: `newt_config_reloads_total`, - `newt_restart_count_total`, `newt_config_apply_seconds`, and + `process_start_time_seconds`, `newt_config_apply_seconds`, and `newt_cert_rotation_total` provide visibility into blueprint reloads, process boots, configuration timings, and certificate rotation outcomes. - **Build metadata**: `newt_build_info` records the binary version/commit together - with a monotonic restart counter when build information is supplied at startup. -- **WebSocket control-plane**: `newt_websocket_connect_latency_seconds` and - `newt_websocket_messages_total` report connect latency and ping/pong/text activity. + with optional site metadata when build information is supplied at startup. +- **WebSocket control-plane**: `newt_websocket_connect_latency_seconds`, + `newt_websocket_messages_total`, `newt_websocket_connected`, and + `newt_websocket_reconnects_total` report connect latency, ping/pong/text activity, + connection state, and reconnect reasons. - **Proxy data-plane**: Observable gauges (`newt_proxy_active_connections`, - `newt_proxy_buffer_bytes`, `newt_proxy_async_backlog_bytes`) and the - `newt_proxy_drops_total` counter are fed from the proxy manager to monitor backlog - and drop behaviour alongside per-protocol byte counters. + `newt_proxy_buffer_bytes`, `newt_proxy_async_backlog_bytes`) plus counters for + drops, accepts, connection lifecycle events (`newt_proxy_connections_total`), and + duration histograms (`newt_proxy_connection_duration_seconds`) surface backlog, + drop behaviour, and churn alongside per-protocol byte counters. Refer to `docs/observability.md` for a tabular catalogue with instrument types, attributes, and sample exposition lines. @@ -61,8 +64,9 @@ The implementation adheres to most OTel Go recommendations: suffixes for counters and `_seconds`/`_bytes` unit conventions. Histograms are registered with explicit second-based buckets. - **Resource attributes** – Service name/version and optional `site_id`/`region` - populate the `resource.Resource` and are also injected as metric attributes for - compatibility with Prometheus queries. + populate the `resource.Resource`. Metric labels mirror these by default (and on + per-site gauges) but can be disabled with `NEWT_METRICS_INCLUDE_SITE_LABELS=false` + to avoid unnecessary cardinality growth. - **Attribute hygiene** – A single attribute filter (`sdkmetric.WithView`) enforces the allow-list of label keys to prevent accidental high-cardinality emission. - **Runtime metrics** – Go runtime instrumentation is enabled automatically through @@ -83,10 +87,9 @@ The review identified a few actionable adjustments: 2. **Surface config reload failures** – `telemetry.IncConfigReload` is invoked with `result="success"` only. Callers should record a failure result when blueprint parsing or application aborts before success counters are incremented. -3. **Harmonise restart count behaviour** – `newt_restart_count_total` increments only - when build metadata is provided. Consider moving the increment out of - `RegisterBuildInfo` so the counter advances even for ad-hoc builds without version - strings. +3. **Expose robust uptime** – Document using `time() - process_start_time_seconds` + to derive uptime now that the restart counter has been replaced with a timestamp + gauge. 4. **Propagate contexts where available** – Many emitters call metric helpers with `context.Background()`. Passing real contexts (when inexpensive) would allow future exporters to correlate spans and metrics. @@ -98,17 +101,17 @@ The review identified a few actionable adjustments: Prioritised additions that would close visibility gaps: -1. **WebSocket disconnect outcomes** – A counter (e.g., `newt_websocket_disconnects_total`) - partitioned by `reason` would complement the existing connect latency histogram and - explain reconnect storms. -2. **Keepalive/heartbeat failures** – Counting ping timeouts or heartbeat misses would - make `newt_site_last_heartbeat_seconds` actionable by providing discrete events. -3. **Proxy connection lifecycle** – Add counters/histograms for proxy accept events and - connection durations to correlate drops with load and backlog metrics. +1. **Config reload error taxonomy** – Split reload attempts into a dedicated + `newt_config_reload_errors_total{phase}` counter to make blueprint validation failures + visible alongside the existing success counter. +2. **Config source visibility** – Export `newt_config_source_info{source,version}` so + operators can audit the active blueprint origin/commit during incidents. +3. **Certificate expiry** – Emit `newt_cert_expiry_timestamp_seconds` (per cert) to + enable proactive alerts before mTLS credentials lapse. 4. **Blueprint/config pull latency** – Measuring Pangolin blueprint fetch durations and HTTP status distribution would expose slow control-plane operations. -5. **Certificate rotation attempts** – Complement `newt_cert_rotation_total` with a - duration histogram to observe slow PKI updates and detect stuck rotations. +5. **Tunnel setup latency** – Histograms for DNS resolution and tunnel handshakes would + help correlate connect latency spikes with network dependencies. These metrics rely on data that is already available in the code paths mentioned above and would round out operational dashboards. diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index 5403e43..6c34724 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -3,6 +3,8 @@ package telemetry import ( "context" "sync" + "sync/atomic" + "time" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -37,9 +39,9 @@ var ( // Config/Restart mConfigReloads metric.Int64Counter - mRestartCount metric.Int64Counter mConfigApply metric.Float64Histogram mCertRotationTotal metric.Int64Counter + mProcessStartTime metric.Float64ObservableGauge // Build info mBuildInfo metric.Int64ObservableGauge @@ -50,6 +52,8 @@ var ( mWSDisconnects metric.Int64Counter mWSKeepaliveFailure metric.Int64Counter mWSSessionDuration metric.Float64Histogram + mWSConnected metric.Int64ObservableGauge + mWSReconnects metric.Int64Counter // Proxy mProxyActiveConns metric.Int64ObservableGauge @@ -58,16 +62,28 @@ var ( mProxyDropsTotal metric.Int64Counter mProxyAcceptsTotal metric.Int64Counter mProxyConnDuration metric.Float64Histogram + mProxyConnectionsTotal metric.Int64Counter - buildVersion string - buildCommit string + buildVersion string + buildCommit string + processStartUnix = float64(time.Now().UnixNano()) / 1e9 + wsConnectedState atomic.Int64 ) -// attrsWithSite appends global site/region labels when present. +// Proxy connection lifecycle events. +const ( + ProxyConnectionOpened = "opened" + ProxyConnectionClosed = "closed" +) + +// attrsWithSite appends site/region labels only when explicitly enabled to keep +// label cardinality low by default. func attrsWithSite(extra ...attribute.KeyValue) []attribute.KeyValue { - attrs := make([]attribute.KeyValue, 0, len(extra)+2) - attrs = append(attrs, extra...) - attrs = append(attrs, siteAttrs()...) + attrs := make([]attribute.KeyValue, len(extra)) + copy(attrs, extra) + if ShouldIncludeSiteLabels() { + attrs = append(attrs, siteAttrs()...) + } return attrs } @@ -111,8 +127,9 @@ func registerSiteInstruments() error { if err != nil { return err } - mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_seconds", - metric.WithDescription("Seconds since last site heartbeat")) + mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_timestamp_seconds", + metric.WithDescription("Unix timestamp of the last site heartbeat"), + metric.WithUnit("s")) if err != nil { return err } @@ -164,13 +181,22 @@ func registerConnInstruments() error { func registerConfigInstruments() error { mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total", metric.WithDescription("Configuration reloads")) - mRestartCount, _ = meter.Int64Counter("newt_restart_count_total", - metric.WithDescription("Process restart count (incremented on start)")) mConfigApply, _ = meter.Float64Histogram("newt_config_apply_seconds", metric.WithDescription("Configuration apply duration in seconds"), metric.WithUnit("s")) mCertRotationTotal, _ = meter.Int64Counter("newt_cert_rotation_total", metric.WithDescription("Certificate rotation events (success/failure)")) + mProcessStartTime, _ = meter.Float64ObservableGauge("process_start_time_seconds", + metric.WithDescription("Unix timestamp of the process start time"), + metric.WithUnit("s")) + if mProcessStartTime != nil { + if _, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + o.ObserveFloat64(mProcessStartTime, processStartUnix) + return nil + }, mProcessStartTime); err != nil { + otel.Handle(err) + } + } return nil } @@ -191,6 +217,10 @@ func registerBuildWSProxyInstruments() error { mWSSessionDuration, _ = meter.Float64Histogram("newt_websocket_session_duration_seconds", metric.WithDescription("Duration of established WebSocket sessions"), metric.WithUnit("s")) + mWSConnected, _ = meter.Int64ObservableGauge("newt_websocket_connected", + metric.WithDescription("WebSocket connection state (1=connected, 0=disconnected)")) + mWSReconnects, _ = meter.Int64Counter("newt_websocket_reconnects_total", + metric.WithDescription("WebSocket reconnect attempts by reason")) // Proxy mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections", metric.WithDescription("Proxy active connections per tunnel and protocol")) @@ -207,6 +237,8 @@ func registerBuildWSProxyInstruments() error { mProxyConnDuration, _ = meter.Float64Histogram("newt_proxy_connection_duration_seconds", metric.WithDescription("Duration of completed proxy connections"), metric.WithUnit("s")) + mProxyConnectionsTotal, _ = meter.Int64Counter("newt_proxy_connections_total", + metric.WithDescription("Proxy connection lifecycle events by protocol")) // Register a default callback for build info if version/commit set reg, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { if buildVersion == "" && buildCommit == "" { @@ -219,7 +251,9 @@ func registerBuildWSProxyInstruments() error { if buildCommit != "" { attrs = append(attrs, attribute.String("commit", buildCommit)) } - attrs = append(attrs, siteAttrs()...) + if ShouldIncludeSiteLabels() { + attrs = append(attrs, siteAttrs()...) + } o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...)) return nil }, mBuildInfo) @@ -229,6 +263,17 @@ func registerBuildWSProxyInstruments() error { // Provide a functional stopper that unregisters the callback obsStopper = func() { _ = reg.Unregister() } } + if mWSConnected != nil { + if regConn, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + val := wsConnectedState.Load() + o.ObserveInt64(mWSConnected, val, metric.WithAttributes(attrsWithSite()...)) + return nil + }, mWSConnected); err != nil { + otel.Handle(err) + } else { + wsConnStopper = func() { _ = regConn.Unregister() } + } + } return nil } @@ -237,10 +282,11 @@ func registerBuildWSProxyInstruments() error { // heartbeat seconds, and active sessions. var ( - obsOnce sync.Once - obsStopper func() - proxyObsOnce sync.Once - proxyStopper func() + obsOnce sync.Once + obsStopper func() + proxyObsOnce sync.Once + proxyStopper func() + wsConnStopper func() ) // SetObservableCallback registers a single callback that will be invoked @@ -251,7 +297,7 @@ var ( // // telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error { // o.ObserveInt64(mSiteOnline, 1) -// o.ObserveFloat64(mSiteLastHeartbeat, time.Since(lastHB).Seconds()) +// o.ObserveFloat64(mSiteLastHeartbeat, float64(lastHB.Unix())) // o.ObserveInt64(mTunnelSessions, int64(len(activeSessions))) // return nil // }) @@ -290,8 +336,6 @@ func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) func RegisterBuildInfo(version, commit string) { buildVersion = version buildCommit = commit - // Increment restart count on boot - mRestartCount.Add(context.Background(), 1) } // Config reloads @@ -358,6 +402,25 @@ func IncWSKeepaliveFailure(ctx context.Context, reason string) { )...)) } +// SetWSConnectionState updates the backing gauge for the WebSocket connected state. +func SetWSConnectionState(connected bool) { + if connected { + wsConnectedState.Store(1) + } else { + wsConnectedState.Store(0) + } +} + +// IncWSReconnect increments the WebSocket reconnect counter with a bounded reason label. +func IncWSReconnect(ctx context.Context, reason string) { + if reason == "" { + reason = "unknown" + } + mWSReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite( + attribute.String("reason", reason), + )...)) +} + func ObserveWSSessionDuration(ctx context.Context, seconds float64, result string) { mWSSessionDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite( attribute.String("result", result), @@ -413,6 +476,21 @@ func ObserveProxyConnectionDuration(ctx context.Context, tunnelID, protocol, res mProxyConnDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...)) } +// IncProxyConnectionEvent records proxy connection lifecycle events (opened/closed). +func IncProxyConnectionEvent(ctx context.Context, tunnelID, protocol, event string) { + if event == "" { + event = "unknown" + } + attrs := []attribute.KeyValue{ + attribute.String("protocol", protocol), + attribute.String("event", event), + } + if ShouldIncludeTunnelID() && tunnelID != "" { + attrs = append(attrs, attribute.String("tunnel_id", tunnelID)) + } + mProxyConnectionsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...)) +} + // --- Config/PKI helpers --- func ObserveConfigApply(ctx context.Context, phase, result string, seconds float64) { diff --git a/internal/telemetry/metrics_test_helper.go b/internal/telemetry/metrics_test_helper.go new file mode 100644 index 0000000..16aa1a3 --- /dev/null +++ b/internal/telemetry/metrics_test_helper.go @@ -0,0 +1,59 @@ +package telemetry + +import ( + "sync" + "time" +) + +func resetMetricsForTest() { + initOnce = sync.Once{} + obsOnce = sync.Once{} + proxyObsOnce = sync.Once{} + obsStopper = nil + proxyStopper = nil + if wsConnStopper != nil { + wsConnStopper() + } + wsConnStopper = nil + meter = nil + + mSiteRegistrations = nil + mSiteOnline = nil + mSiteLastHeartbeat = nil + + mTunnelSessions = nil + mTunnelBytes = nil + mTunnelLatency = nil + mReconnects = nil + + mConnAttempts = nil + mConnErrors = nil + + mConfigReloads = nil + mConfigApply = nil + mCertRotationTotal = nil + mProcessStartTime = nil + + mBuildInfo = nil + + mWSConnectLatency = nil + mWSMessages = nil + mWSDisconnects = nil + mWSKeepaliveFailure = nil + mWSSessionDuration = nil + mWSConnected = nil + mWSReconnects = nil + + mProxyActiveConns = nil + mProxyBufferBytes = nil + mProxyAsyncBacklogByte = nil + mProxyDropsTotal = nil + mProxyAcceptsTotal = nil + mProxyConnDuration = nil + mProxyConnectionsTotal = nil + + processStartUnix = float64(time.Now().UnixNano()) / 1e9 + wsConnectedState.Store(0) + includeTunnelIDVal.Store(false) + includeSiteLabelVal.Store(false) +} diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go index 071a405..6c6b6de 100644 --- a/internal/telemetry/state_view.go +++ b/internal/telemetry/state_view.go @@ -62,8 +62,8 @@ func observeSiteOnlineFor(o metric.Observer, sv StateView, siteID string) { func observeLastHeartbeatFor(o metric.Observer, sv StateView, siteID string) { if t, ok := sv.LastHeartbeat(siteID); ok { - secs := time.Since(t).Seconds() - o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes( + ts := float64(t.UnixNano()) / 1e9 + o.ObserveFloat64(mSiteLastHeartbeat, ts, metric.WithAttributes( attribute.String("site_id", siteID), )) } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 14100ec..bd435ce 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -118,6 +118,11 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) { } else { includeTunnelIDVal.Store(false) } + if getenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") == "true" { + includeSiteLabelVal.Store(true) + } else { + includeSiteLabelVal.Store(false) + } res := buildResource(ctx, cfg) UpdateSiteInfo(cfg.SiteID, cfg.Region) @@ -294,7 +299,10 @@ func parseResourceAttributes(s string) map[string]string { // Global site/region used to enrich metric labels. var siteIDVal atomic.Value var regionVal atomic.Value -var includeTunnelIDVal atomic.Value // bool; default true +var ( + includeTunnelIDVal atomic.Value // bool; default true + includeSiteLabelVal atomic.Value // bool; default false +) // UpdateSiteInfo updates the global site_id and region used for metric labels. // Thread-safe via atomic.Value: subsequent metric emissions will include @@ -335,7 +343,12 @@ func siteAttrs() []attribute.KeyValue { } // SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager). -func SiteLabelKVs() []attribute.KeyValue { return siteAttrs() } +func SiteLabelKVs() []attribute.KeyValue { + if !ShouldIncludeSiteLabels() { + return nil + } + return siteAttrs() +} // ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted. func ShouldIncludeTunnelID() bool { @@ -345,6 +358,15 @@ func ShouldIncludeTunnelID() bool { return true } +// ShouldIncludeSiteLabels returns whether site_id/region should be emitted as +// metric labels in addition to resource attributes. +func ShouldIncludeSiteLabels() bool { + if v, ok := includeSiteLabelVal.Load().(bool); ok { + return v + } + return false +} + func getenv(k, d string) string { if v := os.Getenv(k); v != "" { return v diff --git a/internal/telemetry/telemetry_attrfilter_test.go b/internal/telemetry/telemetry_attrfilter_test.go index 6c54afe..ebbb3c2 100644 --- a/internal/telemetry/telemetry_attrfilter_test.go +++ b/internal/telemetry/telemetry_attrfilter_test.go @@ -14,17 +14,23 @@ import ( // Test that disallowed attributes are filtered from the exposition. func TestAttributeFilterDropsUnknownKeys(t *testing.T) { - ctx := context.Background() -cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"} + ctx := context.Background() + resetMetricsForTest() + t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") + cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"} tel, err := Init(ctx, cfg) - if err != nil { t.Fatalf("init: %v", err) } + if err != nil { + t.Fatalf("init: %v", err) + } defer func() { _ = tel.Shutdown(context.Background()) }() - if tel.PrometheusHandler == nil { t.Fatalf("prom handler nil") } + if tel.PrometheusHandler == nil { + t.Fatalf("prom handler nil") + } ts := httptest.NewServer(tel.PrometheusHandler) defer ts.Close() -// Add samples with disallowed attribute keys + // Add samples with disallowed attribute keys for _, k := range []string{"forbidden", "site_id", "host"} { set := attribute.NewSet(attribute.String(k, "x")) AddTunnelBytesSet(ctx, 123, set) @@ -32,7 +38,9 @@ cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"} time.Sleep(50 * time.Millisecond) resp, err := http.Get(ts.URL) - if err != nil { t.Fatalf("GET: %v", err) } + if err != nil { + t.Fatalf("GET: %v", err) + } defer resp.Body.Close() b, _ := io.ReadAll(resp.Body) body := string(b) @@ -43,4 +51,3 @@ cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"} t.Fatalf("expected allowed attribute site_id to be present in metrics, got: %s", body) } } - diff --git a/internal/telemetry/telemetry_golden_test.go b/internal/telemetry/telemetry_golden_test.go index 3e6f896..62f41b8 100644 --- a/internal/telemetry/telemetry_golden_test.go +++ b/internal/telemetry/telemetry_golden_test.go @@ -7,6 +7,7 @@ import ( "net/http" "net/http/httptest" "os" + "path/filepath" "strings" "testing" "time" @@ -15,36 +16,61 @@ import ( // Golden test that /metrics contains expected metric names. func TestMetricsGoldenContains(t *testing.T) { ctx := context.Background() -cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", BuildVersion: "test"} + resetMetricsForTest() + t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") + cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", BuildVersion: "test"} tel, err := Init(ctx, cfg) - if err != nil { t.Fatalf("telemetry init error: %v", err) } + if err != nil { + t.Fatalf("telemetry init error: %v", err) + } defer func() { _ = tel.Shutdown(context.Background()) }() - if tel.PrometheusHandler == nil { t.Fatalf("prom handler nil") } + if tel.PrometheusHandler == nil { + t.Fatalf("prom handler nil") + } ts := httptest.NewServer(tel.PrometheusHandler) defer ts.Close() - // Trigger a counter + // Trigger counters to ensure they appear in the scrape IncConnAttempt(ctx, "websocket", "success") + IncWSReconnect(ctx, "io_error") + IncProxyConnectionEvent(ctx, "", "tcp", ProxyConnectionOpened) + if tel.MeterProvider != nil { + _ = tel.MeterProvider.ForceFlush(ctx) + } time.Sleep(100 * time.Millisecond) - resp, err := http.Get(ts.URL) - if err != nil { t.Fatalf("GET metrics failed: %v", err) } - defer resp.Body.Close() - b, _ := io.ReadAll(resp.Body) - body := string(b) + var body string + for i := 0; i < 5; i++ { + resp, err := http.Get(ts.URL) + if err != nil { + t.Fatalf("GET metrics failed: %v", err) + } + b, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + body = string(b) + if strings.Contains(body, "newt_connection_attempts_total") { + break + } + time.Sleep(100 * time.Millisecond) + } - f, err := os.Open("internal/telemetry/testdata/expected_contains.golden") - if err != nil { t.Fatalf("read golden: %v", err) } + f, err := os.Open(filepath.Join("testdata", "expected_contains.golden")) + if err != nil { + t.Fatalf("read golden: %v", err) + } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { needle := strings.TrimSpace(s.Text()) - if needle == "" { continue } + if needle == "" { + continue + } if !strings.Contains(body, needle) { t.Fatalf("expected metrics body to contain %q. body=\n%s", needle, body) } } - if err := s.Err(); err != nil { t.Fatalf("scan golden: %v", err) } + if err := s.Err(); err != nil { + t.Fatalf("scan golden: %v", err) + } } - diff --git a/internal/telemetry/telemetry_smoke_test.go b/internal/telemetry/telemetry_smoke_test.go index d51ea8e..b736ca5 100644 --- a/internal/telemetry/telemetry_smoke_test.go +++ b/internal/telemetry/telemetry_smoke_test.go @@ -13,13 +13,15 @@ import ( // Smoke test that /metrics contains at least one newt_* metric when Prom exporter is enabled. func TestMetricsSmoke(t *testing.T) { ctx := context.Background() + resetMetricsForTest() + t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") cfg := Config{ - ServiceName: "newt", - PromEnabled: true, - OTLPEnabled: false, - AdminAddr: "127.0.0.1:0", - BuildVersion: "test", - BuildCommit: "deadbeef", + ServiceName: "newt", + PromEnabled: true, + OTLPEnabled: false, + AdminAddr: "127.0.0.1:0", + BuildVersion: "test", + BuildCommit: "deadbeef", MetricExportInterval: 5 * time.Second, } tel, err := Init(ctx, cfg) @@ -37,18 +39,27 @@ func TestMetricsSmoke(t *testing.T) { // Record a simple metric and then fetch /metrics IncConnAttempt(ctx, "websocket", "success") + if tel.MeterProvider != nil { + _ = tel.MeterProvider.ForceFlush(ctx) + } // Give the exporter a tick to collect time.Sleep(100 * time.Millisecond) - resp, err := http.Get(ts.URL) - if err != nil { - t.Fatalf("GET /metrics failed: %v", err) + var body string + for i := 0; i < 5; i++ { + resp, err := http.Get(ts.URL) + if err != nil { + t.Fatalf("GET /metrics failed: %v", err) + } + b, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + body = string(b) + if strings.Contains(body, "newt_connection_attempts_total") { + break + } + time.Sleep(100 * time.Millisecond) } - defer resp.Body.Close() - b, _ := io.ReadAll(resp.Body) - body := string(b) if !strings.Contains(body, "newt_connection_attempts_total") { t.Fatalf("expected newt_connection_attempts_total in metrics, got:\n%s", body) } } - diff --git a/internal/telemetry/testdata/expected_contains.golden b/internal/telemetry/testdata/expected_contains.golden index 48123dd..50d3892 100644 --- a/internal/telemetry/testdata/expected_contains.golden +++ b/internal/telemetry/testdata/expected_contains.golden @@ -1,3 +1,7 @@ newt_connection_attempts_total +newt_websocket_connected +newt_websocket_reconnects_total +newt_proxy_connections_total newt_build_info +process_start_time_seconds diff --git a/proxy/manager.go b/proxy/manager.go index 31e7788..cef5fa6 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -509,6 +509,7 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) tunnelID := pm.currentTunnelID telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "success", "") + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "tcp", telemetry.ProxyConnectionOpened) if tunnelID != "" { state.Global().IncSessions(tunnelID) if e := pm.getEntry(tunnelID); e != nil { @@ -523,6 +524,7 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) logger.Error("Error connecting to target: %v", err) accepted.Close() telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "failure", classifyProxyError(err)) + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "tcp", telemetry.ProxyConnectionClosed) telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "failure", time.Since(connStart).Seconds()) return } @@ -556,6 +558,7 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) } } telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "success", time.Since(connStart).Seconds()) + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "tcp", telemetry.ProxyConnectionClosed) }(tunnelID, conn) } } @@ -631,6 +634,7 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { } tunnelID := pm.currentTunnelID telemetry.IncProxyAccept(context.Background(), tunnelID, "udp", "success", "") + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "udp", telemetry.ProxyConnectionOpened) // Only increment activeUDP after a successful DialUDP if e := pm.getEntry(tunnelID); e != nil { e.activeUDP.Add(1) @@ -655,6 +659,7 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) { } clientsMutex.Unlock() telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "udp", result, time.Since(start).Seconds()) + telemetry.IncProxyConnectionEvent(context.Background(), tunnelID, "udp", telemetry.ProxyConnectionClosed) }() buffer := make([]byte, 65507) diff --git a/scripts/smoke-metrics.sh b/scripts/smoke-metrics.sh index d2eb11f..27dd02e 100644 --- a/scripts/smoke-metrics.sh +++ b/scripts/smoke-metrics.sh @@ -20,7 +20,7 @@ probe "newt_* presence" "^newt_" || true # Site gauges with site_id probe "site_online with site_id" "^newt_site_online\{.*site_id=\"[^\"]+\"" || true -probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_seconds\{.*site_id=\"[^\"]+\"" || true +probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_timestamp_seconds\{.*site_id=\"[^\"]+\"" || true # Bytes with direction ingress/egress and protocol probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true @@ -39,11 +39,14 @@ fi # WebSocket metrics (when OTLP/WS used) probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true +probe "websocket connected gauge" "^newt_websocket_connected" || true +probe "websocket reconnects total" "^newt_websocket_reconnects_total\{" || true # Proxy metrics (when proxy active) probe "proxy active connections" "^newt_proxy_active_connections\{" || true probe "proxy buffer bytes" "^newt_proxy_buffer_bytes\{" || true probe "proxy drops total" "^newt_proxy_drops_total\{" || true +probe "proxy connections total" "^newt_proxy_connections_total\{" || true # Config apply probe "config apply seconds buckets" "^newt_config_apply_seconds_bucket\{" || true diff --git a/websocket/client.go b/websocket/client.go index ba39202..8af3be9 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -167,6 +167,7 @@ func (c *Client) Close() error { // Set connection status to false c.setConnected(false) + telemetry.SetWSConnectionState(false) // Close the WebSocket connection gracefully if c.conn != nil { @@ -508,6 +509,7 @@ func (c *Client) establishConnection() error { } else { telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonError) } + telemetry.IncWSReconnect(ctx, etype) return fmt.Errorf("failed to connect to WebSocket: %w", err) } @@ -515,6 +517,7 @@ func (c *Client) establishConnection() error { telemetry.ObserveWSConnectLatency(ctx, lat, "success", "") c.conn = conn c.setConnected(true) + telemetry.SetWSConnectionState(true) c.setMetricsContext(ctx) sessionStart := time.Now() // Wire up pong handler for metrics @@ -632,6 +635,7 @@ func (c *Client) pingMonitor() { default: logger.Error("Ping failed: %v", err) telemetry.IncWSKeepaliveFailure(c.metricsContext(), "ping_write") + telemetry.IncWSReconnect(c.metricsContext(), "ping_write") c.reconnect() return } @@ -660,6 +664,7 @@ func (c *Client) readPumpWithDisconnectDetection(started time.Time) { // Shutting down, don't reconnect return default: + telemetry.IncWSReconnect(ctx, disconnectReason) c.reconnect() } }() @@ -710,6 +715,7 @@ func (c *Client) readPumpWithDisconnectDetection(started time.Time) { func (c *Client) reconnect() { c.setConnected(false) + telemetry.SetWSConnectionState(false) if c.conn != nil { c.conn.Close() c.conn = nil From 186b51e000292d8c0ef1b22f1bcdec67d680bbff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Fri, 10 Oct 2025 19:17:02 +0200 Subject: [PATCH 72/72] refactor(telemetry): update OpenTelemetry SDK imports and types for metrics and tracing --- internal/telemetry/telemetry.go | 40 ++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 14100ec..1de220e 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -17,9 +17,9 @@ import ( "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" "go.opentelemetry.io/otel/exporters/prometheus" - sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" - sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/sdk/trace" semconv "go.opentelemetry.io/otel/semconv/v1.26.0" "google.golang.org/grpc/credentials" ) @@ -98,8 +98,8 @@ func FromEnv() Config { // Setup holds initialized telemetry providers and (optionally) a /metrics handler. // Call Shutdown when the process terminates to flush exporters. type Setup struct { - MeterProvider *sdkmetric.MeterProvider - TracerProvider *sdktrace.TracerProvider + MeterProvider *metric.MeterProvider + TracerProvider *trace.TracerProvider PrometheusHandler http.Handler // nil if Prometheus exporter disabled @@ -171,8 +171,8 @@ func buildResource(ctx context.Context, cfg Config) *resource.Resource { return res } -func setupMetricExport(ctx context.Context, cfg Config, _ *resource.Resource) ([]sdkmetric.Reader, http.Handler, []func(context.Context) error, error) { - var readers []sdkmetric.Reader +func setupMetricExport(ctx context.Context, cfg Config, _ *resource.Resource) ([]metric.Reader, http.Handler, []func(context.Context) error, error) { + var readers []metric.Reader var shutdowns []func(context.Context) error var promHandler http.Handler if cfg.PromEnabled { @@ -200,25 +200,25 @@ func setupMetricExport(ctx context.Context, cfg Config, _ *resource.Resource) ([ if err != nil { return nil, nil, nil, err } - readers = append(readers, sdkmetric.NewPeriodicReader(mexp, sdkmetric.WithInterval(cfg.MetricExportInterval))) + readers = append(readers, metric.NewPeriodicReader(mexp, metric.WithInterval(cfg.MetricExportInterval))) shutdowns = append(shutdowns, mexp.Shutdown) } return readers, promHandler, shutdowns, nil } -func buildMeterProvider(res *resource.Resource, readers []sdkmetric.Reader) *sdkmetric.MeterProvider { - var mpOpts []sdkmetric.Option - mpOpts = append(mpOpts, sdkmetric.WithResource(res)) +func buildMeterProvider(res *resource.Resource, readers []metric.Reader) *metric.MeterProvider { + var mpOpts []metric.Option + mpOpts = append(mpOpts, metric.WithResource(res)) for _, r := range readers { - mpOpts = append(mpOpts, sdkmetric.WithReader(r)) + mpOpts = append(mpOpts, metric.WithReader(r)) } - mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( - sdkmetric.Instrument{Name: "newt_*_latency_seconds"}, - sdkmetric.Stream{Aggregation: sdkmetric.AggregationExplicitBucketHistogram{Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}}}, + mpOpts = append(mpOpts, metric.WithView(metric.NewView( + metric.Instrument{Name: "newt_*_latency_seconds"}, + metric.Stream{Aggregation: metric.AggregationExplicitBucketHistogram{Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}}}, ))) - mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( - sdkmetric.Instrument{Name: "newt_*"}, - sdkmetric.Stream{AttributeFilter: func(kv attribute.KeyValue) bool { + mpOpts = append(mpOpts, metric.WithView(metric.NewView( + metric.Instrument{Name: "newt_*"}, + metric.Stream{AttributeFilter: func(kv attribute.KeyValue) bool { k := string(kv.Key) switch k { case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "msg_type", "phase", "version", "commit", "site_id", "region": @@ -228,10 +228,10 @@ func buildMeterProvider(res *resource.Resource, readers []sdkmetric.Reader) *sdk } }}, ))) - return sdkmetric.NewMeterProvider(mpOpts...) + return metric.NewMeterProvider(mpOpts...) } -func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*sdktrace.TracerProvider, func(context.Context) error) { +func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*trace.TracerProvider, func(context.Context) error) { topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)} if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { topts = append(topts, otlptracegrpc.WithHeaders(hdrs)) @@ -247,7 +247,7 @@ func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*sdk if err != nil { return nil, nil } - tp := sdktrace.NewTracerProvider(sdktrace.WithBatcher(exp), sdktrace.WithResource(res)) + tp := trace.NewTracerProvider(trace.WithBatcher(exp), trace.WithResource(res)) return tp, exp.Shutdown }