mirror of
https://github.com/fosrl/newt.git
synced 2026-03-26 20:46:41 +00:00
Enhance telemetry metrics and context propagation
This commit is contained in:
64
docs/otel-review.md
Normal file
64
docs/otel-review.md
Normal file
@@ -0,0 +1,64 @@
|
||||
# OpenTelemetry Review
|
||||
|
||||
## Metric inventory
|
||||
The table below lists every instrument registered by `internal/telemetry/metrics.go`, the helper that emits it, and an example time-series. Attribute sets automatically add `site_id` (and optionally `region`) via `attrsWithSite` unless the observable callback overrides them. 【F:internal/telemetry/metrics.go†L23-L205】【F:internal/telemetry/metrics.go†L289-L403】
|
||||
|
||||
| Metric | Instrument & unit | Purpose | Emission path | Example series |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| `newt_site_registrations_total` | Counter | Counts Pangolin registration attempts keyed by result (`success`, `failure`). | `telemetry.IncSiteRegistration` (called after registration completes). | `newt_site_registrations_total{result="success",site_id="abc"} 1` |
|
||||
| `newt_site_online` | Observable gauge | 0/1 heartbeat for the active site, driven by the registered `StateView`. | `telemetry.SetObservableCallback` via `state.TelemetryView`. | `newt_site_online{site_id="self"} 1` |
|
||||
| `newt_site_last_heartbeat_seconds` | Observable gauge | Seconds since the last Pangolin heartbeat. | Same callback as above using `state.TelemetryView.TouchHeartbeat`. | `newt_site_last_heartbeat_seconds{site_id="self"} 3.2` |
|
||||
| `newt_tunnel_sessions` | Observable gauge | Active sessions per tunnel; collapses to site total when `tunnel_id` emission is disabled. | `state.TelemetryView.SessionsByTunnel` via `RegisterStateView`. | `newt_tunnel_sessions{site_id="self",tunnel_id="wgpub"} 2` |
|
||||
| `newt_tunnel_bytes_total` | Counter (`By`) | Traffic accounting per tunnel, direction (`ingress`/`egress`), protocol (`tcp`/`udp`). | Proxy manager counting writers (`AddTunnelBytes`/`AddTunnelBytesSet`). | `newt_tunnel_bytes_total{direction="egress",protocol="tcp",site_id="self",tunnel_id="wgpub"} 8192` |
|
||||
| `newt_tunnel_latency_seconds` | Histogram (`s`) | RTT samples from WireGuard stack and health pings per tunnel/transport. | `telemetry.ObserveTunnelLatency` from tunnel health checks. | `newt_tunnel_latency_seconds_bucket{transport="wireguard",le="0.05",tunnel_id="wgpub"} 4` |
|
||||
| `newt_tunnel_reconnects_total` | Counter | Reconnect attempts bucketed by initiator (`client`/`server`) and reason enums. | `telemetry.IncReconnect` across websocket, WG, and utility flows. | `newt_tunnel_reconnects_total{initiator="client",reason="timeout",tunnel_id="wgpub"} 3` |
|
||||
| `newt_connection_attempts_total` | Counter | Auth and WebSocket attempt counts by transport (`auth`, `websocket`) and result (`success`/`failure`). | `telemetry.IncConnAttempt` in auth/token and dial paths. | `newt_connection_attempts_total{transport="websocket",result="failure",site_id="self"} 2` |
|
||||
| `newt_connection_errors_total` | Counter | Connection error tally keyed by transport and canonical error type (`dial_timeout`, `tls_handshake`, `auth_failed`, `io_error`). | `telemetry.IncConnError` in auth/websocket flows. | `newt_connection_errors_total{transport="auth",error_type="auth_failed",site_id="self"} 1` |
|
||||
| `newt_config_reloads_total` | Counter | Successful/failed config reload attempts. | `telemetry.IncConfigReload` during WireGuard config reloads. | `newt_config_reloads_total{result="success",site_id="self"} 1` |
|
||||
| `newt_restart_count_total` | Counter | Bumps to 1 at process boot for build info scrapers. | `telemetry.RegisterBuildInfo` called from `Init`. | `newt_restart_count_total{site_id="self"} 1` |
|
||||
| `newt_config_apply_seconds` | Histogram (`s`) | Measures interface/peer apply duration per phase and result. | `telemetry.ObserveConfigApply` around config updates. | `newt_config_apply_seconds_bucket{phase="peer",result="success",le="0.1"} 5` |
|
||||
| `newt_cert_rotation_total` | Counter | Certificate rotation events tagged by result. | `telemetry.IncCertRotation` during PKI updates. | `newt_cert_rotation_total{result="success",site_id="self"} 1` |
|
||||
| `newt_build_info` | Observable gauge | Constant 1 with `version`/`commit` attributes to expose build metadata. | Callback registered in `registerBuildWSProxyInstruments`. | `newt_build_info{version="1.2.3",commit="abc123",site_id="self"} 1` |
|
||||
| `newt_websocket_connect_latency_seconds` | Histogram (`s`) | Dial latency for Pangolin WebSocket connects annotated with result/error_type. | `telemetry.ObserveWSConnectLatency` inside `Client.establishConnection`. | `newt_websocket_connect_latency_seconds_bucket{result="success",transport="websocket",le="0.5"} 1` |
|
||||
| `newt_websocket_messages_total` | Counter | Counts inbound/outbound WebSocket messages by direction and logical message type. | `telemetry.IncWSMessage` for ping/pong/text events. | `newt_websocket_messages_total{direction="out",msg_type="ping",site_id="self"} 4` |
|
||||
| `newt_websocket_disconnects_total` | Counter | Tracks WebSocket disconnects grouped by `reason` (`shutdown`, `unexpected_close`, etc.) and `result`. | Emitted from `Client.readPumpWithDisconnectDetection` defer block. | `newt_websocket_disconnects_total{reason="unexpected_close",result="error",site_id="self"} 1` |
|
||||
| `newt_websocket_keepalive_failures_total` | Counter | Failed WebSocket ping/pong keepalive attempts by reason. | Incremented in `Client.pingMonitor` when `WriteControl` fails. | `newt_websocket_keepalive_failures_total{reason="ping_write",site_id="self"} 1` |
|
||||
| `newt_websocket_session_duration_seconds` | Histogram (`s`) | Duration of WebSocket sessions by outcome (`result`). | Observed when the read pump exits. | `newt_websocket_session_duration_seconds_sum{result="success",site_id="self"} 120` |
|
||||
| `newt_proxy_active_connections` | Observable gauge | Active TCP/UDP proxy connections per tunnel and protocol. | Proxy manager callback via `SetProxyObservableCallback`. | `newt_proxy_active_connections{protocol="tcp",tunnel_id="wgpub"} 3` |
|
||||
| `newt_proxy_buffer_bytes` | Observable gauge (`By`) | Size of proxy buffer pools (synchronous path) per tunnel/protocol. | Same proxy callback as above. | `newt_proxy_buffer_bytes{protocol="tcp",tunnel_id="wgpub"} 10240` |
|
||||
| `newt_proxy_async_backlog_bytes` | Observable gauge (`By`) | Unflushed async byte backlog when deferred accounting is enabled. | Proxy callback when async accounting is turned on. | `newt_proxy_async_backlog_bytes{protocol="udp",tunnel_id="wgpub"} 4096` |
|
||||
| `newt_proxy_drops_total` | Counter | Proxy write-drop events per protocol/tunnel. | `telemetry.IncProxyDrops` on UDP drop paths. | `newt_proxy_drops_total{protocol="udp",tunnel_id="wgpub"} 2` |
|
||||
| `newt_proxy_accept_total` | Counter | Proxy accept attempts labelled by protocol, result, and reason. | `telemetry.IncProxyAccept` in TCP accept loop and UDP dial paths. | `newt_proxy_accept_total{protocol="tcp",result="failure",reason="timeout",site_id="self"} 1` |
|
||||
| `newt_proxy_connection_duration_seconds` | Histogram (`s`) | Lifecycle duration for proxied TCP/UDP connections by result. | `telemetry.ObserveProxyConnectionDuration` when TCP/UDP handlers complete. | `newt_proxy_connection_duration_seconds_sum{protocol="udp",result="success",site_id="self"} 30` |
|
||||
|
||||
In addition, Go runtime metrics are automatically exported when telemetry is initialised. 【F:internal/telemetry/telemetry.go†L147-L155】
|
||||
|
||||
## Tracing footprint
|
||||
* Tracing is enabled only when OTLP export is turned on; `telemetry.Init` wires a batch `TracerProvider` and sets it globally. 【F:internal/telemetry/telemetry.go†L135-L155】
|
||||
* The admin HTTP mux (`/metrics`, `/healthz`) is wrapped with `otelhttp.NewHandler`, so any inbound admin requests produce spans. 【F:main.go†L373-L387】
|
||||
* WebSocket dials create a `ws.connect` span around the outbound handshake, but subsequent control-plane HTTP requests (token fetch, blueprint sync) use plain `http.Client` without propagation. 【F:websocket/client.go†L417-L459】
|
||||
|
||||
Overall span coverage is limited to the WebSocket connect loop and admin server; tunnel setup, Docker discovery, config application, and health pings currently emit only metrics.
|
||||
|
||||
## Guideline & best-practice adherence
|
||||
* **Resource & exporter configuration:** `telemetry.FromEnv` honours OTEL env-vars, sets service name/version, and promotes `site_id`/`region` resource attributes before building the provider. Exporters default to Prometheus with optional OTLP, aligning with OTel deployment guidance. 【F:internal/telemetry/telemetry.go†L56-L206】
|
||||
* **Low-cardinality enforcement:** A view-level attribute allow-list retains only approved keys (`tunnel_id`, `transport`, `protocol`, etc.), protecting Prometheus cardinality while still surfacing `site_id`/`region`. 【F:internal/telemetry/telemetry.go†L209-L231】
|
||||
* **Units and naming:** Instrument helpers enforce `_total` suffixes for counters, `_seconds` for durations, and attach `metric.WithUnit("By"|"s")` for size/time metrics, matching OTel semantic conventions. 【F:internal/telemetry/metrics.go†L23-L192】
|
||||
* **Runtime metrics & shutdown:** The runtime instrumentation is enabled, and `Setup.Shutdown` drains exporters in reverse order to avoid data loss. 【F:internal/telemetry/telemetry.go†L147-L261】
|
||||
* **Site-aware observables:** `state.TelemetryView` provides thread-safe snapshots to feed `newt_site_online`/`_last_heartbeat_seconds`/`_tunnel_sessions`, ensuring gauges report cohesive per-site data even when `tunnel_id` labels are disabled. 【F:internal/state/telemetry_view.go†L11-L79】
|
||||
|
||||
## Gaps & recommended improvements
|
||||
1. **Tracing coverage:** Instrument the Pangolin REST calls (`getToken`, blueprint downloads) with `otelhttp.NewTransport` or explicit spans, and consider spans for WireGuard handshake/config apply to enable end-to-end traces when OTLP is on. 【F:websocket/client.go†L240-L360】
|
||||
2. **Histogram coverage:** Introduce `newt_site_registration_latency_seconds` (bootstrap) and `newt_ping_roundtrip_seconds` (heartbeat) to capture SLO-critical latencies before release. Existing latency buckets (`0.005s` → `30s`) can be reused. 【F:internal/telemetry/telemetry.go†L209-L218】
|
||||
3. **Control-plane throughput:** Add `newt_websocket_payload_bytes_total` (direction/msg_type) or reuse the tunnel counter with a `transport="websocket"` label to quantify command traffic volume and detect back-pressure.
|
||||
4. **Docker discovery metrics:** If Docker auto-discovery is enabled, expose counters for container add/remove events and failures so operators can trace missing backends to discovery issues.
|
||||
|
||||
## Pre-release metric backlog
|
||||
Prior to GA, we recommend landing the following high-value instruments:
|
||||
* **Bootstrap latency:** `newt_site_registration_latency_seconds` histogram emitted around the initial Pangolin registration HTTP call to detect slow control-plane responses.
|
||||
* **Session duration:** `newt_websocket_session_duration_seconds` histogram recorded when a WebSocket closes (result + reason) to quantify stability.
|
||||
* **Heartbeat lag:** `newt_ping_roundtrip_seconds` histogram from ping/pong monitors to capture tunnel health, complementing the heartbeat gauge.
|
||||
* **Proxy accept errors:** `newt_proxy_accept_errors_total` counter keyed by protocol/reason to surface listener pressure distinct from data-plane drops.
|
||||
* **Discovery events:** `newt_discovery_events_total` counter with `action` (`add`, `remove`, `error`) and `source` (`docker`, `file`) to audit service inventory churn.
|
||||
|
||||
Implementing the above will round out visibility into control-plane responsiveness, connection stability, and discovery health while preserving the existing low-cardinality discipline.
|
||||
@@ -47,12 +47,17 @@ var (
|
||||
// WebSocket
|
||||
mWSConnectLatency metric.Float64Histogram
|
||||
mWSMessages metric.Int64Counter
|
||||
mWSDisconnects metric.Int64Counter
|
||||
mWSKeepaliveFailure metric.Int64Counter
|
||||
mWSSessionDuration metric.Float64Histogram
|
||||
|
||||
// Proxy
|
||||
mProxyActiveConns metric.Int64ObservableGauge
|
||||
mProxyBufferBytes metric.Int64ObservableGauge
|
||||
mProxyAsyncBacklogByte metric.Int64ObservableGauge
|
||||
mProxyDropsTotal metric.Int64Counter
|
||||
mProxyAcceptsTotal metric.Int64Counter
|
||||
mProxyConnDuration metric.Float64Histogram
|
||||
|
||||
buildVersion string
|
||||
buildCommit string
|
||||
@@ -179,6 +184,13 @@ func registerBuildWSProxyInstruments() error {
|
||||
metric.WithUnit("s"))
|
||||
mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total",
|
||||
metric.WithDescription("WebSocket messages by direction and type"))
|
||||
mWSDisconnects, _ = meter.Int64Counter("newt_websocket_disconnects_total",
|
||||
metric.WithDescription("WebSocket disconnects by reason/result"))
|
||||
mWSKeepaliveFailure, _ = meter.Int64Counter("newt_websocket_keepalive_failures_total",
|
||||
metric.WithDescription("WebSocket keepalive (ping/pong) failures"))
|
||||
mWSSessionDuration, _ = meter.Float64Histogram("newt_websocket_session_duration_seconds",
|
||||
metric.WithDescription("Duration of established WebSocket sessions"),
|
||||
metric.WithUnit("s"))
|
||||
// Proxy
|
||||
mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections",
|
||||
metric.WithDescription("Proxy active connections per tunnel and protocol"))
|
||||
@@ -190,6 +202,11 @@ func registerBuildWSProxyInstruments() error {
|
||||
metric.WithUnit("By"))
|
||||
mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total",
|
||||
metric.WithDescription("Proxy drops due to write errors"))
|
||||
mProxyAcceptsTotal, _ = meter.Int64Counter("newt_proxy_accept_total",
|
||||
metric.WithDescription("Proxy connection accepts by protocol and result"))
|
||||
mProxyConnDuration, _ = meter.Float64Histogram("newt_proxy_connection_duration_seconds",
|
||||
metric.WithDescription("Duration of completed proxy connections"),
|
||||
metric.WithUnit("s"))
|
||||
// Register a default callback for build info if version/commit set
|
||||
reg, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
if buildVersion == "" && buildCommit == "" {
|
||||
@@ -328,6 +345,25 @@ func IncWSMessage(ctx context.Context, direction, msgType string) {
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncWSDisconnect(ctx context.Context, reason, result string) {
|
||||
mWSDisconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("reason", reason),
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncWSKeepaliveFailure(ctx context.Context, reason string) {
|
||||
mWSKeepaliveFailure.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("reason", reason),
|
||||
)...))
|
||||
}
|
||||
|
||||
func ObserveWSSessionDuration(ctx context.Context, seconds float64, result string) {
|
||||
mWSSessionDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
// --- Proxy helpers ---
|
||||
|
||||
func ObserveProxyActiveConnsObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
|
||||
@@ -352,6 +388,31 @@ func IncProxyDrops(ctx context.Context, tunnelID, protocol string) {
|
||||
mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncProxyAccept(ctx context.Context, tunnelID, protocol, result, reason string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
attribute.String("result", result),
|
||||
}
|
||||
if reason != "" {
|
||||
attrs = append(attrs, attribute.String("reason", reason))
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyAcceptsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func ObserveProxyConnectionDuration(ctx context.Context, tunnelID, protocol, result string, seconds float64) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
attribute.String("result", result),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyConnDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
// --- Config/PKI helpers ---
|
||||
|
||||
func ObserveConfigApply(ctx context.Context, phase, result string, seconds float64) {
|
||||
|
||||
25
main.go
25
main.go
@@ -586,6 +586,10 @@ func main() {
|
||||
// Register handlers for different message types
|
||||
client.RegisterHandler("newt/wg/connect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received registration message")
|
||||
regResult := "success"
|
||||
defer func() {
|
||||
telemetry.IncSiteRegistration(ctx, regResult)
|
||||
}()
|
||||
if stopFunc != nil {
|
||||
stopFunc() // stop the ws from sending more requests
|
||||
stopFunc = nil // reset stopFunc to nil to avoid double stopping
|
||||
@@ -605,11 +609,13 @@ func main() {
|
||||
jsonData, err := json.Marshal(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info(fmtErrMarshaling, err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(jsonData, &wgData); err != nil {
|
||||
logger.Info("Error unmarshaling target data: %v", err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
@@ -620,6 +626,7 @@ func main() {
|
||||
mtuInt)
|
||||
if err != nil {
|
||||
logger.Error("Failed to create TUN device: %v", err)
|
||||
regResult = "failure"
|
||||
}
|
||||
|
||||
setDownstreamTNetstack(tnet)
|
||||
@@ -633,6 +640,7 @@ func main() {
|
||||
host, _, err := net.SplitHostPort(wgData.Endpoint)
|
||||
if err != nil {
|
||||
logger.Error("Failed to split endpoint: %v", err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
@@ -641,6 +649,7 @@ func main() {
|
||||
endpoint, err := resolveDomain(wgData.Endpoint)
|
||||
if err != nil {
|
||||
logger.Error("Failed to resolve endpoint: %v", err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
@@ -656,12 +665,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
err = dev.IpcSet(config)
|
||||
if err != nil {
|
||||
logger.Error("Failed to configure WireGuard device: %v", err)
|
||||
regResult = "failure"
|
||||
}
|
||||
|
||||
// Bring up the device
|
||||
err = dev.Up()
|
||||
if err != nil {
|
||||
logger.Error("Failed to bring up WireGuard device: %v", err)
|
||||
regResult = "failure"
|
||||
}
|
||||
|
||||
logger.Debug("WireGuard device created. Lets ping the server now...")
|
||||
@@ -676,10 +687,11 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
logger.Debug("Testing initial connection with reliable ping...")
|
||||
lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
if err == nil && wgData.PublicKey != "" {
|
||||
telemetry.ObserveTunnelLatency(context.Background(), wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
telemetry.ObserveTunnelLatency(ctx, wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
}
|
||||
if err != nil {
|
||||
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
|
||||
regResult = "failure"
|
||||
} else {
|
||||
logger.Info("Initial connection test successful")
|
||||
}
|
||||
@@ -701,9 +713,6 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
connected = true
|
||||
|
||||
// telemetry: record a successful site registration (omit region unless available)
|
||||
telemetry.IncSiteRegistration(context.Background(), "success")
|
||||
|
||||
// add the targets if there are any
|
||||
if len(wgData.Targets.TCP) > 0 {
|
||||
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
|
||||
@@ -738,7 +747,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received reconnect message")
|
||||
if wgData.PublicKey != "" {
|
||||
telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server", telemetry.ReasonServerRequest)
|
||||
telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest)
|
||||
}
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
@@ -767,7 +776,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received termination message")
|
||||
if wgData.PublicKey != "" {
|
||||
telemetry.IncReconnect(context.Background(), wgData.PublicKey, "server", telemetry.ReasonServerRequest)
|
||||
telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest)
|
||||
}
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
@@ -837,7 +846,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
},
|
||||
}
|
||||
|
||||
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
|
||||
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"pingResults": pingResults,
|
||||
"newtVersion": newtVersion,
|
||||
@@ -940,7 +949,7 @@ stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
|
||||
}
|
||||
|
||||
// Send the ping results to the cloud for selection
|
||||
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
|
||||
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"pingResults": pingResults,
|
||||
"newtVersion": newtVersion,
|
||||
|
||||
103
proxy/manager.go
103
proxy/manager.go
@@ -2,6 +2,7 @@ package proxy
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
@@ -97,6 +98,32 @@ func (cw *countingWriter) Write(p []byte) (int, error) {
|
||||
return n, err
|
||||
}
|
||||
|
||||
func classifyProxyError(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
if errors.Is(err, net.ErrClosed) {
|
||||
return "closed"
|
||||
}
|
||||
if ne, ok := err.(net.Error); ok {
|
||||
if ne.Timeout() {
|
||||
return "timeout"
|
||||
}
|
||||
if ne.Temporary() {
|
||||
return "temporary"
|
||||
}
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
switch {
|
||||
case strings.Contains(msg, "refused"):
|
||||
return "refused"
|
||||
case strings.Contains(msg, "reset"):
|
||||
return "reset"
|
||||
default:
|
||||
return "io_error"
|
||||
}
|
||||
}
|
||||
|
||||
// NewProxyManager creates a new proxy manager instance
|
||||
func NewProxyManager(tnet *netstack.Net) *ProxyManager {
|
||||
return &ProxyManager{
|
||||
@@ -467,72 +494,69 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
|
||||
for {
|
||||
conn, err := listener.Accept()
|
||||
if err != nil {
|
||||
// Check if we're shutting down or the listener was closed
|
||||
telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "tcp", "failure", classifyProxyError(err))
|
||||
if !pm.running {
|
||||
return
|
||||
}
|
||||
|
||||
// Check for specific network errors that indicate the listener is closed
|
||||
if ne, ok := err.(net.Error); ok && !ne.Temporary() {
|
||||
logger.Info("TCP listener closed, stopping proxy handler for %v", listener.Addr())
|
||||
return
|
||||
}
|
||||
|
||||
logger.Error("Error accepting TCP connection: %v", err)
|
||||
// Don't hammer the CPU if we hit a temporary error
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
continue
|
||||
}
|
||||
|
||||
// Count sessions only once per accepted TCP connection
|
||||
if pm.currentTunnelID != "" {
|
||||
state.Global().IncSessions(pm.currentTunnelID)
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
tunnelID := pm.currentTunnelID
|
||||
telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "success", "")
|
||||
if tunnelID != "" {
|
||||
state.Global().IncSessions(tunnelID)
|
||||
if e := pm.getEntry(tunnelID); e != nil {
|
||||
e.activeTCP.Add(1)
|
||||
}
|
||||
}
|
||||
|
||||
go func() {
|
||||
go func(tunnelID string, accepted net.Conn) {
|
||||
connStart := time.Now()
|
||||
target, err := net.Dial("tcp", targetAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error connecting to target: %v", err)
|
||||
conn.Close()
|
||||
accepted.Close()
|
||||
telemetry.IncProxyAccept(context.Background(), tunnelID, "tcp", "failure", classifyProxyError(err))
|
||||
telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "failure", time.Since(connStart).Seconds())
|
||||
return
|
||||
}
|
||||
|
||||
// already incremented on accept
|
||||
|
||||
// Create a WaitGroup to ensure both copy operations complete
|
||||
entry := pm.getEntry(tunnelID)
|
||||
if entry == nil {
|
||||
entry = &tunnelEntry{}
|
||||
}
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
|
||||
// client -> target (direction=in)
|
||||
go func() {
|
||||
go func(ent *tunnelEntry) {
|
||||
defer wg.Done()
|
||||
e := pm.getEntry(pm.currentTunnelID)
|
||||
cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"}
|
||||
_, _ = io.Copy(cw, conn)
|
||||
cw := &countingWriter{ctx: context.Background(), w: target, set: ent.attrInTCP, pm: pm, ent: ent, out: false, proto: "tcp"}
|
||||
_, _ = io.Copy(cw, accepted)
|
||||
_ = target.Close()
|
||||
}()
|
||||
}(entry)
|
||||
|
||||
// target -> client (direction=out)
|
||||
go func() {
|
||||
go func(ent *tunnelEntry) {
|
||||
defer wg.Done()
|
||||
e := pm.getEntry(pm.currentTunnelID)
|
||||
cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"}
|
||||
cw := &countingWriter{ctx: context.Background(), w: accepted, set: ent.attrOutTCP, pm: pm, ent: ent, out: true, proto: "tcp"}
|
||||
_, _ = io.Copy(cw, target)
|
||||
_ = conn.Close()
|
||||
}()
|
||||
_ = accepted.Close()
|
||||
}(entry)
|
||||
|
||||
// Wait for both copies to complete then session -1
|
||||
wg.Wait()
|
||||
if pm.currentTunnelID != "" {
|
||||
state.Global().DecSessions(pm.currentTunnelID)
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
if tunnelID != "" {
|
||||
state.Global().DecSessions(tunnelID)
|
||||
if e := pm.getEntry(tunnelID); e != nil {
|
||||
e.activeTCP.Add(-1)
|
||||
}
|
||||
}
|
||||
}()
|
||||
telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "tcp", "success", time.Since(connStart).Seconds())
|
||||
}(tunnelID, conn)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -595,16 +619,20 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
targetUDPAddr, err := net.ResolveUDPAddr("udp", targetAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error resolving target address: %v", err)
|
||||
telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "udp", "failure", "resolve")
|
||||
continue
|
||||
}
|
||||
|
||||
targetConn, err = net.DialUDP("udp", nil, targetUDPAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error connecting to target: %v", err)
|
||||
telemetry.IncProxyAccept(context.Background(), pm.currentTunnelID, "udp", "failure", classifyProxyError(err))
|
||||
continue
|
||||
}
|
||||
tunnelID := pm.currentTunnelID
|
||||
telemetry.IncProxyAccept(context.Background(), tunnelID, "udp", "success", "")
|
||||
// Only increment activeUDP after a successful DialUDP
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
if e := pm.getEntry(tunnelID); e != nil {
|
||||
e.activeUDP.Add(1)
|
||||
}
|
||||
|
||||
@@ -612,18 +640,21 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
clientConns[clientKey] = targetConn
|
||||
clientsMutex.Unlock()
|
||||
|
||||
go func(clientKey string, targetConn *net.UDPConn, remoteAddr net.Addr) {
|
||||
go func(clientKey string, targetConn *net.UDPConn, remoteAddr net.Addr, tunnelID string) {
|
||||
start := time.Now()
|
||||
result := "success"
|
||||
defer func() {
|
||||
// Always clean up when this goroutine exits
|
||||
clientsMutex.Lock()
|
||||
if storedConn, exists := clientConns[clientKey]; exists && storedConn == targetConn {
|
||||
delete(clientConns, clientKey)
|
||||
targetConn.Close()
|
||||
if e := pm.getEntry(pm.currentTunnelID); e != nil {
|
||||
if e := pm.getEntry(tunnelID); e != nil {
|
||||
e.activeUDP.Add(-1)
|
||||
}
|
||||
}
|
||||
clientsMutex.Unlock()
|
||||
telemetry.ObserveProxyConnectionDuration(context.Background(), tunnelID, "udp", result, time.Since(start).Seconds())
|
||||
}()
|
||||
|
||||
buffer := make([]byte, 65507)
|
||||
@@ -631,6 +662,7 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
n, _, err := targetConn.ReadFromUDP(buffer)
|
||||
if err != nil {
|
||||
logger.Error("Error reading from target: %v", err)
|
||||
result = "failure"
|
||||
return // defer will handle cleanup
|
||||
}
|
||||
|
||||
@@ -651,10 +683,11 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
if err != nil {
|
||||
logger.Error("Error writing to client: %v", err)
|
||||
telemetry.IncProxyDrops(context.Background(), pm.currentTunnelID, "udp")
|
||||
result = "failure"
|
||||
return // defer will handle cleanup
|
||||
}
|
||||
}
|
||||
}(clientKey, targetConn, remoteAddr)
|
||||
}(clientKey, targetConn, remoteAddr, tunnelID)
|
||||
}
|
||||
|
||||
written, err := targetConn.Write(buffer[:n])
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
@@ -42,6 +43,8 @@ type Client struct {
|
||||
writeMux sync.Mutex
|
||||
clientType string // Type of client (e.g., "newt", "olm")
|
||||
tlsConfig TLSConfig
|
||||
metricsCtxMu sync.RWMutex
|
||||
metricsCtx context.Context
|
||||
}
|
||||
|
||||
type ClientOption func(*Client)
|
||||
@@ -85,6 +88,26 @@ func (c *Client) OnTokenUpdate(callback func(token string)) {
|
||||
c.onTokenUpdate = callback
|
||||
}
|
||||
|
||||
func (c *Client) metricsContext() context.Context {
|
||||
c.metricsCtxMu.RLock()
|
||||
defer c.metricsCtxMu.RUnlock()
|
||||
if c.metricsCtx != nil {
|
||||
return c.metricsCtx
|
||||
}
|
||||
return context.Background()
|
||||
}
|
||||
|
||||
func (c *Client) setMetricsContext(ctx context.Context) {
|
||||
c.metricsCtxMu.Lock()
|
||||
c.metricsCtx = ctx
|
||||
c.metricsCtxMu.Unlock()
|
||||
}
|
||||
|
||||
// MetricsContext exposes the context used for telemetry emission when a connection is active.
|
||||
func (c *Client) MetricsContext() context.Context {
|
||||
return c.metricsContext()
|
||||
}
|
||||
|
||||
// NewClient creates a new websocket client
|
||||
func NewClient(clientType string, ID, secret string, endpoint string, pingInterval time.Duration, pingTimeout time.Duration, opts ...ClientOption) (*Client, error) {
|
||||
config := &Config{
|
||||
@@ -177,7 +200,7 @@ func (c *Client) SendMessage(messageType string, data interface{}) error {
|
||||
if err := c.conn.WriteJSON(msg); err != nil {
|
||||
return err
|
||||
}
|
||||
telemetry.IncWSMessage(context.Background(), "out", "text")
|
||||
telemetry.IncWSMessage(c.metricsContext(), "out", "text")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -273,8 +296,12 @@ func (c *Client) getToken() (string, error) {
|
||||
return "", fmt.Errorf("failed to marshal token request data: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Create a new request
|
||||
req, err := http.NewRequest(
|
||||
req, err := http.NewRequestWithContext(
|
||||
ctx,
|
||||
"POST",
|
||||
baseEndpoint+"/api/v1/auth/"+c.clientType+"/get-token",
|
||||
bytes.NewBuffer(jsonData),
|
||||
@@ -296,7 +323,8 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
telemetry.IncConnError(context.Background(), "auth", classifyConnError(err))
|
||||
telemetry.IncConnAttempt(ctx, "auth", "failure")
|
||||
telemetry.IncConnError(ctx, "auth", classifyConnError(err))
|
||||
return "", fmt.Errorf("failed to request new token: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -304,15 +332,15 @@ func (c *Client) getToken() (string, error) {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
telemetry.IncConnAttempt(context.Background(), "auth", "failure")
|
||||
telemetry.IncConnAttempt(ctx, "auth", "failure")
|
||||
etype := "io_error"
|
||||
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
etype = "auth_failed"
|
||||
}
|
||||
telemetry.IncConnError(context.Background(), "auth", etype)
|
||||
telemetry.IncConnError(ctx, "auth", etype)
|
||||
// Reconnect reason mapping for auth failures
|
||||
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonAuthError)
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonAuthError)
|
||||
}
|
||||
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
@@ -332,7 +360,7 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
|
||||
logger.Debug("Received token: %s", tokenResp.Data.Token)
|
||||
telemetry.IncConnAttempt(context.Background(), "auth", "success")
|
||||
telemetry.IncConnAttempt(ctx, "auth", "success")
|
||||
|
||||
return tokenResp.Data.Token, nil
|
||||
}
|
||||
@@ -357,6 +385,30 @@ func classifyConnError(err error) string {
|
||||
}
|
||||
}
|
||||
|
||||
func classifyWSDisconnect(err error) (result, reason string) {
|
||||
if err == nil {
|
||||
return "success", "normal"
|
||||
}
|
||||
if websocket.IsCloseError(err, websocket.CloseNormalClosure) {
|
||||
return "success", "normal"
|
||||
}
|
||||
if ne, ok := err.(net.Error); ok && ne.Timeout() {
|
||||
return "error", "timeout"
|
||||
}
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) {
|
||||
return "error", "unexpected_close"
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
switch {
|
||||
case strings.Contains(msg, "eof"):
|
||||
return "error", "eof"
|
||||
case strings.Contains(msg, "reset"):
|
||||
return "error", "connection_reset"
|
||||
default:
|
||||
return "error", "read_error"
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) connectWithRetry() {
|
||||
for {
|
||||
select {
|
||||
@@ -375,13 +427,13 @@ func (c *Client) connectWithRetry() {
|
||||
}
|
||||
|
||||
func (c *Client) establishConnection() error {
|
||||
ctx := context.Background()
|
||||
|
||||
// Get token for authentication
|
||||
token, err := c.getToken()
|
||||
if err != nil {
|
||||
// telemetry: connection attempt failed before dialing
|
||||
// site_id isn't globally available here; use client ID as site_id (low cardinality)
|
||||
telemetry.IncConnAttempt(context.Background(), "websocket", "failure")
|
||||
telemetry.IncConnError(context.Background(), "websocket", classifyConnError(err))
|
||||
telemetry.IncConnAttempt(ctx, "websocket", "failure")
|
||||
telemetry.IncConnError(ctx, "websocket", classifyConnError(err))
|
||||
return fmt.Errorf("failed to get token: %w", err)
|
||||
}
|
||||
|
||||
@@ -416,7 +468,7 @@ func (c *Client) establishConnection() error {
|
||||
|
||||
// Connect to WebSocket (optional span)
|
||||
tr := otel.Tracer("newt")
|
||||
spanCtx, span := tr.Start(context.Background(), "ws.connect")
|
||||
ctx, span := tr.Start(ctx, "ws.connect")
|
||||
defer span.End()
|
||||
|
||||
start := time.Now()
|
||||
@@ -441,38 +493,40 @@ func (c *Client) establishConnection() error {
|
||||
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
|
||||
}
|
||||
|
||||
conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
|
||||
conn, _, err := dialer.DialContext(ctx, u.String(), nil)
|
||||
lat := time.Since(start).Seconds()
|
||||
if err != nil {
|
||||
telemetry.IncConnAttempt(context.Background(), "websocket", "failure")
|
||||
telemetry.IncConnAttempt(ctx, "websocket", "failure")
|
||||
etype := classifyConnError(err)
|
||||
telemetry.IncConnError(context.Background(), "websocket", etype)
|
||||
telemetry.ObserveWSConnectLatency(context.Background(), lat, "failure", etype)
|
||||
telemetry.IncConnError(ctx, "websocket", etype)
|
||||
telemetry.ObserveWSConnectLatency(ctx, lat, "failure", etype)
|
||||
// Map handshake-related errors to reconnect reasons where appropriate
|
||||
if etype == "tls_handshake" {
|
||||
telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonHandshakeError)
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonHandshakeError)
|
||||
} else if etype == "dial_timeout" {
|
||||
telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonTimeout)
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonTimeout)
|
||||
} else {
|
||||
telemetry.IncReconnect(context.Background(), c.config.ID, "client", telemetry.ReasonError)
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonError)
|
||||
}
|
||||
return fmt.Errorf("failed to connect to WebSocket: %w", err)
|
||||
}
|
||||
|
||||
telemetry.IncConnAttempt(context.Background(), "websocket", "success")
|
||||
telemetry.ObserveWSConnectLatency(context.Background(), lat, "success", "")
|
||||
telemetry.IncConnAttempt(ctx, "websocket", "success")
|
||||
telemetry.ObserveWSConnectLatency(ctx, lat, "success", "")
|
||||
c.conn = conn
|
||||
c.setConnected(true)
|
||||
c.setMetricsContext(ctx)
|
||||
sessionStart := time.Now()
|
||||
// Wire up pong handler for metrics
|
||||
c.conn.SetPongHandler(func(appData string) error {
|
||||
telemetry.IncWSMessage(context.Background(), "in", "pong")
|
||||
telemetry.IncWSMessage(c.metricsContext(), "in", "pong")
|
||||
return nil
|
||||
})
|
||||
|
||||
// Start the ping monitor
|
||||
go c.pingMonitor()
|
||||
// Start the read pump with disconnect detection
|
||||
go c.readPumpWithDisconnectDetection()
|
||||
go c.readPumpWithDisconnectDetection(sessionStart)
|
||||
|
||||
if c.onConnect != nil {
|
||||
err := c.saveConfig()
|
||||
@@ -566,7 +620,7 @@ func (c *Client) pingMonitor() {
|
||||
c.writeMux.Lock()
|
||||
err := c.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(c.pingTimeout))
|
||||
if err == nil {
|
||||
telemetry.IncWSMessage(context.Background(), "out", "ping")
|
||||
telemetry.IncWSMessage(c.metricsContext(), "out", "ping")
|
||||
}
|
||||
c.writeMux.Unlock()
|
||||
if err != nil {
|
||||
@@ -577,6 +631,7 @@ func (c *Client) pingMonitor() {
|
||||
return
|
||||
default:
|
||||
logger.Error("Ping failed: %v", err)
|
||||
telemetry.IncWSKeepaliveFailure(c.metricsContext(), "ping_write")
|
||||
c.reconnect()
|
||||
return
|
||||
}
|
||||
@@ -586,11 +641,19 @@ func (c *Client) pingMonitor() {
|
||||
}
|
||||
|
||||
// readPumpWithDisconnectDetection reads messages and triggers reconnect on error
|
||||
func (c *Client) readPumpWithDisconnectDetection() {
|
||||
func (c *Client) readPumpWithDisconnectDetection(started time.Time) {
|
||||
ctx := c.metricsContext()
|
||||
disconnectReason := "shutdown"
|
||||
disconnectResult := "success"
|
||||
|
||||
defer func() {
|
||||
if c.conn != nil {
|
||||
c.conn.Close()
|
||||
}
|
||||
if !started.IsZero() {
|
||||
telemetry.ObserveWSSessionDuration(ctx, time.Since(started).Seconds(), disconnectResult)
|
||||
}
|
||||
telemetry.IncWSDisconnect(ctx, disconnectReason, disconnectResult)
|
||||
// Only attempt reconnect if we're not shutting down
|
||||
select {
|
||||
case <-c.done:
|
||||
@@ -604,12 +667,14 @@ func (c *Client) readPumpWithDisconnectDetection() {
|
||||
for {
|
||||
select {
|
||||
case <-c.done:
|
||||
disconnectReason = "shutdown"
|
||||
disconnectResult = "success"
|
||||
return
|
||||
default:
|
||||
var msg WSMessage
|
||||
err := c.conn.ReadJSON(&msg)
|
||||
if err == nil {
|
||||
telemetry.IncWSMessage(context.Background(), "in", "text")
|
||||
telemetry.IncWSMessage(c.metricsContext(), "in", "text")
|
||||
}
|
||||
if err != nil {
|
||||
// Check if we're shutting down before logging error
|
||||
@@ -617,14 +682,19 @@ func (c *Client) readPumpWithDisconnectDetection() {
|
||||
case <-c.done:
|
||||
// Expected during shutdown, don't log as error
|
||||
logger.Debug("WebSocket connection closed during shutdown")
|
||||
disconnectReason = "shutdown"
|
||||
disconnectResult = "success"
|
||||
return
|
||||
default:
|
||||
// Unexpected error during normal operation
|
||||
disconnectResult, disconnectReason = classifyWSDisconnect(err)
|
||||
if disconnectResult == "error" {
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure, websocket.CloseNormalClosure) {
|
||||
logger.Error("WebSocket read error: %v", err)
|
||||
} else {
|
||||
logger.Debug("WebSocket connection closed: %v", err)
|
||||
}
|
||||
}
|
||||
return // triggers reconnect via defer
|
||||
}
|
||||
}
|
||||
|
||||
24
wg/wg.go
24
wg/wg.go
@@ -280,6 +280,15 @@ func (s *WireGuardService) LoadRemoteConfig() error {
|
||||
}
|
||||
|
||||
func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
ctx := context.Background()
|
||||
if s.client != nil {
|
||||
ctx = s.client.MetricsContext()
|
||||
}
|
||||
result := "success"
|
||||
defer func() {
|
||||
telemetry.IncConfigReload(ctx, result)
|
||||
}()
|
||||
|
||||
var config WgConfig
|
||||
|
||||
logger.Debug("Received message: %v", msg)
|
||||
@@ -288,11 +297,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
jsonData, err := json.Marshal(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error marshaling data: %v", err)
|
||||
result = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(jsonData, &config); err != nil {
|
||||
logger.Info("Error unmarshaling target data: %v", err)
|
||||
result = "failure"
|
||||
return
|
||||
}
|
||||
s.config = config
|
||||
@@ -303,27 +314,28 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
}
|
||||
|
||||
// telemetry: config reload success
|
||||
telemetry.IncConfigReload(context.Background(), "success")
|
||||
// Optional reconnect reason mapping: config change
|
||||
if s.serverPubKey != "" {
|
||||
telemetry.IncReconnect(context.Background(), s.serverPubKey, "client", telemetry.ReasonConfigChange)
|
||||
telemetry.IncReconnect(ctx, s.serverPubKey, "client", telemetry.ReasonConfigChange)
|
||||
}
|
||||
|
||||
// Ensure the WireGuard interface and peers are configured
|
||||
start := time.Now()
|
||||
if err := s.ensureWireguardInterface(config); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard interface: %v", err)
|
||||
telemetry.ObserveConfigApply(context.Background(), "interface", "failure", time.Since(start).Seconds())
|
||||
telemetry.ObserveConfigApply(ctx, "interface", "failure", time.Since(start).Seconds())
|
||||
result = "failure"
|
||||
} else {
|
||||
telemetry.ObserveConfigApply(context.Background(), "interface", "success", time.Since(start).Seconds())
|
||||
telemetry.ObserveConfigApply(ctx, "interface", "success", time.Since(start).Seconds())
|
||||
}
|
||||
|
||||
startPeers := time.Now()
|
||||
if err := s.ensureWireguardPeers(config.Peers); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard peers: %v", err)
|
||||
telemetry.ObserveConfigApply(context.Background(), "peer", "failure", time.Since(startPeers).Seconds())
|
||||
telemetry.ObserveConfigApply(ctx, "peer", "failure", time.Since(startPeers).Seconds())
|
||||
result = "failure"
|
||||
} else {
|
||||
telemetry.ObserveConfigApply(context.Background(), "peer", "success", time.Since(startPeers).Seconds())
|
||||
telemetry.ObserveConfigApply(ctx, "peer", "success", time.Since(startPeers).Seconds())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user