Adding OpenTelemetry Metrics and Tracing

This commit is contained in:
Marc Schäfer
2025-10-11 18:19:51 +02:00
parent b383cec0b0
commit c086e69dd0
30 changed files with 3457 additions and 158 deletions

206
main.go
View File

@@ -1,7 +1,9 @@
package main
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"net"
@@ -22,6 +24,9 @@ import (
"github.com/fosrl/newt/updates"
"github.com/fosrl/newt/websocket"
"github.com/fosrl/newt/internal/state"
"github.com/fosrl/newt/internal/telemetry"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
"golang.zx2c4.com/wireguard/conn"
"golang.zx2c4.com/wireguard/device"
"golang.zx2c4.com/wireguard/tun"
@@ -91,6 +96,14 @@ func (s *stringSlice) Set(value string) error {
return nil
}
const (
fmtErrMarshaling = "Error marshaling data: %v"
fmtReceivedMsg = "Received: %+v"
topicWGRegister = "newt/wg/register"
msgNoTunnelOrProxy = "No tunnel IP or proxy manager available"
fmtErrParsingTargetData = "Error parsing target data: %v"
)
var (
endpoint string
id string
@@ -120,8 +133,17 @@ var (
preferEndpoint string
healthMonitor *healthcheck.Monitor
enforceHealthcheckCert bool
blueprintFile string
noCloud bool
// Build/version (can be overridden via -ldflags "-X main.newtVersion=...")
newtVersion = "version_replaceme"
// Observability/metrics flags
metricsEnabled bool
otlpEnabled bool
adminAddr string
region string
metricsAsyncBytes bool
blueprintFile string
noCloud bool
// New mTLS configuration variables
tlsClientCert string
@@ -133,6 +155,10 @@ var (
)
func main() {
// Prepare context for graceful shutdown and signal handling
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
id = os.Getenv("NEWT_ID")
@@ -143,6 +169,14 @@ func main() {
updownScript = os.Getenv("UPDOWN_SCRIPT")
interfaceName = os.Getenv("INTERFACE")
generateAndSaveKeyTo = os.Getenv("GENERATE_AND_SAVE_KEY_TO")
// Metrics/observability env mirrors
metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
regionEnv := os.Getenv("NEWT_REGION")
asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
keepInterfaceEnv := os.Getenv("KEEP_INTERFACE")
keepInterface = keepInterfaceEnv == "true"
acceptClientsEnv := os.Getenv("ACCEPT_CLIENTS")
@@ -286,6 +320,43 @@ func main() {
flag.BoolVar(&noCloud, "no-cloud", false, "Disable cloud failover")
}
// Metrics/observability flags (mirror ENV if unset)
if metricsEnabledEnv == "" {
flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
} else {
if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil {
metricsEnabled = v
} else {
metricsEnabled = true
}
}
if otlpEnabledEnv == "" {
flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
} else {
if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil {
otlpEnabled = v
}
}
if adminAddrEnv == "" {
flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
} else {
adminAddr = adminAddrEnv
}
// Async bytes toggle
if asyncBytesEnv == "" {
flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
} else {
if v, err := strconv.ParseBool(asyncBytesEnv); err == nil {
metricsAsyncBytes = v
}
}
// Optional region flag (resource attribute)
if regionEnv == "" {
flag.StringVar(&region, "region", "", "Optional region resource attribute (also NEWT_REGION)")
} else {
region = regionEnv
}
// do a --version check
version := flag.Bool("version", false, "Print the version")
@@ -300,12 +371,57 @@ func main() {
loggerLevel := parseLogLevel(logLevel)
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
newtVersion := "version_replaceme"
// Initialize telemetry after flags are parsed (so flags override env)
tcfg := telemetry.FromEnv()
tcfg.PromEnabled = metricsEnabled
tcfg.OTLPEnabled = otlpEnabled
if adminAddr != "" {
tcfg.AdminAddr = adminAddr
}
// Resource attributes (if available)
tcfg.SiteID = id
tcfg.Region = region
// Build info
tcfg.BuildVersion = newtVersion
tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
tel, telErr := telemetry.Init(ctx, tcfg)
if telErr != nil {
logger.Warn("Telemetry init failed: %v", telErr)
}
if tel != nil {
// Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
mux := http.NewServeMux()
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
if tel.PrometheusHandler != nil {
mux.Handle("/metrics", tel.PrometheusHandler)
}
admin := &http.Server{
Addr: tcfg.AdminAddr,
Handler: otelhttp.NewHandler(mux, "newt-admin"),
ReadTimeout: 5 * time.Second,
WriteTimeout: 10 * time.Second,
ReadHeaderTimeout: 5 * time.Second,
IdleTimeout: 30 * time.Second,
}
go func() {
if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
logger.Warn("admin http error: %v", err)
}
}()
defer func() {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = admin.Shutdown(ctx)
}()
defer func() { _ = tel.Shutdown(context.Background()) }()
}
if *version {
fmt.Println("Newt version " + newtVersion)
os.Exit(0)
} else {
logger.Info("Newt version " + newtVersion)
logger.Info("Newt version %s", newtVersion)
}
if err := updates.CheckForUpdate("fosrl", "newt", newtVersion); err != nil {
@@ -376,6 +492,8 @@ func main() {
}
endpoint = client.GetConfig().Endpoint // Update endpoint from config
id = client.GetConfig().ID // Update ID from config
// Update site labels for metrics with the resolved ID
telemetry.UpdateSiteInfo(id, region)
// output env var values if set
logger.Debug("Endpoint: %v", endpoint)
@@ -484,6 +602,10 @@ func main() {
// Register handlers for different message types
client.RegisterHandler("newt/wg/connect", func(msg websocket.WSMessage) {
logger.Debug("Received registration message")
regResult := "success"
defer func() {
telemetry.IncSiteRegistration(ctx, regResult)
}()
if stopFunc != nil {
stopFunc() // stop the ws from sending more requests
stopFunc = nil // reset stopFunc to nil to avoid double stopping
@@ -502,22 +624,25 @@ func main() {
jsonData, err := json.Marshal(msg.Data)
if err != nil {
logger.Info("Error marshaling data: %v", err)
logger.Info(fmtErrMarshaling, err)
regResult = "failure"
return
}
if err := json.Unmarshal(jsonData, &wgData); err != nil {
logger.Info("Error unmarshaling target data: %v", err)
regResult = "failure"
return
}
logger.Debug("Received: %+v", msg)
logger.Debug(fmtReceivedMsg, msg)
tun, tnet, err = netstack.CreateNetTUN(
[]netip.Addr{netip.MustParseAddr(wgData.TunnelIP)},
[]netip.Addr{netip.MustParseAddr(dns)},
mtuInt)
if err != nil {
logger.Error("Failed to create TUN device: %v", err)
regResult = "failure"
}
setDownstreamTNetstack(tnet)
@@ -531,6 +656,7 @@ func main() {
host, _, err := net.SplitHostPort(wgData.Endpoint)
if err != nil {
logger.Error("Failed to split endpoint: %v", err)
regResult = "failure"
return
}
@@ -539,6 +665,7 @@ func main() {
endpoint, err := resolveDomain(wgData.Endpoint)
if err != nil {
logger.Error("Failed to resolve endpoint: %v", err)
regResult = "failure"
return
}
@@ -554,12 +681,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
err = dev.IpcSet(config)
if err != nil {
logger.Error("Failed to configure WireGuard device: %v", err)
regResult = "failure"
}
// Bring up the device
err = dev.Up()
if err != nil {
logger.Error("Failed to bring up WireGuard device: %v", err)
regResult = "failure"
}
logger.Debug("WireGuard device created. Lets ping the server now...")
@@ -572,9 +701,13 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
}
// Use reliable ping for initial connection test
logger.Debug("Testing initial connection with reliable ping...")
_, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
if err == nil && wgData.PublicKey != "" {
telemetry.ObserveTunnelLatency(ctx, wgData.PublicKey, "wireguard", lat.Seconds())
}
if err != nil {
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
regResult = "failure"
} else {
logger.Debug("Initial connection test successful")
}
@@ -585,11 +718,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
// as the pings will continue in the background
if !connected {
logger.Debug("Starting ping check")
pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
}
// Create proxy manager
pm = proxy.NewProxyManager(tnet)
pm.SetAsyncBytes(metricsAsyncBytes)
// Set tunnel_id for metrics (WireGuard peer public key)
pm.SetTunnelID(wgData.PublicKey)
connected = true
@@ -626,10 +762,19 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
logger.Info("Received reconnect message")
if wgData.PublicKey != "" {
telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest)
}
// Close the WireGuard device and TUN
closeWgTunnel()
// Clear metrics attrs and sessions for the tunnel
if pm != nil {
pm.ClearTunnelID()
state.Global().ClearTunnel(wgData.PublicKey)
}
// Mark as disconnected
connected = false
@@ -648,6 +793,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
logger.Info("Received termination message")
if wgData.PublicKey != "" {
telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest)
}
// Close the WireGuard device and TUN
closeWgTunnel()
@@ -675,7 +823,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
jsonData, err := json.Marshal(msg.Data)
if err != nil {
logger.Info("Error marshaling data: %v", err)
logger.Info(fmtErrMarshaling, err)
return
}
if err := json.Unmarshal(jsonData, &exitNodeData); err != nil {
@@ -716,7 +864,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
},
}
stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
"publicKey": publicKey.String(),
"pingResults": pingResults,
"newtVersion": newtVersion,
@@ -819,7 +967,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
}
// Send the ping results to the cloud for selection
stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
"publicKey": publicKey.String(),
"pingResults": pingResults,
"newtVersion": newtVersion,
@@ -829,17 +977,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
})
client.RegisterHandler("newt/tcp/add", func(msg websocket.WSMessage) {
logger.Debug("Received: %+v", msg)
logger.Debug(fmtReceivedMsg, msg)
// if there is no wgData or pm, we can't add targets
if wgData.TunnelIP == "" || pm == nil {
logger.Info("No tunnel IP or proxy manager available")
logger.Info(msgNoTunnelOrProxy)
return
}
targetData, err := parseTargetData(msg.Data)
if err != nil {
logger.Info("Error parsing target data: %v", err)
logger.Info(fmtErrParsingTargetData, err)
return
}
@@ -854,17 +1002,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
})
client.RegisterHandler("newt/udp/add", func(msg websocket.WSMessage) {
logger.Info("Received: %+v", msg)
logger.Info(fmtReceivedMsg, msg)
// if there is no wgData or pm, we can't add targets
if wgData.TunnelIP == "" || pm == nil {
logger.Info("No tunnel IP or proxy manager available")
logger.Info(msgNoTunnelOrProxy)
return
}
targetData, err := parseTargetData(msg.Data)
if err != nil {
logger.Info("Error parsing target data: %v", err)
logger.Info(fmtErrParsingTargetData, err)
return
}
@@ -879,17 +1027,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
})
client.RegisterHandler("newt/udp/remove", func(msg websocket.WSMessage) {
logger.Info("Received: %+v", msg)
logger.Info(fmtReceivedMsg, msg)
// if there is no wgData or pm, we can't add targets
if wgData.TunnelIP == "" || pm == nil {
logger.Info("No tunnel IP or proxy manager available")
logger.Info(msgNoTunnelOrProxy)
return
}
targetData, err := parseTargetData(msg.Data)
if err != nil {
logger.Info("Error parsing target data: %v", err)
logger.Info(fmtErrParsingTargetData, err)
return
}
@@ -904,17 +1052,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
})
client.RegisterHandler("newt/tcp/remove", func(msg websocket.WSMessage) {
logger.Info("Received: %+v", msg)
logger.Info(fmtReceivedMsg, msg)
// if there is no wgData or pm, we can't add targets
if wgData.TunnelIP == "" || pm == nil {
logger.Info("No tunnel IP or proxy manager available")
logger.Info(msgNoTunnelOrProxy)
return
}
targetData, err := parseTargetData(msg.Data)
if err != nil {
logger.Info("Error parsing target data: %v", err)
logger.Info(fmtErrParsingTargetData, err)
return
}
@@ -998,7 +1146,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
jsonData, err := json.Marshal(msg.Data)
if err != nil {
logger.Info("Error marshaling data: %v", err)
logger.Info(fmtErrMarshaling, err)
return
}
if err := json.Unmarshal(jsonData, &sshPublicKeyData); err != nil {
@@ -1155,9 +1303,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
}
if err := healthMonitor.EnableTarget(requestData.ID); err != nil {
logger.Error("Failed to enable health check target %s: %v", requestData.ID, err)
logger.Error("Failed to enable health check target %d: %v", requestData.ID, err)
} else {
logger.Info("Enabled health check target: %s", requestData.ID)
logger.Info("Enabled health check target: %d", requestData.ID)
}
})
@@ -1180,9 +1328,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
}
if err := healthMonitor.DisableTarget(requestData.ID); err != nil {
logger.Error("Failed to disable health check target %s: %v", requestData.ID, err)
logger.Error("Failed to disable health check target %d: %v", requestData.ID, err)
} else {
logger.Info("Disabled health check target: %s", requestData.ID)
logger.Info("Disabled health check target: %d", requestData.ID)
}
})
@@ -1252,7 +1400,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
}
// Send registration message to the server for backward compatibility
err := client.SendMessage("newt/wg/register", map[string]interface{}{
err := client.SendMessage(topicWGRegister, map[string]interface{}{
"publicKey": publicKey.String(),
"newtVersion": newtVersion,
"backwardsCompatible": true,