mirror of
https://github.com/fosrl/newt.git
synced 2026-03-02 08:46:42 +00:00
Adding OpenTelemetry Metrics and Tracing
This commit is contained in:
206
main.go
206
main.go
@@ -1,7 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net"
|
||||
@@ -22,6 +24,9 @@ import (
|
||||
"github.com/fosrl/newt/updates"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
|
||||
"github.com/fosrl/newt/internal/state"
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
@@ -91,6 +96,14 @@ func (s *stringSlice) Set(value string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
const (
|
||||
fmtErrMarshaling = "Error marshaling data: %v"
|
||||
fmtReceivedMsg = "Received: %+v"
|
||||
topicWGRegister = "newt/wg/register"
|
||||
msgNoTunnelOrProxy = "No tunnel IP or proxy manager available"
|
||||
fmtErrParsingTargetData = "Error parsing target data: %v"
|
||||
)
|
||||
|
||||
var (
|
||||
endpoint string
|
||||
id string
|
||||
@@ -120,8 +133,17 @@ var (
|
||||
preferEndpoint string
|
||||
healthMonitor *healthcheck.Monitor
|
||||
enforceHealthcheckCert bool
|
||||
blueprintFile string
|
||||
noCloud bool
|
||||
// Build/version (can be overridden via -ldflags "-X main.newtVersion=...")
|
||||
newtVersion = "version_replaceme"
|
||||
|
||||
// Observability/metrics flags
|
||||
metricsEnabled bool
|
||||
otlpEnabled bool
|
||||
adminAddr string
|
||||
region string
|
||||
metricsAsyncBytes bool
|
||||
blueprintFile string
|
||||
noCloud bool
|
||||
|
||||
// New mTLS configuration variables
|
||||
tlsClientCert string
|
||||
@@ -133,6 +155,10 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Prepare context for graceful shutdown and signal handling
|
||||
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
defer stop()
|
||||
|
||||
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
|
||||
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
|
||||
id = os.Getenv("NEWT_ID")
|
||||
@@ -143,6 +169,14 @@ func main() {
|
||||
updownScript = os.Getenv("UPDOWN_SCRIPT")
|
||||
interfaceName = os.Getenv("INTERFACE")
|
||||
generateAndSaveKeyTo = os.Getenv("GENERATE_AND_SAVE_KEY_TO")
|
||||
|
||||
// Metrics/observability env mirrors
|
||||
metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
|
||||
otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
|
||||
adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
|
||||
regionEnv := os.Getenv("NEWT_REGION")
|
||||
asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
|
||||
|
||||
keepInterfaceEnv := os.Getenv("KEEP_INTERFACE")
|
||||
keepInterface = keepInterfaceEnv == "true"
|
||||
acceptClientsEnv := os.Getenv("ACCEPT_CLIENTS")
|
||||
@@ -286,6 +320,43 @@ func main() {
|
||||
flag.BoolVar(&noCloud, "no-cloud", false, "Disable cloud failover")
|
||||
}
|
||||
|
||||
// Metrics/observability flags (mirror ENV if unset)
|
||||
if metricsEnabledEnv == "" {
|
||||
flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
|
||||
} else {
|
||||
if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil {
|
||||
metricsEnabled = v
|
||||
} else {
|
||||
metricsEnabled = true
|
||||
}
|
||||
}
|
||||
if otlpEnabledEnv == "" {
|
||||
flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
} else {
|
||||
if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil {
|
||||
otlpEnabled = v
|
||||
}
|
||||
}
|
||||
if adminAddrEnv == "" {
|
||||
flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
|
||||
} else {
|
||||
adminAddr = adminAddrEnv
|
||||
}
|
||||
// Async bytes toggle
|
||||
if asyncBytesEnv == "" {
|
||||
flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
|
||||
} else {
|
||||
if v, err := strconv.ParseBool(asyncBytesEnv); err == nil {
|
||||
metricsAsyncBytes = v
|
||||
}
|
||||
}
|
||||
// Optional region flag (resource attribute)
|
||||
if regionEnv == "" {
|
||||
flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)")
|
||||
} else {
|
||||
region = regionEnv
|
||||
}
|
||||
|
||||
// do a --version check
|
||||
version := flag.Bool("version", false, "Print the version")
|
||||
|
||||
@@ -300,12 +371,57 @@ func main() {
|
||||
loggerLevel := parseLogLevel(logLevel)
|
||||
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
|
||||
|
||||
newtVersion := "version_replaceme"
|
||||
// Initialize telemetry after flags are parsed (so flags override env)
|
||||
tcfg := telemetry.FromEnv()
|
||||
tcfg.PromEnabled = metricsEnabled
|
||||
tcfg.OTLPEnabled = otlpEnabled
|
||||
if adminAddr != "" {
|
||||
tcfg.AdminAddr = adminAddr
|
||||
}
|
||||
// Resource attributes (if available)
|
||||
tcfg.SiteID = id
|
||||
tcfg.Region = region
|
||||
// Build info
|
||||
tcfg.BuildVersion = newtVersion
|
||||
tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
|
||||
|
||||
tel, telErr := telemetry.Init(ctx, tcfg)
|
||||
if telErr != nil {
|
||||
logger.Warn("Telemetry init failed: %v", telErr)
|
||||
}
|
||||
if tel != nil {
|
||||
// Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
|
||||
if tel.PrometheusHandler != nil {
|
||||
mux.Handle("/metrics", tel.PrometheusHandler)
|
||||
}
|
||||
admin := &http.Server{
|
||||
Addr: tcfg.AdminAddr,
|
||||
Handler: otelhttp.NewHandler(mux, "newt-admin"),
|
||||
ReadTimeout: 5 * time.Second,
|
||||
WriteTimeout: 10 * time.Second,
|
||||
ReadHeaderTimeout: 5 * time.Second,
|
||||
IdleTimeout: 30 * time.Second,
|
||||
}
|
||||
go func() {
|
||||
if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
logger.Warn("admin http error: %v", err)
|
||||
}
|
||||
}()
|
||||
defer func() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
_ = admin.Shutdown(ctx)
|
||||
}()
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
}
|
||||
|
||||
if *version {
|
||||
fmt.Println("Newt version " + newtVersion)
|
||||
os.Exit(0)
|
||||
} else {
|
||||
logger.Info("Newt version " + newtVersion)
|
||||
logger.Info("Newt version %s", newtVersion)
|
||||
}
|
||||
|
||||
if err := updates.CheckForUpdate("fosrl", "newt", newtVersion); err != nil {
|
||||
@@ -376,6 +492,8 @@ func main() {
|
||||
}
|
||||
endpoint = client.GetConfig().Endpoint // Update endpoint from config
|
||||
id = client.GetConfig().ID // Update ID from config
|
||||
// Update site labels for metrics with the resolved ID
|
||||
telemetry.UpdateSiteInfo(id, region)
|
||||
|
||||
// output env var values if set
|
||||
logger.Debug("Endpoint: %v", endpoint)
|
||||
@@ -484,6 +602,10 @@ func main() {
|
||||
// Register handlers for different message types
|
||||
client.RegisterHandler("newt/wg/connect", func(msg websocket.WSMessage) {
|
||||
logger.Debug("Received registration message")
|
||||
regResult := "success"
|
||||
defer func() {
|
||||
telemetry.IncSiteRegistration(ctx, regResult)
|
||||
}()
|
||||
if stopFunc != nil {
|
||||
stopFunc() // stop the ws from sending more requests
|
||||
stopFunc = nil // reset stopFunc to nil to avoid double stopping
|
||||
@@ -502,22 +624,25 @@ func main() {
|
||||
|
||||
jsonData, err := json.Marshal(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error marshaling data: %v", err)
|
||||
logger.Info(fmtErrMarshaling, err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(jsonData, &wgData); err != nil {
|
||||
logger.Info("Error unmarshaling target data: %v", err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
logger.Debug("Received: %+v", msg)
|
||||
logger.Debug(fmtReceivedMsg, msg)
|
||||
tun, tnet, err = netstack.CreateNetTUN(
|
||||
[]netip.Addr{netip.MustParseAddr(wgData.TunnelIP)},
|
||||
[]netip.Addr{netip.MustParseAddr(dns)},
|
||||
mtuInt)
|
||||
if err != nil {
|
||||
logger.Error("Failed to create TUN device: %v", err)
|
||||
regResult = "failure"
|
||||
}
|
||||
|
||||
setDownstreamTNetstack(tnet)
|
||||
@@ -531,6 +656,7 @@ func main() {
|
||||
host, _, err := net.SplitHostPort(wgData.Endpoint)
|
||||
if err != nil {
|
||||
logger.Error("Failed to split endpoint: %v", err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
@@ -539,6 +665,7 @@ func main() {
|
||||
endpoint, err := resolveDomain(wgData.Endpoint)
|
||||
if err != nil {
|
||||
logger.Error("Failed to resolve endpoint: %v", err)
|
||||
regResult = "failure"
|
||||
return
|
||||
}
|
||||
|
||||
@@ -554,12 +681,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
err = dev.IpcSet(config)
|
||||
if err != nil {
|
||||
logger.Error("Failed to configure WireGuard device: %v", err)
|
||||
regResult = "failure"
|
||||
}
|
||||
|
||||
// Bring up the device
|
||||
err = dev.Up()
|
||||
if err != nil {
|
||||
logger.Error("Failed to bring up WireGuard device: %v", err)
|
||||
regResult = "failure"
|
||||
}
|
||||
|
||||
logger.Debug("WireGuard device created. Lets ping the server now...")
|
||||
@@ -572,9 +701,13 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
// Use reliable ping for initial connection test
|
||||
logger.Debug("Testing initial connection with reliable ping...")
|
||||
_, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
if err == nil && wgData.PublicKey != "" {
|
||||
telemetry.ObserveTunnelLatency(ctx, wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
}
|
||||
if err != nil {
|
||||
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
|
||||
regResult = "failure"
|
||||
} else {
|
||||
logger.Debug("Initial connection test successful")
|
||||
}
|
||||
@@ -585,11 +718,14 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
// as the pings will continue in the background
|
||||
if !connected {
|
||||
logger.Debug("Starting ping check")
|
||||
pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
|
||||
pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Create proxy manager
|
||||
pm = proxy.NewProxyManager(tnet)
|
||||
pm.SetAsyncBytes(metricsAsyncBytes)
|
||||
// Set tunnel_id for metrics (WireGuard peer public key)
|
||||
pm.SetTunnelID(wgData.PublicKey)
|
||||
|
||||
connected = true
|
||||
|
||||
@@ -626,10 +762,19 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received reconnect message")
|
||||
if wgData.PublicKey != "" {
|
||||
telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest)
|
||||
}
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
|
||||
// Clear metrics attrs and sessions for the tunnel
|
||||
if pm != nil {
|
||||
pm.ClearTunnelID()
|
||||
state.Global().ClearTunnel(wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Mark as disconnected
|
||||
connected = false
|
||||
|
||||
@@ -648,6 +793,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received termination message")
|
||||
if wgData.PublicKey != "" {
|
||||
telemetry.IncReconnect(ctx, wgData.PublicKey, "server", telemetry.ReasonServerRequest)
|
||||
}
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
@@ -675,7 +823,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
jsonData, err := json.Marshal(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error marshaling data: %v", err)
|
||||
logger.Info(fmtErrMarshaling, err)
|
||||
return
|
||||
}
|
||||
if err := json.Unmarshal(jsonData, &exitNodeData); err != nil {
|
||||
@@ -716,7 +864,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
},
|
||||
}
|
||||
|
||||
stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{
|
||||
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"pingResults": pingResults,
|
||||
"newtVersion": newtVersion,
|
||||
@@ -819,7 +967,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
|
||||
// Send the ping results to the cloud for selection
|
||||
stopFunc = client.SendMessageInterval("newt/wg/register", map[string]interface{}{
|
||||
stopFunc = client.SendMessageInterval(topicWGRegister, map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"pingResults": pingResults,
|
||||
"newtVersion": newtVersion,
|
||||
@@ -829,17 +977,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
})
|
||||
|
||||
client.RegisterHandler("newt/tcp/add", func(msg websocket.WSMessage) {
|
||||
logger.Debug("Received: %+v", msg)
|
||||
logger.Debug(fmtReceivedMsg, msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if wgData.TunnelIP == "" || pm == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
logger.Info(msgNoTunnelOrProxy)
|
||||
return
|
||||
}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error parsing target data: %v", err)
|
||||
logger.Info(fmtErrParsingTargetData, err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -854,17 +1002,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
})
|
||||
|
||||
client.RegisterHandler("newt/udp/add", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received: %+v", msg)
|
||||
logger.Info(fmtReceivedMsg, msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if wgData.TunnelIP == "" || pm == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
logger.Info(msgNoTunnelOrProxy)
|
||||
return
|
||||
}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error parsing target data: %v", err)
|
||||
logger.Info(fmtErrParsingTargetData, err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -879,17 +1027,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
})
|
||||
|
||||
client.RegisterHandler("newt/udp/remove", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received: %+v", msg)
|
||||
logger.Info(fmtReceivedMsg, msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if wgData.TunnelIP == "" || pm == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
logger.Info(msgNoTunnelOrProxy)
|
||||
return
|
||||
}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error parsing target data: %v", err)
|
||||
logger.Info(fmtErrParsingTargetData, err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -904,17 +1052,17 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
})
|
||||
|
||||
client.RegisterHandler("newt/tcp/remove", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received: %+v", msg)
|
||||
logger.Info(fmtReceivedMsg, msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if wgData.TunnelIP == "" || pm == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
logger.Info(msgNoTunnelOrProxy)
|
||||
return
|
||||
}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error parsing target data: %v", err)
|
||||
logger.Info(fmtErrParsingTargetData, err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -998,7 +1146,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
jsonData, err := json.Marshal(msg.Data)
|
||||
if err != nil {
|
||||
logger.Info("Error marshaling data: %v", err)
|
||||
logger.Info(fmtErrMarshaling, err)
|
||||
return
|
||||
}
|
||||
if err := json.Unmarshal(jsonData, &sshPublicKeyData); err != nil {
|
||||
@@ -1155,9 +1303,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
|
||||
if err := healthMonitor.EnableTarget(requestData.ID); err != nil {
|
||||
logger.Error("Failed to enable health check target %s: %v", requestData.ID, err)
|
||||
logger.Error("Failed to enable health check target %d: %v", requestData.ID, err)
|
||||
} else {
|
||||
logger.Info("Enabled health check target: %s", requestData.ID)
|
||||
logger.Info("Enabled health check target: %d", requestData.ID)
|
||||
}
|
||||
})
|
||||
|
||||
@@ -1180,9 +1328,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
|
||||
if err := healthMonitor.DisableTarget(requestData.ID); err != nil {
|
||||
logger.Error("Failed to disable health check target %s: %v", requestData.ID, err)
|
||||
logger.Error("Failed to disable health check target %d: %v", requestData.ID, err)
|
||||
} else {
|
||||
logger.Info("Disabled health check target: %s", requestData.ID)
|
||||
logger.Info("Disabled health check target: %d", requestData.ID)
|
||||
}
|
||||
})
|
||||
|
||||
@@ -1252,7 +1400,7 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
|
||||
// Send registration message to the server for backward compatibility
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
err := client.SendMessage(topicWGRegister, map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"newtVersion": newtVersion,
|
||||
"backwardsCompatible": true,
|
||||
|
||||
Reference in New Issue
Block a user