Adding OpenTelemetry Metrics and Tracing

This commit is contained in:
Marc Schäfer
2025-10-11 18:19:51 +02:00
parent b383cec0b0
commit c086e69dd0
30 changed files with 3457 additions and 158 deletions

View File

@@ -0,0 +1,80 @@
package state
import (
"sync"
"sync/atomic"
"time"
"github.com/fosrl/newt/internal/telemetry"
)
// TelemetryView is a minimal, thread-safe implementation to feed observables.
// Since one Newt process represents one site, we expose a single logical site.
// site_id is a resource attribute, so we do not emit per-site labels here.
type TelemetryView struct {
online atomic.Bool
lastHBUnix atomic.Int64 // unix seconds
// per-tunnel sessions
sessMu sync.RWMutex
sessions map[string]*atomic.Int64
}
var (
globalView atomic.Pointer[TelemetryView]
)
// Global returns a singleton TelemetryView.
func Global() *TelemetryView {
if v := globalView.Load(); v != nil { return v }
v := &TelemetryView{ sessions: make(map[string]*atomic.Int64) }
globalView.Store(v)
telemetry.RegisterStateView(v)
return v
}
// Instrumentation helpers
func (v *TelemetryView) IncSessions(tunnelID string) {
v.sessMu.Lock(); defer v.sessMu.Unlock()
c := v.sessions[tunnelID]
if c == nil { c = &atomic.Int64{}; v.sessions[tunnelID] = c }
c.Add(1)
}
func (v *TelemetryView) DecSessions(tunnelID string) {
v.sessMu.Lock(); defer v.sessMu.Unlock()
if c := v.sessions[tunnelID]; c != nil {
c.Add(-1)
if c.Load() <= 0 { delete(v.sessions, tunnelID) }
}
}
func (v *TelemetryView) ClearTunnel(tunnelID string) {
v.sessMu.Lock(); defer v.sessMu.Unlock()
delete(v.sessions, tunnelID)
}
func (v *TelemetryView) SetOnline(b bool) { v.online.Store(b) }
func (v *TelemetryView) TouchHeartbeat() { v.lastHBUnix.Store(time.Now().Unix()) }
// --- telemetry.StateView interface ---
func (v *TelemetryView) ListSites() []string { return []string{"self"} }
func (v *TelemetryView) Online(_ string) (bool, bool) { return v.online.Load(), true }
func (v *TelemetryView) LastHeartbeat(_ string) (time.Time, bool) {
sec := v.lastHBUnix.Load()
if sec == 0 { return time.Time{}, false }
return time.Unix(sec, 0), true
}
func (v *TelemetryView) ActiveSessions(_ string) (int64, bool) {
// aggregated sessions (not used for per-tunnel gauge)
v.sessMu.RLock(); defer v.sessMu.RUnlock()
var sum int64
for _, c := range v.sessions { if c != nil { sum += c.Load() } }
return sum, true
}
// Extended accessor used by telemetry callback to publish per-tunnel samples.
func (v *TelemetryView) SessionsByTunnel() map[string]int64 {
v.sessMu.RLock(); defer v.sessMu.RUnlock()
out := make(map[string]int64, len(v.sessions))
for id, c := range v.sessions { if c != nil && c.Load() > 0 { out[id] = c.Load() } }
return out
}

View File

@@ -0,0 +1,19 @@
package telemetry
// Protocol labels (low-cardinality)
const (
ProtocolTCP = "tcp"
ProtocolUDP = "udp"
)
// Reconnect reason bins (fixed, low-cardinality)
const (
ReasonServerRequest = "server_request"
ReasonTimeout = "timeout"
ReasonPeerClose = "peer_close"
ReasonNetworkChange = "network_change"
ReasonAuthError = "auth_error"
ReasonHandshakeError = "handshake_error"
ReasonConfigChange = "config_change"
ReasonError = "error"
)

View File

@@ -0,0 +1,32 @@
package telemetry
import "testing"
func TestAllowedConstants(t *testing.T) {
allowedReasons := map[string]struct{}{
ReasonServerRequest: {},
ReasonTimeout: {},
ReasonPeerClose: {},
ReasonNetworkChange: {},
ReasonAuthError: {},
ReasonHandshakeError: {},
ReasonConfigChange: {},
ReasonError: {},
}
for k := range allowedReasons {
if k == "" {
t.Fatalf("empty reason constant")
}
}
allowedProtocols := map[string]struct{}{
ProtocolTCP: {},
ProtocolUDP: {},
}
for k := range allowedProtocols {
if k == "" {
t.Fatalf("empty protocol constant")
}
}
}

View File

@@ -0,0 +1,542 @@
package telemetry
import (
"context"
"sync"
"sync/atomic"
"time"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
)
// Instruments and helpers for Newt metrics following the naming, units, and
// low-cardinality label guidance from the issue description.
//
// Counters end with _total, durations are in seconds, sizes in bytes.
// Only low-cardinality stable labels are supported: tunnel_id,
// transport, direction, result, reason, error_type.
var (
initOnce sync.Once
meter metric.Meter
// Site / Registration
mSiteRegistrations metric.Int64Counter
mSiteOnline metric.Int64ObservableGauge
mSiteLastHeartbeat metric.Float64ObservableGauge
// Tunnel / Sessions
mTunnelSessions metric.Int64ObservableGauge
mTunnelBytes metric.Int64Counter
mTunnelLatency metric.Float64Histogram
mReconnects metric.Int64Counter
// Connection / NAT
mConnAttempts metric.Int64Counter
mConnErrors metric.Int64Counter
// Config/Restart
mConfigReloads metric.Int64Counter
mConfigApply metric.Float64Histogram
mCertRotationTotal metric.Int64Counter
mProcessStartTime metric.Float64ObservableGauge
// Build info
mBuildInfo metric.Int64ObservableGauge
// WebSocket
mWSConnectLatency metric.Float64Histogram
mWSMessages metric.Int64Counter
mWSDisconnects metric.Int64Counter
mWSKeepaliveFailure metric.Int64Counter
mWSSessionDuration metric.Float64Histogram
mWSConnected metric.Int64ObservableGauge
mWSReconnects metric.Int64Counter
// Proxy
mProxyActiveConns metric.Int64ObservableGauge
mProxyBufferBytes metric.Int64ObservableGauge
mProxyAsyncBacklogByte metric.Int64ObservableGauge
mProxyDropsTotal metric.Int64Counter
mProxyAcceptsTotal metric.Int64Counter
mProxyConnDuration metric.Float64Histogram
mProxyConnectionsTotal metric.Int64Counter
buildVersion string
buildCommit string
processStartUnix = float64(time.Now().UnixNano()) / 1e9
wsConnectedState atomic.Int64
)
// Proxy connection lifecycle events.
const (
ProxyConnectionOpened = "opened"
ProxyConnectionClosed = "closed"
)
// attrsWithSite appends site/region labels only when explicitly enabled to keep
// label cardinality low by default.
func attrsWithSite(extra ...attribute.KeyValue) []attribute.KeyValue {
attrs := make([]attribute.KeyValue, len(extra))
copy(attrs, extra)
if ShouldIncludeSiteLabels() {
attrs = append(attrs, siteAttrs()...)
}
return attrs
}
func registerInstruments() error {
var err error
initOnce.Do(func() {
meter = otel.Meter("newt")
if e := registerSiteInstruments(); e != nil {
err = e
return
}
if e := registerTunnelInstruments(); e != nil {
err = e
return
}
if e := registerConnInstruments(); e != nil {
err = e
return
}
if e := registerConfigInstruments(); e != nil {
err = e
return
}
if e := registerBuildWSProxyInstruments(); e != nil {
err = e
return
}
})
return err
}
func registerSiteInstruments() error {
var err error
mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total",
metric.WithDescription("Total site registration attempts"))
if err != nil {
return err
}
mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online",
metric.WithDescription("Site online (0/1)"))
if err != nil {
return err
}
mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_timestamp_seconds",
metric.WithDescription("Unix timestamp of the last site heartbeat"),
metric.WithUnit("s"))
if err != nil {
return err
}
return nil
}
func registerTunnelInstruments() error {
var err error
mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions",
metric.WithDescription("Active tunnel sessions"))
if err != nil {
return err
}
mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total",
metric.WithDescription("Tunnel bytes ingress/egress"),
metric.WithUnit("By"))
if err != nil {
return err
}
mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds",
metric.WithDescription("Per-tunnel latency in seconds"),
metric.WithUnit("s"))
if err != nil {
return err
}
mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total",
metric.WithDescription("Tunnel reconnect events"))
if err != nil {
return err
}
return nil
}
func registerConnInstruments() error {
var err error
mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total",
metric.WithDescription("Connection attempts"))
if err != nil {
return err
}
mConnErrors, err = meter.Int64Counter("newt_connection_errors_total",
metric.WithDescription("Connection errors by type"))
if err != nil {
return err
}
return nil
}
func registerConfigInstruments() error {
mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total",
metric.WithDescription("Configuration reloads"))
mConfigApply, _ = meter.Float64Histogram("newt_config_apply_seconds",
metric.WithDescription("Configuration apply duration in seconds"),
metric.WithUnit("s"))
mCertRotationTotal, _ = meter.Int64Counter("newt_cert_rotation_total",
metric.WithDescription("Certificate rotation events (success/failure)"))
mProcessStartTime, _ = meter.Float64ObservableGauge("process_start_time_seconds",
metric.WithDescription("Unix timestamp of the process start time"),
metric.WithUnit("s"))
if mProcessStartTime != nil {
if _, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
o.ObserveFloat64(mProcessStartTime, processStartUnix)
return nil
}, mProcessStartTime); err != nil {
otel.Handle(err)
}
}
return nil
}
func registerBuildWSProxyInstruments() error {
// Build info gauge (value 1 with version/commit attributes)
mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info",
metric.WithDescription("Newt build information (value is always 1)"))
// WebSocket
mWSConnectLatency, _ = meter.Float64Histogram("newt_websocket_connect_latency_seconds",
metric.WithDescription("WebSocket connect latency in seconds"),
metric.WithUnit("s"))
mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total",
metric.WithDescription("WebSocket messages by direction and type"))
mWSDisconnects, _ = meter.Int64Counter("newt_websocket_disconnects_total",
metric.WithDescription("WebSocket disconnects by reason/result"))
mWSKeepaliveFailure, _ = meter.Int64Counter("newt_websocket_keepalive_failures_total",
metric.WithDescription("WebSocket keepalive (ping/pong) failures"))
mWSSessionDuration, _ = meter.Float64Histogram("newt_websocket_session_duration_seconds",
metric.WithDescription("Duration of established WebSocket sessions"),
metric.WithUnit("s"))
mWSConnected, _ = meter.Int64ObservableGauge("newt_websocket_connected",
metric.WithDescription("WebSocket connection state (1=connected, 0=disconnected)"))
mWSReconnects, _ = meter.Int64Counter("newt_websocket_reconnects_total",
metric.WithDescription("WebSocket reconnect attempts by reason"))
// Proxy
mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections",
metric.WithDescription("Proxy active connections per tunnel and protocol"))
mProxyBufferBytes, _ = meter.Int64ObservableGauge("newt_proxy_buffer_bytes",
metric.WithDescription("Proxy buffer bytes (may approximate async backlog)"),
metric.WithUnit("By"))
mProxyAsyncBacklogByte, _ = meter.Int64ObservableGauge("newt_proxy_async_backlog_bytes",
metric.WithDescription("Unflushed async byte backlog per tunnel and protocol"),
metric.WithUnit("By"))
mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total",
metric.WithDescription("Proxy drops due to write errors"))
mProxyAcceptsTotal, _ = meter.Int64Counter("newt_proxy_accept_total",
metric.WithDescription("Proxy connection accepts by protocol and result"))
mProxyConnDuration, _ = meter.Float64Histogram("newt_proxy_connection_duration_seconds",
metric.WithDescription("Duration of completed proxy connections"),
metric.WithUnit("s"))
mProxyConnectionsTotal, _ = meter.Int64Counter("newt_proxy_connections_total",
metric.WithDescription("Proxy connection lifecycle events by protocol"))
// Register a default callback for build info if version/commit set
reg, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
if buildVersion == "" && buildCommit == "" {
return nil
}
attrs := []attribute.KeyValue{}
if buildVersion != "" {
attrs = append(attrs, attribute.String("version", buildVersion))
}
if buildCommit != "" {
attrs = append(attrs, attribute.String("commit", buildCommit))
}
if ShouldIncludeSiteLabels() {
attrs = append(attrs, siteAttrs()...)
}
o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...))
return nil
}, mBuildInfo)
if e != nil {
otel.Handle(e)
} else {
// Provide a functional stopper that unregisters the callback
obsStopper = func() { _ = reg.Unregister() }
}
if mWSConnected != nil {
if regConn, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
val := wsConnectedState.Load()
o.ObserveInt64(mWSConnected, val, metric.WithAttributes(attrsWithSite()...))
return nil
}, mWSConnected); err != nil {
otel.Handle(err)
} else {
wsConnStopper = func() { _ = regConn.Unregister() }
}
}
return nil
}
// Observable registration: Newt can register a callback to report gauges.
// Call SetObservableCallback once to start observing online status, last
// heartbeat seconds, and active sessions.
var (
obsOnce sync.Once
obsStopper func()
proxyObsOnce sync.Once
proxyStopper func()
wsConnStopper func()
)
// SetObservableCallback registers a single callback that will be invoked
// on collection. Use the provided observer to emit values for the observable
// gauges defined here.
//
// Example inside your code (where you have access to current state):
//
// telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error {
// o.ObserveInt64(mSiteOnline, 1)
// o.ObserveFloat64(mSiteLastHeartbeat, float64(lastHB.Unix()))
// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions)))
// return nil
// })
func SetObservableCallback(cb func(context.Context, metric.Observer) error) {
obsOnce.Do(func() {
reg, e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions)
if e != nil {
otel.Handle(e)
obsStopper = func() {
// no-op: registration failed; keep stopper callable
}
return
}
// Provide a functional stopper mirroring proxy/build-info behavior
obsStopper = func() { _ = reg.Unregister() }
})
}
// SetProxyObservableCallback registers a callback to observe proxy gauges.
func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) {
proxyObsOnce.Do(func() {
reg, e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte)
if e != nil {
otel.Handle(e)
proxyStopper = func() {
// no-op: registration failed; keep stopper callable
}
return
}
// Provide a functional stopper to unregister later if needed
proxyStopper = func() { _ = reg.Unregister() }
})
}
// Build info registration
func RegisterBuildInfo(version, commit string) {
buildVersion = version
buildCommit = commit
}
// Config reloads
func IncConfigReload(ctx context.Context, result string) {
mConfigReloads.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("result", result),
)...))
}
// Helpers for counters/histograms
func IncSiteRegistration(ctx context.Context, result string) {
attrs := []attribute.KeyValue{
attribute.String("result", result),
}
mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
}
func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) {
attrs := []attribute.KeyValue{
attribute.String("direction", direction),
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(attrs...)...))
}
// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations.
func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) {
mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs))
}
// --- WebSocket helpers ---
func ObserveWSConnectLatency(ctx context.Context, seconds float64, result, errorType string) {
attrs := []attribute.KeyValue{
attribute.String("transport", "websocket"),
attribute.String("result", result),
}
if errorType != "" {
attrs = append(attrs, attribute.String("error_type", errorType))
}
mWSConnectLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
}
func IncWSMessage(ctx context.Context, direction, msgType string) {
mWSMessages.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("direction", direction),
attribute.String("msg_type", msgType),
)...))
}
func IncWSDisconnect(ctx context.Context, reason, result string) {
mWSDisconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("reason", reason),
attribute.String("result", result),
)...))
}
func IncWSKeepaliveFailure(ctx context.Context, reason string) {
mWSKeepaliveFailure.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("reason", reason),
)...))
}
// SetWSConnectionState updates the backing gauge for the WebSocket connected state.
func SetWSConnectionState(connected bool) {
if connected {
wsConnectedState.Store(1)
} else {
wsConnectedState.Store(0)
}
}
// IncWSReconnect increments the WebSocket reconnect counter with a bounded reason label.
func IncWSReconnect(ctx context.Context, reason string) {
if reason == "" {
reason = "unknown"
}
mWSReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("reason", reason),
)...))
}
func ObserveWSSessionDuration(ctx context.Context, seconds float64, result string) {
mWSSessionDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
attribute.String("result", result),
)...))
}
// --- Proxy helpers ---
func ObserveProxyActiveConnsObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
o.ObserveInt64(mProxyActiveConns, value, metric.WithAttributes(attrs...))
}
func ObserveProxyBufferBytesObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
o.ObserveInt64(mProxyBufferBytes, value, metric.WithAttributes(attrs...))
}
func ObserveProxyAsyncBacklogObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
o.ObserveInt64(mProxyAsyncBacklogByte, value, metric.WithAttributes(attrs...))
}
func IncProxyDrops(ctx context.Context, tunnelID, protocol string) {
attrs := []attribute.KeyValue{
attribute.String("protocol", protocol),
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
}
func IncProxyAccept(ctx context.Context, tunnelID, protocol, result, reason string) {
attrs := []attribute.KeyValue{
attribute.String("protocol", protocol),
attribute.String("result", result),
}
if reason != "" {
attrs = append(attrs, attribute.String("reason", reason))
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mProxyAcceptsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
}
func ObserveProxyConnectionDuration(ctx context.Context, tunnelID, protocol, result string, seconds float64) {
attrs := []attribute.KeyValue{
attribute.String("protocol", protocol),
attribute.String("result", result),
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mProxyConnDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
}
// IncProxyConnectionEvent records proxy connection lifecycle events (opened/closed).
func IncProxyConnectionEvent(ctx context.Context, tunnelID, protocol, event string) {
if event == "" {
event = "unknown"
}
attrs := []attribute.KeyValue{
attribute.String("protocol", protocol),
attribute.String("event", event),
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mProxyConnectionsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
}
// --- Config/PKI helpers ---
func ObserveConfigApply(ctx context.Context, phase, result string, seconds float64) {
mConfigApply.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
attribute.String("phase", phase),
attribute.String("result", result),
)...))
}
func IncCertRotation(ctx context.Context, result string) {
mCertRotationTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("result", result),
)...))
}
func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) {
attrs := []attribute.KeyValue{
attribute.String("transport", transport),
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
}
func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) {
attrs := []attribute.KeyValue{
attribute.String("initiator", initiator),
attribute.String("reason", reason),
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
}
func IncConnAttempt(ctx context.Context, transport, result string) {
mConnAttempts.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("transport", transport),
attribute.String("result", result),
)...))
}
func IncConnError(ctx context.Context, transport, typ string) {
mConnErrors.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("transport", transport),
attribute.String("error_type", typ),
)...))
}

View File

@@ -0,0 +1,59 @@
package telemetry
import (
"sync"
"time"
)
func resetMetricsForTest() {
initOnce = sync.Once{}
obsOnce = sync.Once{}
proxyObsOnce = sync.Once{}
obsStopper = nil
proxyStopper = nil
if wsConnStopper != nil {
wsConnStopper()
}
wsConnStopper = nil
meter = nil
mSiteRegistrations = nil
mSiteOnline = nil
mSiteLastHeartbeat = nil
mTunnelSessions = nil
mTunnelBytes = nil
mTunnelLatency = nil
mReconnects = nil
mConnAttempts = nil
mConnErrors = nil
mConfigReloads = nil
mConfigApply = nil
mCertRotationTotal = nil
mProcessStartTime = nil
mBuildInfo = nil
mWSConnectLatency = nil
mWSMessages = nil
mWSDisconnects = nil
mWSKeepaliveFailure = nil
mWSSessionDuration = nil
mWSConnected = nil
mWSReconnects = nil
mProxyActiveConns = nil
mProxyBufferBytes = nil
mProxyAsyncBacklogByte = nil
mProxyDropsTotal = nil
mProxyAcceptsTotal = nil
mProxyConnDuration = nil
mProxyConnectionsTotal = nil
processStartUnix = float64(time.Now().UnixNano()) / 1e9
wsConnectedState.Store(0)
includeTunnelIDVal.Store(false)
includeSiteLabelVal.Store(false)
}

View File

@@ -0,0 +1,106 @@
package telemetry
import (
"context"
"sync/atomic"
"time"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
)
// StateView provides a read-only view for observable gauges.
// Implementations must be concurrency-safe and avoid blocking operations.
// All methods should be fast and use RLocks where applicable.
type StateView interface {
// ListSites returns a stable, low-cardinality list of site IDs to expose.
ListSites() []string
// Online returns whether the site is online.
Online(siteID string) (online bool, ok bool)
// LastHeartbeat returns the last heartbeat time for a site.
LastHeartbeat(siteID string) (t time.Time, ok bool)
// ActiveSessions returns the current number of active sessions for a site (across tunnels),
// or scoped to site if your model is site-scoped.
ActiveSessions(siteID string) (n int64, ok bool)
}
var (
stateView atomic.Value // of type StateView
)
// RegisterStateView sets the global StateView used by the default observable callback.
func RegisterStateView(v StateView) {
stateView.Store(v)
// If instruments are registered, ensure a callback exists.
if v != nil {
SetObservableCallback(func(ctx context.Context, o metric.Observer) error {
if any := stateView.Load(); any != nil {
if sv, ok := any.(StateView); ok {
for _, siteID := range sv.ListSites() {
observeSiteOnlineFor(o, sv, siteID)
observeLastHeartbeatFor(o, sv, siteID)
observeSessionsFor(o, siteID, sv)
}
}
}
return nil
})
}
}
func observeSiteOnlineFor(o metric.Observer, sv StateView, siteID string) {
if online, ok := sv.Online(siteID); ok {
val := int64(0)
if online {
val = 1
}
o.ObserveInt64(mSiteOnline, val, metric.WithAttributes(
attribute.String("site_id", siteID),
))
}
}
func observeLastHeartbeatFor(o metric.Observer, sv StateView, siteID string) {
if t, ok := sv.LastHeartbeat(siteID); ok {
ts := float64(t.UnixNano()) / 1e9
o.ObserveFloat64(mSiteLastHeartbeat, ts, metric.WithAttributes(
attribute.String("site_id", siteID),
))
}
}
func observeSessionsFor(o metric.Observer, siteID string, any interface{}) {
if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok {
sessions := tm.SessionsByTunnel()
// If tunnel_id labels are enabled, preserve existing per-tunnel observations
if ShouldIncludeTunnelID() {
for tid, n := range sessions {
attrs := []attribute.KeyValue{
attribute.String("site_id", siteID),
}
if tid != "" {
attrs = append(attrs, attribute.String("tunnel_id", tid))
}
o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attrs...))
}
return
}
// When tunnel_id is disabled, collapse per-tunnel counts into a single site-level value
var total int64
for _, n := range sessions {
total += n
}
// If there are no per-tunnel entries, fall back to ActiveSessions() if available
if total == 0 {
if svAny := stateView.Load(); svAny != nil {
if sv, ok := svAny.(StateView); ok {
if n, ok2 := sv.ActiveSessions(siteID); ok2 {
total = n
}
}
}
}
o.ObserveInt64(mTunnelSessions, total, metric.WithAttributes(attribute.String("site_id", siteID)))
return
}
}

View File

@@ -0,0 +1,384 @@
package telemetry
import (
"context"
"errors"
"net/http"
"os"
"strings"
"sync/atomic"
"time"
promclient "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/contrib/instrumentation/runtime"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/prometheus"
"go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
"go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
"google.golang.org/grpc/credentials"
)
// Config controls telemetry initialization via env flags.
//
// Defaults align with the issue requirements:
// - Prometheus exporter enabled by default (/metrics)
// - OTLP exporter disabled by default
// - Durations in seconds, bytes in raw bytes
// - Admin HTTP server address configurable (for mounting /metrics)
type Config struct {
ServiceName string
ServiceVersion string
// Optional resource attributes
SiteID string
Region string
PromEnabled bool
OTLPEnabled bool
OTLPEndpoint string // host:port
OTLPInsecure bool
MetricExportInterval time.Duration
AdminAddr string // e.g.: ":2112"
// Optional build info for newt_build_info metric
BuildVersion string
BuildCommit string
}
// FromEnv reads configuration from environment variables.
//
// NEWT_METRICS_PROMETHEUS_ENABLED (default: true)
// NEWT_METRICS_OTLP_ENABLED (default: false)
// OTEL_EXPORTER_OTLP_ENDPOINT (default: "localhost:4317")
// OTEL_EXPORTER_OTLP_INSECURE (default: true)
// OTEL_METRIC_EXPORT_INTERVAL (default: 15s)
// OTEL_SERVICE_NAME (default: "newt")
// OTEL_SERVICE_VERSION (default: "")
// NEWT_ADMIN_ADDR (default: ":2112")
func FromEnv() Config {
// Prefer explicit NEWT_* env vars, then fall back to OTEL_RESOURCE_ATTRIBUTES
site := getenv("NEWT_SITE_ID", "")
if site == "" {
site = getenv("NEWT_ID", "")
}
region := os.Getenv("NEWT_REGION")
if site == "" || region == "" {
if ra := os.Getenv("OTEL_RESOURCE_ATTRIBUTES"); ra != "" {
m := parseResourceAttributes(ra)
if site == "" {
site = m["site_id"]
}
if region == "" {
region = m["region"]
}
}
}
return Config{
ServiceName: getenv("OTEL_SERVICE_NAME", "newt"),
ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"),
SiteID: site,
Region: region,
PromEnabled: getenv("NEWT_METRICS_PROMETHEUS_ENABLED", "true") == "true",
OTLPEnabled: getenv("NEWT_METRICS_OTLP_ENABLED", "false") == "true",
OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"),
OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true",
MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second),
AdminAddr: getenv("NEWT_ADMIN_ADDR", ":2112"),
}
}
// Setup holds initialized telemetry providers and (optionally) a /metrics handler.
// Call Shutdown when the process terminates to flush exporters.
type Setup struct {
MeterProvider *metric.MeterProvider
TracerProvider *trace.TracerProvider
PrometheusHandler http.Handler // nil if Prometheus exporter disabled
shutdowns []func(context.Context) error
}
// Init configures OpenTelemetry metrics and (optionally) tracing.
//
// It sets a global MeterProvider and TracerProvider, registers runtime instrumentation,
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
// a Shutdown method to flush exporters.
func Init(ctx context.Context, cfg Config) (*Setup, error) {
// Configure tunnel_id label inclusion from env (default true)
if getenv("NEWT_METRICS_INCLUDE_TUNNEL_ID", "true") == "true" {
includeTunnelIDVal.Store(true)
} else {
includeTunnelIDVal.Store(false)
}
if getenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") == "true" {
includeSiteLabelVal.Store(true)
} else {
includeSiteLabelVal.Store(false)
}
res := buildResource(ctx, cfg)
UpdateSiteInfo(cfg.SiteID, cfg.Region)
s := &Setup{}
readers, promHandler, shutdowns, err := setupMetricExport(ctx, cfg, res)
if err != nil {
return nil, err
}
s.PrometheusHandler = promHandler
// Build provider
mp := buildMeterProvider(res, readers)
otel.SetMeterProvider(mp)
s.MeterProvider = mp
s.shutdowns = append(s.shutdowns, mp.Shutdown)
// Optional tracing
if cfg.OTLPEnabled {
if tp, shutdown := setupTracing(ctx, cfg, res); tp != nil {
otel.SetTracerProvider(tp)
s.TracerProvider = tp
s.shutdowns = append(s.shutdowns, func(c context.Context) error {
return errors.Join(shutdown(c), tp.Shutdown(c))
})
}
}
// Add metric exporter shutdowns
s.shutdowns = append(s.shutdowns, shutdowns...)
// Runtime metrics
_ = runtime.Start(runtime.WithMeterProvider(mp))
// Instruments
if err := registerInstruments(); err != nil {
return nil, err
}
if cfg.BuildVersion != "" || cfg.BuildCommit != "" {
RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit)
}
return s, nil
}
func buildResource(ctx context.Context, cfg Config) *resource.Resource {
attrs := []attribute.KeyValue{
semconv.ServiceName(cfg.ServiceName),
semconv.ServiceVersion(cfg.ServiceVersion),
}
if cfg.SiteID != "" {
attrs = append(attrs, attribute.String("site_id", cfg.SiteID))
}
if cfg.Region != "" {
attrs = append(attrs, attribute.String("region", cfg.Region))
}
res, _ := resource.New(ctx, resource.WithFromEnv(), resource.WithHost(), resource.WithAttributes(attrs...))
return res
}
func setupMetricExport(ctx context.Context, cfg Config, _ *resource.Resource) ([]metric.Reader, http.Handler, []func(context.Context) error, error) {
var readers []metric.Reader
var shutdowns []func(context.Context) error
var promHandler http.Handler
if cfg.PromEnabled {
reg := promclient.NewRegistry()
exp, err := prometheus.New(prometheus.WithRegisterer(reg))
if err != nil {
return nil, nil, nil, err
}
readers = append(readers, exp)
promHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
}
if cfg.OTLPEnabled {
mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)}
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs))
}
if cfg.OTLPInsecure {
mopts = append(mopts, otlpmetricgrpc.WithInsecure())
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil {
mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds))
}
}
mexp, err := otlpmetricgrpc.New(ctx, mopts...)
if err != nil {
return nil, nil, nil, err
}
readers = append(readers, metric.NewPeriodicReader(mexp, metric.WithInterval(cfg.MetricExportInterval)))
shutdowns = append(shutdowns, mexp.Shutdown)
}
return readers, promHandler, shutdowns, nil
}
func buildMeterProvider(res *resource.Resource, readers []metric.Reader) *metric.MeterProvider {
var mpOpts []metric.Option
mpOpts = append(mpOpts, metric.WithResource(res))
for _, r := range readers {
mpOpts = append(mpOpts, metric.WithReader(r))
}
mpOpts = append(mpOpts, metric.WithView(metric.NewView(
metric.Instrument{Name: "newt_*_latency_seconds"},
metric.Stream{Aggregation: metric.AggregationExplicitBucketHistogram{Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}}},
)))
mpOpts = append(mpOpts, metric.WithView(metric.NewView(
metric.Instrument{Name: "newt_*"},
metric.Stream{AttributeFilter: func(kv attribute.KeyValue) bool {
k := string(kv.Key)
switch k {
case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "msg_type", "phase", "version", "commit", "site_id", "region":
return true
default:
return false
}
}},
)))
return metric.NewMeterProvider(mpOpts...)
}
func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*trace.TracerProvider, func(context.Context) error) {
topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)}
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
topts = append(topts, otlptracegrpc.WithHeaders(hdrs))
}
if cfg.OTLPInsecure {
topts = append(topts, otlptracegrpc.WithInsecure())
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil {
topts = append(topts, otlptracegrpc.WithTLSCredentials(creds))
}
}
exp, err := otlptracegrpc.New(ctx, topts...)
if err != nil {
return nil, nil
}
tp := trace.NewTracerProvider(trace.WithBatcher(exp), trace.WithResource(res))
return tp, exp.Shutdown
}
// Shutdown flushes exporters and providers in reverse init order.
func (s *Setup) Shutdown(ctx context.Context) error {
var err error
for i := len(s.shutdowns) - 1; i >= 0; i-- {
err = errors.Join(err, s.shutdowns[i](ctx))
}
return err
}
func parseOTLPHeaders(h string) map[string]string {
m := map[string]string{}
if h == "" {
return m
}
pairs := strings.Split(h, ",")
for _, p := range pairs {
kv := strings.SplitN(strings.TrimSpace(p), "=", 2)
if len(kv) == 2 {
m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
}
}
return m
}
// parseResourceAttributes parses OTEL_RESOURCE_ATTRIBUTES formatted as k=v,k2=v2
func parseResourceAttributes(s string) map[string]string {
m := map[string]string{}
if s == "" {
return m
}
parts := strings.Split(s, ",")
for _, p := range parts {
kv := strings.SplitN(strings.TrimSpace(p), "=", 2)
if len(kv) == 2 {
m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
}
}
return m
}
// Global site/region used to enrich metric labels.
var siteIDVal atomic.Value
var regionVal atomic.Value
var (
includeTunnelIDVal atomic.Value // bool; default true
includeSiteLabelVal atomic.Value // bool; default false
)
// UpdateSiteInfo updates the global site_id and region used for metric labels.
// Thread-safe via atomic.Value: subsequent metric emissions will include
// the new labels, prior emissions remain unchanged.
func UpdateSiteInfo(siteID, region string) {
if siteID != "" {
siteIDVal.Store(siteID)
}
if region != "" {
regionVal.Store(region)
}
}
func getSiteID() string {
if v, ok := siteIDVal.Load().(string); ok {
return v
}
return ""
}
func getRegion() string {
if v, ok := regionVal.Load().(string); ok {
return v
}
return ""
}
// siteAttrs returns label KVs for site_id and region (if set).
func siteAttrs() []attribute.KeyValue {
var out []attribute.KeyValue
if s := getSiteID(); s != "" {
out = append(out, attribute.String("site_id", s))
}
if r := getRegion(); r != "" {
out = append(out, attribute.String("region", r))
}
return out
}
// SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager).
func SiteLabelKVs() []attribute.KeyValue {
if !ShouldIncludeSiteLabels() {
return nil
}
return siteAttrs()
}
// ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted.
func ShouldIncludeTunnelID() bool {
if v, ok := includeTunnelIDVal.Load().(bool); ok {
return v
}
return true
}
// ShouldIncludeSiteLabels returns whether site_id/region should be emitted as
// metric labels in addition to resource attributes.
func ShouldIncludeSiteLabels() bool {
if v, ok := includeSiteLabelVal.Load().(bool); ok {
return v
}
return false
}
func getenv(k, d string) string {
if v := os.Getenv(k); v != "" {
return v
}
return d
}
func getdur(k string, d time.Duration) time.Duration {
if v := os.Getenv(k); v != "" {
if p, e := time.ParseDuration(v); e == nil {
return p
}
}
return d
}

View File

@@ -0,0 +1,53 @@
package telemetry
import (
"context"
"io"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"go.opentelemetry.io/otel/attribute"
)
// Test that disallowed attributes are filtered from the exposition.
func TestAttributeFilterDropsUnknownKeys(t *testing.T) {
ctx := context.Background()
resetMetricsForTest()
t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true")
cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"}
tel, err := Init(ctx, cfg)
if err != nil {
t.Fatalf("init: %v", err)
}
defer func() { _ = tel.Shutdown(context.Background()) }()
if tel.PrometheusHandler == nil {
t.Fatalf("prom handler nil")
}
ts := httptest.NewServer(tel.PrometheusHandler)
defer ts.Close()
// Add samples with disallowed attribute keys
for _, k := range []string{"forbidden", "site_id", "host"} {
set := attribute.NewSet(attribute.String(k, "x"))
AddTunnelBytesSet(ctx, 123, set)
}
time.Sleep(50 * time.Millisecond)
resp, err := http.Get(ts.URL)
if err != nil {
t.Fatalf("GET: %v", err)
}
defer resp.Body.Close()
b, _ := io.ReadAll(resp.Body)
body := string(b)
if strings.Contains(body, "forbidden=") {
t.Fatalf("unexpected forbidden attribute leaked into metrics: %s", body)
}
if !strings.Contains(body, "site_id=\"x\"") {
t.Fatalf("expected allowed attribute site_id to be present in metrics, got: %s", body)
}
}

View File

@@ -0,0 +1,76 @@
package telemetry
import (
"bufio"
"context"
"io"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"time"
)
// Golden test that /metrics contains expected metric names.
func TestMetricsGoldenContains(t *testing.T) {
ctx := context.Background()
resetMetricsForTest()
t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true")
cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", BuildVersion: "test"}
tel, err := Init(ctx, cfg)
if err != nil {
t.Fatalf("telemetry init error: %v", err)
}
defer func() { _ = tel.Shutdown(context.Background()) }()
if tel.PrometheusHandler == nil {
t.Fatalf("prom handler nil")
}
ts := httptest.NewServer(tel.PrometheusHandler)
defer ts.Close()
// Trigger counters to ensure they appear in the scrape
IncConnAttempt(ctx, "websocket", "success")
IncWSReconnect(ctx, "io_error")
IncProxyConnectionEvent(ctx, "", "tcp", ProxyConnectionOpened)
if tel.MeterProvider != nil {
_ = tel.MeterProvider.ForceFlush(ctx)
}
time.Sleep(100 * time.Millisecond)
var body string
for i := 0; i < 5; i++ {
resp, err := http.Get(ts.URL)
if err != nil {
t.Fatalf("GET metrics failed: %v", err)
}
b, _ := io.ReadAll(resp.Body)
_ = resp.Body.Close()
body = string(b)
if strings.Contains(body, "newt_connection_attempts_total") {
break
}
time.Sleep(100 * time.Millisecond)
}
f, err := os.Open(filepath.Join("testdata", "expected_contains.golden"))
if err != nil {
t.Fatalf("read golden: %v", err)
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
needle := strings.TrimSpace(s.Text())
if needle == "" {
continue
}
if !strings.Contains(body, needle) {
t.Fatalf("expected metrics body to contain %q. body=\n%s", needle, body)
}
}
if err := s.Err(); err != nil {
t.Fatalf("scan golden: %v", err)
}
}

View File

@@ -0,0 +1,65 @@
package telemetry
import (
"context"
"io"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
)
// Smoke test that /metrics contains at least one newt_* metric when Prom exporter is enabled.
func TestMetricsSmoke(t *testing.T) {
ctx := context.Background()
resetMetricsForTest()
t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true")
cfg := Config{
ServiceName: "newt",
PromEnabled: true,
OTLPEnabled: false,
AdminAddr: "127.0.0.1:0",
BuildVersion: "test",
BuildCommit: "deadbeef",
MetricExportInterval: 5 * time.Second,
}
tel, err := Init(ctx, cfg)
if err != nil {
t.Fatalf("telemetry init error: %v", err)
}
defer func() { _ = tel.Shutdown(context.Background()) }()
// Serve the Prom handler on a test server
if tel.PrometheusHandler == nil {
t.Fatalf("Prometheus handler nil; PromEnabled should enable it")
}
ts := httptest.NewServer(tel.PrometheusHandler)
defer ts.Close()
// Record a simple metric and then fetch /metrics
IncConnAttempt(ctx, "websocket", "success")
if tel.MeterProvider != nil {
_ = tel.MeterProvider.ForceFlush(ctx)
}
// Give the exporter a tick to collect
time.Sleep(100 * time.Millisecond)
var body string
for i := 0; i < 5; i++ {
resp, err := http.Get(ts.URL)
if err != nil {
t.Fatalf("GET /metrics failed: %v", err)
}
b, _ := io.ReadAll(resp.Body)
_ = resp.Body.Close()
body = string(b)
if strings.Contains(body, "newt_connection_attempts_total") {
break
}
time.Sleep(100 * time.Millisecond)
}
if !strings.Contains(body, "newt_connection_attempts_total") {
t.Fatalf("expected newt_connection_attempts_total in metrics, got:\n%s", body)
}
}