mirror of
https://github.com/fosrl/newt.git
synced 2026-02-08 05:56:40 +00:00
Adding OpenTelemetry Metrics and Tracing
This commit is contained in:
80
internal/state/telemetry_view.go
Normal file
80
internal/state/telemetry_view.go
Normal file
@@ -0,0 +1,80 @@
|
||||
package state
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
// TelemetryView is a minimal, thread-safe implementation to feed observables.
|
||||
// Since one Newt process represents one site, we expose a single logical site.
|
||||
// site_id is a resource attribute, so we do not emit per-site labels here.
|
||||
type TelemetryView struct {
|
||||
online atomic.Bool
|
||||
lastHBUnix atomic.Int64 // unix seconds
|
||||
// per-tunnel sessions
|
||||
sessMu sync.RWMutex
|
||||
sessions map[string]*atomic.Int64
|
||||
}
|
||||
|
||||
var (
|
||||
globalView atomic.Pointer[TelemetryView]
|
||||
)
|
||||
|
||||
// Global returns a singleton TelemetryView.
|
||||
func Global() *TelemetryView {
|
||||
if v := globalView.Load(); v != nil { return v }
|
||||
v := &TelemetryView{ sessions: make(map[string]*atomic.Int64) }
|
||||
globalView.Store(v)
|
||||
telemetry.RegisterStateView(v)
|
||||
return v
|
||||
}
|
||||
|
||||
// Instrumentation helpers
|
||||
func (v *TelemetryView) IncSessions(tunnelID string) {
|
||||
v.sessMu.Lock(); defer v.sessMu.Unlock()
|
||||
c := v.sessions[tunnelID]
|
||||
if c == nil { c = &atomic.Int64{}; v.sessions[tunnelID] = c }
|
||||
c.Add(1)
|
||||
}
|
||||
func (v *TelemetryView) DecSessions(tunnelID string) {
|
||||
v.sessMu.Lock(); defer v.sessMu.Unlock()
|
||||
if c := v.sessions[tunnelID]; c != nil {
|
||||
c.Add(-1)
|
||||
if c.Load() <= 0 { delete(v.sessions, tunnelID) }
|
||||
}
|
||||
}
|
||||
func (v *TelemetryView) ClearTunnel(tunnelID string) {
|
||||
v.sessMu.Lock(); defer v.sessMu.Unlock()
|
||||
delete(v.sessions, tunnelID)
|
||||
}
|
||||
func (v *TelemetryView) SetOnline(b bool) { v.online.Store(b) }
|
||||
func (v *TelemetryView) TouchHeartbeat() { v.lastHBUnix.Store(time.Now().Unix()) }
|
||||
|
||||
// --- telemetry.StateView interface ---
|
||||
|
||||
func (v *TelemetryView) ListSites() []string { return []string{"self"} }
|
||||
func (v *TelemetryView) Online(_ string) (bool, bool) { return v.online.Load(), true }
|
||||
func (v *TelemetryView) LastHeartbeat(_ string) (time.Time, bool) {
|
||||
sec := v.lastHBUnix.Load()
|
||||
if sec == 0 { return time.Time{}, false }
|
||||
return time.Unix(sec, 0), true
|
||||
}
|
||||
func (v *TelemetryView) ActiveSessions(_ string) (int64, bool) {
|
||||
// aggregated sessions (not used for per-tunnel gauge)
|
||||
v.sessMu.RLock(); defer v.sessMu.RUnlock()
|
||||
var sum int64
|
||||
for _, c := range v.sessions { if c != nil { sum += c.Load() } }
|
||||
return sum, true
|
||||
}
|
||||
|
||||
// Extended accessor used by telemetry callback to publish per-tunnel samples.
|
||||
func (v *TelemetryView) SessionsByTunnel() map[string]int64 {
|
||||
v.sessMu.RLock(); defer v.sessMu.RUnlock()
|
||||
out := make(map[string]int64, len(v.sessions))
|
||||
for id, c := range v.sessions { if c != nil && c.Load() > 0 { out[id] = c.Load() } }
|
||||
return out
|
||||
}
|
||||
|
||||
19
internal/telemetry/constants.go
Normal file
19
internal/telemetry/constants.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package telemetry
|
||||
|
||||
// Protocol labels (low-cardinality)
|
||||
const (
|
||||
ProtocolTCP = "tcp"
|
||||
ProtocolUDP = "udp"
|
||||
)
|
||||
|
||||
// Reconnect reason bins (fixed, low-cardinality)
|
||||
const (
|
||||
ReasonServerRequest = "server_request"
|
||||
ReasonTimeout = "timeout"
|
||||
ReasonPeerClose = "peer_close"
|
||||
ReasonNetworkChange = "network_change"
|
||||
ReasonAuthError = "auth_error"
|
||||
ReasonHandshakeError = "handshake_error"
|
||||
ReasonConfigChange = "config_change"
|
||||
ReasonError = "error"
|
||||
)
|
||||
32
internal/telemetry/constants_test.go
Normal file
32
internal/telemetry/constants_test.go
Normal file
@@ -0,0 +1,32 @@
|
||||
package telemetry
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestAllowedConstants(t *testing.T) {
|
||||
allowedReasons := map[string]struct{}{
|
||||
ReasonServerRequest: {},
|
||||
ReasonTimeout: {},
|
||||
ReasonPeerClose: {},
|
||||
ReasonNetworkChange: {},
|
||||
ReasonAuthError: {},
|
||||
ReasonHandshakeError: {},
|
||||
ReasonConfigChange: {},
|
||||
ReasonError: {},
|
||||
}
|
||||
for k := range allowedReasons {
|
||||
if k == "" {
|
||||
t.Fatalf("empty reason constant")
|
||||
}
|
||||
}
|
||||
|
||||
allowedProtocols := map[string]struct{}{
|
||||
ProtocolTCP: {},
|
||||
ProtocolUDP: {},
|
||||
}
|
||||
for k := range allowedProtocols {
|
||||
if k == "" {
|
||||
t.Fatalf("empty protocol constant")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
542
internal/telemetry/metrics.go
Normal file
542
internal/telemetry/metrics.go
Normal file
@@ -0,0 +1,542 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
)
|
||||
|
||||
// Instruments and helpers for Newt metrics following the naming, units, and
|
||||
// low-cardinality label guidance from the issue description.
|
||||
//
|
||||
// Counters end with _total, durations are in seconds, sizes in bytes.
|
||||
// Only low-cardinality stable labels are supported: tunnel_id,
|
||||
// transport, direction, result, reason, error_type.
|
||||
var (
|
||||
initOnce sync.Once
|
||||
|
||||
meter metric.Meter
|
||||
|
||||
// Site / Registration
|
||||
mSiteRegistrations metric.Int64Counter
|
||||
mSiteOnline metric.Int64ObservableGauge
|
||||
mSiteLastHeartbeat metric.Float64ObservableGauge
|
||||
|
||||
// Tunnel / Sessions
|
||||
mTunnelSessions metric.Int64ObservableGauge
|
||||
mTunnelBytes metric.Int64Counter
|
||||
mTunnelLatency metric.Float64Histogram
|
||||
mReconnects metric.Int64Counter
|
||||
|
||||
// Connection / NAT
|
||||
mConnAttempts metric.Int64Counter
|
||||
mConnErrors metric.Int64Counter
|
||||
|
||||
// Config/Restart
|
||||
mConfigReloads metric.Int64Counter
|
||||
mConfigApply metric.Float64Histogram
|
||||
mCertRotationTotal metric.Int64Counter
|
||||
mProcessStartTime metric.Float64ObservableGauge
|
||||
|
||||
// Build info
|
||||
mBuildInfo metric.Int64ObservableGauge
|
||||
|
||||
// WebSocket
|
||||
mWSConnectLatency metric.Float64Histogram
|
||||
mWSMessages metric.Int64Counter
|
||||
mWSDisconnects metric.Int64Counter
|
||||
mWSKeepaliveFailure metric.Int64Counter
|
||||
mWSSessionDuration metric.Float64Histogram
|
||||
mWSConnected metric.Int64ObservableGauge
|
||||
mWSReconnects metric.Int64Counter
|
||||
|
||||
// Proxy
|
||||
mProxyActiveConns metric.Int64ObservableGauge
|
||||
mProxyBufferBytes metric.Int64ObservableGauge
|
||||
mProxyAsyncBacklogByte metric.Int64ObservableGauge
|
||||
mProxyDropsTotal metric.Int64Counter
|
||||
mProxyAcceptsTotal metric.Int64Counter
|
||||
mProxyConnDuration metric.Float64Histogram
|
||||
mProxyConnectionsTotal metric.Int64Counter
|
||||
|
||||
buildVersion string
|
||||
buildCommit string
|
||||
processStartUnix = float64(time.Now().UnixNano()) / 1e9
|
||||
wsConnectedState atomic.Int64
|
||||
)
|
||||
|
||||
// Proxy connection lifecycle events.
|
||||
const (
|
||||
ProxyConnectionOpened = "opened"
|
||||
ProxyConnectionClosed = "closed"
|
||||
)
|
||||
|
||||
// attrsWithSite appends site/region labels only when explicitly enabled to keep
|
||||
// label cardinality low by default.
|
||||
func attrsWithSite(extra ...attribute.KeyValue) []attribute.KeyValue {
|
||||
attrs := make([]attribute.KeyValue, len(extra))
|
||||
copy(attrs, extra)
|
||||
if ShouldIncludeSiteLabels() {
|
||||
attrs = append(attrs, siteAttrs()...)
|
||||
}
|
||||
return attrs
|
||||
}
|
||||
|
||||
func registerInstruments() error {
|
||||
var err error
|
||||
initOnce.Do(func() {
|
||||
meter = otel.Meter("newt")
|
||||
if e := registerSiteInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
if e := registerTunnelInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
if e := registerConnInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
if e := registerConfigInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
if e := registerBuildWSProxyInstruments(); e != nil {
|
||||
err = e
|
||||
return
|
||||
}
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func registerSiteInstruments() error {
|
||||
var err error
|
||||
mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total",
|
||||
metric.WithDescription("Total site registration attempts"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online",
|
||||
metric.WithDescription("Site online (0/1)"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_timestamp_seconds",
|
||||
metric.WithDescription("Unix timestamp of the last site heartbeat"),
|
||||
metric.WithUnit("s"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerTunnelInstruments() error {
|
||||
var err error
|
||||
mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions",
|
||||
metric.WithDescription("Active tunnel sessions"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total",
|
||||
metric.WithDescription("Tunnel bytes ingress/egress"),
|
||||
metric.WithUnit("By"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds",
|
||||
metric.WithDescription("Per-tunnel latency in seconds"),
|
||||
metric.WithUnit("s"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total",
|
||||
metric.WithDescription("Tunnel reconnect events"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerConnInstruments() error {
|
||||
var err error
|
||||
mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total",
|
||||
metric.WithDescription("Connection attempts"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
mConnErrors, err = meter.Int64Counter("newt_connection_errors_total",
|
||||
metric.WithDescription("Connection errors by type"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerConfigInstruments() error {
|
||||
mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total",
|
||||
metric.WithDescription("Configuration reloads"))
|
||||
mConfigApply, _ = meter.Float64Histogram("newt_config_apply_seconds",
|
||||
metric.WithDescription("Configuration apply duration in seconds"),
|
||||
metric.WithUnit("s"))
|
||||
mCertRotationTotal, _ = meter.Int64Counter("newt_cert_rotation_total",
|
||||
metric.WithDescription("Certificate rotation events (success/failure)"))
|
||||
mProcessStartTime, _ = meter.Float64ObservableGauge("process_start_time_seconds",
|
||||
metric.WithDescription("Unix timestamp of the process start time"),
|
||||
metric.WithUnit("s"))
|
||||
if mProcessStartTime != nil {
|
||||
if _, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
o.ObserveFloat64(mProcessStartTime, processStartUnix)
|
||||
return nil
|
||||
}, mProcessStartTime); err != nil {
|
||||
otel.Handle(err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func registerBuildWSProxyInstruments() error {
|
||||
// Build info gauge (value 1 with version/commit attributes)
|
||||
mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info",
|
||||
metric.WithDescription("Newt build information (value is always 1)"))
|
||||
// WebSocket
|
||||
mWSConnectLatency, _ = meter.Float64Histogram("newt_websocket_connect_latency_seconds",
|
||||
metric.WithDescription("WebSocket connect latency in seconds"),
|
||||
metric.WithUnit("s"))
|
||||
mWSMessages, _ = meter.Int64Counter("newt_websocket_messages_total",
|
||||
metric.WithDescription("WebSocket messages by direction and type"))
|
||||
mWSDisconnects, _ = meter.Int64Counter("newt_websocket_disconnects_total",
|
||||
metric.WithDescription("WebSocket disconnects by reason/result"))
|
||||
mWSKeepaliveFailure, _ = meter.Int64Counter("newt_websocket_keepalive_failures_total",
|
||||
metric.WithDescription("WebSocket keepalive (ping/pong) failures"))
|
||||
mWSSessionDuration, _ = meter.Float64Histogram("newt_websocket_session_duration_seconds",
|
||||
metric.WithDescription("Duration of established WebSocket sessions"),
|
||||
metric.WithUnit("s"))
|
||||
mWSConnected, _ = meter.Int64ObservableGauge("newt_websocket_connected",
|
||||
metric.WithDescription("WebSocket connection state (1=connected, 0=disconnected)"))
|
||||
mWSReconnects, _ = meter.Int64Counter("newt_websocket_reconnects_total",
|
||||
metric.WithDescription("WebSocket reconnect attempts by reason"))
|
||||
// Proxy
|
||||
mProxyActiveConns, _ = meter.Int64ObservableGauge("newt_proxy_active_connections",
|
||||
metric.WithDescription("Proxy active connections per tunnel and protocol"))
|
||||
mProxyBufferBytes, _ = meter.Int64ObservableGauge("newt_proxy_buffer_bytes",
|
||||
metric.WithDescription("Proxy buffer bytes (may approximate async backlog)"),
|
||||
metric.WithUnit("By"))
|
||||
mProxyAsyncBacklogByte, _ = meter.Int64ObservableGauge("newt_proxy_async_backlog_bytes",
|
||||
metric.WithDescription("Unflushed async byte backlog per tunnel and protocol"),
|
||||
metric.WithUnit("By"))
|
||||
mProxyDropsTotal, _ = meter.Int64Counter("newt_proxy_drops_total",
|
||||
metric.WithDescription("Proxy drops due to write errors"))
|
||||
mProxyAcceptsTotal, _ = meter.Int64Counter("newt_proxy_accept_total",
|
||||
metric.WithDescription("Proxy connection accepts by protocol and result"))
|
||||
mProxyConnDuration, _ = meter.Float64Histogram("newt_proxy_connection_duration_seconds",
|
||||
metric.WithDescription("Duration of completed proxy connections"),
|
||||
metric.WithUnit("s"))
|
||||
mProxyConnectionsTotal, _ = meter.Int64Counter("newt_proxy_connections_total",
|
||||
metric.WithDescription("Proxy connection lifecycle events by protocol"))
|
||||
// Register a default callback for build info if version/commit set
|
||||
reg, e := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
if buildVersion == "" && buildCommit == "" {
|
||||
return nil
|
||||
}
|
||||
attrs := []attribute.KeyValue{}
|
||||
if buildVersion != "" {
|
||||
attrs = append(attrs, attribute.String("version", buildVersion))
|
||||
}
|
||||
if buildCommit != "" {
|
||||
attrs = append(attrs, attribute.String("commit", buildCommit))
|
||||
}
|
||||
if ShouldIncludeSiteLabels() {
|
||||
attrs = append(attrs, siteAttrs()...)
|
||||
}
|
||||
o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...))
|
||||
return nil
|
||||
}, mBuildInfo)
|
||||
if e != nil {
|
||||
otel.Handle(e)
|
||||
} else {
|
||||
// Provide a functional stopper that unregisters the callback
|
||||
obsStopper = func() { _ = reg.Unregister() }
|
||||
}
|
||||
if mWSConnected != nil {
|
||||
if regConn, err := meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
val := wsConnectedState.Load()
|
||||
o.ObserveInt64(mWSConnected, val, metric.WithAttributes(attrsWithSite()...))
|
||||
return nil
|
||||
}, mWSConnected); err != nil {
|
||||
otel.Handle(err)
|
||||
} else {
|
||||
wsConnStopper = func() { _ = regConn.Unregister() }
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Observable registration: Newt can register a callback to report gauges.
|
||||
// Call SetObservableCallback once to start observing online status, last
|
||||
// heartbeat seconds, and active sessions.
|
||||
|
||||
var (
|
||||
obsOnce sync.Once
|
||||
obsStopper func()
|
||||
proxyObsOnce sync.Once
|
||||
proxyStopper func()
|
||||
wsConnStopper func()
|
||||
)
|
||||
|
||||
// SetObservableCallback registers a single callback that will be invoked
|
||||
// on collection. Use the provided observer to emit values for the observable
|
||||
// gauges defined here.
|
||||
//
|
||||
// Example inside your code (where you have access to current state):
|
||||
//
|
||||
// telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
// o.ObserveInt64(mSiteOnline, 1)
|
||||
// o.ObserveFloat64(mSiteLastHeartbeat, float64(lastHB.Unix()))
|
||||
// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions)))
|
||||
// return nil
|
||||
// })
|
||||
func SetObservableCallback(cb func(context.Context, metric.Observer) error) {
|
||||
obsOnce.Do(func() {
|
||||
reg, e := meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions)
|
||||
if e != nil {
|
||||
otel.Handle(e)
|
||||
obsStopper = func() {
|
||||
// no-op: registration failed; keep stopper callable
|
||||
}
|
||||
return
|
||||
}
|
||||
// Provide a functional stopper mirroring proxy/build-info behavior
|
||||
obsStopper = func() { _ = reg.Unregister() }
|
||||
})
|
||||
}
|
||||
|
||||
// SetProxyObservableCallback registers a callback to observe proxy gauges.
|
||||
func SetProxyObservableCallback(cb func(context.Context, metric.Observer) error) {
|
||||
proxyObsOnce.Do(func() {
|
||||
reg, e := meter.RegisterCallback(cb, mProxyActiveConns, mProxyBufferBytes, mProxyAsyncBacklogByte)
|
||||
if e != nil {
|
||||
otel.Handle(e)
|
||||
proxyStopper = func() {
|
||||
// no-op: registration failed; keep stopper callable
|
||||
}
|
||||
return
|
||||
}
|
||||
// Provide a functional stopper to unregister later if needed
|
||||
proxyStopper = func() { _ = reg.Unregister() }
|
||||
})
|
||||
}
|
||||
|
||||
// Build info registration
|
||||
func RegisterBuildInfo(version, commit string) {
|
||||
buildVersion = version
|
||||
buildCommit = commit
|
||||
}
|
||||
|
||||
// Config reloads
|
||||
func IncConfigReload(ctx context.Context, result string) {
|
||||
mConfigReloads.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
// Helpers for counters/histograms
|
||||
|
||||
func IncSiteRegistration(ctx context.Context, result string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("result", result),
|
||||
}
|
||||
mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("direction", direction),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mTunnelBytes.Add(ctx, n, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations.
|
||||
func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) {
|
||||
mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs))
|
||||
}
|
||||
|
||||
// --- WebSocket helpers ---
|
||||
|
||||
func ObserveWSConnectLatency(ctx context.Context, seconds float64, result, errorType string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("transport", "websocket"),
|
||||
attribute.String("result", result),
|
||||
}
|
||||
if errorType != "" {
|
||||
attrs = append(attrs, attribute.String("error_type", errorType))
|
||||
}
|
||||
mWSConnectLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncWSMessage(ctx context.Context, direction, msgType string) {
|
||||
mWSMessages.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("direction", direction),
|
||||
attribute.String("msg_type", msgType),
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncWSDisconnect(ctx context.Context, reason, result string) {
|
||||
mWSDisconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("reason", reason),
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncWSKeepaliveFailure(ctx context.Context, reason string) {
|
||||
mWSKeepaliveFailure.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("reason", reason),
|
||||
)...))
|
||||
}
|
||||
|
||||
// SetWSConnectionState updates the backing gauge for the WebSocket connected state.
|
||||
func SetWSConnectionState(connected bool) {
|
||||
if connected {
|
||||
wsConnectedState.Store(1)
|
||||
} else {
|
||||
wsConnectedState.Store(0)
|
||||
}
|
||||
}
|
||||
|
||||
// IncWSReconnect increments the WebSocket reconnect counter with a bounded reason label.
|
||||
func IncWSReconnect(ctx context.Context, reason string) {
|
||||
if reason == "" {
|
||||
reason = "unknown"
|
||||
}
|
||||
mWSReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("reason", reason),
|
||||
)...))
|
||||
}
|
||||
|
||||
func ObserveWSSessionDuration(ctx context.Context, seconds float64, result string) {
|
||||
mWSSessionDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
// --- Proxy helpers ---
|
||||
|
||||
func ObserveProxyActiveConnsObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
|
||||
o.ObserveInt64(mProxyActiveConns, value, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
func ObserveProxyBufferBytesObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
|
||||
o.ObserveInt64(mProxyBufferBytes, value, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
func ObserveProxyAsyncBacklogObs(o metric.Observer, value int64, attrs []attribute.KeyValue) {
|
||||
o.ObserveInt64(mProxyAsyncBacklogByte, value, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
func IncProxyDrops(ctx context.Context, tunnelID, protocol string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncProxyAccept(ctx context.Context, tunnelID, protocol, result, reason string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
attribute.String("result", result),
|
||||
}
|
||||
if reason != "" {
|
||||
attrs = append(attrs, attribute.String("reason", reason))
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyAcceptsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func ObserveProxyConnectionDuration(ctx context.Context, tunnelID, protocol, result string, seconds float64) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
attribute.String("result", result),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyConnDuration.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
// IncProxyConnectionEvent records proxy connection lifecycle events (opened/closed).
|
||||
func IncProxyConnectionEvent(ctx context.Context, tunnelID, protocol, event string) {
|
||||
if event == "" {
|
||||
event = "unknown"
|
||||
}
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("protocol", protocol),
|
||||
attribute.String("event", event),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mProxyConnectionsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
// --- Config/PKI helpers ---
|
||||
|
||||
func ObserveConfigApply(ctx context.Context, phase, result string, seconds float64) {
|
||||
mConfigApply.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("phase", phase),
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncCertRotation(ctx context.Context, result string) {
|
||||
mCertRotationTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("transport", transport),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncReconnect(ctx context.Context, tunnelID, initiator, reason string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("initiator", initiator),
|
||||
attribute.String("reason", reason),
|
||||
}
|
||||
if ShouldIncludeTunnelID() && tunnelID != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
|
||||
}
|
||||
mReconnects.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
|
||||
}
|
||||
|
||||
func IncConnAttempt(ctx context.Context, transport, result string) {
|
||||
mConnAttempts.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("transport", transport),
|
||||
attribute.String("result", result),
|
||||
)...))
|
||||
}
|
||||
|
||||
func IncConnError(ctx context.Context, transport, typ string) {
|
||||
mConnErrors.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
|
||||
attribute.String("transport", transport),
|
||||
attribute.String("error_type", typ),
|
||||
)...))
|
||||
}
|
||||
59
internal/telemetry/metrics_test_helper.go
Normal file
59
internal/telemetry/metrics_test_helper.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
func resetMetricsForTest() {
|
||||
initOnce = sync.Once{}
|
||||
obsOnce = sync.Once{}
|
||||
proxyObsOnce = sync.Once{}
|
||||
obsStopper = nil
|
||||
proxyStopper = nil
|
||||
if wsConnStopper != nil {
|
||||
wsConnStopper()
|
||||
}
|
||||
wsConnStopper = nil
|
||||
meter = nil
|
||||
|
||||
mSiteRegistrations = nil
|
||||
mSiteOnline = nil
|
||||
mSiteLastHeartbeat = nil
|
||||
|
||||
mTunnelSessions = nil
|
||||
mTunnelBytes = nil
|
||||
mTunnelLatency = nil
|
||||
mReconnects = nil
|
||||
|
||||
mConnAttempts = nil
|
||||
mConnErrors = nil
|
||||
|
||||
mConfigReloads = nil
|
||||
mConfigApply = nil
|
||||
mCertRotationTotal = nil
|
||||
mProcessStartTime = nil
|
||||
|
||||
mBuildInfo = nil
|
||||
|
||||
mWSConnectLatency = nil
|
||||
mWSMessages = nil
|
||||
mWSDisconnects = nil
|
||||
mWSKeepaliveFailure = nil
|
||||
mWSSessionDuration = nil
|
||||
mWSConnected = nil
|
||||
mWSReconnects = nil
|
||||
|
||||
mProxyActiveConns = nil
|
||||
mProxyBufferBytes = nil
|
||||
mProxyAsyncBacklogByte = nil
|
||||
mProxyDropsTotal = nil
|
||||
mProxyAcceptsTotal = nil
|
||||
mProxyConnDuration = nil
|
||||
mProxyConnectionsTotal = nil
|
||||
|
||||
processStartUnix = float64(time.Now().UnixNano()) / 1e9
|
||||
wsConnectedState.Store(0)
|
||||
includeTunnelIDVal.Store(false)
|
||||
includeSiteLabelVal.Store(false)
|
||||
}
|
||||
106
internal/telemetry/state_view.go
Normal file
106
internal/telemetry/state_view.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
)
|
||||
|
||||
// StateView provides a read-only view for observable gauges.
|
||||
// Implementations must be concurrency-safe and avoid blocking operations.
|
||||
// All methods should be fast and use RLocks where applicable.
|
||||
type StateView interface {
|
||||
// ListSites returns a stable, low-cardinality list of site IDs to expose.
|
||||
ListSites() []string
|
||||
// Online returns whether the site is online.
|
||||
Online(siteID string) (online bool, ok bool)
|
||||
// LastHeartbeat returns the last heartbeat time for a site.
|
||||
LastHeartbeat(siteID string) (t time.Time, ok bool)
|
||||
// ActiveSessions returns the current number of active sessions for a site (across tunnels),
|
||||
// or scoped to site if your model is site-scoped.
|
||||
ActiveSessions(siteID string) (n int64, ok bool)
|
||||
}
|
||||
|
||||
var (
|
||||
stateView atomic.Value // of type StateView
|
||||
)
|
||||
|
||||
// RegisterStateView sets the global StateView used by the default observable callback.
|
||||
func RegisterStateView(v StateView) {
|
||||
stateView.Store(v)
|
||||
// If instruments are registered, ensure a callback exists.
|
||||
if v != nil {
|
||||
SetObservableCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
if any := stateView.Load(); any != nil {
|
||||
if sv, ok := any.(StateView); ok {
|
||||
for _, siteID := range sv.ListSites() {
|
||||
observeSiteOnlineFor(o, sv, siteID)
|
||||
observeLastHeartbeatFor(o, sv, siteID)
|
||||
observeSessionsFor(o, siteID, sv)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func observeSiteOnlineFor(o metric.Observer, sv StateView, siteID string) {
|
||||
if online, ok := sv.Online(siteID); ok {
|
||||
val := int64(0)
|
||||
if online {
|
||||
val = 1
|
||||
}
|
||||
o.ObserveInt64(mSiteOnline, val, metric.WithAttributes(
|
||||
attribute.String("site_id", siteID),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
func observeLastHeartbeatFor(o metric.Observer, sv StateView, siteID string) {
|
||||
if t, ok := sv.LastHeartbeat(siteID); ok {
|
||||
ts := float64(t.UnixNano()) / 1e9
|
||||
o.ObserveFloat64(mSiteLastHeartbeat, ts, metric.WithAttributes(
|
||||
attribute.String("site_id", siteID),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
func observeSessionsFor(o metric.Observer, siteID string, any interface{}) {
|
||||
if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok {
|
||||
sessions := tm.SessionsByTunnel()
|
||||
// If tunnel_id labels are enabled, preserve existing per-tunnel observations
|
||||
if ShouldIncludeTunnelID() {
|
||||
for tid, n := range sessions {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("site_id", siteID),
|
||||
}
|
||||
if tid != "" {
|
||||
attrs = append(attrs, attribute.String("tunnel_id", tid))
|
||||
}
|
||||
o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attrs...))
|
||||
}
|
||||
return
|
||||
}
|
||||
// When tunnel_id is disabled, collapse per-tunnel counts into a single site-level value
|
||||
var total int64
|
||||
for _, n := range sessions {
|
||||
total += n
|
||||
}
|
||||
// If there are no per-tunnel entries, fall back to ActiveSessions() if available
|
||||
if total == 0 {
|
||||
if svAny := stateView.Load(); svAny != nil {
|
||||
if sv, ok := svAny.(StateView); ok {
|
||||
if n, ok2 := sv.ActiveSessions(siteID); ok2 {
|
||||
total = n
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
o.ObserveInt64(mTunnelSessions, total, metric.WithAttributes(attribute.String("site_id", siteID)))
|
||||
return
|
||||
}
|
||||
}
|
||||
384
internal/telemetry/telemetry.go
Normal file
384
internal/telemetry/telemetry.go
Normal file
@@ -0,0 +1,384 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
promclient "github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"go.opentelemetry.io/contrib/instrumentation/runtime"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||
"go.opentelemetry.io/otel/exporters/prometheus"
|
||||
"go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
"go.opentelemetry.io/otel/sdk/trace"
|
||||
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
|
||||
"google.golang.org/grpc/credentials"
|
||||
)
|
||||
|
||||
// Config controls telemetry initialization via env flags.
|
||||
//
|
||||
// Defaults align with the issue requirements:
|
||||
// - Prometheus exporter enabled by default (/metrics)
|
||||
// - OTLP exporter disabled by default
|
||||
// - Durations in seconds, bytes in raw bytes
|
||||
// - Admin HTTP server address configurable (for mounting /metrics)
|
||||
type Config struct {
|
||||
ServiceName string
|
||||
ServiceVersion string
|
||||
|
||||
// Optional resource attributes
|
||||
SiteID string
|
||||
Region string
|
||||
|
||||
PromEnabled bool
|
||||
OTLPEnabled bool
|
||||
|
||||
OTLPEndpoint string // host:port
|
||||
OTLPInsecure bool
|
||||
|
||||
MetricExportInterval time.Duration
|
||||
AdminAddr string // e.g.: ":2112"
|
||||
|
||||
// Optional build info for newt_build_info metric
|
||||
BuildVersion string
|
||||
BuildCommit string
|
||||
}
|
||||
|
||||
// FromEnv reads configuration from environment variables.
|
||||
//
|
||||
// NEWT_METRICS_PROMETHEUS_ENABLED (default: true)
|
||||
// NEWT_METRICS_OTLP_ENABLED (default: false)
|
||||
// OTEL_EXPORTER_OTLP_ENDPOINT (default: "localhost:4317")
|
||||
// OTEL_EXPORTER_OTLP_INSECURE (default: true)
|
||||
// OTEL_METRIC_EXPORT_INTERVAL (default: 15s)
|
||||
// OTEL_SERVICE_NAME (default: "newt")
|
||||
// OTEL_SERVICE_VERSION (default: "")
|
||||
// NEWT_ADMIN_ADDR (default: ":2112")
|
||||
func FromEnv() Config {
|
||||
// Prefer explicit NEWT_* env vars, then fall back to OTEL_RESOURCE_ATTRIBUTES
|
||||
site := getenv("NEWT_SITE_ID", "")
|
||||
if site == "" {
|
||||
site = getenv("NEWT_ID", "")
|
||||
}
|
||||
region := os.Getenv("NEWT_REGION")
|
||||
if site == "" || region == "" {
|
||||
if ra := os.Getenv("OTEL_RESOURCE_ATTRIBUTES"); ra != "" {
|
||||
m := parseResourceAttributes(ra)
|
||||
if site == "" {
|
||||
site = m["site_id"]
|
||||
}
|
||||
if region == "" {
|
||||
region = m["region"]
|
||||
}
|
||||
}
|
||||
}
|
||||
return Config{
|
||||
ServiceName: getenv("OTEL_SERVICE_NAME", "newt"),
|
||||
ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"),
|
||||
SiteID: site,
|
||||
Region: region,
|
||||
PromEnabled: getenv("NEWT_METRICS_PROMETHEUS_ENABLED", "true") == "true",
|
||||
OTLPEnabled: getenv("NEWT_METRICS_OTLP_ENABLED", "false") == "true",
|
||||
OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"),
|
||||
OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true",
|
||||
MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second),
|
||||
AdminAddr: getenv("NEWT_ADMIN_ADDR", ":2112"),
|
||||
}
|
||||
}
|
||||
|
||||
// Setup holds initialized telemetry providers and (optionally) a /metrics handler.
|
||||
// Call Shutdown when the process terminates to flush exporters.
|
||||
type Setup struct {
|
||||
MeterProvider *metric.MeterProvider
|
||||
TracerProvider *trace.TracerProvider
|
||||
|
||||
PrometheusHandler http.Handler // nil if Prometheus exporter disabled
|
||||
|
||||
shutdowns []func(context.Context) error
|
||||
}
|
||||
|
||||
// Init configures OpenTelemetry metrics and (optionally) tracing.
|
||||
//
|
||||
// It sets a global MeterProvider and TracerProvider, registers runtime instrumentation,
|
||||
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
|
||||
// a Shutdown method to flush exporters.
|
||||
func Init(ctx context.Context, cfg Config) (*Setup, error) {
|
||||
// Configure tunnel_id label inclusion from env (default true)
|
||||
if getenv("NEWT_METRICS_INCLUDE_TUNNEL_ID", "true") == "true" {
|
||||
includeTunnelIDVal.Store(true)
|
||||
} else {
|
||||
includeTunnelIDVal.Store(false)
|
||||
}
|
||||
if getenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true") == "true" {
|
||||
includeSiteLabelVal.Store(true)
|
||||
} else {
|
||||
includeSiteLabelVal.Store(false)
|
||||
}
|
||||
res := buildResource(ctx, cfg)
|
||||
UpdateSiteInfo(cfg.SiteID, cfg.Region)
|
||||
|
||||
s := &Setup{}
|
||||
readers, promHandler, shutdowns, err := setupMetricExport(ctx, cfg, res)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s.PrometheusHandler = promHandler
|
||||
// Build provider
|
||||
mp := buildMeterProvider(res, readers)
|
||||
otel.SetMeterProvider(mp)
|
||||
s.MeterProvider = mp
|
||||
s.shutdowns = append(s.shutdowns, mp.Shutdown)
|
||||
// Optional tracing
|
||||
if cfg.OTLPEnabled {
|
||||
if tp, shutdown := setupTracing(ctx, cfg, res); tp != nil {
|
||||
otel.SetTracerProvider(tp)
|
||||
s.TracerProvider = tp
|
||||
s.shutdowns = append(s.shutdowns, func(c context.Context) error {
|
||||
return errors.Join(shutdown(c), tp.Shutdown(c))
|
||||
})
|
||||
}
|
||||
}
|
||||
// Add metric exporter shutdowns
|
||||
s.shutdowns = append(s.shutdowns, shutdowns...)
|
||||
// Runtime metrics
|
||||
_ = runtime.Start(runtime.WithMeterProvider(mp))
|
||||
// Instruments
|
||||
if err := registerInstruments(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if cfg.BuildVersion != "" || cfg.BuildCommit != "" {
|
||||
RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit)
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
func buildResource(ctx context.Context, cfg Config) *resource.Resource {
|
||||
attrs := []attribute.KeyValue{
|
||||
semconv.ServiceName(cfg.ServiceName),
|
||||
semconv.ServiceVersion(cfg.ServiceVersion),
|
||||
}
|
||||
if cfg.SiteID != "" {
|
||||
attrs = append(attrs, attribute.String("site_id", cfg.SiteID))
|
||||
}
|
||||
if cfg.Region != "" {
|
||||
attrs = append(attrs, attribute.String("region", cfg.Region))
|
||||
}
|
||||
res, _ := resource.New(ctx, resource.WithFromEnv(), resource.WithHost(), resource.WithAttributes(attrs...))
|
||||
return res
|
||||
}
|
||||
|
||||
func setupMetricExport(ctx context.Context, cfg Config, _ *resource.Resource) ([]metric.Reader, http.Handler, []func(context.Context) error, error) {
|
||||
var readers []metric.Reader
|
||||
var shutdowns []func(context.Context) error
|
||||
var promHandler http.Handler
|
||||
if cfg.PromEnabled {
|
||||
reg := promclient.NewRegistry()
|
||||
exp, err := prometheus.New(prometheus.WithRegisterer(reg))
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
readers = append(readers, exp)
|
||||
promHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
|
||||
}
|
||||
if cfg.OTLPEnabled {
|
||||
mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)}
|
||||
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
|
||||
mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs))
|
||||
}
|
||||
if cfg.OTLPInsecure {
|
||||
mopts = append(mopts, otlpmetricgrpc.WithInsecure())
|
||||
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
|
||||
if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil {
|
||||
mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds))
|
||||
}
|
||||
}
|
||||
mexp, err := otlpmetricgrpc.New(ctx, mopts...)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
readers = append(readers, metric.NewPeriodicReader(mexp, metric.WithInterval(cfg.MetricExportInterval)))
|
||||
shutdowns = append(shutdowns, mexp.Shutdown)
|
||||
}
|
||||
return readers, promHandler, shutdowns, nil
|
||||
}
|
||||
|
||||
func buildMeterProvider(res *resource.Resource, readers []metric.Reader) *metric.MeterProvider {
|
||||
var mpOpts []metric.Option
|
||||
mpOpts = append(mpOpts, metric.WithResource(res))
|
||||
for _, r := range readers {
|
||||
mpOpts = append(mpOpts, metric.WithReader(r))
|
||||
}
|
||||
mpOpts = append(mpOpts, metric.WithView(metric.NewView(
|
||||
metric.Instrument{Name: "newt_*_latency_seconds"},
|
||||
metric.Stream{Aggregation: metric.AggregationExplicitBucketHistogram{Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}}},
|
||||
)))
|
||||
mpOpts = append(mpOpts, metric.WithView(metric.NewView(
|
||||
metric.Instrument{Name: "newt_*"},
|
||||
metric.Stream{AttributeFilter: func(kv attribute.KeyValue) bool {
|
||||
k := string(kv.Key)
|
||||
switch k {
|
||||
case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "initiator", "error_type", "msg_type", "phase", "version", "commit", "site_id", "region":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}},
|
||||
)))
|
||||
return metric.NewMeterProvider(mpOpts...)
|
||||
}
|
||||
|
||||
func setupTracing(ctx context.Context, cfg Config, res *resource.Resource) (*trace.TracerProvider, func(context.Context) error) {
|
||||
topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)}
|
||||
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
|
||||
topts = append(topts, otlptracegrpc.WithHeaders(hdrs))
|
||||
}
|
||||
if cfg.OTLPInsecure {
|
||||
topts = append(topts, otlptracegrpc.WithInsecure())
|
||||
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
|
||||
if creds, cerr := credentials.NewClientTLSFromFile(certFile, ""); cerr == nil {
|
||||
topts = append(topts, otlptracegrpc.WithTLSCredentials(creds))
|
||||
}
|
||||
}
|
||||
exp, err := otlptracegrpc.New(ctx, topts...)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
tp := trace.NewTracerProvider(trace.WithBatcher(exp), trace.WithResource(res))
|
||||
return tp, exp.Shutdown
|
||||
}
|
||||
|
||||
// Shutdown flushes exporters and providers in reverse init order.
|
||||
func (s *Setup) Shutdown(ctx context.Context) error {
|
||||
var err error
|
||||
for i := len(s.shutdowns) - 1; i >= 0; i-- {
|
||||
err = errors.Join(err, s.shutdowns[i](ctx))
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func parseOTLPHeaders(h string) map[string]string {
|
||||
m := map[string]string{}
|
||||
if h == "" {
|
||||
return m
|
||||
}
|
||||
pairs := strings.Split(h, ",")
|
||||
for _, p := range pairs {
|
||||
kv := strings.SplitN(strings.TrimSpace(p), "=", 2)
|
||||
if len(kv) == 2 {
|
||||
m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// parseResourceAttributes parses OTEL_RESOURCE_ATTRIBUTES formatted as k=v,k2=v2
|
||||
func parseResourceAttributes(s string) map[string]string {
|
||||
m := map[string]string{}
|
||||
if s == "" {
|
||||
return m
|
||||
}
|
||||
parts := strings.Split(s, ",")
|
||||
for _, p := range parts {
|
||||
kv := strings.SplitN(strings.TrimSpace(p), "=", 2)
|
||||
if len(kv) == 2 {
|
||||
m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// Global site/region used to enrich metric labels.
|
||||
var siteIDVal atomic.Value
|
||||
var regionVal atomic.Value
|
||||
var (
|
||||
includeTunnelIDVal atomic.Value // bool; default true
|
||||
includeSiteLabelVal atomic.Value // bool; default false
|
||||
)
|
||||
|
||||
// UpdateSiteInfo updates the global site_id and region used for metric labels.
|
||||
// Thread-safe via atomic.Value: subsequent metric emissions will include
|
||||
// the new labels, prior emissions remain unchanged.
|
||||
func UpdateSiteInfo(siteID, region string) {
|
||||
if siteID != "" {
|
||||
siteIDVal.Store(siteID)
|
||||
}
|
||||
if region != "" {
|
||||
regionVal.Store(region)
|
||||
}
|
||||
}
|
||||
|
||||
func getSiteID() string {
|
||||
if v, ok := siteIDVal.Load().(string); ok {
|
||||
return v
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func getRegion() string {
|
||||
if v, ok := regionVal.Load().(string); ok {
|
||||
return v
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// siteAttrs returns label KVs for site_id and region (if set).
|
||||
func siteAttrs() []attribute.KeyValue {
|
||||
var out []attribute.KeyValue
|
||||
if s := getSiteID(); s != "" {
|
||||
out = append(out, attribute.String("site_id", s))
|
||||
}
|
||||
if r := getRegion(); r != "" {
|
||||
out = append(out, attribute.String("region", r))
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// SiteLabelKVs exposes site label KVs for other packages (e.g., proxy manager).
|
||||
func SiteLabelKVs() []attribute.KeyValue {
|
||||
if !ShouldIncludeSiteLabels() {
|
||||
return nil
|
||||
}
|
||||
return siteAttrs()
|
||||
}
|
||||
|
||||
// ShouldIncludeTunnelID returns whether tunnel_id labels should be emitted.
|
||||
func ShouldIncludeTunnelID() bool {
|
||||
if v, ok := includeTunnelIDVal.Load().(bool); ok {
|
||||
return v
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// ShouldIncludeSiteLabels returns whether site_id/region should be emitted as
|
||||
// metric labels in addition to resource attributes.
|
||||
func ShouldIncludeSiteLabels() bool {
|
||||
if v, ok := includeSiteLabelVal.Load().(bool); ok {
|
||||
return v
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func getenv(k, d string) string {
|
||||
if v := os.Getenv(k); v != "" {
|
||||
return v
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func getdur(k string, d time.Duration) time.Duration {
|
||||
if v := os.Getenv(k); v != "" {
|
||||
if p, e := time.ParseDuration(v); e == nil {
|
||||
return p
|
||||
}
|
||||
}
|
||||
return d
|
||||
}
|
||||
53
internal/telemetry/telemetry_attrfilter_test.go
Normal file
53
internal/telemetry/telemetry_attrfilter_test.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
)
|
||||
|
||||
// Test that disallowed attributes are filtered from the exposition.
|
||||
func TestAttributeFilterDropsUnknownKeys(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
resetMetricsForTest()
|
||||
t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true")
|
||||
cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"}
|
||||
tel, err := Init(ctx, cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("init: %v", err)
|
||||
}
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
|
||||
if tel.PrometheusHandler == nil {
|
||||
t.Fatalf("prom handler nil")
|
||||
}
|
||||
ts := httptest.NewServer(tel.PrometheusHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// Add samples with disallowed attribute keys
|
||||
for _, k := range []string{"forbidden", "site_id", "host"} {
|
||||
set := attribute.NewSet(attribute.String(k, "x"))
|
||||
AddTunnelBytesSet(ctx, 123, set)
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
resp, err := http.Get(ts.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("GET: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
body := string(b)
|
||||
if strings.Contains(body, "forbidden=") {
|
||||
t.Fatalf("unexpected forbidden attribute leaked into metrics: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "site_id=\"x\"") {
|
||||
t.Fatalf("expected allowed attribute site_id to be present in metrics, got: %s", body)
|
||||
}
|
||||
}
|
||||
76
internal/telemetry/telemetry_golden_test.go
Normal file
76
internal/telemetry/telemetry_golden_test.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Golden test that /metrics contains expected metric names.
|
||||
func TestMetricsGoldenContains(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
resetMetricsForTest()
|
||||
t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true")
|
||||
cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", BuildVersion: "test"}
|
||||
tel, err := Init(ctx, cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("telemetry init error: %v", err)
|
||||
}
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
|
||||
if tel.PrometheusHandler == nil {
|
||||
t.Fatalf("prom handler nil")
|
||||
}
|
||||
ts := httptest.NewServer(tel.PrometheusHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// Trigger counters to ensure they appear in the scrape
|
||||
IncConnAttempt(ctx, "websocket", "success")
|
||||
IncWSReconnect(ctx, "io_error")
|
||||
IncProxyConnectionEvent(ctx, "", "tcp", ProxyConnectionOpened)
|
||||
if tel.MeterProvider != nil {
|
||||
_ = tel.MeterProvider.ForceFlush(ctx)
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
var body string
|
||||
for i := 0; i < 5; i++ {
|
||||
resp, err := http.Get(ts.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("GET metrics failed: %v", err)
|
||||
}
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
body = string(b)
|
||||
if strings.Contains(body, "newt_connection_attempts_total") {
|
||||
break
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
|
||||
f, err := os.Open(filepath.Join("testdata", "expected_contains.golden"))
|
||||
if err != nil {
|
||||
t.Fatalf("read golden: %v", err)
|
||||
}
|
||||
defer f.Close()
|
||||
s := bufio.NewScanner(f)
|
||||
for s.Scan() {
|
||||
needle := strings.TrimSpace(s.Text())
|
||||
if needle == "" {
|
||||
continue
|
||||
}
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("expected metrics body to contain %q. body=\n%s", needle, body)
|
||||
}
|
||||
}
|
||||
if err := s.Err(); err != nil {
|
||||
t.Fatalf("scan golden: %v", err)
|
||||
}
|
||||
}
|
||||
65
internal/telemetry/telemetry_smoke_test.go
Normal file
65
internal/telemetry/telemetry_smoke_test.go
Normal file
@@ -0,0 +1,65 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Smoke test that /metrics contains at least one newt_* metric when Prom exporter is enabled.
|
||||
func TestMetricsSmoke(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
resetMetricsForTest()
|
||||
t.Setenv("NEWT_METRICS_INCLUDE_SITE_LABELS", "true")
|
||||
cfg := Config{
|
||||
ServiceName: "newt",
|
||||
PromEnabled: true,
|
||||
OTLPEnabled: false,
|
||||
AdminAddr: "127.0.0.1:0",
|
||||
BuildVersion: "test",
|
||||
BuildCommit: "deadbeef",
|
||||
MetricExportInterval: 5 * time.Second,
|
||||
}
|
||||
tel, err := Init(ctx, cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("telemetry init error: %v", err)
|
||||
}
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
|
||||
// Serve the Prom handler on a test server
|
||||
if tel.PrometheusHandler == nil {
|
||||
t.Fatalf("Prometheus handler nil; PromEnabled should enable it")
|
||||
}
|
||||
ts := httptest.NewServer(tel.PrometheusHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// Record a simple metric and then fetch /metrics
|
||||
IncConnAttempt(ctx, "websocket", "success")
|
||||
if tel.MeterProvider != nil {
|
||||
_ = tel.MeterProvider.ForceFlush(ctx)
|
||||
}
|
||||
// Give the exporter a tick to collect
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
var body string
|
||||
for i := 0; i < 5; i++ {
|
||||
resp, err := http.Get(ts.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("GET /metrics failed: %v", err)
|
||||
}
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
_ = resp.Body.Close()
|
||||
body = string(b)
|
||||
if strings.Contains(body, "newt_connection_attempts_total") {
|
||||
break
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
if !strings.Contains(body, "newt_connection_attempts_total") {
|
||||
t.Fatalf("expected newt_connection_attempts_total in metrics, got:\n%s", body)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user