mirror of
https://github.com/fosrl/newt.git
synced 2026-03-26 20:46:41 +00:00
Add telemetry metrics and constants for improved observability
This commit is contained in:
19
internal/telemetry/constants.go
Normal file
19
internal/telemetry/constants.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package telemetry
|
||||
|
||||
// Protocol labels (low-cardinality)
|
||||
const (
|
||||
ProtocolTCP = "tcp"
|
||||
ProtocolUDP = "udp"
|
||||
)
|
||||
|
||||
// Reconnect reason bins (fixed, low-cardinality)
|
||||
const (
|
||||
ReasonServerRequest = "server_request"
|
||||
ReasonTimeout = "timeout"
|
||||
ReasonPeerClose = "peer_close"
|
||||
ReasonNetworkChange = "network_change"
|
||||
ReasonAuthError = "auth_error"
|
||||
ReasonHandshakeError = "handshake_error"
|
||||
ReasonConfigChange = "config_change"
|
||||
ReasonError = "error"
|
||||
)
|
||||
32
internal/telemetry/constants_test.go
Normal file
32
internal/telemetry/constants_test.go
Normal file
@@ -0,0 +1,32 @@
|
||||
package telemetry
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestAllowedConstants(t *testing.T) {
|
||||
allowedReasons := map[string]struct{}{
|
||||
ReasonServerRequest: {},
|
||||
ReasonTimeout: {},
|
||||
ReasonPeerClose: {},
|
||||
ReasonNetworkChange: {},
|
||||
ReasonAuthError: {},
|
||||
ReasonHandshakeError: {},
|
||||
ReasonConfigChange: {},
|
||||
ReasonError: {},
|
||||
}
|
||||
for k := range allowedReasons {
|
||||
if k == "" {
|
||||
t.Fatalf("empty reason constant")
|
||||
}
|
||||
}
|
||||
|
||||
allowedProtocols := map[string]struct{}{
|
||||
ProtocolTCP: {},
|
||||
ProtocolUDP: {},
|
||||
}
|
||||
for k := range allowedProtocols {
|
||||
if k == "" {
|
||||
t.Fatalf("empty protocol constant")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
231
internal/telemetry/metrics.go
Normal file
231
internal/telemetry/metrics.go
Normal file
@@ -0,0 +1,231 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
)
|
||||
|
||||
// Instruments and helpers for Newt metrics following the naming, units, and
|
||||
// low-cardinality label guidance from the issue description.
|
||||
//
|
||||
// Counters end with _total, durations are in seconds, sizes in bytes.
|
||||
// Only low-cardinality stable labels are supported: site_id, tunnel_id,
|
||||
// transport, direction, result, reason, error_type, region.
|
||||
var (
|
||||
initOnce sync.Once
|
||||
|
||||
meter metric.Meter
|
||||
|
||||
// Site / Registration
|
||||
mSiteRegistrations metric.Int64Counter
|
||||
mSiteOnline metric.Int64ObservableGauge
|
||||
mSiteLastHeartbeat metric.Float64ObservableGauge
|
||||
|
||||
// Tunnel / Sessions
|
||||
mTunnelSessions metric.Int64ObservableGauge
|
||||
mTunnelBytes metric.Int64Counter
|
||||
mTunnelLatency metric.Float64Histogram
|
||||
mReconnects metric.Int64Counter
|
||||
|
||||
// Connection / NAT
|
||||
mConnAttempts metric.Int64Counter
|
||||
mConnErrors metric.Int64Counter
|
||||
|
||||
// Config/Restart
|
||||
mConfigReloads metric.Int64Counter
|
||||
mRestartCount metric.Int64Counter
|
||||
|
||||
// Build info
|
||||
mBuildInfo metric.Int64ObservableGauge
|
||||
|
||||
buildVersion string
|
||||
buildCommit string
|
||||
)
|
||||
|
||||
func registerInstruments() error {
|
||||
var err error
|
||||
initOnce.Do(func() {
|
||||
meter = otel.Meter("newt")
|
||||
|
||||
// Site / Registration
|
||||
mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total",
|
||||
metric.WithDescription("Total site registration attempts"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online",
|
||||
metric.WithDescription("Site online (0/1)"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_seconds",
|
||||
metric.WithDescription("Seconds since last site heartbeat"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Tunnel / Sessions
|
||||
mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions",
|
||||
metric.WithDescription("Active tunnel sessions"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total",
|
||||
metric.WithDescription("Tunnel bytes in/out"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds",
|
||||
metric.WithDescription("Per-tunnel latency in seconds"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total",
|
||||
metric.WithDescription("Tunnel reconnect events"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Connection / NAT
|
||||
mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total",
|
||||
metric.WithDescription("Connection attempts"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
mConnErrors, err = meter.Int64Counter("newt_connection_errors_total",
|
||||
metric.WithDescription("Connection errors by type"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Config/Restart
|
||||
mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total",
|
||||
metric.WithDescription("Configuration reloads"))
|
||||
mRestartCount, _ = meter.Int64Counter("newt_restart_count_total",
|
||||
metric.WithDescription("Process restart count (incremented on start)"))
|
||||
|
||||
// Build info gauge (value 1 with version/commit attributes)
|
||||
mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info",
|
||||
metric.WithDescription("Newt build information (value is always 1)"))
|
||||
|
||||
// Register a default callback for build info if version/commit set
|
||||
meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
if buildVersion == "" && buildCommit == "" {
|
||||
return nil
|
||||
}
|
||||
attrs := []attribute.KeyValue{}
|
||||
if buildVersion != "" {
|
||||
attrs = append(attrs, attribute.String("version", buildVersion))
|
||||
}
|
||||
if buildCommit != "" {
|
||||
attrs = append(attrs, attribute.String("commit", buildCommit))
|
||||
}
|
||||
o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...))
|
||||
return nil
|
||||
}, mBuildInfo)
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
// Observable registration: Newt can register a callback to report gauges.
|
||||
// Call SetObservableCallback once to start observing online status, last
|
||||
// heartbeat seconds, and active sessions.
|
||||
|
||||
var (
|
||||
obsOnce sync.Once
|
||||
obsStopper func()
|
||||
)
|
||||
|
||||
// SetObservableCallback registers a single callback that will be invoked
|
||||
// on collection. Use the provided observer to emit values for the observable
|
||||
// gauges defined here.
|
||||
//
|
||||
// Example inside your code (where you have access to current state):
|
||||
//
|
||||
// telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
// o.ObserveInt64(mSiteOnline, 1, attribute.String("site_id", siteID))
|
||||
// o.ObserveFloat64(mSiteLastHeartbeat, time.Since(lastHB).Seconds(), attribute.String("site_id", siteID))
|
||||
// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions)), attribute.String("site_id", siteID))
|
||||
// return nil
|
||||
// })
|
||||
func SetObservableCallback(cb func(context.Context, metric.Observer) error) {
|
||||
obsOnce.Do(func() {
|
||||
meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions)
|
||||
obsStopper = func() { /* no-op; otel callbacks are unregistered when provider shuts down */ }
|
||||
})
|
||||
}
|
||||
|
||||
// Build info registration
|
||||
func RegisterBuildInfo(version, commit string) {
|
||||
buildVersion = version
|
||||
buildCommit = commit
|
||||
// Increment restart count on boot
|
||||
mRestartCount.Add(context.Background(), 1)
|
||||
}
|
||||
|
||||
// Config reloads
|
||||
func IncConfigReload(ctx context.Context, result string) {
|
||||
mConfigReloads.Add(ctx, 1, metric.WithAttributes(attribute.String("result", result)))
|
||||
}
|
||||
|
||||
// Helpers for counters/histograms
|
||||
|
||||
func IncSiteRegistration(ctx context.Context, siteID, region, result string) {
|
||||
attrs := []attribute.KeyValue{
|
||||
attribute.String("site_id", siteID),
|
||||
attribute.String("result", result),
|
||||
}
|
||||
if region != "" {
|
||||
attrs = append(attrs, attribute.String("region", region))
|
||||
}
|
||||
mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrs...))
|
||||
}
|
||||
|
||||
func AddTunnelBytes(ctx context.Context, siteID, tunnelID, direction string, n int64) {
|
||||
mTunnelBytes.Add(ctx, n, metric.WithAttributes(
|
||||
attribute.String("site_id", siteID),
|
||||
attribute.String("tunnel_id", tunnelID),
|
||||
attribute.String("direction", direction),
|
||||
))
|
||||
}
|
||||
|
||||
// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations.
|
||||
func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) {
|
||||
mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs))
|
||||
}
|
||||
|
||||
func ObserveTunnelLatency(ctx context.Context, siteID, tunnelID, transport string, seconds float64) {
|
||||
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(
|
||||
attribute.String("site_id", siteID),
|
||||
attribute.String("tunnel_id", tunnelID),
|
||||
attribute.String("transport", transport),
|
||||
))
|
||||
}
|
||||
|
||||
func IncReconnect(ctx context.Context, siteID, tunnelID, reason string) {
|
||||
mReconnects.Add(ctx, 1, metric.WithAttributes(
|
||||
attribute.String("site_id", siteID),
|
||||
attribute.String("tunnel_id", tunnelID),
|
||||
attribute.String("reason", reason),
|
||||
))
|
||||
}
|
||||
|
||||
func IncConnAttempt(ctx context.Context, siteID, transport, result string) {
|
||||
mConnAttempts.Add(ctx, 1, metric.WithAttributes(
|
||||
attribute.String("site_id", siteID),
|
||||
attribute.String("transport", transport),
|
||||
attribute.String("result", result),
|
||||
))
|
||||
}
|
||||
|
||||
func IncConnError(ctx context.Context, siteID, transport, typ string) {
|
||||
mConnErrors.Add(ctx, 1, metric.WithAttributes(
|
||||
attribute.String("site_id", siteID),
|
||||
attribute.String("transport", transport),
|
||||
attribute.String("error_type", typ),
|
||||
))
|
||||
}
|
||||
63
internal/telemetry/state_view.go
Normal file
63
internal/telemetry/state_view.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/metric"
|
||||
)
|
||||
|
||||
// StateView provides a read-only view for observable gauges.
|
||||
// Implementations must be concurrency-safe and avoid blocking operations.
|
||||
// All methods should be fast and use RLocks where applicable.
|
||||
type StateView interface {
|
||||
// ListSites returns a stable, low-cardinality list of site IDs to expose.
|
||||
ListSites() []string
|
||||
// Online returns whether the site is online.
|
||||
Online(siteID string) (online bool, ok bool)
|
||||
// LastHeartbeat returns the last heartbeat time for a site.
|
||||
LastHeartbeat(siteID string) (t time.Time, ok bool)
|
||||
// ActiveSessions returns the current number of active sessions for a site (across tunnels),
|
||||
// or scoped to site if your model is site-scoped.
|
||||
ActiveSessions(siteID string) (n int64, ok bool)
|
||||
}
|
||||
|
||||
var (
|
||||
stateView atomic.Value // of type StateView
|
||||
)
|
||||
|
||||
// RegisterStateView sets the global StateView used by the default observable callback.
|
||||
func RegisterStateView(v StateView) {
|
||||
stateView.Store(v)
|
||||
// If instruments are registered, ensure a callback exists.
|
||||
if v != nil {
|
||||
SetObservableCallback(func(ctx context.Context, o metric.Observer) error {
|
||||
if any := stateView.Load(); any != nil {
|
||||
if sv, ok := any.(StateView); ok {
|
||||
for _, siteID := range sv.ListSites() {
|
||||
if online, ok := sv.Online(siteID); ok {
|
||||
val := int64(0)
|
||||
if online {
|
||||
val = 1
|
||||
}
|
||||
o.ObserveInt64(mSiteOnline, val, metric.WithAttributes(attribute.String("site_id", siteID)))
|
||||
}
|
||||
if t, ok := sv.LastHeartbeat(siteID); ok {
|
||||
secs := time.Since(t).Seconds()
|
||||
o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes(attribute.String("site_id", siteID)))
|
||||
}
|
||||
// If the view supports per-tunnel sessions, report them labeled by tunnel_id.
|
||||
if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok {
|
||||
for tid, n := range tm.SessionsByTunnel() {
|
||||
o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attribute.String("tunnel_id", tid)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
}
|
||||
265
internal/telemetry/telemetry.go
Normal file
265
internal/telemetry/telemetry.go
Normal file
@@ -0,0 +1,265 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
promclient "github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"go.opentelemetry.io/contrib/instrumentation/runtime"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||
"go.opentelemetry.io/otel/exporters/prometheus"
|
||||
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
|
||||
"google.golang.org/grpc/credentials"
|
||||
)
|
||||
|
||||
// Config controls telemetry initialization via env flags.
|
||||
//
|
||||
// Defaults align with the issue requirements:
|
||||
// - Prometheus exporter enabled by default (/metrics)
|
||||
// - OTLP exporter disabled by default
|
||||
// - Durations in seconds, bytes in raw bytes
|
||||
// - Admin HTTP server address configurable (for mounting /metrics)
|
||||
type Config struct {
|
||||
ServiceName string
|
||||
ServiceVersion string
|
||||
|
||||
// Optional resource attributes
|
||||
SiteID string
|
||||
Region string
|
||||
|
||||
PromEnabled bool
|
||||
OTLPEnabled bool
|
||||
|
||||
OTLPEndpoint string // host:port
|
||||
OTLPInsecure bool
|
||||
|
||||
MetricExportInterval time.Duration
|
||||
AdminAddr string // e.g.: ":2112"
|
||||
|
||||
// Optional build info for newt_build_info metric
|
||||
BuildVersion string
|
||||
BuildCommit string
|
||||
}
|
||||
|
||||
// FromEnv reads configuration from environment variables.
|
||||
//
|
||||
// NEWT_METRICS_PROMETHEUS_ENABLED (default: true)
|
||||
// NEWT_METRICS_OTLP_ENABLED (default: false)
|
||||
// OTEL_EXPORTER_OTLP_ENDPOINT (default: "localhost:4317")
|
||||
// OTEL_EXPORTER_OTLP_INSECURE (default: true)
|
||||
// OTEL_METRIC_EXPORT_INTERVAL (default: 15s)
|
||||
// OTEL_SERVICE_NAME (default: "newt")
|
||||
// OTEL_SERVICE_VERSION (default: "")
|
||||
// NEWT_ADMIN_ADDR (default: ":2112")
|
||||
func FromEnv() Config {
|
||||
return Config{
|
||||
ServiceName: getenv("OTEL_SERVICE_NAME", "newt"),
|
||||
ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"),
|
||||
Region: os.Getenv("NEWT_REGION"),
|
||||
PromEnabled: getenv("NEWT_METRICS_PROMETHEUS_ENABLED", "true") == "true",
|
||||
OTLPEnabled: getenv("NEWT_METRICS_OTLP_ENABLED", "false") == "true",
|
||||
OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"),
|
||||
OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true",
|
||||
MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second),
|
||||
AdminAddr: getenv("NEWT_ADMIN_ADDR", "127.0.0.1:2112"),
|
||||
}
|
||||
}
|
||||
|
||||
// Setup holds initialized telemetry providers and (optionally) a /metrics handler.
|
||||
// Call Shutdown when the process terminates to flush exporters.
|
||||
type Setup struct {
|
||||
MeterProvider *sdkmetric.MeterProvider
|
||||
TracerProvider *sdktrace.TracerProvider
|
||||
|
||||
PrometheusHandler http.Handler // nil if Prometheus exporter disabled
|
||||
|
||||
shutdowns []func(context.Context) error
|
||||
}
|
||||
|
||||
// Init configures OpenTelemetry metrics and (optionally) tracing.
|
||||
//
|
||||
// It sets a global MeterProvider and TracerProvider, registers runtime instrumentation,
|
||||
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
|
||||
// a Shutdown method to flush exporters.
|
||||
func Init(ctx context.Context, cfg Config) (*Setup, error) {
|
||||
res, _ := resource.New(ctx,
|
||||
resource.WithFromEnv(),
|
||||
resource.WithHost(),
|
||||
resource.WithAttributes(
|
||||
semconv.ServiceName(cfg.ServiceName),
|
||||
semconv.ServiceVersion(cfg.ServiceVersion),
|
||||
// Optional resource attributes
|
||||
attribute.String("site_id", cfg.SiteID),
|
||||
attribute.String("region", cfg.Region),
|
||||
),
|
||||
)
|
||||
|
||||
s := &Setup{}
|
||||
|
||||
// Build metric readers/exporters
|
||||
var readers []sdkmetric.Reader
|
||||
|
||||
// Prometheus exporter exposes a native /metrics handler for scraping
|
||||
if cfg.PromEnabled {
|
||||
reg := promclient.NewRegistry()
|
||||
exp, err := prometheus.New(prometheus.WithRegisterer(reg))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
readers = append(readers, exp)
|
||||
s.PrometheusHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
|
||||
}
|
||||
|
||||
// Optional OTLP metric exporter (gRPC)
|
||||
if cfg.OTLPEnabled {
|
||||
mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)}
|
||||
// Headers support via OTEL_EXPORTER_OTLP_HEADERS (k=v,k2=v2)
|
||||
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
|
||||
mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs))
|
||||
}
|
||||
if cfg.OTLPInsecure {
|
||||
mopts = append(mopts, otlpmetricgrpc.WithInsecure())
|
||||
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
|
||||
creds, cerr := credentials.NewClientTLSFromFile(certFile, "")
|
||||
if cerr == nil {
|
||||
mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds))
|
||||
}
|
||||
}
|
||||
mexp, err := otlpmetricgrpc.New(ctx, mopts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
readers = append(readers, sdkmetric.NewPeriodicReader(mexp, sdkmetric.WithInterval(cfg.MetricExportInterval)))
|
||||
s.shutdowns = append(s.shutdowns, mexp.Shutdown)
|
||||
}
|
||||
|
||||
// Build provider options iteratively (WithReader is not variadic)
|
||||
var mpOpts []sdkmetric.Option
|
||||
mpOpts = append(mpOpts, sdkmetric.WithResource(res))
|
||||
for _, r := range readers {
|
||||
mpOpts = append(mpOpts, sdkmetric.WithReader(r))
|
||||
}
|
||||
// Default view for latency histograms in seconds.
|
||||
mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView(
|
||||
sdkmetric.Instrument{
|
||||
Name: "newt_*_latency_seconds",
|
||||
},
|
||||
sdkmetric.Stream{
|
||||
Aggregation: sdkmetric.AggregationExplicitBucketHistogram{
|
||||
Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30},
|
||||
},
|
||||
},
|
||||
)))
|
||||
// Attribute whitelist: only allow expected low-cardinality keys on newt_* instruments.
|
||||
mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView(
|
||||
sdkmetric.Instrument{Name: "newt_*"},
|
||||
sdkmetric.Stream{
|
||||
AttributeFilter: func(kv attribute.KeyValue) bool {
|
||||
k := string(kv.Key)
|
||||
switch k {
|
||||
case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
},
|
||||
},
|
||||
)))
|
||||
mp := sdkmetric.NewMeterProvider(mpOpts...)
|
||||
otel.SetMeterProvider(mp)
|
||||
s.MeterProvider = mp
|
||||
s.shutdowns = append(s.shutdowns, mp.Shutdown)
|
||||
|
||||
// Optional tracing (OTLP over gRPC)
|
||||
if cfg.OTLPEnabled {
|
||||
topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)}
|
||||
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
|
||||
topts = append(topts, otlptracegrpc.WithHeaders(hdrs))
|
||||
}
|
||||
if cfg.OTLPInsecure {
|
||||
topts = append(topts, otlptracegrpc.WithInsecure())
|
||||
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
|
||||
creds, cerr := credentials.NewClientTLSFromFile(certFile, "")
|
||||
if cerr == nil {
|
||||
topts = append(topts, otlptracegrpc.WithTLSCredentials(creds))
|
||||
}
|
||||
}
|
||||
exp, err := otlptracegrpc.New(ctx, topts...)
|
||||
if err == nil {
|
||||
tp := sdktrace.NewTracerProvider(
|
||||
sdktrace.WithBatcher(exp),
|
||||
sdktrace.WithResource(res),
|
||||
)
|
||||
otel.SetTracerProvider(tp)
|
||||
s.TracerProvider = tp
|
||||
s.shutdowns = append(s.shutdowns, func(ctx context.Context) error {
|
||||
return errors.Join(exp.Shutdown(ctx), tp.Shutdown(ctx))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Export Go runtime metrics (goroutines, GC, mem, etc.)
|
||||
_ = runtime.Start(runtime.WithMeterProvider(mp))
|
||||
|
||||
// Register instruments after provider is set
|
||||
if err := registerInstruments(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Optional build info metric
|
||||
if cfg.BuildVersion != "" || cfg.BuildCommit != "" {
|
||||
RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit)
|
||||
}
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Shutdown flushes exporters and providers in reverse init order.
|
||||
func (s *Setup) Shutdown(ctx context.Context) error {
|
||||
var err error
|
||||
for i := len(s.shutdowns) - 1; i >= 0; i-- {
|
||||
err = errors.Join(err, s.shutdowns[i](ctx))
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func parseOTLPHeaders(h string) map[string]string {
|
||||
m := map[string]string{}
|
||||
if h == "" {
|
||||
return m
|
||||
}
|
||||
pairs := strings.Split(h, ",")
|
||||
for _, p := range pairs {
|
||||
kv := strings.SplitN(strings.TrimSpace(p), "=", 2)
|
||||
if len(kv) == 2 {
|
||||
m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func getenv(k, d string) string {
|
||||
if v := os.Getenv(k); v != "" {
|
||||
return v
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func getdur(k string, d time.Duration) time.Duration {
|
||||
if v := os.Getenv(k); v != "" {
|
||||
if p, e := time.ParseDuration(v); e == nil {
|
||||
return p
|
||||
}
|
||||
}
|
||||
return d
|
||||
}
|
||||
43
internal/telemetry/telemetry_attrfilter_test.go
Normal file
43
internal/telemetry/telemetry_attrfilter_test.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
)
|
||||
|
||||
// Test that disallowed attributes are filtered from the exposition.
|
||||
func TestAttributeFilterDropsUnknownKeys(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"}
|
||||
tel, err := Init(ctx, cfg)
|
||||
if err != nil { t.Fatalf("init: %v", err) }
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
|
||||
if tel.PrometheusHandler == nil { t.Fatalf("prom handler nil") }
|
||||
ts := httptest.NewServer(tel.PrometheusHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// Add samples with disallowed attribute keys
|
||||
for _, k := range []string{"forbidden", "site_id", "host"} {
|
||||
set := attribute.NewSet(attribute.String(k, "x"))
|
||||
AddTunnelBytesSet(ctx, 123, set)
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
resp, err := http.Get(ts.URL)
|
||||
if err != nil { t.Fatalf("GET: %v", err) }
|
||||
defer resp.Body.Close()
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
body := string(b)
|
||||
if strings.Contains(body, "forbidden=") {
|
||||
t.Fatalf("unexpected forbidden attribute leaked into metrics: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
50
internal/telemetry/telemetry_golden_test.go
Normal file
50
internal/telemetry/telemetry_golden_test.go
Normal file
@@ -0,0 +1,50 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Golden test that /metrics contains expected metric names.
|
||||
func TestMetricsGoldenContains(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", BuildVersion: "test"}
|
||||
tel, err := Init(ctx, cfg)
|
||||
if err != nil { t.Fatalf("telemetry init error: %v", err) }
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
|
||||
if tel.PrometheusHandler == nil { t.Fatalf("prom handler nil") }
|
||||
ts := httptest.NewServer(tel.PrometheusHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// Trigger a counter
|
||||
IncConnAttempt(ctx, "ignored", "websocket", "success")
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
resp, err := http.Get(ts.URL)
|
||||
if err != nil { t.Fatalf("GET metrics failed: %v", err) }
|
||||
defer resp.Body.Close()
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
body := string(b)
|
||||
|
||||
f, err := os.Open("internal/telemetry/testdata/expected_contains.golden")
|
||||
if err != nil { t.Fatalf("read golden: %v", err) }
|
||||
defer f.Close()
|
||||
s := bufio.NewScanner(f)
|
||||
for s.Scan() {
|
||||
needle := strings.TrimSpace(s.Text())
|
||||
if needle == "" { continue }
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("expected metrics body to contain %q. body=\n%s", needle, body)
|
||||
}
|
||||
}
|
||||
if err := s.Err(); err != nil { t.Fatalf("scan golden: %v", err) }
|
||||
}
|
||||
|
||||
54
internal/telemetry/telemetry_smoke_test.go
Normal file
54
internal/telemetry/telemetry_smoke_test.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package telemetry
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Smoke test that /metrics contains at least one newt_* metric when Prom exporter is enabled.
|
||||
func TestMetricsSmoke(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
cfg := Config{
|
||||
ServiceName: "newt",
|
||||
PromEnabled: true,
|
||||
OTLPEnabled: false,
|
||||
AdminAddr: "127.0.0.1:0",
|
||||
BuildVersion: "test",
|
||||
BuildCommit: "deadbeef",
|
||||
MetricExportInterval: 5 * time.Second,
|
||||
}
|
||||
tel, err := Init(ctx, cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("telemetry init error: %v", err)
|
||||
}
|
||||
defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
|
||||
// Serve the Prom handler on a test server
|
||||
if tel.PrometheusHandler == nil {
|
||||
t.Fatalf("Prometheus handler nil; PromEnabled should enable it")
|
||||
}
|
||||
ts := httptest.NewServer(tel.PrometheusHandler)
|
||||
defer ts.Close()
|
||||
|
||||
// Record a simple metric and then fetch /metrics
|
||||
IncConnAttempt(ctx, "site-1", "websocket", "success")
|
||||
// Give the exporter a tick to collect
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
resp, err := http.Get(ts.URL)
|
||||
if err != nil {
|
||||
t.Fatalf("GET /metrics failed: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
body := string(b)
|
||||
if !strings.Contains(body, "newt_connection_attempts_total") {
|
||||
t.Fatalf("expected newt_connection_attempts_total in metrics, got:\n%s", body)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user