From 2d4f656852f69cc5466ba76363d8301f07aec354 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Sch=C3=A4fer?= Date: Tue, 7 Oct 2025 09:15:36 +0200 Subject: [PATCH] Add telemetry metrics and constants for improved observability --- internal/telemetry/constants.go | 19 ++ internal/telemetry/constants_test.go | 32 +++ internal/telemetry/metrics.go | 231 +++++++++++++++ internal/telemetry/state_view.go | 63 +++++ internal/telemetry/telemetry.go | 265 ++++++++++++++++++ .../telemetry/telemetry_attrfilter_test.go | 43 +++ internal/telemetry/telemetry_golden_test.go | 50 ++++ internal/telemetry/telemetry_smoke_test.go | 54 ++++ 8 files changed, 757 insertions(+) create mode 100644 internal/telemetry/constants.go create mode 100644 internal/telemetry/constants_test.go create mode 100644 internal/telemetry/metrics.go create mode 100644 internal/telemetry/state_view.go create mode 100644 internal/telemetry/telemetry.go create mode 100644 internal/telemetry/telemetry_attrfilter_test.go create mode 100644 internal/telemetry/telemetry_golden_test.go create mode 100644 internal/telemetry/telemetry_smoke_test.go diff --git a/internal/telemetry/constants.go b/internal/telemetry/constants.go new file mode 100644 index 0000000..bc117bf --- /dev/null +++ b/internal/telemetry/constants.go @@ -0,0 +1,19 @@ +package telemetry + +// Protocol labels (low-cardinality) +const ( + ProtocolTCP = "tcp" + ProtocolUDP = "udp" +) + +// Reconnect reason bins (fixed, low-cardinality) +const ( + ReasonServerRequest = "server_request" + ReasonTimeout = "timeout" + ReasonPeerClose = "peer_close" + ReasonNetworkChange = "network_change" + ReasonAuthError = "auth_error" + ReasonHandshakeError = "handshake_error" + ReasonConfigChange = "config_change" + ReasonError = "error" +) diff --git a/internal/telemetry/constants_test.go b/internal/telemetry/constants_test.go new file mode 100644 index 0000000..e95fb52 --- /dev/null +++ b/internal/telemetry/constants_test.go @@ -0,0 +1,32 @@ +package telemetry + +import "testing" + +func TestAllowedConstants(t *testing.T) { + allowedReasons := map[string]struct{}{ + ReasonServerRequest: {}, + ReasonTimeout: {}, + ReasonPeerClose: {}, + ReasonNetworkChange: {}, + ReasonAuthError: {}, + ReasonHandshakeError: {}, + ReasonConfigChange: {}, + ReasonError: {}, + } + for k := range allowedReasons { + if k == "" { + t.Fatalf("empty reason constant") + } + } + + allowedProtocols := map[string]struct{}{ + ProtocolTCP: {}, + ProtocolUDP: {}, + } + for k := range allowedProtocols { + if k == "" { + t.Fatalf("empty protocol constant") + } + } +} + diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go new file mode 100644 index 0000000..130fbd3 --- /dev/null +++ b/internal/telemetry/metrics.go @@ -0,0 +1,231 @@ +package telemetry + +import ( + "context" + "sync" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Instruments and helpers for Newt metrics following the naming, units, and +// low-cardinality label guidance from the issue description. +// +// Counters end with _total, durations are in seconds, sizes in bytes. +// Only low-cardinality stable labels are supported: site_id, tunnel_id, +// transport, direction, result, reason, error_type, region. +var ( + initOnce sync.Once + + meter metric.Meter + + // Site / Registration + mSiteRegistrations metric.Int64Counter + mSiteOnline metric.Int64ObservableGauge + mSiteLastHeartbeat metric.Float64ObservableGauge + + // Tunnel / Sessions + mTunnelSessions metric.Int64ObservableGauge + mTunnelBytes metric.Int64Counter + mTunnelLatency metric.Float64Histogram + mReconnects metric.Int64Counter + + // Connection / NAT + mConnAttempts metric.Int64Counter + mConnErrors metric.Int64Counter + + // Config/Restart + mConfigReloads metric.Int64Counter + mRestartCount metric.Int64Counter + + // Build info + mBuildInfo metric.Int64ObservableGauge + + buildVersion string + buildCommit string +) + +func registerInstruments() error { + var err error + initOnce.Do(func() { + meter = otel.Meter("newt") + + // Site / Registration + mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total", + metric.WithDescription("Total site registration attempts")) + if err != nil { + return + } + mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online", + metric.WithDescription("Site online (0/1)")) + if err != nil { + return + } + mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_seconds", + metric.WithDescription("Seconds since last site heartbeat")) + if err != nil { + return + } + + // Tunnel / Sessions + mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions", + metric.WithDescription("Active tunnel sessions")) + if err != nil { + return + } + mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total", + metric.WithDescription("Tunnel bytes in/out")) + if err != nil { + return + } + mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds", + metric.WithDescription("Per-tunnel latency in seconds")) + if err != nil { + return + } + mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total", + metric.WithDescription("Tunnel reconnect events")) + if err != nil { + return + } + + // Connection / NAT + mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total", + metric.WithDescription("Connection attempts")) + if err != nil { + return + } + mConnErrors, err = meter.Int64Counter("newt_connection_errors_total", + metric.WithDescription("Connection errors by type")) + if err != nil { + return + } + + // Config/Restart + mConfigReloads, _ = meter.Int64Counter("newt_config_reloads_total", + metric.WithDescription("Configuration reloads")) + mRestartCount, _ = meter.Int64Counter("newt_restart_count_total", + metric.WithDescription("Process restart count (incremented on start)")) + + // Build info gauge (value 1 with version/commit attributes) + mBuildInfo, _ = meter.Int64ObservableGauge("newt_build_info", + metric.WithDescription("Newt build information (value is always 1)")) + + // Register a default callback for build info if version/commit set + meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { + if buildVersion == "" && buildCommit == "" { + return nil + } + attrs := []attribute.KeyValue{} + if buildVersion != "" { + attrs = append(attrs, attribute.String("version", buildVersion)) + } + if buildCommit != "" { + attrs = append(attrs, attribute.String("commit", buildCommit)) + } + o.ObserveInt64(mBuildInfo, 1, metric.WithAttributes(attrs...)) + return nil + }, mBuildInfo) + }) + return err +} + +// Observable registration: Newt can register a callback to report gauges. +// Call SetObservableCallback once to start observing online status, last +// heartbeat seconds, and active sessions. + +var ( + obsOnce sync.Once + obsStopper func() +) + +// SetObservableCallback registers a single callback that will be invoked +// on collection. Use the provided observer to emit values for the observable +// gauges defined here. +// +// Example inside your code (where you have access to current state): +// +// telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error { +// o.ObserveInt64(mSiteOnline, 1, attribute.String("site_id", siteID)) +// o.ObserveFloat64(mSiteLastHeartbeat, time.Since(lastHB).Seconds(), attribute.String("site_id", siteID)) +// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions)), attribute.String("site_id", siteID)) +// return nil +// }) +func SetObservableCallback(cb func(context.Context, metric.Observer) error) { + obsOnce.Do(func() { + meter.RegisterCallback(cb, mSiteOnline, mSiteLastHeartbeat, mTunnelSessions) + obsStopper = func() { /* no-op; otel callbacks are unregistered when provider shuts down */ } + }) +} + +// Build info registration +func RegisterBuildInfo(version, commit string) { + buildVersion = version + buildCommit = commit + // Increment restart count on boot + mRestartCount.Add(context.Background(), 1) +} + +// Config reloads +func IncConfigReload(ctx context.Context, result string) { + mConfigReloads.Add(ctx, 1, metric.WithAttributes(attribute.String("result", result))) +} + +// Helpers for counters/histograms + +func IncSiteRegistration(ctx context.Context, siteID, region, result string) { + attrs := []attribute.KeyValue{ + attribute.String("site_id", siteID), + attribute.String("result", result), + } + if region != "" { + attrs = append(attrs, attribute.String("region", region)) + } + mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrs...)) +} + +func AddTunnelBytes(ctx context.Context, siteID, tunnelID, direction string, n int64) { + mTunnelBytes.Add(ctx, n, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("tunnel_id", tunnelID), + attribute.String("direction", direction), + )) +} + +// AddTunnelBytesSet adds bytes using a pre-built attribute.Set to avoid per-call allocations. +func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) { + mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs)) +} + +func ObserveTunnelLatency(ctx context.Context, siteID, tunnelID, transport string, seconds float64) { + mTunnelLatency.Record(ctx, seconds, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("tunnel_id", tunnelID), + attribute.String("transport", transport), + )) +} + +func IncReconnect(ctx context.Context, siteID, tunnelID, reason string) { + mReconnects.Add(ctx, 1, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("tunnel_id", tunnelID), + attribute.String("reason", reason), + )) +} + +func IncConnAttempt(ctx context.Context, siteID, transport, result string) { + mConnAttempts.Add(ctx, 1, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("transport", transport), + attribute.String("result", result), + )) +} + +func IncConnError(ctx context.Context, siteID, transport, typ string) { + mConnErrors.Add(ctx, 1, metric.WithAttributes( + attribute.String("site_id", siteID), + attribute.String("transport", transport), + attribute.String("error_type", typ), + )) +} diff --git a/internal/telemetry/state_view.go b/internal/telemetry/state_view.go new file mode 100644 index 0000000..4c97ddf --- /dev/null +++ b/internal/telemetry/state_view.go @@ -0,0 +1,63 @@ +package telemetry + +import ( + "context" + "sync/atomic" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// StateView provides a read-only view for observable gauges. +// Implementations must be concurrency-safe and avoid blocking operations. +// All methods should be fast and use RLocks where applicable. +type StateView interface { + // ListSites returns a stable, low-cardinality list of site IDs to expose. + ListSites() []string + // Online returns whether the site is online. + Online(siteID string) (online bool, ok bool) + // LastHeartbeat returns the last heartbeat time for a site. + LastHeartbeat(siteID string) (t time.Time, ok bool) + // ActiveSessions returns the current number of active sessions for a site (across tunnels), + // or scoped to site if your model is site-scoped. + ActiveSessions(siteID string) (n int64, ok bool) +} + +var ( + stateView atomic.Value // of type StateView +) + +// RegisterStateView sets the global StateView used by the default observable callback. +func RegisterStateView(v StateView) { + stateView.Store(v) + // If instruments are registered, ensure a callback exists. + if v != nil { + SetObservableCallback(func(ctx context.Context, o metric.Observer) error { + if any := stateView.Load(); any != nil { + if sv, ok := any.(StateView); ok { + for _, siteID := range sv.ListSites() { + if online, ok := sv.Online(siteID); ok { + val := int64(0) + if online { + val = 1 + } + o.ObserveInt64(mSiteOnline, val, metric.WithAttributes(attribute.String("site_id", siteID))) + } + if t, ok := sv.LastHeartbeat(siteID); ok { + secs := time.Since(t).Seconds() + o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes(attribute.String("site_id", siteID))) + } + // If the view supports per-tunnel sessions, report them labeled by tunnel_id. + if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok { + for tid, n := range tm.SessionsByTunnel() { + o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attribute.String("tunnel_id", tid))) + } + } + } + } + } + return nil + }) + } +} diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go new file mode 100644 index 0000000..20a25c0 --- /dev/null +++ b/internal/telemetry/telemetry.go @@ -0,0 +1,265 @@ +package telemetry + +import ( + "context" + "errors" + "net/http" + "os" + "strings" + "time" + + promclient "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "go.opentelemetry.io/contrib/instrumentation/runtime" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/exporters/prometheus" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" + "google.golang.org/grpc/credentials" +) + +// Config controls telemetry initialization via env flags. +// +// Defaults align with the issue requirements: +// - Prometheus exporter enabled by default (/metrics) +// - OTLP exporter disabled by default +// - Durations in seconds, bytes in raw bytes +// - Admin HTTP server address configurable (for mounting /metrics) +type Config struct { + ServiceName string + ServiceVersion string + + // Optional resource attributes + SiteID string + Region string + + PromEnabled bool + OTLPEnabled bool + + OTLPEndpoint string // host:port + OTLPInsecure bool + + MetricExportInterval time.Duration + AdminAddr string // e.g.: ":2112" + + // Optional build info for newt_build_info metric + BuildVersion string + BuildCommit string +} + +// FromEnv reads configuration from environment variables. +// +// NEWT_METRICS_PROMETHEUS_ENABLED (default: true) +// NEWT_METRICS_OTLP_ENABLED (default: false) +// OTEL_EXPORTER_OTLP_ENDPOINT (default: "localhost:4317") +// OTEL_EXPORTER_OTLP_INSECURE (default: true) +// OTEL_METRIC_EXPORT_INTERVAL (default: 15s) +// OTEL_SERVICE_NAME (default: "newt") +// OTEL_SERVICE_VERSION (default: "") +// NEWT_ADMIN_ADDR (default: ":2112") +func FromEnv() Config { + return Config{ + ServiceName: getenv("OTEL_SERVICE_NAME", "newt"), + ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"), + Region: os.Getenv("NEWT_REGION"), + PromEnabled: getenv("NEWT_METRICS_PROMETHEUS_ENABLED", "true") == "true", + OTLPEnabled: getenv("NEWT_METRICS_OTLP_ENABLED", "false") == "true", + OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"), + OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true", + MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second), + AdminAddr: getenv("NEWT_ADMIN_ADDR", "127.0.0.1:2112"), + } +} + +// Setup holds initialized telemetry providers and (optionally) a /metrics handler. +// Call Shutdown when the process terminates to flush exporters. +type Setup struct { + MeterProvider *sdkmetric.MeterProvider + TracerProvider *sdktrace.TracerProvider + + PrometheusHandler http.Handler // nil if Prometheus exporter disabled + + shutdowns []func(context.Context) error +} + +// Init configures OpenTelemetry metrics and (optionally) tracing. +// +// It sets a global MeterProvider and TracerProvider, registers runtime instrumentation, +// installs recommended histogram views for *_latency_seconds, and returns a Setup with +// a Shutdown method to flush exporters. +func Init(ctx context.Context, cfg Config) (*Setup, error) { + res, _ := resource.New(ctx, + resource.WithFromEnv(), + resource.WithHost(), + resource.WithAttributes( + semconv.ServiceName(cfg.ServiceName), + semconv.ServiceVersion(cfg.ServiceVersion), + // Optional resource attributes + attribute.String("site_id", cfg.SiteID), + attribute.String("region", cfg.Region), + ), + ) + + s := &Setup{} + + // Build metric readers/exporters + var readers []sdkmetric.Reader + + // Prometheus exporter exposes a native /metrics handler for scraping + if cfg.PromEnabled { + reg := promclient.NewRegistry() + exp, err := prometheus.New(prometheus.WithRegisterer(reg)) + if err != nil { + return nil, err + } + readers = append(readers, exp) + s.PrometheusHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{}) + } + + // Optional OTLP metric exporter (gRPC) + if cfg.OTLPEnabled { + mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)} + // Headers support via OTEL_EXPORTER_OTLP_HEADERS (k=v,k2=v2) + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { + mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs)) + } + if cfg.OTLPInsecure { + mopts = append(mopts, otlpmetricgrpc.WithInsecure()) + } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + creds, cerr := credentials.NewClientTLSFromFile(certFile, "") + if cerr == nil { + mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds)) + } + } + mexp, err := otlpmetricgrpc.New(ctx, mopts...) + if err != nil { + return nil, err + } + readers = append(readers, sdkmetric.NewPeriodicReader(mexp, sdkmetric.WithInterval(cfg.MetricExportInterval))) + s.shutdowns = append(s.shutdowns, mexp.Shutdown) + } + + // Build provider options iteratively (WithReader is not variadic) + var mpOpts []sdkmetric.Option + mpOpts = append(mpOpts, sdkmetric.WithResource(res)) + for _, r := range readers { + mpOpts = append(mpOpts, sdkmetric.WithReader(r)) + } + // Default view for latency histograms in seconds. + mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( + sdkmetric.Instrument{ + Name: "newt_*_latency_seconds", + }, + sdkmetric.Stream{ + Aggregation: sdkmetric.AggregationExplicitBucketHistogram{ + Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, + }, + }, + ))) + // Attribute whitelist: only allow expected low-cardinality keys on newt_* instruments. + mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView( + sdkmetric.Instrument{Name: "newt_*"}, + sdkmetric.Stream{ + AttributeFilter: func(kv attribute.KeyValue) bool { + k := string(kv.Key) + switch k { + case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type": + return true + default: + return false + } + }, + }, + ))) + mp := sdkmetric.NewMeterProvider(mpOpts...) + otel.SetMeterProvider(mp) + s.MeterProvider = mp + s.shutdowns = append(s.shutdowns, mp.Shutdown) + + // Optional tracing (OTLP over gRPC) + if cfg.OTLPEnabled { + topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)} + if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 { + topts = append(topts, otlptracegrpc.WithHeaders(hdrs)) + } + if cfg.OTLPInsecure { + topts = append(topts, otlptracegrpc.WithInsecure()) + } else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" { + creds, cerr := credentials.NewClientTLSFromFile(certFile, "") + if cerr == nil { + topts = append(topts, otlptracegrpc.WithTLSCredentials(creds)) + } + } + exp, err := otlptracegrpc.New(ctx, topts...) + if err == nil { + tp := sdktrace.NewTracerProvider( + sdktrace.WithBatcher(exp), + sdktrace.WithResource(res), + ) + otel.SetTracerProvider(tp) + s.TracerProvider = tp + s.shutdowns = append(s.shutdowns, func(ctx context.Context) error { + return errors.Join(exp.Shutdown(ctx), tp.Shutdown(ctx)) + }) + } + } + + // Export Go runtime metrics (goroutines, GC, mem, etc.) + _ = runtime.Start(runtime.WithMeterProvider(mp)) + + // Register instruments after provider is set + if err := registerInstruments(); err != nil { + return nil, err + } + // Optional build info metric + if cfg.BuildVersion != "" || cfg.BuildCommit != "" { + RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit) + } + + return s, nil +} + +// Shutdown flushes exporters and providers in reverse init order. +func (s *Setup) Shutdown(ctx context.Context) error { + var err error + for i := len(s.shutdowns) - 1; i >= 0; i-- { + err = errors.Join(err, s.shutdowns[i](ctx)) + } + return err +} + +func parseOTLPHeaders(h string) map[string]string { + m := map[string]string{} + if h == "" { + return m + } + pairs := strings.Split(h, ",") + for _, p := range pairs { + kv := strings.SplitN(strings.TrimSpace(p), "=", 2) + if len(kv) == 2 { + m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1]) + } + } + return m +} + +func getenv(k, d string) string { + if v := os.Getenv(k); v != "" { + return v + } + return d +} + +func getdur(k string, d time.Duration) time.Duration { + if v := os.Getenv(k); v != "" { + if p, e := time.ParseDuration(v); e == nil { + return p + } + } + return d +} diff --git a/internal/telemetry/telemetry_attrfilter_test.go b/internal/telemetry/telemetry_attrfilter_test.go new file mode 100644 index 0000000..461888f --- /dev/null +++ b/internal/telemetry/telemetry_attrfilter_test.go @@ -0,0 +1,43 @@ +package telemetry + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "go.opentelemetry.io/otel/attribute" +) + +// Test that disallowed attributes are filtered from the exposition. +func TestAttributeFilterDropsUnknownKeys(t *testing.T) { + ctx := context.Background() +cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0"} + tel, err := Init(ctx, cfg) + if err != nil { t.Fatalf("init: %v", err) } + defer func() { _ = tel.Shutdown(context.Background()) }() + + if tel.PrometheusHandler == nil { t.Fatalf("prom handler nil") } + ts := httptest.NewServer(tel.PrometheusHandler) + defer ts.Close() + +// Add samples with disallowed attribute keys + for _, k := range []string{"forbidden", "site_id", "host"} { + set := attribute.NewSet(attribute.String(k, "x")) + AddTunnelBytesSet(ctx, 123, set) + } + time.Sleep(50 * time.Millisecond) + + resp, err := http.Get(ts.URL) + if err != nil { t.Fatalf("GET: %v", err) } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + body := string(b) + if strings.Contains(body, "forbidden=") { + t.Fatalf("unexpected forbidden attribute leaked into metrics: %s", body) + } +} + diff --git a/internal/telemetry/telemetry_golden_test.go b/internal/telemetry/telemetry_golden_test.go new file mode 100644 index 0000000..91dcbd2 --- /dev/null +++ b/internal/telemetry/telemetry_golden_test.go @@ -0,0 +1,50 @@ +package telemetry + +import ( + "bufio" + "context" + "io" + "net/http" + "net/http/httptest" + "os" + "strings" + "testing" + "time" +) + +// Golden test that /metrics contains expected metric names. +func TestMetricsGoldenContains(t *testing.T) { + ctx := context.Background() +cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0", BuildVersion: "test"} + tel, err := Init(ctx, cfg) + if err != nil { t.Fatalf("telemetry init error: %v", err) } + defer func() { _ = tel.Shutdown(context.Background()) }() + + if tel.PrometheusHandler == nil { t.Fatalf("prom handler nil") } + ts := httptest.NewServer(tel.PrometheusHandler) + defer ts.Close() + + // Trigger a counter + IncConnAttempt(ctx, "ignored", "websocket", "success") + time.Sleep(100 * time.Millisecond) + + resp, err := http.Get(ts.URL) + if err != nil { t.Fatalf("GET metrics failed: %v", err) } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + body := string(b) + + f, err := os.Open("internal/telemetry/testdata/expected_contains.golden") + if err != nil { t.Fatalf("read golden: %v", err) } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + needle := strings.TrimSpace(s.Text()) + if needle == "" { continue } + if !strings.Contains(body, needle) { + t.Fatalf("expected metrics body to contain %q. body=\n%s", needle, body) + } + } + if err := s.Err(); err != nil { t.Fatalf("scan golden: %v", err) } +} + diff --git a/internal/telemetry/telemetry_smoke_test.go b/internal/telemetry/telemetry_smoke_test.go new file mode 100644 index 0000000..b820af1 --- /dev/null +++ b/internal/telemetry/telemetry_smoke_test.go @@ -0,0 +1,54 @@ +package telemetry + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +// Smoke test that /metrics contains at least one newt_* metric when Prom exporter is enabled. +func TestMetricsSmoke(t *testing.T) { + ctx := context.Background() + cfg := Config{ + ServiceName: "newt", + PromEnabled: true, + OTLPEnabled: false, + AdminAddr: "127.0.0.1:0", + BuildVersion: "test", + BuildCommit: "deadbeef", + MetricExportInterval: 5 * time.Second, + } + tel, err := Init(ctx, cfg) + if err != nil { + t.Fatalf("telemetry init error: %v", err) + } + defer func() { _ = tel.Shutdown(context.Background()) }() + + // Serve the Prom handler on a test server + if tel.PrometheusHandler == nil { + t.Fatalf("Prometheus handler nil; PromEnabled should enable it") + } + ts := httptest.NewServer(tel.PrometheusHandler) + defer ts.Close() + + // Record a simple metric and then fetch /metrics + IncConnAttempt(ctx, "site-1", "websocket", "success") + // Give the exporter a tick to collect + time.Sleep(100 * time.Millisecond) + + resp, err := http.Get(ts.URL) + if err != nil { + t.Fatalf("GET /metrics failed: %v", err) + } + defer resp.Body.Close() + b, _ := io.ReadAll(resp.Body) + body := string(b) + if !strings.Contains(body, "newt_connection_attempts_total") { + t.Fatalf("expected newt_connection_attempts_total in metrics, got:\n%s", body) + } +} +