Files
newt/internal/telemetry/telemetry.go

266 lines
8.3 KiB
Go

package telemetry
import (
"context"
"errors"
"net/http"
"os"
"strings"
"time"
promclient "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.opentelemetry.io/contrib/instrumentation/runtime"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/exporters/prometheus"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
"google.golang.org/grpc/credentials"
)
// Config controls telemetry initialization via env flags.
//
// Defaults align with the issue requirements:
// - Prometheus exporter enabled by default (/metrics)
// - OTLP exporter disabled by default
// - Durations in seconds, bytes in raw bytes
// - Admin HTTP server address configurable (for mounting /metrics)
type Config struct {
ServiceName string
ServiceVersion string
// Optional resource attributes
SiteID string
Region string
PromEnabled bool
OTLPEnabled bool
OTLPEndpoint string // host:port
OTLPInsecure bool
MetricExportInterval time.Duration
AdminAddr string // e.g.: ":2112"
// Optional build info for newt_build_info metric
BuildVersion string
BuildCommit string
}
// FromEnv reads configuration from environment variables.
//
// NEWT_METRICS_PROMETHEUS_ENABLED (default: true)
// NEWT_METRICS_OTLP_ENABLED (default: false)
// OTEL_EXPORTER_OTLP_ENDPOINT (default: "localhost:4317")
// OTEL_EXPORTER_OTLP_INSECURE (default: true)
// OTEL_METRIC_EXPORT_INTERVAL (default: 15s)
// OTEL_SERVICE_NAME (default: "newt")
// OTEL_SERVICE_VERSION (default: "")
// NEWT_ADMIN_ADDR (default: ":2112")
func FromEnv() Config {
return Config{
ServiceName: getenv("OTEL_SERVICE_NAME", "newt"),
ServiceVersion: os.Getenv("OTEL_SERVICE_VERSION"),
Region: os.Getenv("NEWT_REGION"),
PromEnabled: getenv("NEWT_METRICS_PROMETHEUS_ENABLED", "true") == "true",
OTLPEnabled: getenv("NEWT_METRICS_OTLP_ENABLED", "false") == "true",
OTLPEndpoint: getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317"),
OTLPInsecure: getenv("OTEL_EXPORTER_OTLP_INSECURE", "true") == "true",
MetricExportInterval: getdur("OTEL_METRIC_EXPORT_INTERVAL", 15*time.Second),
AdminAddr: getenv("NEWT_ADMIN_ADDR", "127.0.0.1:2112"),
}
}
// Setup holds initialized telemetry providers and (optionally) a /metrics handler.
// Call Shutdown when the process terminates to flush exporters.
type Setup struct {
MeterProvider *sdkmetric.MeterProvider
TracerProvider *sdktrace.TracerProvider
PrometheusHandler http.Handler // nil if Prometheus exporter disabled
shutdowns []func(context.Context) error
}
// Init configures OpenTelemetry metrics and (optionally) tracing.
//
// It sets a global MeterProvider and TracerProvider, registers runtime instrumentation,
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
// a Shutdown method to flush exporters.
func Init(ctx context.Context, cfg Config) (*Setup, error) {
res, _ := resource.New(ctx,
resource.WithFromEnv(),
resource.WithHost(),
resource.WithAttributes(
semconv.ServiceName(cfg.ServiceName),
semconv.ServiceVersion(cfg.ServiceVersion),
// Optional resource attributes
attribute.String("site_id", cfg.SiteID),
attribute.String("region", cfg.Region),
),
)
s := &Setup{}
// Build metric readers/exporters
var readers []sdkmetric.Reader
// Prometheus exporter exposes a native /metrics handler for scraping
if cfg.PromEnabled {
reg := promclient.NewRegistry()
exp, err := prometheus.New(prometheus.WithRegisterer(reg))
if err != nil {
return nil, err
}
readers = append(readers, exp)
s.PrometheusHandler = promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
}
// Optional OTLP metric exporter (gRPC)
if cfg.OTLPEnabled {
mopts := []otlpmetricgrpc.Option{otlpmetricgrpc.WithEndpoint(cfg.OTLPEndpoint)}
// Headers support via OTEL_EXPORTER_OTLP_HEADERS (k=v,k2=v2)
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
mopts = append(mopts, otlpmetricgrpc.WithHeaders(hdrs))
}
if cfg.OTLPInsecure {
mopts = append(mopts, otlpmetricgrpc.WithInsecure())
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
creds, cerr := credentials.NewClientTLSFromFile(certFile, "")
if cerr == nil {
mopts = append(mopts, otlpmetricgrpc.WithTLSCredentials(creds))
}
}
mexp, err := otlpmetricgrpc.New(ctx, mopts...)
if err != nil {
return nil, err
}
readers = append(readers, sdkmetric.NewPeriodicReader(mexp, sdkmetric.WithInterval(cfg.MetricExportInterval)))
s.shutdowns = append(s.shutdowns, mexp.Shutdown)
}
// Build provider options iteratively (WithReader is not variadic)
var mpOpts []sdkmetric.Option
mpOpts = append(mpOpts, sdkmetric.WithResource(res))
for _, r := range readers {
mpOpts = append(mpOpts, sdkmetric.WithReader(r))
}
// Default view for latency histograms in seconds.
mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView(
sdkmetric.Instrument{
Name: "newt_*_latency_seconds",
},
sdkmetric.Stream{
Aggregation: sdkmetric.AggregationExplicitBucketHistogram{
Boundaries: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30},
},
},
)))
// Attribute whitelist: only allow expected low-cardinality keys on newt_* instruments.
mpOpts = append(mpOpts, sdkmetric.WithView(sdkmetric.NewView(
sdkmetric.Instrument{Name: "newt_*"},
sdkmetric.Stream{
AttributeFilter: func(kv attribute.KeyValue) bool {
k := string(kv.Key)
switch k {
case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type":
return true
default:
return false
}
},
},
)))
mp := sdkmetric.NewMeterProvider(mpOpts...)
otel.SetMeterProvider(mp)
s.MeterProvider = mp
s.shutdowns = append(s.shutdowns, mp.Shutdown)
// Optional tracing (OTLP over gRPC)
if cfg.OTLPEnabled {
topts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(cfg.OTLPEndpoint)}
if hdrs := parseOTLPHeaders(os.Getenv("OTEL_EXPORTER_OTLP_HEADERS")); len(hdrs) > 0 {
topts = append(topts, otlptracegrpc.WithHeaders(hdrs))
}
if cfg.OTLPInsecure {
topts = append(topts, otlptracegrpc.WithInsecure())
} else if certFile := os.Getenv("OTEL_EXPORTER_OTLP_CERTIFICATE"); certFile != "" {
creds, cerr := credentials.NewClientTLSFromFile(certFile, "")
if cerr == nil {
topts = append(topts, otlptracegrpc.WithTLSCredentials(creds))
}
}
exp, err := otlptracegrpc.New(ctx, topts...)
if err == nil {
tp := sdktrace.NewTracerProvider(
sdktrace.WithBatcher(exp),
sdktrace.WithResource(res),
)
otel.SetTracerProvider(tp)
s.TracerProvider = tp
s.shutdowns = append(s.shutdowns, func(ctx context.Context) error {
return errors.Join(exp.Shutdown(ctx), tp.Shutdown(ctx))
})
}
}
// Export Go runtime metrics (goroutines, GC, mem, etc.)
_ = runtime.Start(runtime.WithMeterProvider(mp))
// Register instruments after provider is set
if err := registerInstruments(); err != nil {
return nil, err
}
// Optional build info metric
if cfg.BuildVersion != "" || cfg.BuildCommit != "" {
RegisterBuildInfo(cfg.BuildVersion, cfg.BuildCommit)
}
return s, nil
}
// Shutdown flushes exporters and providers in reverse init order.
func (s *Setup) Shutdown(ctx context.Context) error {
var err error
for i := len(s.shutdowns) - 1; i >= 0; i-- {
err = errors.Join(err, s.shutdowns[i](ctx))
}
return err
}
func parseOTLPHeaders(h string) map[string]string {
m := map[string]string{}
if h == "" {
return m
}
pairs := strings.Split(h, ",")
for _, p := range pairs {
kv := strings.SplitN(strings.TrimSpace(p), "=", 2)
if len(kv) == 2 {
m[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
}
}
return m
}
func getenv(k, d string) string {
if v := os.Getenv(k); v != "" {
return v
}
return d
}
func getdur(k string, d time.Duration) time.Duration {
if v := os.Getenv(k); v != "" {
if p, e := time.ParseDuration(v); e == nil {
return p
}
}
return d
}