refactor: Simplify telemetry metrics by removing site_id and enhancing tunnel_id usage

This commit is contained in:
Marc Schäfer
2025-10-07 18:43:09 +02:00
parent f8fd8e1bc5
commit a86b14d97d
10 changed files with 53 additions and 54 deletions

View File

@@ -13,8 +13,8 @@ import (
// low-cardinality label guidance from the issue description.
//
// Counters end with _total, durations are in seconds, sizes in bytes.
// Only low-cardinality stable labels are supported: site_id, tunnel_id,
// transport, direction, result, reason, error_type, region.
// Only low-cardinality stable labels are supported: tunnel_id,
// transport, direction, result, reason, error_type.
var (
initOnce sync.Once
@@ -147,9 +147,9 @@ var (
// Example inside your code (where you have access to current state):
//
// telemetry.SetObservableCallback(func(ctx context.Context, o metric.Observer) error {
// o.ObserveInt64(mSiteOnline, 1, attribute.String("site_id", siteID))
// o.ObserveFloat64(mSiteLastHeartbeat, time.Since(lastHB).Seconds(), attribute.String("site_id", siteID))
// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions)), attribute.String("site_id", siteID))
// o.ObserveInt64(mSiteOnline, 1)
// o.ObserveFloat64(mSiteLastHeartbeat, time.Since(lastHB).Seconds())
// o.ObserveInt64(mTunnelSessions, int64(len(activeSessions)))
// return nil
// })
func SetObservableCallback(cb func(context.Context, metric.Observer) error) {
@@ -174,20 +174,15 @@ func IncConfigReload(ctx context.Context, result string) {
// Helpers for counters/histograms
func IncSiteRegistration(ctx context.Context, siteID, region, result string) {
func IncSiteRegistration(ctx context.Context, result string) {
attrs := []attribute.KeyValue{
attribute.String("site_id", siteID),
attribute.String("result", result),
}
if region != "" {
attrs = append(attrs, attribute.String("region", region))
}
mSiteRegistrations.Add(ctx, 1, metric.WithAttributes(attrs...))
}
func AddTunnelBytes(ctx context.Context, siteID, tunnelID, direction string, n int64) {
func AddTunnelBytes(ctx context.Context, tunnelID, direction string, n int64) {
mTunnelBytes.Add(ctx, n, metric.WithAttributes(
attribute.String("site_id", siteID),
attribute.String("tunnel_id", tunnelID),
attribute.String("direction", direction),
))
@@ -198,33 +193,29 @@ func AddTunnelBytesSet(ctx context.Context, n int64, attrs attribute.Set) {
mTunnelBytes.Add(ctx, n, metric.WithAttributeSet(attrs))
}
func ObserveTunnelLatency(ctx context.Context, siteID, tunnelID, transport string, seconds float64) {
func ObserveTunnelLatency(ctx context.Context, tunnelID, transport string, seconds float64) {
mTunnelLatency.Record(ctx, seconds, metric.WithAttributes(
attribute.String("site_id", siteID),
attribute.String("tunnel_id", tunnelID),
attribute.String("transport", transport),
))
}
func IncReconnect(ctx context.Context, siteID, tunnelID, reason string) {
func IncReconnect(ctx context.Context, tunnelID, reason string) {
mReconnects.Add(ctx, 1, metric.WithAttributes(
attribute.String("site_id", siteID),
attribute.String("tunnel_id", tunnelID),
attribute.String("reason", reason),
))
}
func IncConnAttempt(ctx context.Context, siteID, transport, result string) {
func IncConnAttempt(ctx context.Context, transport, result string) {
mConnAttempts.Add(ctx, 1, metric.WithAttributes(
attribute.String("site_id", siteID),
attribute.String("transport", transport),
attribute.String("result", result),
))
}
func IncConnError(ctx context.Context, siteID, transport, typ string) {
func IncConnError(ctx context.Context, transport, typ string) {
mConnErrors.Add(ctx, 1, metric.WithAttributes(
attribute.String("site_id", siteID),
attribute.String("transport", transport),
attribute.String("error_type", typ),
))

View File

@@ -42,16 +42,19 @@ func RegisterStateView(v StateView) {
if online {
val = 1
}
o.ObserveInt64(mSiteOnline, val, metric.WithAttributes(attribute.String("site_id", siteID)))
o.ObserveInt64(mSiteOnline, val)
}
if t, ok := sv.LastHeartbeat(siteID); ok {
secs := time.Since(t).Seconds()
o.ObserveFloat64(mSiteLastHeartbeat, secs, metric.WithAttributes(attribute.String("site_id", siteID)))
o.ObserveFloat64(mSiteLastHeartbeat, secs)
}
// If the view supports per-tunnel sessions, report them labeled by tunnel_id.
if tm, ok := any.(interface{ SessionsByTunnel() map[string]int64 }); ok {
for tid, n := range tm.SessionsByTunnel() {
o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(attribute.String("tunnel_id", tid)))
o.ObserveInt64(mTunnelSessions, n, metric.WithAttributes(
attribute.String("tunnel_id", tid),
attribute.String("transport", "tcp"),
))
}
}
}

View File

@@ -93,16 +93,21 @@ type Setup struct {
// installs recommended histogram views for *_latency_seconds, and returns a Setup with
// a Shutdown method to flush exporters.
func Init(ctx context.Context, cfg Config) (*Setup, error) {
// Build resource with required attributes and only include optional ones when non-empty
attrs := []attribute.KeyValue{
semconv.ServiceName(cfg.ServiceName),
semconv.ServiceVersion(cfg.ServiceVersion),
}
if cfg.SiteID != "" {
attrs = append(attrs, attribute.String("site_id", cfg.SiteID))
}
if cfg.Region != "" {
attrs = append(attrs, attribute.String("region", cfg.Region))
}
res, _ := resource.New(ctx,
resource.WithFromEnv(),
resource.WithHost(),
resource.WithAttributes(
semconv.ServiceName(cfg.ServiceName),
semconv.ServiceVersion(cfg.ServiceVersion),
// Optional resource attributes
attribute.String("site_id", cfg.SiteID),
attribute.String("region", cfg.Region),
),
resource.WithAttributes(attrs...),
)
s := &Setup{}
@@ -168,7 +173,7 @@ func Init(ctx context.Context, cfg Config) (*Setup, error) {
AttributeFilter: func(kv attribute.KeyValue) bool {
k := string(kv.Key)
switch k {
case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type":
case "tunnel_id", "transport", "direction", "protocol", "result", "reason", "error_type", "version", "commit":
return true
default:
return false

View File

@@ -25,7 +25,7 @@ cfg := Config{ServiceName: "newt", PromEnabled: true, AdminAddr: "127.0.0.1:0",
defer ts.Close()
// Trigger a counter
IncConnAttempt(ctx, "ignored", "websocket", "success")
IncConnAttempt(ctx, "websocket", "success")
time.Sleep(100 * time.Millisecond)
resp, err := http.Get(ts.URL)

View File

@@ -36,7 +36,7 @@ func TestMetricsSmoke(t *testing.T) {
defer ts.Close()
// Record a simple metric and then fetch /metrics
IncConnAttempt(ctx, "site-1", "websocket", "success")
IncConnAttempt(ctx, "websocket", "success")
// Give the exporter a tick to collect
time.Sleep(100 * time.Millisecond)