fix(docker-compose, prometheus, telemetry, proxy): standardize collector naming and improve error handling

This commit is contained in:
Marc Schäfer
2025-10-10 14:42:05 +02:00
parent 8d0e6be2c7
commit bd62da4cc9
5 changed files with 73 additions and 36 deletions

View File

@@ -1,5 +1,5 @@
services:
collector:
otel-collector:
image: otel/opentelemetry-collector:0.111.0
command: ["--config=/etc/otelcol/config.yaml"]
volumes:
@@ -15,14 +15,14 @@ services:
OTEL_SERVICE_NAME: newt
NEWT_METRICS_PROMETHEUS_ENABLED: "true"
NEWT_METRICS_OTLP_ENABLED: "true"
OTEL_EXPORTER_OTLP_ENDPOINT: "collector:4317"
OTEL_EXPORTER_OTLP_ENDPOINT: "otel-collector:4317"
OTEL_EXPORTER_OTLP_INSECURE: "true"
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative"
NEWT_ADMIN_ADDR: "0.0.0.0:2112"
ports:
- "2112:2112"
depends_on:
- collector
- otel-collector
prometheus:
image: prom/prometheus:v2.55.0

View File

@@ -125,7 +125,7 @@ global:
scrape_configs:
- job_name: otel-collector
static_configs:
- targets: ["collector:8889"]
- targets: ["otel-collector:8889"]
```
Reason mapping (source → reason)

View File

@@ -18,4 +18,4 @@ scrape_configs:
# WARNING: Do not enable this together with the 'newt' job above or you will double-count.
# - job_name: 'otel-collector'
# static_configs:
# - targets: ['collector:8889']
# - targets: ['otel-collector:8889']

View File

@@ -70,11 +70,26 @@ func registerInstruments() error {
var err error
initOnce.Do(func() {
meter = otel.Meter("newt")
if e := registerSiteInstruments(); e != nil { err = e; return }
if e := registerTunnelInstruments(); e != nil { err = e; return }
if e := registerConnInstruments(); e != nil { err = e; return }
if e := registerConfigInstruments(); e != nil { err = e; return }
if e := registerBuildWSProxyInstruments(); e != nil { err = e; return }
if e := registerSiteInstruments(); e != nil {
err = e
return
}
if e := registerTunnelInstruments(); e != nil {
err = e
return
}
if e := registerConnInstruments(); e != nil {
err = e
return
}
if e := registerConfigInstruments(); e != nil {
err = e
return
}
if e := registerBuildWSProxyInstruments(); e != nil {
err = e
return
}
})
return err
}
@@ -83,13 +98,19 @@ func registerSiteInstruments() error {
var err error
mSiteRegistrations, err = meter.Int64Counter("newt_site_registrations_total",
metric.WithDescription("Total site registration attempts"))
if err != nil { return err }
if err != nil {
return err
}
mSiteOnline, err = meter.Int64ObservableGauge("newt_site_online",
metric.WithDescription("Site online (0/1)"))
if err != nil { return err }
if err != nil {
return err
}
mSiteLastHeartbeat, err = meter.Float64ObservableGauge("newt_site_last_heartbeat_seconds",
metric.WithDescription("Seconds since last site heartbeat"))
if err != nil { return err }
if err != nil {
return err
}
return nil
}
@@ -97,18 +118,26 @@ func registerTunnelInstruments() error {
var err error
mTunnelSessions, err = meter.Int64ObservableGauge("newt_tunnel_sessions",
metric.WithDescription("Active tunnel sessions"))
if err != nil { return err }
if err != nil {
return err
}
mTunnelBytes, err = meter.Int64Counter("newt_tunnel_bytes_total",
metric.WithDescription("Tunnel bytes ingress/egress"),
metric.WithUnit("By"))
if err != nil { return err }
if err != nil {
return err
}
mTunnelLatency, err = meter.Float64Histogram("newt_tunnel_latency_seconds",
metric.WithDescription("Per-tunnel latency in seconds"),
metric.WithUnit("s"))
if err != nil { return err }
if err != nil {
return err
}
mReconnects, err = meter.Int64Counter("newt_tunnel_reconnects_total",
metric.WithDescription("Tunnel reconnect events"))
if err != nil { return err }
if err != nil {
return err
}
return nil
}
@@ -116,10 +145,14 @@ func registerConnInstruments() error {
var err error
mConnAttempts, err = meter.Int64Counter("newt_connection_attempts_total",
metric.WithDescription("Connection attempts"))
if err != nil { return err }
if err != nil {
return err
}
mConnErrors, err = meter.Int64Counter("newt_connection_errors_total",
metric.WithDescription("Connection errors by type"))
if err != nil { return err }
if err != nil {
return err
}
return nil
}
@@ -310,10 +343,13 @@ func ObserveProxyAsyncBacklogObs(o metric.Observer, value int64, attrs []attribu
}
func IncProxyDrops(ctx context.Context, tunnelID, protocol string) {
mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(
attribute.String("tunnel_id", tunnelID),
attrs := []attribute.KeyValue{
attribute.String("protocol", protocol),
)...))
}
if ShouldIncludeTunnelID() && tunnelID != "" {
attrs = append(attrs, attribute.String("tunnel_id", tunnelID))
}
mProxyDropsTotal.Add(ctx, 1, metric.WithAttributes(attrsWithSite(attrs...)...))
}
// --- Config/PKI helpers ---

View File

@@ -275,7 +275,7 @@ func (pm *ProxyManager) Start() error {
telemetry.ObserveProxyActiveConnsObs(o, e.activeTCP.Load(), e.attrOutTCP.ToSlice())
telemetry.ObserveProxyActiveConnsObs(o, e.activeUDP.Load(), e.attrOutUDP.ToSlice())
// backlog bytes (sum of unflushed counters)
b := int64(e.bytesInTCP.Load()+e.bytesOutTCP.Load()+e.bytesInUDP.Load()+e.bytesOutUDP.Load())
b := int64(e.bytesInTCP.Load() + e.bytesOutTCP.Load() + e.bytesInUDP.Load() + e.bytesOutUDP.Load())
telemetry.ObserveProxyAsyncBacklogObs(o, b, e.attrOutTCP.ToSlice())
telemetry.ObserveProxyBufferBytesObs(o, b, e.attrOutTCP.ToSlice())
}
@@ -599,13 +599,14 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
}
targetConn, err = net.DialUDP("udp", nil, targetUDPAddr)
if e := pm.getEntry(pm.currentTunnelID); e != nil {
e.activeUDP.Add(1)
}
if err != nil {
logger.Error("Error connecting to target: %v", err)
continue
}
// Only increment activeUDP after a successful DialUDP
if e := pm.getEntry(pm.currentTunnelID); e != nil {
e.activeUDP.Add(1)
}
clientsMutex.Lock()
clientConns[clientKey] = targetConn