mirror of
https://github.com/fosrl/newt.git
synced 2026-02-22 04:46:42 +00:00
Adding OpenTelemetry Metrics and Tracing
This commit is contained in:
@@ -7,6 +7,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
@@ -18,6 +19,11 @@ import (
|
||||
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/gorilla/websocket"
|
||||
|
||||
"context"
|
||||
|
||||
"github.com/fosrl/newt/internal/telemetry"
|
||||
"go.opentelemetry.io/otel"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
@@ -37,6 +43,8 @@ type Client struct {
|
||||
writeMux sync.Mutex
|
||||
clientType string // Type of client (e.g., "newt", "olm")
|
||||
tlsConfig TLSConfig
|
||||
metricsCtxMu sync.RWMutex
|
||||
metricsCtx context.Context
|
||||
configNeedsSave bool // Flag to track if config needs to be saved
|
||||
}
|
||||
|
||||
@@ -81,6 +89,26 @@ func (c *Client) OnTokenUpdate(callback func(token string)) {
|
||||
c.onTokenUpdate = callback
|
||||
}
|
||||
|
||||
func (c *Client) metricsContext() context.Context {
|
||||
c.metricsCtxMu.RLock()
|
||||
defer c.metricsCtxMu.RUnlock()
|
||||
if c.metricsCtx != nil {
|
||||
return c.metricsCtx
|
||||
}
|
||||
return context.Background()
|
||||
}
|
||||
|
||||
func (c *Client) setMetricsContext(ctx context.Context) {
|
||||
c.metricsCtxMu.Lock()
|
||||
c.metricsCtx = ctx
|
||||
c.metricsCtxMu.Unlock()
|
||||
}
|
||||
|
||||
// MetricsContext exposes the context used for telemetry emission when a connection is active.
|
||||
func (c *Client) MetricsContext() context.Context {
|
||||
return c.metricsContext()
|
||||
}
|
||||
|
||||
// NewClient creates a new websocket client
|
||||
func NewClient(clientType string, ID, secret string, endpoint string, pingInterval time.Duration, pingTimeout time.Duration, opts ...ClientOption) (*Client, error) {
|
||||
config := &Config{
|
||||
@@ -140,6 +168,7 @@ func (c *Client) Close() error {
|
||||
|
||||
// Set connection status to false
|
||||
c.setConnected(false)
|
||||
telemetry.SetWSConnectionState(false)
|
||||
|
||||
// Close the WebSocket connection gracefully
|
||||
if c.conn != nil {
|
||||
@@ -170,7 +199,11 @@ func (c *Client) SendMessage(messageType string, data interface{}) error {
|
||||
|
||||
c.writeMux.Lock()
|
||||
defer c.writeMux.Unlock()
|
||||
return c.conn.WriteJSON(msg)
|
||||
if err := c.conn.WriteJSON(msg); err != nil {
|
||||
return err
|
||||
}
|
||||
telemetry.IncWSMessage(c.metricsContext(), "out", "text")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Client) SendMessageInterval(messageType string, data interface{}, interval time.Duration) (stop func()) {
|
||||
@@ -265,8 +298,12 @@ func (c *Client) getToken() (string, error) {
|
||||
return "", fmt.Errorf("failed to marshal token request data: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Create a new request
|
||||
req, err := http.NewRequest(
|
||||
req, err := http.NewRequestWithContext(
|
||||
ctx,
|
||||
"POST",
|
||||
baseEndpoint+"/api/v1/auth/"+c.clientType+"/get-token",
|
||||
bytes.NewBuffer(jsonData),
|
||||
@@ -288,6 +325,8 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
telemetry.IncConnAttempt(ctx, "auth", "failure")
|
||||
telemetry.IncConnError(ctx, "auth", classifyConnError(err))
|
||||
return "", fmt.Errorf("failed to request new token: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -295,6 +334,16 @@ func (c *Client) getToken() (string, error) {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
telemetry.IncConnAttempt(ctx, "auth", "failure")
|
||||
etype := "io_error"
|
||||
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
etype = "auth_failed"
|
||||
}
|
||||
telemetry.IncConnError(ctx, "auth", etype)
|
||||
// Reconnect reason mapping for auth failures
|
||||
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonAuthError)
|
||||
}
|
||||
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
@@ -313,10 +362,55 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
|
||||
logger.Debug("Received token: %s", tokenResp.Data.Token)
|
||||
telemetry.IncConnAttempt(ctx, "auth", "success")
|
||||
|
||||
return tokenResp.Data.Token, nil
|
||||
}
|
||||
|
||||
// classifyConnError maps to fixed, low-cardinality error_type values.
|
||||
// Allowed enum: dial_timeout, tls_handshake, auth_failed, io_error
|
||||
func classifyConnError(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
switch {
|
||||
case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
|
||||
return "tls_handshake"
|
||||
case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout") || strings.Contains(msg, "deadline exceeded"):
|
||||
return "dial_timeout"
|
||||
case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
|
||||
return "auth_failed"
|
||||
default:
|
||||
// Group remaining network/socket errors as io_error to avoid label explosion
|
||||
return "io_error"
|
||||
}
|
||||
}
|
||||
|
||||
func classifyWSDisconnect(err error) (result, reason string) {
|
||||
if err == nil {
|
||||
return "success", "normal"
|
||||
}
|
||||
if websocket.IsCloseError(err, websocket.CloseNormalClosure) {
|
||||
return "success", "normal"
|
||||
}
|
||||
if ne, ok := err.(net.Error); ok && ne.Timeout() {
|
||||
return "error", "timeout"
|
||||
}
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure) {
|
||||
return "error", "unexpected_close"
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
switch {
|
||||
case strings.Contains(msg, "eof"):
|
||||
return "error", "eof"
|
||||
case strings.Contains(msg, "reset"):
|
||||
return "error", "connection_reset"
|
||||
default:
|
||||
return "error", "read_error"
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) connectWithRetry() {
|
||||
for {
|
||||
select {
|
||||
@@ -335,9 +429,13 @@ func (c *Client) connectWithRetry() {
|
||||
}
|
||||
|
||||
func (c *Client) establishConnection() error {
|
||||
ctx := context.Background()
|
||||
|
||||
// Get token for authentication
|
||||
token, err := c.getToken()
|
||||
if err != nil {
|
||||
telemetry.IncConnAttempt(ctx, "websocket", "failure")
|
||||
telemetry.IncConnError(ctx, "websocket", classifyConnError(err))
|
||||
return fmt.Errorf("failed to get token: %w", err)
|
||||
}
|
||||
|
||||
@@ -370,7 +468,12 @@ func (c *Client) establishConnection() error {
|
||||
q.Set("clientType", c.clientType)
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
// Connect to WebSocket
|
||||
// Connect to WebSocket (optional span)
|
||||
tr := otel.Tracer("newt")
|
||||
ctx, span := tr.Start(ctx, "ws.connect")
|
||||
defer span.End()
|
||||
|
||||
start := time.Now()
|
||||
dialer := websocket.DefaultDialer
|
||||
|
||||
// Use new TLS configuration method
|
||||
@@ -392,18 +495,42 @@ func (c *Client) establishConnection() error {
|
||||
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
|
||||
}
|
||||
|
||||
conn, _, err := dialer.Dial(u.String(), nil)
|
||||
conn, _, err := dialer.DialContext(ctx, u.String(), nil)
|
||||
lat := time.Since(start).Seconds()
|
||||
if err != nil {
|
||||
telemetry.IncConnAttempt(ctx, "websocket", "failure")
|
||||
etype := classifyConnError(err)
|
||||
telemetry.IncConnError(ctx, "websocket", etype)
|
||||
telemetry.ObserveWSConnectLatency(ctx, lat, "failure", etype)
|
||||
// Map handshake-related errors to reconnect reasons where appropriate
|
||||
if etype == "tls_handshake" {
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonHandshakeError)
|
||||
} else if etype == "dial_timeout" {
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonTimeout)
|
||||
} else {
|
||||
telemetry.IncReconnect(ctx, c.config.ID, "client", telemetry.ReasonError)
|
||||
}
|
||||
telemetry.IncWSReconnect(ctx, etype)
|
||||
return fmt.Errorf("failed to connect to WebSocket: %w", err)
|
||||
}
|
||||
|
||||
telemetry.IncConnAttempt(ctx, "websocket", "success")
|
||||
telemetry.ObserveWSConnectLatency(ctx, lat, "success", "")
|
||||
c.conn = conn
|
||||
c.setConnected(true)
|
||||
telemetry.SetWSConnectionState(true)
|
||||
c.setMetricsContext(ctx)
|
||||
sessionStart := time.Now()
|
||||
// Wire up pong handler for metrics
|
||||
c.conn.SetPongHandler(func(appData string) error {
|
||||
telemetry.IncWSMessage(c.metricsContext(), "in", "pong")
|
||||
return nil
|
||||
})
|
||||
|
||||
// Start the ping monitor
|
||||
go c.pingMonitor()
|
||||
// Start the read pump with disconnect detection
|
||||
go c.readPumpWithDisconnectDetection()
|
||||
go c.readPumpWithDisconnectDetection(sessionStart)
|
||||
|
||||
if c.onConnect != nil {
|
||||
err := c.saveConfig()
|
||||
@@ -496,6 +623,9 @@ func (c *Client) pingMonitor() {
|
||||
}
|
||||
c.writeMux.Lock()
|
||||
err := c.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(c.pingTimeout))
|
||||
if err == nil {
|
||||
telemetry.IncWSMessage(c.metricsContext(), "out", "ping")
|
||||
}
|
||||
c.writeMux.Unlock()
|
||||
if err != nil {
|
||||
// Check if we're shutting down before logging error and reconnecting
|
||||
@@ -505,6 +635,8 @@ func (c *Client) pingMonitor() {
|
||||
return
|
||||
default:
|
||||
logger.Error("Ping failed: %v", err)
|
||||
telemetry.IncWSKeepaliveFailure(c.metricsContext(), "ping_write")
|
||||
telemetry.IncWSReconnect(c.metricsContext(), "ping_write")
|
||||
c.reconnect()
|
||||
return
|
||||
}
|
||||
@@ -514,17 +646,26 @@ func (c *Client) pingMonitor() {
|
||||
}
|
||||
|
||||
// readPumpWithDisconnectDetection reads messages and triggers reconnect on error
|
||||
func (c *Client) readPumpWithDisconnectDetection() {
|
||||
func (c *Client) readPumpWithDisconnectDetection(started time.Time) {
|
||||
ctx := c.metricsContext()
|
||||
disconnectReason := "shutdown"
|
||||
disconnectResult := "success"
|
||||
|
||||
defer func() {
|
||||
if c.conn != nil {
|
||||
c.conn.Close()
|
||||
}
|
||||
if !started.IsZero() {
|
||||
telemetry.ObserveWSSessionDuration(ctx, time.Since(started).Seconds(), disconnectResult)
|
||||
}
|
||||
telemetry.IncWSDisconnect(ctx, disconnectReason, disconnectResult)
|
||||
// Only attempt reconnect if we're not shutting down
|
||||
select {
|
||||
case <-c.done:
|
||||
// Shutting down, don't reconnect
|
||||
return
|
||||
default:
|
||||
telemetry.IncWSReconnect(ctx, disconnectReason)
|
||||
c.reconnect()
|
||||
}
|
||||
}()
|
||||
@@ -532,23 +673,33 @@ func (c *Client) readPumpWithDisconnectDetection() {
|
||||
for {
|
||||
select {
|
||||
case <-c.done:
|
||||
disconnectReason = "shutdown"
|
||||
disconnectResult = "success"
|
||||
return
|
||||
default:
|
||||
var msg WSMessage
|
||||
err := c.conn.ReadJSON(&msg)
|
||||
if err == nil {
|
||||
telemetry.IncWSMessage(c.metricsContext(), "in", "text")
|
||||
}
|
||||
if err != nil {
|
||||
// Check if we're shutting down before logging error
|
||||
select {
|
||||
case <-c.done:
|
||||
// Expected during shutdown, don't log as error
|
||||
logger.Debug("WebSocket connection closed during shutdown")
|
||||
disconnectReason = "shutdown"
|
||||
disconnectResult = "success"
|
||||
return
|
||||
default:
|
||||
// Unexpected error during normal operation
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure, websocket.CloseNormalClosure) {
|
||||
logger.Error("WebSocket read error: %v", err)
|
||||
} else {
|
||||
logger.Debug("WebSocket connection closed: %v", err)
|
||||
disconnectResult, disconnectReason = classifyWSDisconnect(err)
|
||||
if disconnectResult == "error" {
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseAbnormalClosure, websocket.CloseNormalClosure) {
|
||||
logger.Error("WebSocket read error: %v", err)
|
||||
} else {
|
||||
logger.Debug("WebSocket connection closed: %v", err)
|
||||
}
|
||||
}
|
||||
return // triggers reconnect via defer
|
||||
}
|
||||
@@ -565,6 +716,7 @@ func (c *Client) readPumpWithDisconnectDetection() {
|
||||
|
||||
func (c *Client) reconnect() {
|
||||
c.setConnected(false)
|
||||
telemetry.SetWSConnectionState(false)
|
||||
if c.conn != nil {
|
||||
c.conn.Close()
|
||||
c.conn = nil
|
||||
|
||||
Reference in New Issue
Block a user