Adding OpenTelemetry Metrics and Tracing

This commit is contained in:
Marc Schäfer
2025-10-11 18:19:51 +02:00
parent b383cec0b0
commit c086e69dd0
30 changed files with 3457 additions and 158 deletions

View File

@@ -0,0 +1,898 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 0,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 500
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value_and_name"
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "go_goroutine_count",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Goroutines",
"transformations": [],
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 1,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 256
},
{
"color": "red",
"value": 512
}
]
},
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 6,
"x": 6,
"y": 0
},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value_and_name"
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "go_memory_gc_goal_bytes / 1024 / 1024",
"format": "time_series",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "GC Target Heap (MiB)",
"transformations": [],
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 2,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 10
},
{
"color": "red",
"value": 25
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 6,
"x": 12,
"y": 0
},
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value_and_name"
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(http_server_request_duration_seconds_count[$__rate_interval]))",
"instant": false,
"legendFormat": "req/s",
"refId": "A"
}
],
"title": "HTTP Requests / s",
"transformations": [],
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"decimals": 3,
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "orange",
"value": 0.1
},
{
"color": "red",
"value": 0.5
}
]
},
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 7,
"w": 6,
"x": 18,
"y": 0
},
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "value_and_name"
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(newt_connection_errors_total{site_id=~\"$site_id\"}[$__rate_interval]))",
"instant": false,
"legendFormat": "errors/s",
"refId": "A"
}
],
"title": "Connection Errors / s",
"transformations": [],
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"unit": "bytes"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 7
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(go_memory_used_bytes)",
"legendFormat": "Used",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "go_memory_gc_goal_bytes",
"legendFormat": "GC Goal",
"refId": "B"
}
],
"title": "Go Heap Usage vs GC Goal",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {},
"decimals": 0,
"mappings": [],
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 7
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "rate(go_memory_allocations_total[$__rate_interval])",
"legendFormat": "Allocations/s",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "rate(go_memory_allocated_bytes_total[$__rate_interval])",
"legendFormat": "Allocated bytes/s",
"refId": "B"
}
],
"title": "Allocation Activity",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 16
},
"id": 7,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))",
"legendFormat": "p50",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))",
"legendFormat": "p95",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket[$__rate_interval])) by (le))",
"legendFormat": "p99",
"refId": "C"
}
],
"title": "HTTP Request Duration Quantiles",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 16
},
"id": 8,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(http_server_request_duration_seconds_count[$__rate_interval])) by (http_response_status_code)",
"legendFormat": "{{http_response_status_code}}",
"refId": "A"
}
],
"title": "HTTP Requests by Status",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 25
},
"id": 9,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(newt_connection_attempts_total{site_id=~\"$site_id\"}[$__rate_interval])) by (transport, result)",
"legendFormat": "{{transport}} • {{result}}",
"refId": "A"
}
],
"title": "Connection Attempts by Transport/Result",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"unit": "ops"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 25
},
"id": 10,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(newt_connection_errors_total{site_id=~\"$site_id\"}[$__rate_interval])) by (transport, error_type)",
"legendFormat": "{{transport}} • {{error_type}}",
"refId": "A"
}
],
"title": "Connection Errors by Type",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {},
"decimals": 3,
"mappings": [],
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 34
},
"id": 11,
"options": {
"legend": {
"calcs": [],
"displayMode": "table",
"placement": "right"
},
"tooltip": {
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))",
"legendFormat": "p50",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))",
"legendFormat": "p95",
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le))",
"legendFormat": "p99",
"refId": "C"
}
],
"title": "Tunnel Latency Quantiles",
"type": "timeseries"
},
{
"cards": {},
"color": {
"cardColor": "#b4ff00",
"colorScale": "sqrt",
"colorScheme": "interpolateTurbo"
},
"dataFormat": "tsbuckets",
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [],
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 34
},
"heatmap": {},
"hideZeroBuckets": true,
"id": 12,
"legend": {
"show": false
},
"options": {
"calculate": true,
"cellGap": 2,
"cellSize": "auto",
"color": {
"exponent": 0.5
},
"exemplars": {
"color": "rgba(255,255,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": false
},
"tooltip": {
"mode": "single",
"show": true
},
"xAxis": {
"show": true
},
"yAxis": {
"decimals": 3,
"show": true
}
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(rate(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\", tunnel_id=~\"$tunnel_id\"}[$__rate_interval])) by (le)",
"format": "heatmap",
"legendFormat": "{{le}}",
"refId": "A"
}
],
"title": "Tunnel Latency Bucket Rate",
"type": "heatmap"
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": [
"newt",
"otel"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "Prometheus",
"value": "prometheus"
},
"hide": 0,
"label": "Datasource",
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"refresh": 1,
"type": "datasource"
},
{
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"definition": "label_values(target_info, site_id)",
"hide": 0,
"includeAll": true,
"label": "Site",
"multi": true,
"name": "site_id",
"options": [],
"query": {
"query": "label_values(target_info, site_id)",
"refId": "SiteIdVar"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"definition": "label_values(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\"}, tunnel_id)",
"hide": 0,
"includeAll": true,
"label": "Tunnel",
"multi": true,
"name": "tunnel_id",
"options": [],
"query": {
"query": "label_values(newt_tunnel_latency_seconds_bucket{site_id=~\"$site_id\"}, tunnel_id)",
"refId": "TunnelVar"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Newt Overview",
"uid": "newt-overview",
"version": 1,
"weekStart": ""
}

View File

@@ -0,0 +1,9 @@
apiVersion: 1
providers:
- name: "newt"
folder: "Newt"
type: file
disableDeletion: false
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
uid: prometheus
isDefault: true
editable: true

View File

@@ -0,0 +1,61 @@
# Variant A: Direct scrape of Newt (/metrics) via Prometheus (no Collector needed)
# Note: Newt already exposes labels like site_id, protocol, direction. Do not promote
# resource attributes into labels when scraping Newt directly.
#
# Example Prometheus scrape config:
# global:
# scrape_interval: 15s
# scrape_configs:
# - job_name: newt
# static_configs:
# - targets: ["newt:2112"]
#
# Variant B: Use OTEL Collector (Newt -> OTLP -> Collector -> Prometheus)
# This pipeline scrapes metrics from the Collector's Prometheus exporter.
# Labels are already on datapoints; promotion from resource is OPTIONAL and typically NOT required.
# If you enable transform/promote below, ensure you do not duplicate labels.
receivers:
otlp:
protocols:
grpc:
endpoint: ":4317"
processors:
memory_limiter:
check_interval: 5s
limit_percentage: 80
spike_limit_percentage: 25
resourcedetection:
detectors: [env, system]
timeout: 5s
batch: {}
# OPTIONAL: Only enable if you need to promote resource attributes to labels.
# WARNING: Newt already provides site_id as a label; avoid double-promotion.
# transform/promote:
# error_mode: ignore
# metric_statements:
# - context: datapoint
# statements:
# - set(attributes["service_instance_id"], resource.attributes["service.instance.id"]) where resource.attributes["service.instance.id"] != nil
# - set(attributes["site_id"], resource.attributes["site_id"]) where resource.attributes["site_id"] != nil
exporters:
prometheus:
endpoint: ":8889"
send_timestamps: true
# prometheusremotewrite:
# endpoint: http://mimir:9009/api/v1/push
debug:
verbosity: basic
service:
pipelines:
metrics:
receivers: [otlp]
processors: [memory_limiter, resourcedetection, batch] # add transform/promote if you really need it
exporters: [prometheus]
traces:
receivers: [otlp]
processors: [memory_limiter, resourcedetection, batch]
exporters: [debug]

View File

@@ -0,0 +1,16 @@
global:
scrape_interval: 15s
scrape_configs:
# IMPORTANT: Do not scrape Newt directly; scrape only the Collector!
- job_name: 'otel-collector'
static_configs:
- targets: ['otel-collector:8889']
# optional: limit metric cardinality
relabel_configs:
- action: labeldrop
regex: 'tunnel_id'
# - action: keep
# source_labels: [site_id]
# regex: '(site-a|site-b)'

21
examples/prometheus.yml Normal file
View File

@@ -0,0 +1,21 @@
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'newt'
scrape_interval: 15s
static_configs:
- targets: ['newt:2112'] # /metrics
relabel_configs:
# optional: drop tunnel_id
- action: labeldrop
regex: 'tunnel_id'
# optional: allow only specific sites
- action: keep
source_labels: [site_id]
regex: '(site-a|site-b)'
# WARNING: Do not enable this together with the 'newt' job above or you will double-count.
# - job_name: 'otel-collector'
# static_configs:
# - targets: ['otel-collector:8889']