mirror of
https://github.com/fosrl/newt.git
synced 2026-03-27 13:06:38 +00:00
Compare commits
85 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e07439a366 | ||
|
|
9f43d4ce6d | ||
|
|
5888553c50 | ||
|
|
f63b1b689f | ||
|
|
7f104d1a0c | ||
|
|
9de29e7e00 | ||
|
|
cf611fe849 | ||
|
|
23e2731473 | ||
|
|
186b51e000 | ||
|
|
d21f4951e9 | ||
|
|
e04c654292 | ||
|
|
e43fbebcb8 | ||
|
|
1afed32562 | ||
|
|
46384e6242 | ||
|
|
52e4a57cc1 | ||
|
|
1a9f6c4685 | ||
|
|
b6f5458ad9 | ||
|
|
4ef9737862 | ||
|
|
b68777e83a | ||
|
|
8d26de5f4d | ||
|
|
c32828128f | ||
|
|
3cd7329d8b | ||
|
|
3490220803 | ||
|
|
bd62da4cc9 | ||
|
|
8d0e6be2c7 | ||
|
|
b62e18622e | ||
|
|
89274eb9a8 | ||
|
|
77d56596ab | ||
|
|
6ec0ab813c | ||
|
|
fef9e8c76b | ||
|
|
ae5129a7c7 | ||
|
|
ed127a2d61 | ||
|
|
20ddbb5382 | ||
|
|
5cbda35637 | ||
|
|
60196455d1 | ||
|
|
84e659acde | ||
|
|
e16881b7c8 | ||
|
|
587e829e42 | ||
|
|
ee2f8899ff | ||
|
|
744a741556 | ||
|
|
aea80200e0 | ||
|
|
b20f7a02b2 | ||
|
|
f28d90595b | ||
|
|
4a90e36a44 | ||
|
|
9ace45e71f | ||
|
|
75d5e695d6 | ||
|
|
d74065a71b | ||
|
|
f86031f458 | ||
|
|
31f70e5032 | ||
|
|
31514f26df | ||
|
|
09fcb36963 | ||
|
|
83c3ae5cf9 | ||
|
|
1e88fb86b4 | ||
|
|
62407b0c74 | ||
|
|
d91c6ef168 | ||
|
|
59e8d79404 | ||
|
|
d907ae9e84 | ||
|
|
d745aa79d4 | ||
|
|
427ab67bb5 | ||
|
|
a86b14d97d | ||
|
|
f8fd8e1bc5 | ||
|
|
0b5e662abc | ||
|
|
bd55269b39 | ||
|
|
3e9c74a65b | ||
|
|
922591b269 | ||
|
|
cfe52caa4a | ||
|
|
d31d08c1c8 | ||
|
|
9ac4cee48d | ||
|
|
b53fb70778 | ||
|
|
0f83489f11 | ||
|
|
09e9bd9493 | ||
|
|
2d4f656852 | ||
|
|
8f7f9c417c | ||
|
|
660adcc72d | ||
|
|
0d55e35784 | ||
|
|
ceef228665 | ||
|
|
496ff0734c | ||
|
|
a89f13870c | ||
|
|
85394d3255 | ||
|
|
0405aebb45 | ||
|
|
9c0f4599b8 | ||
|
|
fd6b1ae323 | ||
|
|
831ae2d9c5 | ||
|
|
a63a27e3ab | ||
|
|
34d558a5a2 |
14
.github/workflows/cicd.yml
vendored
14
.github/workflows/cicd.yml
vendored
@@ -25,7 +25,7 @@ concurrency:
|
||||
jobs:
|
||||
release:
|
||||
name: Build and Release
|
||||
runs-on: amd64-runner
|
||||
runs-on: ubuntu-latest
|
||||
# Job-level timeout to avoid runaway or stuck runs
|
||||
timeout-minutes: 120
|
||||
env:
|
||||
@@ -78,6 +78,13 @@ jobs:
|
||||
echo "Built & pushed to: ${{ env.DOCKERHUB_IMAGE }}:${TAG}"
|
||||
shell: bash
|
||||
|
||||
- name: Login in to GHCR
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Install skopeo + jq
|
||||
# skopeo: copy/inspect images between registries
|
||||
# jq: JSON parsing tool used to extract digest values
|
||||
@@ -87,11 +94,6 @@ jobs:
|
||||
skopeo --version
|
||||
shell: bash
|
||||
|
||||
- name: Login to GHCR
|
||||
run: |
|
||||
skopeo login ghcr.io -u "${{ github.actor }}" -p "${{ secrets.GITHUB_TOKEN }}"
|
||||
shell: bash
|
||||
|
||||
- name: Copy tag from Docker Hub to GHCR
|
||||
# Mirror the already-built image (all architectures) to GHCR so we can sign it
|
||||
run: |
|
||||
|
||||
11
.github/workflows/mirror.yaml
vendored
11
.github/workflows/mirror.yaml
vendored
@@ -14,7 +14,7 @@ env:
|
||||
|
||||
jobs:
|
||||
mirror-and-dual-sign:
|
||||
runs-on: amd64-runner
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Install skopeo + jq
|
||||
run: |
|
||||
@@ -36,11 +36,17 @@ jobs:
|
||||
run: |
|
||||
skopeo login ghcr.io -u "${{ github.actor }}" -p "${{ secrets.GITHUB_TOKEN }}"
|
||||
|
||||
# Auth for cosign (docker-config)
|
||||
# >>> IMPORTANT: Auth for cosign (docker-config) <<<
|
||||
- name: Docker login to GHCR (for cosign)
|
||||
run: |
|
||||
echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
|
||||
|
||||
# Optional (if Docker Hub private / tight limits)
|
||||
# - name: Login to Docker Hub (skopeo and cosign share this via docker login)
|
||||
# run: |
|
||||
# echo "${{ secrets.DOCKERHUB_TOKEN }}" | docker login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" --password-stdin
|
||||
# skopeo login docker.io -u "${{ secrets.DOCKERHUB_USERNAME }}" -p "${{ secrets.DOCKERHUB_TOKEN }}"
|
||||
|
||||
- name: List source tags
|
||||
run: |
|
||||
set -euo pipefail
|
||||
@@ -130,3 +136,4 @@ jobs:
|
||||
echo "Skipped : $skipped"
|
||||
echo "Verified OK : $v_ok"
|
||||
echo "Errors : $errs"
|
||||
|
||||
|
||||
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -11,7 +11,7 @@ on:
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: amd64-runner
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -6,3 +6,4 @@ nohup.out
|
||||
*.iml
|
||||
certs/
|
||||
newt_arm64
|
||||
.env
|
||||
|
||||
@@ -4,7 +4,11 @@ Contributions are welcome!
|
||||
|
||||
Please see the contribution and local development guide on the docs page before getting started:
|
||||
|
||||
https://docs.pangolin.net/development/contributing
|
||||
https://docs.fossorial.io/development
|
||||
|
||||
For ideas about what features to work on and our future plans, please see the roadmap:
|
||||
|
||||
https://docs.fossorial.io/roadmap
|
||||
|
||||
### Licensing Considerations
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#ghcr.io/marcschaeferger/newt-private:1.0.0-otel
|
||||
#tademsh/newt:1.0.0-otel
|
||||
FROM golang:1.25-alpine AS builder
|
||||
|
||||
# Install git and ca-certificates
|
||||
@@ -9,8 +11,13 @@ WORKDIR /app
|
||||
# Copy go mod and sum files
|
||||
COPY go.mod go.sum ./
|
||||
|
||||
# Coolify specific Test - set Go proxy to direct to avoid issues
|
||||
# ENV GOSUMDB=off
|
||||
ENV GOPROXY=https://goproxy.io,https://proxy.golang.org,direct
|
||||
RUN go env | grep -E 'GOPROXY|GOSUMDB|GOPRIVATE' && go mod download
|
||||
|
||||
# Download all dependencies
|
||||
RUN go mod download
|
||||
#RUN go mod download
|
||||
|
||||
# Copy the source code into the container
|
||||
COPY . .
|
||||
|
||||
136
README.md
136
README.md
@@ -1,15 +1,18 @@
|
||||
<!-- markdownlint-disable MD033 -->
|
||||
# Newt
|
||||
|
||||
[](https://pkg.go.dev/github.com/fosrl/newt)
|
||||
[](https://github.com/fosrl/newt/blob/main/LICENSE)
|
||||
[](https://goreportcard.com/report/github.com/fosrl/newt)
|
||||
|
||||
Newt is a fully user space [WireGuard](https://www.wireguard.com/) tunnel client and TCP/UDP proxy, designed to securely expose private resources controlled by Pangolin. By using Newt, you don't need to manage complex WireGuard tunnels and NATing.
|
||||
|
||||
### Installation and Documentation
|
||||
## Installation and Documentation
|
||||
|
||||
Newt is used with Pangolin and Gerbil as part of the larger system. See documentation below:
|
||||
|
||||
- [Full Documentation](https://docs.pangolin.net)
|
||||
- [Full Documentation](https://docs.fossorial.io)
|
||||
- Observability Quickstart: see `docs/observability.md` — canonical Prometheus/OTel Collector quickstart and smoke tests
|
||||
|
||||
## Preview
|
||||
|
||||
@@ -33,120 +36,72 @@ When Newt receives WireGuard control messages, it will use the information encod
|
||||
|
||||
## CLI Args
|
||||
|
||||
### Core Configuration
|
||||
|
||||
- `id`: Newt ID generated by Pangolin to identify the client.
|
||||
- `secret`: A unique secret (not shared and kept private) used to authenticate the client ID with the websocket in order to receive commands.
|
||||
- `endpoint`: The endpoint where both Gerbil and Pangolin reside in order to connect to the websocket.
|
||||
- `blueprint-file` (optional): Path to blueprint file to define Pangolin resources and configurations.
|
||||
- `no-cloud` (optional): Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false
|
||||
|
||||
- `mtu` (optional): MTU for the internal WG interface. Default: 1280
|
||||
- `dns` (optional): DNS server to use to resolve the endpoint. Default: 9.9.9.9
|
||||
- `log-level` (optional): The log level to use (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO
|
||||
|
||||
### Docker Integration
|
||||
|
||||
- `enforce-hc-cert` (optional): Enforce certificate validation for health checks. Default: false (accepts any cert)
|
||||
- `docker-socket` (optional): Set the Docker socket to use the container discovery integration
|
||||
- `ping-interval` (optional): Interval for pinging the server. Default: 3s
|
||||
- `ping-timeout` (optional): Timeout for each ping. Default: 5s
|
||||
- `updown` (optional): A script to be called when targets are added or removed.
|
||||
- `tls-client-cert` (optional): Client certificate (p12 or pfx) for mTLS. See [mTLS](#mtls)
|
||||
- `tls-client-cert` (optional): Path to client certificate (PEM format, optional if using PKCS12). See [mTLS](#mtls)
|
||||
- `tls-client-key` (optional): Path to private key for mTLS (PEM format, optional if using PKCS12)
|
||||
- `tls-ca-cert` (optional): Path to CA certificate to verify server (PEM format, optional if using PKCS12)
|
||||
- `docker-enforce-network-validation` (optional): Validate the container target is on the same network as the newt process. Default: false
|
||||
|
||||
### Accpet Client Connection
|
||||
|
||||
- `health-file` (optional): Check if connection to WG server (pangolin) is ok. creates a file if ok, removes it if not ok. Can be used with docker healtcheck to restart newt
|
||||
- `accept-clients` (optional): Enable WireGuard server mode to accept incoming newt client connections. Default: false
|
||||
- `generateAndSaveKeyTo` (optional): Path to save generated private key
|
||||
- `native` (optional): Use native WireGuard interface when accepting clients (requires WireGuard kernel module and Linux, must run as root). Default: false (uses userspace netstack)
|
||||
- `interface` (optional): Name of the WireGuard interface. Default: newt
|
||||
- `keep-interface` (optional): Keep the WireGuard interface. Default: false
|
||||
|
||||
### Metrics & Observability
|
||||
|
||||
- `metrics` (optional): Enable Prometheus /metrics exporter. Default: true
|
||||
- `otlp` (optional): Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT. Default: false
|
||||
- `metrics-admin-addr` (optional): Admin/metrics bind address. Default: 127.0.0.1:2112
|
||||
- `metrics-async-bytes` (optional): Enable async bytes counting (background flush; lower hot path overhead). Default: false
|
||||
- `region` (optional): Optional region resource attribute for telemetry and metrics.
|
||||
|
||||
### Network Configuration
|
||||
|
||||
- `mtu` (optional): MTU for the internal WG interface. Default: 1280
|
||||
- `dns` (optional): DNS server to use to resolve the endpoint. Default: 9.9.9.9
|
||||
- `ping-interval` (optional): Interval for pinging the server. Default: 3s
|
||||
- `ping-timeout` (optional): Timeout for each ping. Default: 5s
|
||||
|
||||
### Security & TLS
|
||||
|
||||
- `enforce-hc-cert` (optional): Enforce certificate validation for health checks. Default: false (accepts any cert)
|
||||
- `tls-client-cert` (optional): Client certificate (p12 or pfx) for mTLS or path to client certificate (PEM format). See [mTLS](#mtls)
|
||||
- `tls-client-key` (optional): Path to private key for mTLS (PEM format, optional if using PKCS12)
|
||||
- `tls-ca-cert` (optional): Path to CA certificate to verify server (PEM format, optional if using PKCS12)
|
||||
|
||||
### Monitoring & Health
|
||||
|
||||
- `health-file` (optional): Check if connection to WG server (pangolin) is ok. creates a file if ok, removes it if not ok. Can be used with docker healtcheck to restart newt
|
||||
- `updown` (optional): A script to be called when targets are added or removed.
|
||||
- `blueprint-file` (optional): Path to blueprint file to define Pangolin resources and configurations.
|
||||
- `no-cloud` (optional): Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false
|
||||
|
||||
## Environment Variables
|
||||
|
||||
All CLI arguments can be set using environment variables as an alternative to command line flags. Environment variables are particularly useful when running Newt in containerized environments.
|
||||
|
||||
### Core Configuration
|
||||
|
||||
- `PANGOLIN_ENDPOINT`: Endpoint of your pangolin server (equivalent to `--endpoint`)
|
||||
- `NEWT_ID`: Newt ID generated by Pangolin (equivalent to `--id`)
|
||||
- `NEWT_SECRET`: Newt secret for authentication (equivalent to `--secret`)
|
||||
- `CONFIG_FILE`: Load the config json from this file instead of in the home folder.
|
||||
- `BLUEPRINT_FILE`: Path to blueprint file to define Pangolin resources and configurations. (equivalent to `--blueprint-file`)
|
||||
- `NO_CLOUD`: Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false (equivalent to `--no-cloud`)
|
||||
- `MTU`: MTU for the internal WG interface. Default: 1280 (equivalent to `--mtu`)
|
||||
- `DNS`: DNS server to use to resolve the endpoint. Default: 9.9.9.9 (equivalent to `--dns`)
|
||||
- `LOG_LEVEL`: Log level (DEBUG, INFO, WARN, ERROR, FATAL). Default: INFO (equivalent to `--log-level`)
|
||||
|
||||
### Docker Integration
|
||||
|
||||
- `DOCKER_SOCKET`: Path to Docker socket for container discovery (equivalent to `--docker-socket`)
|
||||
- `PING_INTERVAL`: Interval for pinging the server. Default: 3s (equivalent to `--ping-interval`)
|
||||
- `PING_TIMEOUT`: Timeout for each ping. Default: 5s (equivalent to `--ping-timeout`)
|
||||
- `UPDOWN_SCRIPT`: Path to updown script for target add/remove events (equivalent to `--updown`)
|
||||
- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`)
|
||||
- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`)
|
||||
- `TLS_CLIENT_KEY`: Path to private key for mTLS (equivalent to `--tls-client-key`)
|
||||
- `TLS_CA_CERT`: Path to CA certificate to verify server (equivalent to `--tls-ca-cert`)
|
||||
- `DOCKER_ENFORCE_NETWORK_VALIDATION`: Validate container targets are on same network. Default: false (equivalent to `--docker-enforce-network-validation`)
|
||||
|
||||
### Accept Client Connections
|
||||
|
||||
- `ENFORCE_HC_CERT`: Enforce certificate validation for health checks. Default: false (equivalent to `--enforce-hc-cert`)
|
||||
- `HEALTH_FILE`: Path to health file for connection monitoring (equivalent to `--health-file`)
|
||||
- `ACCEPT_CLIENTS`: Enable WireGuard server mode. Default: false (equivalent to `--accept-clients`)
|
||||
- `GENERATE_AND_SAVE_KEY_TO`: Path to save generated private key (equivalent to `--generateAndSaveKeyTo`)
|
||||
- `USE_NATIVE_INTERFACE`: Use native WireGuard interface (Linux only). Default: false (equivalent to `--native`)
|
||||
- `INTERFACE`: Name of the WireGuard interface. Default: newt (equivalent to `--interface`)
|
||||
- `KEEP_INTERFACE`: Keep the WireGuard interface after shutdown. Default: false (equivalent to `--keep-interface`)
|
||||
|
||||
### Monitoring & Health
|
||||
|
||||
- `HEALTH_FILE`: Path to health file for connection monitoring (equivalent to `--health-file`)
|
||||
- `UPDOWN_SCRIPT`: Path to updown script for target add/remove events (equivalent to `--updown`)
|
||||
|
||||
### Metrics & Observability
|
||||
|
||||
- `NEWT_METRICS_PROMETHEUS_ENABLED`: Enable Prometheus /metrics exporter. Default: true (equivalent to `--metrics`)
|
||||
- `NEWT_METRICS_OTLP_ENABLED`: Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT. Default: false (equivalent to `--otlp`)
|
||||
- `NEWT_ADMIN_ADDR`: Admin/metrics bind address. Default: 127.0.0.1:2112 (equivalent to `--metrics-admin-addr`)
|
||||
- `NEWT_METRICS_ASYNC_BYTES`: Enable async bytes counting (background flush; lower hot path overhead). Default: false (equivalent to `--metrics-async-bytes`)
|
||||
- `NEWT_REGION`: Optional region resource attribute for telemetry and metrics (equivalent to `--region`)
|
||||
|
||||
### Network Configuration
|
||||
|
||||
- `MTU`: MTU for the internal WG interface. Default: 1280 (equivalent to `--mtu`)
|
||||
- `DNS`: DNS server to use to resolve the endpoint. Default: 9.9.9.9 (equivalent to `--dns`)
|
||||
- `PING_INTERVAL`: Interval for pinging the server. Default: 3s (equivalent to `--ping-interval`)
|
||||
- `PING_TIMEOUT`: Timeout for each ping. Default: 5s (equivalent to `--ping-timeout`)
|
||||
|
||||
### Security & TLS
|
||||
|
||||
- `ENFORCE_HC_CERT`: Enforce certificate validation for health checks. Default: false (equivalent to `--enforce-hc-cert`)
|
||||
- `TLS_CLIENT_CERT`: Path to client certificate for mTLS (equivalent to `--tls-client-cert`)
|
||||
- `TLS_CLIENT_KEY`: Path to private key for mTLS (equivalent to `--tls-client-key`)
|
||||
- `TLS_CA_CERT`: Path to CA certificate to verify server (equivalent to `--tls-ca-cert`)
|
||||
- `SKIP_TLS_VERIFY`: Skip TLS verification for server connections. Default: false
|
||||
- `CONFIG_FILE`: Load the config json from this file instead of in the home folder.
|
||||
- `BLUEPRINT_FILE`: Path to blueprint file to define Pangolin resources and configurations. (equivalent to `--blueprint-file`)
|
||||
- `NO_CLOUD`: Don't fail over to the cloud when using managed nodes in Pangolin Cloud. Default: false (equivalent to `--no-cloud`)
|
||||
|
||||
## Loading secrets from files
|
||||
|
||||
You can use `CONFIG_FILE` to define a location of a config file to store the credentials between runs.
|
||||
|
||||
```
|
||||
```sh
|
||||
$ cat ~/.config/newt-client/config.json
|
||||
{
|
||||
"id": "spmzu8rbpzj1qq6",
|
||||
"secret": "f6v61mjutwme2kkydbw3fjo227zl60a2tsf5psw9r25hgae3",
|
||||
"endpoint": "https://app.pangolin.net",
|
||||
"endpoint": "https://pangolin.fossorial.io",
|
||||
"tlsClientCert": ""
|
||||
}
|
||||
```
|
||||
@@ -159,6 +114,8 @@ Default locations:
|
||||
- **Windows**: `%PROGRAMDATA%\newt\newt-client\config.json`
|
||||
- **Linux/Others**: `~/.config/newt-client/config.json`
|
||||
|
||||
<!-- Observability Quickstart moved to docs/observability.md (canonical). -->
|
||||
|
||||
## Examples
|
||||
|
||||
**Note**: When both environment variables and CLI arguments are provided, CLI arguments take precedence.
|
||||
@@ -313,7 +270,6 @@ Supported values include:
|
||||
|
||||
`ssh://user@host`
|
||||
|
||||
|
||||
```yaml
|
||||
services:
|
||||
newt:
|
||||
@@ -328,6 +284,7 @@ services:
|
||||
- NEWT_SECRET=nnisrfsdfc7prqsp9ewo1dvtvci50j5uiqotez00dgap0ii2
|
||||
- DOCKER_SOCKET=unix:///var/run/docker.sock
|
||||
```
|
||||
|
||||
>If you previously used just a path like `/var/run/docker.sock`, it still works — Newt assumes it is a UNIX socket by default.
|
||||
|
||||
#### Hostnames vs IPs
|
||||
@@ -373,12 +330,12 @@ Newt supports mutual TLS (mTLS) authentication if the server is configured to re
|
||||
|
||||
> This is the original method and still supported.
|
||||
|
||||
* File must contain:
|
||||
- File must contain:
|
||||
|
||||
* Client private key
|
||||
* Public certificate
|
||||
* CA certificate
|
||||
* Encrypted `.p12` files are **not supported**
|
||||
- Client private key
|
||||
- Public certificate
|
||||
- CA certificate
|
||||
- Encrypted `.p12` files are **not supported**
|
||||
|
||||
Example:
|
||||
|
||||
@@ -394,9 +351,9 @@ newt \
|
||||
|
||||
You can now provide separate files for:
|
||||
|
||||
* `--tls-client-cert`: client certificate (`.crt` or `.pem`)
|
||||
* `--tls-client-key`: client private key (`.key` or `.pem`)
|
||||
* `--tls-ca-cert`: CA cert to verify the server
|
||||
- `--tls-client-cert`: client certificate (`.crt` or `.pem`)
|
||||
- `--tls-client-key`: client private key (`.key` or `.pem`)
|
||||
- `--tls-ca-cert`: CA cert to verify the server
|
||||
|
||||
Example:
|
||||
|
||||
@@ -410,7 +367,6 @@ newt \
|
||||
--tls-ca-cert ./ca.crt
|
||||
```
|
||||
|
||||
|
||||
```yaml
|
||||
services:
|
||||
newt:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
If you discover a security vulnerability, please follow the steps below to responsibly disclose it to us:
|
||||
|
||||
1. **Do not create a public GitHub issue or discussion post.** This could put the security of other users at risk.
|
||||
2. Send a detailed report to [security@pangolin.net](mailto:security@pangolin.net) or send a **private** message to a maintainer on [Discord](https://discord.gg/HCJR8Xhme4). Include:
|
||||
2. Send a detailed report to [security@fossorial.io](mailto:security@fossorial.io) or send a **private** message to a maintainer on [Discord](https://discord.gg/HCJR8Xhme4). Include:
|
||||
|
||||
- Description and location of the vulnerability.
|
||||
- Potential impact of the vulnerability.
|
||||
|
||||
@@ -12,9 +12,9 @@ resources:
|
||||
sso-roles:
|
||||
- Member
|
||||
sso-users:
|
||||
- owen@pangolin.net
|
||||
- owen@fossorial.io
|
||||
whitelist-users:
|
||||
- owen@pangolin.net
|
||||
- owen@fossorial.io
|
||||
targets:
|
||||
# - site: glossy-plains-viscacha-rat
|
||||
- hostname: localhost
|
||||
|
||||
32
docker-compose-coolify.yml
Normal file
32
docker-compose-coolify.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
services:
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector:0.111.0
|
||||
command: ["--config=/etc/otelcol/config.yaml"]
|
||||
volumes:
|
||||
- ./examples/otel-collector.yaml:/etc/otelcol/config.yaml:ro
|
||||
ports:
|
||||
- "4317:4317" # OTLP gRPC in
|
||||
- "8889:8889" # Prometheus scrape out
|
||||
|
||||
newt:
|
||||
build: .
|
||||
image: newt:dev
|
||||
environment:
|
||||
OTEL_SERVICE_NAME: newt
|
||||
NEWT_METRICS_PROMETHEUS_ENABLED: "true"
|
||||
NEWT_METRICS_OTLP_ENABLED: "true"
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "otel-collector:4317"
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE: "cumulative"
|
||||
NEWT_ADMIN_ADDR: "0.0.0.0:2112"
|
||||
ports:
|
||||
- "2112:2112"
|
||||
depends_on:
|
||||
- otel-collector
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.55.0
|
||||
volumes:
|
||||
- ./examples/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
ports:
|
||||
- "9090:9090"
|
||||
@@ -1,4 +1,3 @@
|
||||
name: Newt-Metrics
|
||||
services:
|
||||
# Recommended Variant A: Direct Prometheus scrape of Newt (/metrics)
|
||||
# Optional: You may add the Collector service and enable OTLP export, but do NOT
|
||||
|
||||
75
docs/METRICS_RECOMMENDATIONS.md
Normal file
75
docs/METRICS_RECOMMENDATIONS.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# Newt Metrics: Recommendations, Gaps, and Roadmap
|
||||
|
||||
This document captures the current state of Newt metrics, prioritized fixes, and a pragmatic roadmap for near-term improvements.
|
||||
|
||||
1) Current setup (summary)
|
||||
|
||||
- Export: Prometheus exposition (default), optional OTLP (gRPC)
|
||||
- Existing instruments:
|
||||
- Sites: newt_site_registrations_total, newt_site_online (0/1), newt_site_last_heartbeat_timestamp_seconds
|
||||
- Tunnel/Traffic: newt_tunnel_sessions, newt_tunnel_bytes_total, newt_tunnel_latency_seconds, newt_tunnel_reconnects_total
|
||||
- Connection lifecycle: newt_connection_attempts_total, newt_connection_errors_total
|
||||
- Operations: newt_config_reloads_total, process_start_time_seconds, newt_build_info
|
||||
- Operations: newt_config_reloads_total, process_start_time_seconds, newt_config_apply_seconds, newt_cert_rotation_total
|
||||
- Build metadata: newt_build_info
|
||||
- Control plane: newt_websocket_connect_latency_seconds, newt_websocket_messages_total, newt_websocket_connected, newt_websocket_reconnects_total
|
||||
- Proxy: newt_proxy_active_connections, newt_proxy_buffer_bytes, newt_proxy_async_backlog_bytes, newt_proxy_drops_total, newt_proxy_accept_total, newt_proxy_connection_duration_seconds, newt_proxy_connections_total
|
||||
- Go runtime: GC, heap, goroutines via runtime instrumentation
|
||||
|
||||
2) Main issues addressed now
|
||||
|
||||
- Attribute filter (allow-list) extended to include site_id and region in addition to existing keys (tunnel_id, transport, protocol, direction, result, reason, error_type, version, commit).
|
||||
- site_id and region propagation: site_id/region remain resource attributes. Metric labels mirror them on per-site gauges and counters by default; set `NEWT_METRICS_INCLUDE_SITE_LABELS=false` to drop them for multi-tenant scrapes.
|
||||
- Label semantics clarified:
|
||||
- transport: control-plane mechanism (e.g., websocket, wireguard)
|
||||
- protocol: L4 payload type (tcp, udp)
|
||||
- newt_tunnel_bytes_total uses protocol and direction, not transport.
|
||||
- Robustness improvements: removed duplicate clear logic on reconnect; avoided empty site_id by reading NEWT_SITE_ID/NEWT_ID and OTEL_RESOURCE_ATTRIBUTES.
|
||||
|
||||
3) Remaining gaps and deviations
|
||||
|
||||
- Some call sites still need initiator label on reconnect outcomes (client vs server). This is planned.
|
||||
- Config apply duration and cert rotation counters are planned.
|
||||
- Registration and config reload failures are not yet emitted; add failure code paths so result labels expose churn.
|
||||
- Document using `process_start_time_seconds` (and `time()` in PromQL) to derive uptime; no explicit restart counter is needed.
|
||||
- Metric helpers often use `context.Background()`. Where lightweight contexts exist (e.g., HTTP handlers), propagate them to ease future correlation.
|
||||
- Tracing coverage is limited to admin HTTP and WebSocket connect spans; extend to blueprint fetches, proxy accept loops, and WireGuard updates when OTLP is enabled.
|
||||
|
||||
4) Roadmap (phased)
|
||||
|
||||
- Phase 1 (done in this iteration)
|
||||
- Fix attribute filter (site_id, region)
|
||||
- Propagate site_id (and optional region) across metrics
|
||||
- Correct label semantics (transport vs protocol); fix sessions transport labelling
|
||||
- Documentation alignment
|
||||
- Phase 2 (next)
|
||||
- Reconnect: add initiator label (client/server)
|
||||
- Config & PKI: newt_config_apply_seconds{phase,result}; newt_cert_rotation_total{result}
|
||||
- WebSocket disconnect and keepalive failure counters
|
||||
- Proxy connection lifecycle metrics (accept totals, duration histogram)
|
||||
- Pangolin blueprint/config fetch latency and status metrics
|
||||
- Certificate rotation duration histogram to complement success/failure counter
|
||||
|
||||
5) Operational guidance
|
||||
|
||||
- Do not double scrape: scrape either Newt (/metrics) or the Collector’s Prometheus exporter (not both) to avoid double-counting cumulative counters.
|
||||
- For high cardinality tunnel_id, consider relabeling or dropping per-tunnel series in Prometheus to control cardinality.
|
||||
- OTLP troubleshooting: enable TLS via OTEL_EXPORTER_OTLP_CERTIFICATE, use OTEL_EXPORTER_OTLP_HEADERS for auth; verify endpoint reachability.
|
||||
|
||||
6) Example alerts/recording rules (suggestions)
|
||||
|
||||
- Reconnect spikes:
|
||||
- increase(newt_tunnel_reconnects_total[5m]) by (site_id)
|
||||
- Sustained connection errors:
|
||||
- rate(newt_connection_errors_total[5m]) by (site_id,transport,error_type)
|
||||
- Heartbeat gaps:
|
||||
- max_over_time(time() - newt_site_last_heartbeat_timestamp_seconds[15m]) by (site_id)
|
||||
- Proxy drops:
|
||||
- increase(newt_proxy_drops_total[5m]) by (site_id,protocol)
|
||||
- WebSocket connect p95 (when added):
|
||||
- histogram_quantile(0.95, sum(rate(newt_websocket_connect_latency_seconds_bucket[5m])) by (le,site_id))
|
||||
|
||||
7) Collector configuration
|
||||
|
||||
- Direct scrape variant requires no attribute promotion since site_id is already a metric label.
|
||||
- Transform/promote variant remains optional for environments that rely on resource-to-label promotion.
|
||||
247
docs/observability.md
Normal file
247
docs/observability.md
Normal file
@@ -0,0 +1,247 @@
|
||||
<!-- markdownlint-disable MD033 -->
|
||||
# OpenTelemetry Observability for Newt
|
||||
|
||||
This document describes how Newt exposes metrics using the OpenTelemetry (OTel) Go SDK, how to enable Prometheus scraping, and how to send data to an OpenTelemetry Collector for further export.
|
||||
|
||||
Goals
|
||||
|
||||
- Provide a /metrics endpoint in Prometheus exposition format (via OTel Prometheus exporter)
|
||||
- Keep metrics backend-agnostic; optional OTLP export to a Collector
|
||||
- Use OTel semantic conventions where applicable and enforce SI units
|
||||
- Low-cardinality, stable labels only
|
||||
|
||||
Enable via flags (ENV mirrors)
|
||||
|
||||
- --metrics (default: true) ↔ NEWT_METRICS_PROMETHEUS_ENABLED
|
||||
- --metrics-admin-addr (default: 127.0.0.1:2112) ↔ NEWT_ADMIN_ADDR
|
||||
- --otlp (default: false) ↔ NEWT_METRICS_OTLP_ENABLED
|
||||
|
||||
Enable exporters via environment variables (no code changes required)
|
||||
|
||||
- NEWT_METRICS_PROMETHEUS_ENABLED=true|false (default: true)
|
||||
- NEWT_METRICS_OTLP_ENABLED=true|false (default: false)
|
||||
- OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317
|
||||
- OTEL_EXPORTER_OTLP_INSECURE=true|false (default: true for dev)
|
||||
- OTEL_SERVICE_NAME=newt (default)
|
||||
- OTEL_SERVICE_VERSION=<version>
|
||||
- OTEL_RESOURCE_ATTRIBUTES=service.instance.id=<id>,site_id=<id>
|
||||
- OTEL_METRIC_EXPORT_INTERVAL=15s (default)
|
||||
- NEWT_ADMIN_ADDR=127.0.0.1:2112 (default admin HTTP with /metrics)
|
||||
- NEWT_METRICS_INCLUDE_SITE_LABELS=true|false (default: true; disable to drop site_id/region as metric labels and rely on resource attributes only)
|
||||
|
||||
Runtime behavior
|
||||
|
||||
- When Prometheus exporter is enabled, Newt serves /metrics on NEWT_ADMIN_ADDR (default :2112)
|
||||
- When OTLP is enabled, metrics and traces are exported to OTLP gRPC endpoint
|
||||
- Go runtime metrics (goroutines, GC, memory) are exported automatically
|
||||
|
||||
Metric catalog (current)
|
||||
|
||||
Unless otherwise noted, `site_id` and `region` are available via resource attributes and, by default, as metric labels. Set `NEWT_METRICS_INCLUDE_SITE_LABELS=false` to drop them from counter/histogram label sets in high-cardinality environments.
|
||||
|
||||
| Metric | Instrument | Key attributes | Purpose | Example |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| `newt_build_info` | Observable gauge (Int64) | `version`, `commit`, `site_id`, `region` (optional when site labels enabled) | Emits build metadata with value `1` for scrape-time verification. | `newt_build_info{version="1.5.0"} 1` |
|
||||
| `newt_site_registrations_total` | Counter (Int64) | `result` (`success`/`failure`), `site_id`, `region` (optional) | Counts Pangolin registration attempts. | `newt_site_registrations_total{result="success",site_id="acme-edge-1"} 1` |
|
||||
| `newt_site_online` | Observable gauge (Int64) | `site_id` | Reports whether the site is currently connected (`1`) or offline (`0`). | `newt_site_online{site_id="acme-edge-1"} 1` |
|
||||
| `newt_site_last_heartbeat_timestamp_seconds` | Observable gauge (Float64) | `site_id` | Unix timestamp of the most recent Pangolin heartbeat (derive age via `time() - metric`). | `newt_site_last_heartbeat_timestamp_seconds{site_id="acme-edge-1"} 1.728e+09` |
|
||||
| `newt_tunnel_sessions` | Observable gauge (Int64) | `site_id`, `tunnel_id` (when enabled) | Counts active tunnel sessions per peer; collapses to per-site when tunnel IDs are disabled. | `newt_tunnel_sessions{site_id="acme-edge-1",tunnel_id="wgpub..."} 3` |
|
||||
| `newt_tunnel_bytes_total` | Counter (Int64) | `direction` (`ingress`/`egress`), `protocol` (`tcp`/`udp`), `tunnel_id` (optional), `site_id`, `region` (optional) | Measures proxied traffic volume across tunnels. | `newt_tunnel_bytes_total{direction="ingress",protocol="tcp",site_id="acme-edge-1"} 4096` |
|
||||
| `newt_tunnel_latency_seconds` | Histogram (Float64) | `transport` (e.g., `wireguard`), `tunnel_id` (optional), `site_id`, `region` (optional) | Captures RTT or configuration-driven latency samples. | `newt_tunnel_latency_seconds_bucket{transport="wireguard",le="0.5"} 42` |
|
||||
| `newt_tunnel_reconnects_total` | Counter (Int64) | `initiator` (`client`/`server`), `reason` (enumerated), `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks reconnect causes for troubleshooting flaps. | `newt_tunnel_reconnects_total{initiator="client",reason="timeout",site_id="acme-edge-1"} 5` |
|
||||
| `newt_connection_attempts_total` | Counter (Int64) | `transport` (`auth`/`websocket`), `result`, `site_id`, `region` (optional) | Measures control-plane dial attempts and their outcomes. | `newt_connection_attempts_total{transport="websocket",result="success",site_id="acme-edge-1"} 8` |
|
||||
| `newt_connection_errors_total` | Counter (Int64) | `transport`, `error_type`, `site_id`, `region` (optional) | Buckets connection failures by normalized error class. | `newt_connection_errors_total{transport="websocket",error_type="tls_handshake",site_id="acme-edge-1"} 1` |
|
||||
| `newt_config_reloads_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Counts remote blueprint/config reloads. | `newt_config_reloads_total{result="success",site_id="acme-edge-1"} 3` |
|
||||
| `process_start_time_seconds` | Observable gauge (Float64) | — | Unix timestamp of the Newt process start time (use `time() - process_start_time_seconds` for uptime). | `process_start_time_seconds 1.728e+09` |
|
||||
| `newt_config_apply_seconds` | Histogram (Float64) | `phase` (`interface`/`peer`), `result`, `site_id`, `region` (optional) | Measures time spent applying WireGuard configuration phases. | `newt_config_apply_seconds_sum{phase="peer",result="success",site_id="acme-edge-1"} 0.48` |
|
||||
| `newt_cert_rotation_total` | Counter (Int64) | `result`, `site_id`, `region` (optional) | Tracks client certificate rotation attempts. | `newt_cert_rotation_total{result="success",site_id="acme-edge-1"} 2` |
|
||||
| `newt_websocket_connect_latency_seconds` | Histogram (Float64) | `transport="websocket"`, `result`, `error_type` (on failure), `site_id`, `region` (optional) | Measures WebSocket dial latency and exposes failure buckets. | `newt_websocket_connect_latency_seconds_bucket{result="success",le="0.5",site_id="acme-edge-1"} 9` |
|
||||
| `newt_websocket_messages_total` | Counter (Int64) | `direction` (`in`/`out`), `msg_type` (`text`/`ping`/`pong`), `site_id`, `region` (optional) | Accounts for control WebSocket traffic volume by type. | `newt_websocket_messages_total{direction="out",msg_type="ping",site_id="acme-edge-1"} 12` |
|
||||
| `newt_websocket_connected` | Observable gauge (Int64) | `site_id`, `region` (optional) | Reports current WebSocket connectivity (`1` when connected). | `newt_websocket_connected{site_id="acme-edge-1"} 1` |
|
||||
| `newt_websocket_reconnects_total` | Counter (Int64) | `reason` (`tls_handshake`, `dial_timeout`, `io_error`, `ping_write`, `timeout`, etc.), `site_id`, `region` (optional) | Counts reconnect attempts with normalized reasons for failure analysis. | `newt_websocket_reconnects_total{reason="timeout",site_id="acme-edge-1"} 3` |
|
||||
| `newt_proxy_active_connections` | Observable gauge (Int64) | `protocol` (`tcp`/`udp`), `direction` (`ingress`/`egress`), `tunnel_id` (optional), `site_id`, `region` (optional) | Current proxy connections per tunnel and protocol. | `newt_proxy_active_connections{protocol="tcp",direction="egress",site_id="acme-edge-1"} 4` |
|
||||
| `newt_proxy_buffer_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Volume of buffered data awaiting flush in proxy queues. | `newt_proxy_buffer_bytes{protocol="udp",direction="egress",site_id="acme-edge-1"} 2048` |
|
||||
| `newt_proxy_async_backlog_bytes` | Observable gauge (Int64) | `protocol`, `direction`, `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks async write backlog when deferred flushing is enabled. | `newt_proxy_async_backlog_bytes{protocol="tcp",direction="egress",site_id="acme-edge-1"} 512` |
|
||||
| `newt_proxy_drops_total` | Counter (Int64) | `protocol`, `tunnel_id` (optional), `site_id`, `region` (optional) | Counts proxy drop events caused by downstream write errors. | `newt_proxy_drops_total{protocol="udp",site_id="acme-edge-1"} 1` |
|
||||
| `newt_proxy_connections_total` | Counter (Int64) | `event` (`opened`/`closed`), `protocol`, `tunnel_id` (optional), `site_id`, `region` (optional) | Tracks proxy connection lifecycle events for rate/SLO calculations. | `newt_proxy_connections_total{event="opened",protocol="tcp",site_id="acme-edge-1"} 10` |
|
||||
|
||||
Conventions
|
||||
|
||||
- Durations in seconds (unit: s), names end with _seconds
|
||||
- Sizes in bytes (unit: By), names end with _bytes
|
||||
- Counters end with _total
|
||||
- Labels must be low-cardinality and stable
|
||||
|
||||
Histogram buckets
|
||||
|
||||
- Latency (seconds): 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30
|
||||
|
||||
Local quickstart
|
||||
|
||||
1) Direct Prometheus scrape (do not also scrape the Collector)
|
||||
NEWT_METRICS_PROMETHEUS_ENABLED=true \
|
||||
NEWT_METRICS_OTLP_ENABLED=false \
|
||||
NEWT_ADMIN_ADDR="127.0.0.1:2112" \
|
||||
./newt
|
||||
|
||||
curl -s <http://localhost:2112/metrics> | grep ^newt_
|
||||
|
||||
2) Using the Collector (compose-style)
|
||||
NEWT_METRICS_PROMETHEUS_ENABLED=true \
|
||||
NEWT_METRICS_OTLP_ENABLED=true \
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \
|
||||
OTEL_EXPORTER_OTLP_INSECURE=true \
|
||||
OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=cumulative \
|
||||
./newt
|
||||
|
||||
Collector config example: examples/otel-collector.yaml
|
||||
Prometheus scrape config: examples/prometheus.yml
|
||||
|
||||
Adding new metrics
|
||||
|
||||
- Use helpers in internal/telemetry/metrics.go for counters/histograms
|
||||
- Keep labels low-cardinality
|
||||
- Add observable gauges through SetObservableCallback
|
||||
|
||||
Optional tracing
|
||||
|
||||
- When --otlp is enabled, you can wrap outbound HTTP clients with otelhttp.NewTransport to create spans for HTTP requests to Pangolin. This affects traces only and does not add metric labels.
|
||||
|
||||
OTLP TLS example
|
||||
|
||||
- Enable TLS to Collector with a custom CA and headers:
|
||||
|
||||
```sh
|
||||
NEWT_METRICS_OTLP_ENABLED=true \
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=collector:4317 \
|
||||
OTEL_EXPORTER_OTLP_INSECURE=false \
|
||||
OTEL_EXPORTER_OTLP_CERTIFICATE=/etc/otel/custom-ca.pem \
|
||||
OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer abc123,tenant=acme" \
|
||||
./newt
|
||||
```
|
||||
|
||||
Prometheus scrape strategy (choose one)
|
||||
|
||||
Important: Do not scrape both Newt (2112) and the Collector’s Prometheus exporter (8889) at the same time for the same process. Doing so will double-count cumulative counters.
|
||||
|
||||
A) Scrape Newt directly:
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
scrape_configs:
|
||||
- job_name: newt
|
||||
static_configs:
|
||||
- targets: ["newt:2112"]
|
||||
```
|
||||
|
||||
B) Scrape the Collector’s Prometheus exporter:
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
scrape_configs:
|
||||
- job_name: otel-collector
|
||||
static_configs:
|
||||
- targets: ["otel-collector:8889"]
|
||||
```
|
||||
|
||||
Reason mapping (source → reason)
|
||||
|
||||
- Server instructs reconnect/terminate → server_request
|
||||
- Heartbeat/Ping threshold exceeded → timeout
|
||||
- Peer closed connection gracefully → peer_close
|
||||
- Route/Interface change detected → network_change
|
||||
- Auth/token failure (HTTP 401/403) → auth_error
|
||||
- TLS/WG handshake error → handshake_error
|
||||
- Config reloaded/applied (causing reconnection) → config_change
|
||||
- Other/unclassified errors → error
|
||||
|
||||
PromQL snippets
|
||||
|
||||
- Throughput in (5m):
|
||||
|
||||
```sh
|
||||
sum(rate(newt_tunnel_bytes_total{direction="ingress"}[5m]))
|
||||
```
|
||||
|
||||
- P95 latency (seconds):
|
||||
|
||||
```sh
|
||||
histogram_quantile(0.95, sum(rate(newt_tunnel_latency_seconds_bucket[5m])) by (le))
|
||||
```
|
||||
|
||||
- Active sessions:
|
||||
|
||||
```sh
|
||||
sum(newt_tunnel_sessions)
|
||||
```
|
||||
|
||||
Compatibility notes
|
||||
|
||||
- Gauges do not use the _total suffix (e.g., newt_tunnel_sessions).
|
||||
- site_id/region remain resource attributes. Metric labels for these fields appear on per-site gauges (e.g., `newt_site_online`) and, by default, on counters/histograms; disable them with `NEWT_METRICS_INCLUDE_SITE_LABELS=false` if needed. `tunnel_id` is a metric label (WireGuard public key). Never expose secrets in labels.
|
||||
- NEWT_METRICS_INCLUDE_TUNNEL_ID (default: true) toggles whether tunnel_id is included as a label on bytes/sessions/proxy/reconnect metrics. Disable in high-cardinality environments.
|
||||
- Avoid double-scraping: scrape either Newt (/metrics) or the Collector's Prometheus exporter, not both.
|
||||
- Prometheus does not accept remote_write; use Mimir/Cortex/VM/Thanos-Receive for remote_write.
|
||||
- No free text in labels; use only the enumerated constants for reason, protocol (tcp|udp), and transport (e.g., websocket|wireguard).
|
||||
|
||||
Further reading
|
||||
|
||||
- See docs/METRICS_RECOMMENDATIONS.md for roadmap, label guidance (transport vs protocol), and example alerts.
|
||||
|
||||
Cardinality tips
|
||||
|
||||
- tunnel_id can grow in larger fleets. Use relabeling to drop or retain a subset, for example:
|
||||
|
||||
```yaml
|
||||
# Drop all tunnel_id on bytes to reduce series
|
||||
- source_labels: [__name__]
|
||||
regex: newt_tunnel_bytes_total
|
||||
action: keep
|
||||
- action: labeldrop
|
||||
regex: tunnel_id
|
||||
|
||||
# Or drop only high-churn tunnels
|
||||
- source_labels: [tunnel_id]
|
||||
regex: .*
|
||||
action: drop
|
||||
```
|
||||
|
||||
Quickstart: direkte Prometheus-Erfassung (empfohlen)
|
||||
|
||||
```sh
|
||||
# Start (direkter /metrics-Scrape, keine Doppel-Erfassung)
|
||||
docker compose -f docker-compose.metrics.yml up -d
|
||||
|
||||
# Smoke-Checks
|
||||
./scripts/smoke-metrics.sh
|
||||
# Tunnel-IDs ausblenden (optional):
|
||||
# EXPECT_TUNNEL_ID=false NEWT_METRICS_INCLUDE_TUNNEL_ID=false ./scripts/smoke-metrics.sh
|
||||
```
|
||||
|
||||
- Prometheus UI: <http://localhost:9090>
|
||||
- Standard-Scrape-Intervall: 15s
|
||||
- Kein OTLP aktiv (NEWT_METRICS_OTLP_ENABLED=false in docker-compose.metrics.yml)
|
||||
|
||||
Häufige PromQL-Schnelltests
|
||||
|
||||
```yaml
|
||||
# Online-Status einer Site in den letzten 5 Minuten
|
||||
max_over_time(newt_site_online{site_id="$site"}[5m])
|
||||
|
||||
# TCP egress-Bytes pro Site/Tunnel (10m)
|
||||
sum by (site_id, tunnel_id) (increase(newt_tunnel_bytes_total{protocol="tcp",direction="egress"}[10m]))
|
||||
|
||||
# WebSocket-Connect P95
|
||||
histogram_quantile(0.95, sum by (le, site_id) (rate(newt_websocket_connect_latency_seconds_bucket[5m])))
|
||||
|
||||
# Reconnects nach Initiator
|
||||
increase(newt_tunnel_reconnects_total{site_id="$site"}[30m]) by (initiator, reason)
|
||||
```
|
||||
|
||||
Troubleshooting
|
||||
|
||||
- curl :2112/metrics – ensure endpoint is reachable and includes newt_* metrics
|
||||
- Check Collector logs for OTLP connection issues
|
||||
- Verify Prometheus Targets are UP and scraping Newt or Collector
|
||||
129
docs/otel-review.md
Normal file
129
docs/otel-review.md
Normal file
@@ -0,0 +1,129 @@
|
||||
# Newt OpenTelemetry Review
|
||||
|
||||
## Overview
|
||||
|
||||
This document summarises the current OpenTelemetry (OTel) instrumentation in Newt, assesses
|
||||
compliance with OTel guidelines, and lists concrete improvements to pursue before release.
|
||||
It is based on the implementation in `internal/telemetry` and the call-sites that emit
|
||||
metrics and traces across the code base.
|
||||
|
||||
## Current metric instrumentation
|
||||
|
||||
All instruments are registered in `internal/telemetry/metrics.go`. They are grouped
|
||||
into site, tunnel, connection, configuration, build, WebSocket, and proxy domains.
|
||||
A global attribute filter (see `buildMeterProvider`) constrains exposed label keys to
|
||||
`site_id`, `region`, and a curated list of low-cardinality dimensions so that Prometheus
|
||||
exports stay bounded.
|
||||
|
||||
- **Site lifecycle**: `newt_site_registrations_total`, `newt_site_online`, and
|
||||
`newt_site_last_heartbeat_timestamp_seconds` capture registration attempts and liveness. They
|
||||
are fed either manually (`IncSiteRegistration`) or via the `TelemetryView` state
|
||||
callback that publishes observable gauges for the active site.
|
||||
- **Tunnel health and usage**: Counters and histograms track bytes, latency, reconnects,
|
||||
and active sessions per tunnel (`newt_tunnel_*` family). Attribute helpers respect
|
||||
the `NEWT_METRICS_INCLUDE_TUNNEL_ID` toggle to keep cardinality manageable on larger
|
||||
fleets.
|
||||
- **Connection attempts**: `newt_connection_attempts_total` and
|
||||
`newt_connection_errors_total` are emitted throughout the WebSocket client to classify
|
||||
authentication, dial, and transport failures.
|
||||
- **Operations/configuration**: `newt_config_reloads_total`,
|
||||
`process_start_time_seconds`, `newt_config_apply_seconds`, and
|
||||
`newt_cert_rotation_total` provide visibility into blueprint reloads, process boots,
|
||||
configuration timings, and certificate rotation outcomes.
|
||||
- **Build metadata**: `newt_build_info` records the binary version/commit together
|
||||
with optional site metadata when build information is supplied at startup.
|
||||
- **WebSocket control-plane**: `newt_websocket_connect_latency_seconds`,
|
||||
`newt_websocket_messages_total`, `newt_websocket_connected`, and
|
||||
`newt_websocket_reconnects_total` report connect latency, ping/pong/text activity,
|
||||
connection state, and reconnect reasons.
|
||||
- **Proxy data-plane**: Observable gauges (`newt_proxy_active_connections`,
|
||||
`newt_proxy_buffer_bytes`, `newt_proxy_async_backlog_bytes`) plus counters for
|
||||
drops, accepts, connection lifecycle events (`newt_proxy_connections_total`), and
|
||||
duration histograms (`newt_proxy_connection_duration_seconds`) surface backlog,
|
||||
drop behaviour, and churn alongside per-protocol byte counters.
|
||||
|
||||
Refer to `docs/observability.md` for a tabular catalogue with instrument types,
|
||||
attributes, and sample exposition lines.
|
||||
|
||||
## Tracing coverage
|
||||
|
||||
Tracing is optional and enabled only when OTLP export is configured. When active:
|
||||
|
||||
- The admin HTTP mux is wrapped with `otelhttp.NewHandler`, producing spans for
|
||||
`/metrics` and `/healthz` requests.
|
||||
- The WebSocket dial path creates a `ws.connect` span around the gRPC-based handshake.
|
||||
|
||||
No other subsystems currently create spans, so data-plane operations, blueprint fetches,
|
||||
Docker discovery, and WireGuard reconfiguration happen without trace context.
|
||||
|
||||
## Guideline & best-practice alignment
|
||||
|
||||
The implementation adheres to most OTel Go recommendations:
|
||||
|
||||
- **Naming & units** – Every instrument follows the `newt_*` prefix with `_total`
|
||||
suffixes for counters and `_seconds`/`_bytes` unit conventions. Histograms are
|
||||
registered with explicit second-based buckets.
|
||||
- **Resource attributes** – Service name/version and optional `site_id`/`region`
|
||||
populate the `resource.Resource`. Metric labels mirror these by default (and on
|
||||
per-site gauges) but can be disabled with `NEWT_METRICS_INCLUDE_SITE_LABELS=false`
|
||||
to avoid unnecessary cardinality growth.
|
||||
- **Attribute hygiene** – A single attribute filter (`sdkmetric.WithView`) enforces
|
||||
the allow-list of label keys to prevent accidental high-cardinality emission.
|
||||
- **Runtime metrics** – Go runtime instrumentation is enabled automatically through
|
||||
`runtime.Start`.
|
||||
- **Configuration via environment** – `telemetry.FromEnv` honours `OTEL_*` variables
|
||||
alongside `NEWT_*` overrides so operators can configure exporters without code
|
||||
changes.
|
||||
- **Shutdown handling** – `Setup.Shutdown` iterates exporters in reverse order to
|
||||
flush buffers before process exit.
|
||||
|
||||
## Adjustments & improvements
|
||||
|
||||
The review identified a few actionable adjustments:
|
||||
|
||||
1. **Record registration failures** – `newt_site_registrations_total` is currently
|
||||
incremented only on success. Emit `result="failure"` samples whenever Pangolin
|
||||
rejects a registration or credential exchange so operators can alert on churn.
|
||||
2. **Surface config reload failures** – `telemetry.IncConfigReload` is invoked with
|
||||
`result="success"` only. Callers should record a failure result when blueprint
|
||||
parsing or application aborts before success counters are incremented.
|
||||
3. **Expose robust uptime** – Document using `time() - process_start_time_seconds`
|
||||
to derive uptime now that the restart counter has been replaced with a timestamp
|
||||
gauge.
|
||||
4. **Propagate contexts where available** – Many emitters call metric helpers with
|
||||
`context.Background()`. Passing real contexts (when inexpensive) would allow future
|
||||
exporters to correlate spans and metrics.
|
||||
5. **Extend tracing coverage** – Instrument critical flows such as blueprint fetches,
|
||||
WireGuard reconfiguration, proxy accept loops, and Docker discovery to provide end
|
||||
to end visibility when OTLP tracing is enabled.
|
||||
|
||||
## Metrics to add before release
|
||||
|
||||
Prioritised additions that would close visibility gaps:
|
||||
|
||||
1. **Config reload error taxonomy** – Split reload attempts into a dedicated
|
||||
`newt_config_reload_errors_total{phase}` counter to make blueprint validation failures
|
||||
visible alongside the existing success counter.
|
||||
2. **Config source visibility** – Export `newt_config_source_info{source,version}` so
|
||||
operators can audit the active blueprint origin/commit during incidents.
|
||||
3. **Certificate expiry** – Emit `newt_cert_expiry_timestamp_seconds` (per cert) to
|
||||
enable proactive alerts before mTLS credentials lapse.
|
||||
4. **Blueprint/config pull latency** – Measuring Pangolin blueprint fetch durations and
|
||||
HTTP status distribution would expose slow control-plane operations.
|
||||
5. **Tunnel setup latency** – Histograms for DNS resolution and tunnel handshakes would
|
||||
help correlate connect latency spikes with network dependencies.
|
||||
|
||||
These metrics rely on data that is already available in the code paths mentioned
|
||||
above and would round out operational dashboards.
|
||||
|
||||
## Tracing wishlist
|
||||
|
||||
To benefit from tracing when OTLP is active, add spans around:
|
||||
|
||||
- Pangolin REST calls (wrap the HTTP client with `otelhttp.NewTransport`).
|
||||
- Docker discovery cycles and target registration callbacks.
|
||||
- WireGuard reconfiguration (interface bring-up, peer updates).
|
||||
- Proxy dial/accept loops for both TCP and UDP targets.
|
||||
|
||||
Capturing these stages will let operators correlate latency spikes with reconnects
|
||||
and proxy drops using distributed traces in addition to the metric signals.
|
||||
10
go.mod
10
go.mod
@@ -3,7 +3,7 @@ module github.com/fosrl/newt
|
||||
go 1.25
|
||||
|
||||
require (
|
||||
github.com/docker/docker v28.5.1+incompatible
|
||||
github.com/docker/docker v28.5.0+incompatible
|
||||
github.com/google/gopacket v1.1.19
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
@@ -18,12 +18,12 @@ require (
|
||||
go.opentelemetry.io/otel/sdk v1.38.0
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0
|
||||
golang.org/x/crypto v0.43.0
|
||||
golang.org/x/net v0.45.0
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792
|
||||
golang.org/x/net v0.46.0
|
||||
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb
|
||||
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10
|
||||
google.golang.org/grpc v1.76.0
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
google.golang.org/grpc v1.76.0
|
||||
gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c
|
||||
software.sslmate.com/src/go-pkcs12 v0.6.0
|
||||
)
|
||||
@@ -75,6 +75,10 @@ require (
|
||||
golang.org/x/text v0.30.0 // indirect
|
||||
golang.org/x/time v0.12.0 // indirect
|
||||
golang.org/x/tools v0.37.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect
|
||||
go.opentelemetry.io/otel v1.37.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.36.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.37.0 // indirect
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect
|
||||
|
||||
55
go.sum
55
go.sum
@@ -1,3 +1,5 @@
|
||||
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
|
||||
github.com/Microsoft/go-winio v0.6.0 h1:slsWYD/zyx7lCXoZVlvQrj0hPTM1HI4+v1sIda2yDvg=
|
||||
github.com/Microsoft/go-winio v0.6.0/go.mod h1:cTAf44im0RAYeL23bpB+fzCyDH2MJiz2BO69KH/soAE=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
@@ -10,14 +12,20 @@ github.com/containerd/errdefs v0.3.0 h1:FSZgGOeK4yuT/+DnF07/Olde/q4KBoMsaamhXxIM
|
||||
github.com/containerd/errdefs v0.3.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
|
||||
github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
|
||||
github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk=
|
||||
github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
|
||||
github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
|
||||
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
|
||||
github.com/docker/docker v28.5.1+incompatible h1:Bm8DchhSD2J6PsFzxC35TZo4TLGR2PdW/E69rU45NhM=
|
||||
github.com/docker/docker v28.5.1+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
|
||||
github.com/docker/docker v28.5.0+incompatible h1:ZdSQoRUE9XxhFI/B8YLvhnEFMmYN9Pp8Egd2qcaFk1E=
|
||||
github.com/docker/docker v28.5.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
|
||||
github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94=
|
||||
github.com/docker/go-connections v0.6.0/go.mod h1:AahvXYshr6JgfUJGdDCs2b5EZG/vmaMAntpSFH5BFKE=
|
||||
github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
|
||||
github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
|
||||
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
|
||||
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
|
||||
@@ -25,6 +33,8 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
|
||||
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
|
||||
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
|
||||
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||
github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg=
|
||||
github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
@@ -41,16 +51,31 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnV
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs=
|
||||
github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA=
|
||||
github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
|
||||
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
|
||||
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/mdlayher/genetlink v1.3.2 h1:KdrNKe+CTu+IbZnm/GVUMXSqBBLqcGpRDa0xkQy56gw=
|
||||
github.com/mdlayher/genetlink v1.3.2/go.mod h1:tcC3pkCrPUGIKKsCsp0B3AdaaKuHtaxoJRz3cc+528o=
|
||||
github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g=
|
||||
github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw=
|
||||
github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos=
|
||||
github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ=
|
||||
github.com/mikioh/ipaddr v0.0.0-20190404000644-d465c8ab6721 h1:RlZweED6sbSArvlE924+mUcZuXKLBHA35U7LN621Bws=
|
||||
github.com/mikioh/ipaddr v0.0.0-20190404000644-d465c8ab6721/go.mod h1:Ickgr2WtCLZ2MDGd4Gr0geeCH5HybhRJbonOgQpvSxc=
|
||||
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
|
||||
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
|
||||
github.com/moby/sys/atomicwriter v0.1.0 h1:kw5D/EqkBwsBFi0ss9v1VG3wIkVhzGvLklJ+w3A14Sw=
|
||||
github.com/moby/sys/atomicwriter v0.1.0/go.mod h1:Ul8oqv2ZMNHOceF643P6FKPXeCmYtlQMvpizfsSoaWs=
|
||||
github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU=
|
||||
github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko=
|
||||
github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
|
||||
github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
|
||||
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
|
||||
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||
@@ -60,6 +85,8 @@ github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQ
|
||||
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
|
||||
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
|
||||
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
|
||||
@@ -70,6 +97,12 @@ github.com/prometheus/otlptranslator v0.0.2 h1:+1CdeLVrRQ6Psmhnobldo0kTp96Rj80DR
|
||||
github.com/prometheus/otlptranslator v0.0.2/go.mod h1:P8AwMgdD7XEr6QRUJ2QWLpiAZTgTE2UYgjlu3svompI=
|
||||
github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0=
|
||||
github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw=
|
||||
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
|
||||
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
|
||||
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
|
||||
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/vishvananda/netlink v1.3.1 h1:3AEMt62VKqz90r0tmNhog0r/PpWKmrEShJU0wJW6bV0=
|
||||
github.com/vishvananda/netlink v1.3.1/go.mod h1:ARtKouGSTGchR8aMwmkzC0qiNPrrWO5JS/XMVl45+b4=
|
||||
github.com/vishvananda/netns v0.0.5 h1:DfiHV+j8bA32MFM7bfEunvT8IAqQ/NzSJHtcmW5zdEY=
|
||||
@@ -88,6 +121,7 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZF
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 h1:aTL7F04bJHUlztTsNGJ2l+6he8c+y/b//eR0jjjemT4=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4=
|
||||
go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo=
|
||||
go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk=
|
||||
@@ -101,21 +135,24 @@ go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJr
|
||||
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
|
||||
go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4=
|
||||
go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
|
||||
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04=
|
||||
golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0=
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc=
|
||||
golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
|
||||
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
|
||||
golang.org/x/mod v0.28.0 h1:gQBtGhjxykdjY9YhZpSlZIsbnaE2+PgjfLWUQTnoZ1U=
|
||||
golang.org/x/mod v0.28.0/go.mod h1:yfB/L0NOf/kmEbXjzCPOx1iK1fRutOydrCMsqRhEBxI=
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792 h1:R9PFI6EUdfVKgwKjZef7QIwGcBKu86OEFpJ9nUEP2l4=
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792/go.mod h1:A+z0yzpGtvnG90cToK5n2tu8UJVP2XUATh+r+sfOOOc=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
|
||||
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
|
||||
golang.org/x/net v0.45.0 h1:RLBg5JKixCy82FtLJpeNlVM0nrSqpCRYzVU1n8kj0tM=
|
||||
golang.org/x/net v0.45.0/go.mod h1:ECOoLqd5U3Lhyeyo/QDCEVQ4sNgYsqvCZ722XogGieY=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug=
|
||||
golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||
@@ -133,6 +170,7 @@ golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
|
||||
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
|
||||
golang.org/x/tools v0.37.0 h1:DVSRzp7FwePZW356yEAChSdNcQo6Nsp+fex1SUW09lE=
|
||||
golang.org/x/tools v0.37.0/go.mod h1:MBN5QPQtLMHVdvsbtarmTNukZDdgwdwlO5qGacAzF0w=
|
||||
golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 h1:B82qJJgjvYKsXS9jeunTOisW56dUokqW/FOteYJJ/yg=
|
||||
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2/go.mod h1:deeaetjYA+DHMHg+sMSMI58GrEteJUUzzw7en6TJQcI=
|
||||
@@ -140,6 +178,8 @@ golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb h1:whnFRlWMcXI9d+Z
|
||||
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb/go.mod h1:rpwXGsirqLqN2L0JDJQlwOboGHmptD5ZD6T2VmcqhTw=
|
||||
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10 h1:3GDAcqdIg1ozBNLgPy4SLT84nfcBjr6rhGtXYtrkWLU=
|
||||
golang.zx2c4.com/wireguard/wgctrl v0.0.0-20241231184526-a9ab2273dd10/go.mod h1:T97yPqesLiNrOYxkwmhMI0ZIlJDm+p0PMR8eRVeR5tQ=
|
||||
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
|
||||
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE=
|
||||
@@ -148,9 +188,14 @@ google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A=
|
||||
google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c=
|
||||
google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
|
||||
google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
|
||||
google.golang.org/genproto v0.0.0-20230920204549-e6e6cdab5c13 h1:vlzZttNJGVqTsRFU9AmdnrcO1Znh8Ew9kCD//yjigk0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o=
|
||||
gotest.tools/v3 v3.4.0/go.mod h1:CtbdzLSsqVhDgMtKsx03ird5YTGB3ar27v0u/yKBW5g=
|
||||
gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c h1:m/r7OM+Y2Ty1sgBQ7Qb27VgIMBW8ZZhT4gLnUyDIhzI=
|
||||
gvisor.dev/gvisor v0.0.0-20250503011706-39ed1f5ac29c/go.mod h1:3r5CMtNQMKIvBlrmM9xWUNamjKBYPOWyXOjmg5Kts3g=
|
||||
software.sslmate.com/src/go-pkcs12 v0.6.0 h1:f3sQittAeF+pao32Vb+mkli+ZyT+VwKaD014qFGq6oU=
|
||||
|
||||
7
internal/telemetry/testdata/expected_contains.golden
vendored
Normal file
7
internal/telemetry/testdata/expected_contains.golden
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
newt_connection_attempts_total
|
||||
newt_websocket_connected
|
||||
newt_websocket_reconnects_total
|
||||
newt_proxy_connections_total
|
||||
newt_build_info
|
||||
|
||||
process_start_time_seconds
|
||||
6
main.go
6
main.go
@@ -169,6 +169,10 @@ func main() {
|
||||
updownScript = os.Getenv("UPDOWN_SCRIPT")
|
||||
interfaceName = os.Getenv("INTERFACE")
|
||||
generateAndSaveKeyTo = os.Getenv("GENERATE_AND_SAVE_KEY_TO")
|
||||
keepInterfaceEnv := os.Getenv("KEEP_INTERFACE")
|
||||
acceptClientsEnv := os.Getenv("ACCEPT_CLIENTS")
|
||||
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
|
||||
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
|
||||
|
||||
// Metrics/observability env mirrors
|
||||
metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
|
||||
@@ -177,7 +181,6 @@ func main() {
|
||||
regionEnv := os.Getenv("NEWT_REGION")
|
||||
asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
|
||||
|
||||
keepInterfaceEnv := os.Getenv("KEEP_INTERFACE")
|
||||
keepInterface = keepInterfaceEnv == "true"
|
||||
acceptClientsEnv := os.Getenv("ACCEPT_CLIENTS")
|
||||
acceptClients = acceptClientsEnv == "true"
|
||||
@@ -391,7 +394,6 @@ func main() {
|
||||
}
|
||||
if tel != nil {
|
||||
// Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
|
||||
logger.Info("Starting metrics server on %s", tcfg.AdminAddr)
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
|
||||
if tel.PrometheusHandler != nil {
|
||||
|
||||
802
patches/00_all_changes.patch
Normal file
802
patches/00_all_changes.patch
Normal file
@@ -0,0 +1,802 @@
|
||||
diff --git a/Dockerfile b/Dockerfile
|
||||
index b9c4d29..b9b6dea 100644
|
||||
--- a/Dockerfile
|
||||
+++ b/Dockerfile
|
||||
@@ -22,6 +22,9 @@ RUN apk --no-cache add ca-certificates tzdata
|
||||
COPY --from=builder /newt /usr/local/bin/
|
||||
COPY entrypoint.sh /
|
||||
|
||||
+# Admin/metrics endpoint (Prometheus scrape)
|
||||
+EXPOSE 2112
|
||||
+
|
||||
RUN chmod +x /entrypoint.sh
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
-CMD ["newt"]
|
||||
\ No newline at end of file
|
||||
+CMD ["newt"]
|
||||
diff --git a/go.mod b/go.mod
|
||||
index d475835..5909955 100644
|
||||
--- a/go.mod
|
||||
+++ b/go.mod
|
||||
@@ -7,6 +7,14 @@ require (
|
||||
github.com/google/gopacket v1.1.19
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
github.com/vishvananda/netlink v1.3.1
|
||||
+ go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0
|
||||
+ go.opentelemetry.io/contrib/instrumentation/runtime v0.62.0
|
||||
+ go.opentelemetry.io/otel v1.37.0
|
||||
+ go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.37.0
|
||||
+ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.37.0
|
||||
+ go.opentelemetry.io/otel/sdk/metric v1.37.0
|
||||
+ go.opentelemetry.io/otel/sdk/trace v1.37.0
|
||||
+ go.opentelemetry.io/otel/semconv v1.26.0
|
||||
golang.org/x/crypto v0.42.0
|
||||
golang.org/x/exp v0.0.0-20250718183923-645b1fa84792
|
||||
golang.org/x/net v0.44.0
|
||||
diff --git a/main.go b/main.go
|
||||
index 12849b1..c223b75 100644
|
||||
--- a/main.go
|
||||
+++ b/main.go
|
||||
@@ -1,7 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
+ "errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net"
|
||||
@@ -22,6 +24,9 @@ import (
|
||||
"github.com/fosrl/newt/updates"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
@@ -116,6 +121,13 @@ var (
|
||||
healthMonitor *healthcheck.Monitor
|
||||
enforceHealthcheckCert bool
|
||||
|
||||
+ // Observability/metrics flags
|
||||
+ metricsEnabled bool
|
||||
+ otlpEnabled bool
|
||||
+ adminAddr string
|
||||
+ region string
|
||||
+ metricsAsyncBytes bool
|
||||
+
|
||||
// New mTLS configuration variables
|
||||
tlsClientCert string
|
||||
tlsClientKey string
|
||||
@@ -126,6 +138,10 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
+ // Prepare context for graceful shutdown and signal handling
|
||||
+ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
+ defer stop()
|
||||
+
|
||||
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
|
||||
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
|
||||
id = os.Getenv("NEWT_ID")
|
||||
@@ -141,6 +157,13 @@ func main() {
|
||||
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
|
||||
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
|
||||
|
||||
+ // Metrics/observability env mirrors
|
||||
+ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
|
||||
+ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
|
||||
+ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
|
||||
+ regionEnv := os.Getenv("NEWT_REGION")
|
||||
+ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
|
||||
+
|
||||
keepInterface = keepInterfaceEnv == "true"
|
||||
acceptClients = acceptClientsEnv == "true"
|
||||
useNativeInterface = useNativeInterfaceEnv == "true"
|
||||
@@ -272,6 +295,35 @@ func main() {
|
||||
flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)")
|
||||
}
|
||||
|
||||
+ // Metrics/observability flags (mirror ENV if unset)
|
||||
+ if metricsEnabledEnv == "" {
|
||||
+ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true }
|
||||
+ }
|
||||
+ if otlpEnabledEnv == "" {
|
||||
+ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v }
|
||||
+ }
|
||||
+ if adminAddrEnv == "" {
|
||||
+ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
|
||||
+ } else {
|
||||
+ adminAddr = adminAddrEnv
|
||||
+ }
|
||||
+ // Async bytes toggle
|
||||
+ if asyncBytesEnv == "" {
|
||||
+ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v }
|
||||
+ }
|
||||
+ // Optional region flag (resource attribute)
|
||||
+ if regionEnv == "" {
|
||||
+ flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)")
|
||||
+ } else {
|
||||
+ region = regionEnv
|
||||
+ }
|
||||
+
|
||||
// do a --version check
|
||||
version := flag.Bool("version", false, "Print the version")
|
||||
|
||||
@@ -286,6 +338,50 @@ func main() {
|
||||
loggerLevel := parseLogLevel(logLevel)
|
||||
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
|
||||
|
||||
+ // Initialize telemetry after flags are parsed (so flags override env)
|
||||
+ tcfg := telemetry.FromEnv()
|
||||
+ tcfg.PromEnabled = metricsEnabled
|
||||
+ tcfg.OTLPEnabled = otlpEnabled
|
||||
+ if adminAddr != "" { tcfg.AdminAddr = adminAddr }
|
||||
+ // Resource attributes (if available)
|
||||
+ tcfg.SiteID = id
|
||||
+ tcfg.Region = region
|
||||
+ // Build info
|
||||
+ tcfg.BuildVersion = newtVersion
|
||||
+ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
|
||||
+
|
||||
+ tel, telErr := telemetry.Init(ctx, tcfg)
|
||||
+ if telErr != nil {
|
||||
+ logger.Warn("Telemetry init failed: %v", telErr)
|
||||
+ }
|
||||
+ if tel != nil {
|
||||
+ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
|
||||
+ mux := http.NewServeMux()
|
||||
+ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
|
||||
+ if tel.PrometheusHandler != nil {
|
||||
+ mux.Handle("/metrics", tel.PrometheusHandler)
|
||||
+ }
|
||||
+ admin := &http.Server{
|
||||
+ Addr: tcfg.AdminAddr,
|
||||
+ Handler: otelhttp.NewHandler(mux, "newt-admin"),
|
||||
+ ReadTimeout: 5 * time.Second,
|
||||
+ WriteTimeout: 10 * time.Second,
|
||||
+ ReadHeaderTimeout: 5 * time.Second,
|
||||
+ IdleTimeout: 30 * time.Second,
|
||||
+ }
|
||||
+ go func() {
|
||||
+ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
+ logger.Warn("admin http error: %v", err)
|
||||
+ }
|
||||
+ }()
|
||||
+ defer func() {
|
||||
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
+ defer cancel()
|
||||
+ _ = admin.Shutdown(ctx)
|
||||
+ }()
|
||||
+ defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
+ }
|
||||
+
|
||||
newtVersion := "version_replaceme"
|
||||
if *version {
|
||||
fmt.Println("Newt version " + newtVersion)
|
||||
@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
// Use reliable ping for initial connection test
|
||||
logger.Debug("Testing initial connection with reliable ping...")
|
||||
- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ if err == nil && wgData.PublicKey != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
+ }
|
||||
if err != nil {
|
||||
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
|
||||
} else {
|
||||
@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
// as the pings will continue in the background
|
||||
if !connected {
|
||||
logger.Debug("Starting ping check")
|
||||
- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
|
||||
+ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Create proxy manager
|
||||
pm = proxy.NewProxyManager(tnet)
|
||||
+ pm.SetAsyncBytes(metricsAsyncBytes)
|
||||
+ // Set tunnel_id for metrics (WireGuard peer public key)
|
||||
+ pm.SetTunnelID(wgData.PublicKey)
|
||||
|
||||
connected = true
|
||||
|
||||
+ // telemetry: record a successful site registration (omit region unless available)
|
||||
+ telemetry.IncSiteRegistration(context.Background(), id, "", "success")
|
||||
+
|
||||
// add the targets if there are any
|
||||
if len(wgData.Targets.TCP) > 0 {
|
||||
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
|
||||
@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received reconnect message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
// Mark as disconnected
|
||||
connected = false
|
||||
|
||||
@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received termination message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
diff --git a/proxy/manager.go b/proxy/manager.go
|
||||
index bf10322..86c47a8 100644
|
||||
--- a/proxy/manager.go
|
||||
+++ b/proxy/manager.go
|
||||
@@ -1,16 +1,22 @@
|
||||
package proxy
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
+ "os"
|
||||
"strings"
|
||||
"sync"
|
||||
+ "sync/atomic"
|
||||
"time"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"github.com/fosrl/newt/logger"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
|
||||
+ "go.opentelemetry.io/otel/attribute"
|
||||
)
|
||||
|
||||
// Target represents a proxy target with its address and port
|
||||
@@ -28,6 +34,52 @@ type ProxyManager struct {
|
||||
udpConns []*gonet.UDPConn
|
||||
running bool
|
||||
mutex sync.RWMutex
|
||||
+
|
||||
+ // telemetry (multi-tunnel)
|
||||
+ currentTunnelID string
|
||||
+ tunnels map[string]*tunnelEntry
|
||||
+ asyncBytes bool
|
||||
+ flushStop chan struct{}
|
||||
+}
|
||||
+
|
||||
+// tunnelEntry holds per-tunnel attributes and (optional) async counters.
|
||||
+type tunnelEntry struct {
|
||||
+ attrInTCP attribute.Set
|
||||
+ attrOutTCP attribute.Set
|
||||
+ attrInUDP attribute.Set
|
||||
+ attrOutUDP attribute.Set
|
||||
+
|
||||
+ bytesInTCP atomic.Uint64
|
||||
+ bytesOutTCP atomic.Uint64
|
||||
+ bytesInUDP atomic.Uint64
|
||||
+ bytesOutUDP atomic.Uint64
|
||||
+}
|
||||
+
|
||||
+// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set.
|
||||
+type countingWriter struct {
|
||||
+ ctx context.Context
|
||||
+ w io.Writer
|
||||
+ set attribute.Set
|
||||
+ pm *ProxyManager
|
||||
+ ent *tunnelEntry
|
||||
+ out bool // false=in, true=out
|
||||
+ proto string // "tcp" or "udp"
|
||||
+}
|
||||
+
|
||||
+func (cw *countingWriter) Write(p []byte) (int, error) {
|
||||
+ n, err := cw.w.Write(p)
|
||||
+ if n > 0 {
|
||||
+ if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil {
|
||||
+ if cw.proto == "tcp" {
|
||||
+ if cw.out { cw.ent.bytesOutTCP.Add(uint64(n)) } else { cw.ent.bytesInTCP.Add(uint64(n)) }
|
||||
+ } else if cw.proto == "udp" {
|
||||
+ if cw.out { cw.ent.bytesOutUDP.Add(uint64(n)) } else { cw.ent.bytesInUDP.Add(uint64(n)) }
|
||||
+ }
|
||||
+ } else {
|
||||
+ telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set)
|
||||
+ }
|
||||
+ }
|
||||
+ return n, err
|
||||
}
|
||||
|
||||
// NewProxyManager creates a new proxy manager instance
|
||||
@@ -38,9 +90,46 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager {
|
||||
udpTargets: make(map[string]map[int]string),
|
||||
listeners: make([]*gonet.TCPListener, 0),
|
||||
udpConns: make([]*gonet.UDPConn, 0),
|
||||
+ tunnels: make(map[string]*tunnelEntry),
|
||||
}
|
||||
}
|
||||
|
||||
+// SetTunnelID sets the WireGuard peer public key used as tunnel_id label.
|
||||
+func (pm *ProxyManager) SetTunnelID(id string) {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ pm.currentTunnelID = id
|
||||
+ if _, ok := pm.tunnels[id]; !ok {
|
||||
+ pm.tunnels[id] = &tunnelEntry{}
|
||||
+ }
|
||||
+ e := pm.tunnels[id]
|
||||
+ e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp"))
|
||||
+ e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp"))
|
||||
+ e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp"))
|
||||
+ e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp"))
|
||||
+}
|
||||
+
|
||||
+// ClearTunnelID clears cached attribute sets for the current tunnel.
|
||||
+func (pm *ProxyManager) ClearTunnelID() {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ id := pm.currentTunnelID
|
||||
+ if id == "" { return }
|
||||
+ if e, ok := pm.tunnels[id]; ok {
|
||||
+ // final flush for this tunnel
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ delete(pm.tunnels, id)
|
||||
+ }
|
||||
+ pm.currentTunnelID = ""
|
||||
+}
|
||||
+
|
||||
// init function without tnet
|
||||
func NewProxyManagerWithoutTNet() *ProxyManager {
|
||||
return &ProxyManager{
|
||||
@@ -160,6 +249,57 @@ func (pm *ProxyManager) Start() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
+func (pm *ProxyManager) SetAsyncBytes(b bool) {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ pm.asyncBytes = b
|
||||
+ if b && pm.flushStop == nil {
|
||||
+ pm.flushStop = make(chan struct{})
|
||||
+ go pm.flushLoop()
|
||||
+ }
|
||||
+}
|
||||
+func (pm *ProxyManager) flushLoop() {
|
||||
+ flushInterval := 2 * time.Second
|
||||
+ if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" {
|
||||
+ if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||
+ if d/2 < flushInterval { flushInterval = d / 2 }
|
||||
+ }
|
||||
+ }
|
||||
+ ticker := time.NewTicker(flushInterval)
|
||||
+ defer ticker.Stop()
|
||||
+ for {
|
||||
+ select {
|
||||
+ case <-ticker.C:
|
||||
+ pm.mutex.RLock()
|
||||
+ for _, e := range pm.tunnels {
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ }
|
||||
+ pm.mutex.RUnlock()
|
||||
+ case <-pm.flushStop:
|
||||
+ pm.mutex.RLock()
|
||||
+ for _, e := range pm.tunnels {
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ }
|
||||
+ pm.mutex.RUnlock()
|
||||
+ return
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (pm *ProxyManager) Stop() error {
|
||||
pm.mutex.Lock()
|
||||
defer pm.mutex.Unlock()
|
||||
@@ -236,6 +376,14 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr
|
||||
return nil
|
||||
}
|
||||
|
||||
+// getEntry returns per-tunnel entry or nil.
|
||||
+func (pm *ProxyManager) getEntry(id string) *tunnelEntry {
|
||||
+ pm.mutex.RLock()
|
||||
+ e := pm.tunnels[id]
|
||||
+ pm.mutex.RUnlock()
|
||||
+ return e
|
||||
+}
|
||||
+
|
||||
func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) {
|
||||
for {
|
||||
conn, err := listener.Accept()
|
||||
@@ -257,6 +405,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
|
||||
continue
|
||||
}
|
||||
|
||||
+// Count sessions only once per accepted TCP connection
|
||||
+ if pm.tunnelID != "" { state.Global().IncSessions(pm.tunnelID) }
|
||||
+
|
||||
go func() {
|
||||
target, err := net.Dial("tcp", targetAddr)
|
||||
if err != nil {
|
||||
@@ -265,24 +416,33 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
|
||||
return
|
||||
}
|
||||
|
||||
+ // already incremented on accept
|
||||
+
|
||||
// Create a WaitGroup to ensure both copy operations complete
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
|
||||
+ // client -> target (direction=in)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
- io.Copy(target, conn)
|
||||
- target.Close()
|
||||
+e := pm.getEntry(pm.currentTunnelID)
|
||||
+cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"}
|
||||
+ _, _ = io.Copy(cw, conn)
|
||||
+ _ = target.Close()
|
||||
}()
|
||||
|
||||
+ // target -> client (direction=out)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
- io.Copy(conn, target)
|
||||
- conn.Close()
|
||||
+e := pm.getEntry(pm.currentTunnelID)
|
||||
+cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"}
|
||||
+ _, _ = io.Copy(cw, target)
|
||||
+ _ = conn.Close()
|
||||
}()
|
||||
|
||||
- // Wait for both copies to complete
|
||||
+ // Wait for both copies to complete then session -1
|
||||
wg.Wait()
|
||||
+ if pm.tunnelID != "" { state.Global().DecSessions(pm.tunnelID) }
|
||||
}()
|
||||
}
|
||||
}
|
||||
@@ -326,6 +486,14 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
}
|
||||
|
||||
clientKey := remoteAddr.String()
|
||||
+ // bytes from client -> target (direction=in)
|
||||
+if pm.currentTunnelID != "" && n > 0 {
|
||||
+if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(n)) }
|
||||
+ } else {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) }
|
||||
+ }
|
||||
+ }
|
||||
clientsMutex.RLock()
|
||||
targetConn, exists := clientConns[clientKey]
|
||||
clientsMutex.RUnlock()
|
||||
@@ -366,6 +534,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
return // defer will handle cleanup
|
||||
}
|
||||
|
||||
+ // bytes from target -> client (direction=out)
|
||||
+ if pm.currentTunnelID != "" && n > 0 {
|
||||
+ if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesOutUDP.Add(uint64(n)) }
|
||||
+ } else {
|
||||
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
_, err = conn.WriteTo(buffer[:n], remoteAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error writing to client: %v", err)
|
||||
@@ -375,13 +552,19 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
}(clientKey, targetConn, remoteAddr)
|
||||
}
|
||||
|
||||
- _, err = targetConn.Write(buffer[:n])
|
||||
+ written, err := targetConn.Write(buffer[:n])
|
||||
if err != nil {
|
||||
logger.Error("Error writing to target: %v", err)
|
||||
targetConn.Close()
|
||||
clientsMutex.Lock()
|
||||
delete(clientConns, clientKey)
|
||||
clientsMutex.Unlock()
|
||||
+} else if pm.currentTunnelID != "" && written > 0 {
|
||||
+ if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) }
|
||||
+ } else {
|
||||
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) }
|
||||
+ }
|
||||
}
|
||||
}
|
||||
}
|
||||
diff --git a/util.go b/util.go
|
||||
index 7d6da4f..c1f4915 100644
|
||||
--- a/util.go
|
||||
+++ b/util.go
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/fosrl/newt/proxy"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"golang.org/x/net/icmp"
|
||||
"golang.org/x/net/ipv4"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
|
||||
}
|
||||
|
||||
-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
|
||||
+func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
|
||||
maxInterval := 6 * time.Second
|
||||
currentInterval := pingInterval
|
||||
consecutiveFailures := 0
|
||||
@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
if !connectionLost {
|
||||
connectionLost = true
|
||||
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout)
|
||||
+ }
|
||||
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
|
||||
// Send registration message to the server for backward compatibility
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
} else {
|
||||
// Track recent latencies
|
||||
recentLatencies = append(recentLatencies, latency)
|
||||
+ // Record tunnel latency (limit sampling to this periodic check)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds())
|
||||
+ }
|
||||
if len(recentLatencies) > 10 {
|
||||
recentLatencies = recentLatencies[1:]
|
||||
}
|
||||
diff --git a/websocket/client.go b/websocket/client.go
|
||||
index 0c0664a..c9ac264 100644
|
||||
--- a/websocket/client.go
|
||||
+++ b/websocket/client.go
|
||||
@@ -18,6 +18,10 @@ import (
|
||||
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/gorilla/websocket"
|
||||
+
|
||||
+ "context"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/otel"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err))
|
||||
return "", fmt.Errorf("failed to request new token: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure")
|
||||
+ bin := "http_other"
|
||||
+ if resp.StatusCode >= 500 {
|
||||
+ bin = "http_5xx"
|
||||
+ } else if resp.StatusCode >= 400 {
|
||||
+ bin = "http_4xx"
|
||||
+ }
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin)
|
||||
+ // Reconnect reason mapping for auth failures
|
||||
+ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError)
|
||||
+ }
|
||||
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
|
||||
logger.Debug("Received token: %s", tokenResp.Data.Token)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success")
|
||||
|
||||
return tokenResp.Data.Token, nil
|
||||
}
|
||||
|
||||
+// classifyConnError maps common errors to low-cardinality error_type labels
|
||||
+func classifyConnError(err error) string {
|
||||
+ if err == nil {
|
||||
+ return ""
|
||||
+ }
|
||||
+ msg := strings.ToLower(err.Error())
|
||||
+ switch {
|
||||
+ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
|
||||
+ return "tls"
|
||||
+ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"):
|
||||
+ return "timeout"
|
||||
+ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"):
|
||||
+ return "dns"
|
||||
+ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
|
||||
+ return "auth"
|
||||
+ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"):
|
||||
+ return "io"
|
||||
+ default:
|
||||
+ return "other"
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (c *Client) connectWithRetry() {
|
||||
for {
|
||||
select {
|
||||
@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error {
|
||||
// Get token for authentication
|
||||
token, err := c.getToken()
|
||||
if err != nil {
|
||||
+ // telemetry: connection attempt failed before dialing
|
||||
+ // site_id isn't globally available here; use client ID as site_id (low cardinality)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err))
|
||||
return fmt.Errorf("failed to get token: %w", err)
|
||||
}
|
||||
|
||||
@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error {
|
||||
q.Set("clientType", c.clientType)
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
- // Connect to WebSocket
|
||||
+ // Connect to WebSocket (optional span)
|
||||
+ tr := otel.Tracer("newt")
|
||||
+ spanCtx, span := tr.Start(context.Background(), "ws.connect")
|
||||
+ defer span.End()
|
||||
+
|
||||
dialer := websocket.DefaultDialer
|
||||
|
||||
// Use new TLS configuration method
|
||||
@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error {
|
||||
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
|
||||
}
|
||||
|
||||
- conn, _, err := dialer.Dial(u.String(), nil)
|
||||
+conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ etype := classifyConnError(err)
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype)
|
||||
+ // Map handshake-related errors to reconnect reasons where appropriate
|
||||
+ if etype == "tls" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError)
|
||||
+ } else if etype == "timeout" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout)
|
||||
+ } else {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError)
|
||||
+ }
|
||||
return fmt.Errorf("failed to connect to WebSocket: %w", err)
|
||||
}
|
||||
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success")
|
||||
c.conn = conn
|
||||
c.setConnected(true)
|
||||
|
||||
diff --git a/wg/wg.go b/wg/wg.go
|
||||
index 3cee1a9..a765279 100644
|
||||
--- a/wg/wg.go
|
||||
+++ b/wg/wg.go
|
||||
@@ -3,6 +3,7 @@
|
||||
package wg
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -23,6 +24,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/wgctrl"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
s.stopGetConfig = nil
|
||||
}
|
||||
|
||||
+ // telemetry: config reload success
|
||||
+ telemetry.IncConfigReload(context.Background(), "success")
|
||||
+ // Optional reconnect reason mapping: config change
|
||||
+ if s.serverPubKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange)
|
||||
+ }
|
||||
+
|
||||
// Ensure the WireGuard interface and peers are configured
|
||||
if err := s.ensureWireguardInterface(config); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard interface: %v", err)
|
||||
diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go
|
||||
index 6684c40..09f160e 100644
|
||||
--- a/wgnetstack/wgnetstack.go
|
||||
+++ b/wgnetstack/wgnetstack.go
|
||||
@@ -1,6 +1,7 @@
|
||||
package wgnetstack
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
@@ -26,6 +27,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
|
||||
return service, nil
|
||||
}
|
||||
|
||||
+// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
|
||||
+func (s *WireGuardService) ReportRTT(seconds float64) {
|
||||
+ if s.serverPubKey == "" { return }
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds)
|
||||
+}
|
||||
+
|
||||
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
|
||||
logger.Debug("Received: %+v", msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if s.TunnelIP == "" || s.proxyManager == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
- return
|
||||
- }
|
||||
+ return
|
||||
+}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
301
patches/01_proxy_multitunnel.patch
Normal file
301
patches/01_proxy_multitunnel.patch
Normal file
@@ -0,0 +1,301 @@
|
||||
diff --git a/proxy/manager.go b/proxy/manager.go
|
||||
index bf10322..86c47a8 100644
|
||||
--- a/proxy/manager.go
|
||||
+++ b/proxy/manager.go
|
||||
@@ -1,16 +1,22 @@
|
||||
package proxy
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
+ "os"
|
||||
"strings"
|
||||
"sync"
|
||||
+ "sync/atomic"
|
||||
"time"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"github.com/fosrl/newt/logger"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
|
||||
+ "go.opentelemetry.io/otel/attribute"
|
||||
)
|
||||
|
||||
// Target represents a proxy target with its address and port
|
||||
@@ -28,6 +34,52 @@ type ProxyManager struct {
|
||||
udpConns []*gonet.UDPConn
|
||||
running bool
|
||||
mutex sync.RWMutex
|
||||
+
|
||||
+ // telemetry (multi-tunnel)
|
||||
+ currentTunnelID string
|
||||
+ tunnels map[string]*tunnelEntry
|
||||
+ asyncBytes bool
|
||||
+ flushStop chan struct{}
|
||||
+}
|
||||
+
|
||||
+// tunnelEntry holds per-tunnel attributes and (optional) async counters.
|
||||
+type tunnelEntry struct {
|
||||
+ attrInTCP attribute.Set
|
||||
+ attrOutTCP attribute.Set
|
||||
+ attrInUDP attribute.Set
|
||||
+ attrOutUDP attribute.Set
|
||||
+
|
||||
+ bytesInTCP atomic.Uint64
|
||||
+ bytesOutTCP atomic.Uint64
|
||||
+ bytesInUDP atomic.Uint64
|
||||
+ bytesOutUDP atomic.Uint64
|
||||
+}
|
||||
+
|
||||
+// countingWriter wraps an io.Writer and adds bytes to OTel counter using a pre-built attribute set.
|
||||
+type countingWriter struct {
|
||||
+ ctx context.Context
|
||||
+ w io.Writer
|
||||
+ set attribute.Set
|
||||
+ pm *ProxyManager
|
||||
+ ent *tunnelEntry
|
||||
+ out bool // false=in, true=out
|
||||
+ proto string // "tcp" or "udp"
|
||||
+}
|
||||
+
|
||||
+func (cw *countingWriter) Write(p []byte) (int, error) {
|
||||
+ n, err := cw.w.Write(p)
|
||||
+ if n > 0 {
|
||||
+ if cw.pm != nil && cw.pm.asyncBytes && cw.ent != nil {
|
||||
+ if cw.proto == "tcp" {
|
||||
+ if cw.out { cw.ent.bytesOutTCP.Add(uint64(n)) } else { cw.ent.bytesInTCP.Add(uint64(n)) }
|
||||
+ } else if cw.proto == "udp" {
|
||||
+ if cw.out { cw.ent.bytesOutUDP.Add(uint64(n)) } else { cw.ent.bytesInUDP.Add(uint64(n)) }
|
||||
+ }
|
||||
+ } else {
|
||||
+ telemetry.AddTunnelBytesSet(cw.ctx, int64(n), cw.set)
|
||||
+ }
|
||||
+ }
|
||||
+ return n, err
|
||||
}
|
||||
|
||||
// NewProxyManager creates a new proxy manager instance
|
||||
@@ -38,9 +90,46 @@ func NewProxyManager(tnet *netstack.Net) *ProxyManager {
|
||||
udpTargets: make(map[string]map[int]string),
|
||||
listeners: make([]*gonet.TCPListener, 0),
|
||||
udpConns: make([]*gonet.UDPConn, 0),
|
||||
+ tunnels: make(map[string]*tunnelEntry),
|
||||
}
|
||||
}
|
||||
|
||||
+// SetTunnelID sets the WireGuard peer public key used as tunnel_id label.
|
||||
+func (pm *ProxyManager) SetTunnelID(id string) {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ pm.currentTunnelID = id
|
||||
+ if _, ok := pm.tunnels[id]; !ok {
|
||||
+ pm.tunnels[id] = &tunnelEntry{}
|
||||
+ }
|
||||
+ e := pm.tunnels[id]
|
||||
+ e.attrInTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "tcp"))
|
||||
+ e.attrOutTCP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "tcp"))
|
||||
+ e.attrInUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "in"), attribute.String("protocol", "udp"))
|
||||
+ e.attrOutUDP = attribute.NewSet(attribute.String("tunnel_id", id), attribute.String("direction", "out"), attribute.String("protocol", "udp"))
|
||||
+}
|
||||
+
|
||||
+// ClearTunnelID clears cached attribute sets for the current tunnel.
|
||||
+func (pm *ProxyManager) ClearTunnelID() {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ id := pm.currentTunnelID
|
||||
+ if id == "" { return }
|
||||
+ if e, ok := pm.tunnels[id]; ok {
|
||||
+ // final flush for this tunnel
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ delete(pm.tunnels, id)
|
||||
+ }
|
||||
+ pm.currentTunnelID = ""
|
||||
+}
|
||||
+
|
||||
// init function without tnet
|
||||
func NewProxyManagerWithoutTNet() *ProxyManager {
|
||||
return &ProxyManager{
|
||||
@@ -160,6 +249,57 @@ func (pm *ProxyManager) Start() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
+func (pm *ProxyManager) SetAsyncBytes(b bool) {
|
||||
+ pm.mutex.Lock()
|
||||
+ defer pm.mutex.Unlock()
|
||||
+ pm.asyncBytes = b
|
||||
+ if b && pm.flushStop == nil {
|
||||
+ pm.flushStop = make(chan struct{})
|
||||
+ go pm.flushLoop()
|
||||
+ }
|
||||
+}
|
||||
+func (pm *ProxyManager) flushLoop() {
|
||||
+ flushInterval := 2 * time.Second
|
||||
+ if v := os.Getenv("OTEL_METRIC_EXPORT_INTERVAL"); v != "" {
|
||||
+ if d, err := time.ParseDuration(v); err == nil && d > 0 {
|
||||
+ if d/2 < flushInterval { flushInterval = d / 2 }
|
||||
+ }
|
||||
+ }
|
||||
+ ticker := time.NewTicker(flushInterval)
|
||||
+ defer ticker.Stop()
|
||||
+ for {
|
||||
+ select {
|
||||
+ case <-ticker.C:
|
||||
+ pm.mutex.RLock()
|
||||
+ for _, e := range pm.tunnels {
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ }
|
||||
+ pm.mutex.RUnlock()
|
||||
+ case <-pm.flushStop:
|
||||
+ pm.mutex.RLock()
|
||||
+ for _, e := range pm.tunnels {
|
||||
+ inTCP := e.bytesInTCP.Swap(0)
|
||||
+ outTCP := e.bytesOutTCP.Swap(0)
|
||||
+ inUDP := e.bytesInUDP.Swap(0)
|
||||
+ outUDP := e.bytesOutUDP.Swap(0)
|
||||
+ if inTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inTCP), e.attrInTCP) }
|
||||
+ if outTCP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outTCP), e.attrOutTCP) }
|
||||
+ if inUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(inUDP), e.attrInUDP) }
|
||||
+ if outUDP > 0 { telemetry.AddTunnelBytesSet(context.Background(), int64(outUDP), e.attrOutUDP) }
|
||||
+ }
|
||||
+ pm.mutex.RUnlock()
|
||||
+ return
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (pm *ProxyManager) Stop() error {
|
||||
pm.mutex.Lock()
|
||||
defer pm.mutex.Unlock()
|
||||
@@ -236,6 +376,14 @@ func (pm *ProxyManager) startTarget(proto, listenIP string, port int, targetAddr
|
||||
return nil
|
||||
}
|
||||
|
||||
+// getEntry returns per-tunnel entry or nil.
|
||||
+func (pm *ProxyManager) getEntry(id string) *tunnelEntry {
|
||||
+ pm.mutex.RLock()
|
||||
+ e := pm.tunnels[id]
|
||||
+ pm.mutex.RUnlock()
|
||||
+ return e
|
||||
+}
|
||||
+
|
||||
func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string) {
|
||||
for {
|
||||
conn, err := listener.Accept()
|
||||
@@ -257,6 +405,9 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
|
||||
continue
|
||||
}
|
||||
|
||||
+// Count sessions only once per accepted TCP connection
|
||||
+ if pm.tunnelID != "" { state.Global().IncSessions(pm.tunnelID) }
|
||||
+
|
||||
go func() {
|
||||
target, err := net.Dial("tcp", targetAddr)
|
||||
if err != nil {
|
||||
@@ -265,24 +416,33 @@ func (pm *ProxyManager) handleTCPProxy(listener net.Listener, targetAddr string)
|
||||
return
|
||||
}
|
||||
|
||||
+ // already incremented on accept
|
||||
+
|
||||
// Create a WaitGroup to ensure both copy operations complete
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(2)
|
||||
|
||||
+ // client -> target (direction=in)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
- io.Copy(target, conn)
|
||||
- target.Close()
|
||||
+e := pm.getEntry(pm.currentTunnelID)
|
||||
+cw := &countingWriter{ctx: context.Background(), w: target, set: e.attrInTCP, pm: pm, ent: e, out: false, proto: "tcp"}
|
||||
+ _, _ = io.Copy(cw, conn)
|
||||
+ _ = target.Close()
|
||||
}()
|
||||
|
||||
+ // target -> client (direction=out)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
- io.Copy(conn, target)
|
||||
- conn.Close()
|
||||
+e := pm.getEntry(pm.currentTunnelID)
|
||||
+cw := &countingWriter{ctx: context.Background(), w: conn, set: e.attrOutTCP, pm: pm, ent: e, out: true, proto: "tcp"}
|
||||
+ _, _ = io.Copy(cw, target)
|
||||
+ _ = conn.Close()
|
||||
}()
|
||||
|
||||
- // Wait for both copies to complete
|
||||
+ // Wait for both copies to complete then session -1
|
||||
wg.Wait()
|
||||
+ if pm.tunnelID != "" { state.Global().DecSessions(pm.tunnelID) }
|
||||
}()
|
||||
}
|
||||
}
|
||||
@@ -326,6 +486,14 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
}
|
||||
|
||||
clientKey := remoteAddr.String()
|
||||
+ // bytes from client -> target (direction=in)
|
||||
+if pm.currentTunnelID != "" && n > 0 {
|
||||
+if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(n)) }
|
||||
+ } else {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrInUDP) }
|
||||
+ }
|
||||
+ }
|
||||
clientsMutex.RLock()
|
||||
targetConn, exists := clientConns[clientKey]
|
||||
clientsMutex.RUnlock()
|
||||
@@ -366,6 +534,15 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
return // defer will handle cleanup
|
||||
}
|
||||
|
||||
+ // bytes from target -> client (direction=out)
|
||||
+ if pm.currentTunnelID != "" && n > 0 {
|
||||
+ if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesOutUDP.Add(uint64(n)) }
|
||||
+ } else {
|
||||
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(n), e.attrOutUDP) }
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
_, err = conn.WriteTo(buffer[:n], remoteAddr)
|
||||
if err != nil {
|
||||
logger.Error("Error writing to client: %v", err)
|
||||
@@ -375,13 +552,19 @@ func (pm *ProxyManager) handleUDPProxy(conn *gonet.UDPConn, targetAddr string) {
|
||||
}(clientKey, targetConn, remoteAddr)
|
||||
}
|
||||
|
||||
- _, err = targetConn.Write(buffer[:n])
|
||||
+ written, err := targetConn.Write(buffer[:n])
|
||||
if err != nil {
|
||||
logger.Error("Error writing to target: %v", err)
|
||||
targetConn.Close()
|
||||
clientsMutex.Lock()
|
||||
delete(clientConns, clientKey)
|
||||
clientsMutex.Unlock()
|
||||
+} else if pm.currentTunnelID != "" && written > 0 {
|
||||
+ if pm.asyncBytes {
|
||||
+ if e := pm.getEntry(pm.currentTunnelID); e != nil { e.bytesInUDP.Add(uint64(written)) }
|
||||
+ } else {
|
||||
+if e := pm.getEntry(pm.currentTunnelID); e != nil { telemetry.AddTunnelBytesSet(context.Background(), int64(written), e.attrInUDP) }
|
||||
+ }
|
||||
}
|
||||
}
|
||||
}
|
||||
422
patches/02_reconnect_reasons.patch
Normal file
422
patches/02_reconnect_reasons.patch
Normal file
@@ -0,0 +1,422 @@
|
||||
diff --git a/main.go b/main.go
|
||||
index 12849b1..c223b75 100644
|
||||
--- a/main.go
|
||||
+++ b/main.go
|
||||
@@ -1,7 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
+ "errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net"
|
||||
@@ -22,6 +24,9 @@ import (
|
||||
"github.com/fosrl/newt/updates"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
@@ -116,6 +121,13 @@ var (
|
||||
healthMonitor *healthcheck.Monitor
|
||||
enforceHealthcheckCert bool
|
||||
|
||||
+ // Observability/metrics flags
|
||||
+ metricsEnabled bool
|
||||
+ otlpEnabled bool
|
||||
+ adminAddr string
|
||||
+ region string
|
||||
+ metricsAsyncBytes bool
|
||||
+
|
||||
// New mTLS configuration variables
|
||||
tlsClientCert string
|
||||
tlsClientKey string
|
||||
@@ -126,6 +138,10 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
+ // Prepare context for graceful shutdown and signal handling
|
||||
+ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
+ defer stop()
|
||||
+
|
||||
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
|
||||
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
|
||||
id = os.Getenv("NEWT_ID")
|
||||
@@ -141,6 +157,13 @@ func main() {
|
||||
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
|
||||
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
|
||||
|
||||
+ // Metrics/observability env mirrors
|
||||
+ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
|
||||
+ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
|
||||
+ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
|
||||
+ regionEnv := os.Getenv("NEWT_REGION")
|
||||
+ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
|
||||
+
|
||||
keepInterface = keepInterfaceEnv == "true"
|
||||
acceptClients = acceptClientsEnv == "true"
|
||||
useNativeInterface = useNativeInterfaceEnv == "true"
|
||||
@@ -272,6 +295,35 @@ func main() {
|
||||
flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)")
|
||||
}
|
||||
|
||||
+ // Metrics/observability flags (mirror ENV if unset)
|
||||
+ if metricsEnabledEnv == "" {
|
||||
+ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true }
|
||||
+ }
|
||||
+ if otlpEnabledEnv == "" {
|
||||
+ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v }
|
||||
+ }
|
||||
+ if adminAddrEnv == "" {
|
||||
+ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
|
||||
+ } else {
|
||||
+ adminAddr = adminAddrEnv
|
||||
+ }
|
||||
+ // Async bytes toggle
|
||||
+ if asyncBytesEnv == "" {
|
||||
+ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v }
|
||||
+ }
|
||||
+ // Optional region flag (resource attribute)
|
||||
+ if regionEnv == "" {
|
||||
+ flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)")
|
||||
+ } else {
|
||||
+ region = regionEnv
|
||||
+ }
|
||||
+
|
||||
// do a --version check
|
||||
version := flag.Bool("version", false, "Print the version")
|
||||
|
||||
@@ -286,6 +338,50 @@ func main() {
|
||||
loggerLevel := parseLogLevel(logLevel)
|
||||
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
|
||||
|
||||
+ // Initialize telemetry after flags are parsed (so flags override env)
|
||||
+ tcfg := telemetry.FromEnv()
|
||||
+ tcfg.PromEnabled = metricsEnabled
|
||||
+ tcfg.OTLPEnabled = otlpEnabled
|
||||
+ if adminAddr != "" { tcfg.AdminAddr = adminAddr }
|
||||
+ // Resource attributes (if available)
|
||||
+ tcfg.SiteID = id
|
||||
+ tcfg.Region = region
|
||||
+ // Build info
|
||||
+ tcfg.BuildVersion = newtVersion
|
||||
+ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
|
||||
+
|
||||
+ tel, telErr := telemetry.Init(ctx, tcfg)
|
||||
+ if telErr != nil {
|
||||
+ logger.Warn("Telemetry init failed: %v", telErr)
|
||||
+ }
|
||||
+ if tel != nil {
|
||||
+ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
|
||||
+ mux := http.NewServeMux()
|
||||
+ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
|
||||
+ if tel.PrometheusHandler != nil {
|
||||
+ mux.Handle("/metrics", tel.PrometheusHandler)
|
||||
+ }
|
||||
+ admin := &http.Server{
|
||||
+ Addr: tcfg.AdminAddr,
|
||||
+ Handler: otelhttp.NewHandler(mux, "newt-admin"),
|
||||
+ ReadTimeout: 5 * time.Second,
|
||||
+ WriteTimeout: 10 * time.Second,
|
||||
+ ReadHeaderTimeout: 5 * time.Second,
|
||||
+ IdleTimeout: 30 * time.Second,
|
||||
+ }
|
||||
+ go func() {
|
||||
+ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
+ logger.Warn("admin http error: %v", err)
|
||||
+ }
|
||||
+ }()
|
||||
+ defer func() {
|
||||
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
+ defer cancel()
|
||||
+ _ = admin.Shutdown(ctx)
|
||||
+ }()
|
||||
+ defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
+ }
|
||||
+
|
||||
newtVersion := "version_replaceme"
|
||||
if *version {
|
||||
fmt.Println("Newt version " + newtVersion)
|
||||
@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
// Use reliable ping for initial connection test
|
||||
logger.Debug("Testing initial connection with reliable ping...")
|
||||
- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ if err == nil && wgData.PublicKey != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
+ }
|
||||
if err != nil {
|
||||
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
|
||||
} else {
|
||||
@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
// as the pings will continue in the background
|
||||
if !connected {
|
||||
logger.Debug("Starting ping check")
|
||||
- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
|
||||
+ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Create proxy manager
|
||||
pm = proxy.NewProxyManager(tnet)
|
||||
+ pm.SetAsyncBytes(metricsAsyncBytes)
|
||||
+ // Set tunnel_id for metrics (WireGuard peer public key)
|
||||
+ pm.SetTunnelID(wgData.PublicKey)
|
||||
|
||||
connected = true
|
||||
|
||||
+ // telemetry: record a successful site registration (omit region unless available)
|
||||
+ telemetry.IncSiteRegistration(context.Background(), id, "", "success")
|
||||
+
|
||||
// add the targets if there are any
|
||||
if len(wgData.Targets.TCP) > 0 {
|
||||
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
|
||||
@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received reconnect message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
// Mark as disconnected
|
||||
connected = false
|
||||
|
||||
@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received termination message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
diff --git a/util.go b/util.go
|
||||
index 7d6da4f..c1f4915 100644
|
||||
--- a/util.go
|
||||
+++ b/util.go
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/fosrl/newt/proxy"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"golang.org/x/net/icmp"
|
||||
"golang.org/x/net/ipv4"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
|
||||
}
|
||||
|
||||
-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
|
||||
+func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
|
||||
maxInterval := 6 * time.Second
|
||||
currentInterval := pingInterval
|
||||
consecutiveFailures := 0
|
||||
@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
if !connectionLost {
|
||||
connectionLost = true
|
||||
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout)
|
||||
+ }
|
||||
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
|
||||
// Send registration message to the server for backward compatibility
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
} else {
|
||||
// Track recent latencies
|
||||
recentLatencies = append(recentLatencies, latency)
|
||||
+ // Record tunnel latency (limit sampling to this periodic check)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds())
|
||||
+ }
|
||||
if len(recentLatencies) > 10 {
|
||||
recentLatencies = recentLatencies[1:]
|
||||
}
|
||||
diff --git a/websocket/client.go b/websocket/client.go
|
||||
index 0c0664a..c9ac264 100644
|
||||
--- a/websocket/client.go
|
||||
+++ b/websocket/client.go
|
||||
@@ -18,6 +18,10 @@ import (
|
||||
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/gorilla/websocket"
|
||||
+
|
||||
+ "context"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/otel"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err))
|
||||
return "", fmt.Errorf("failed to request new token: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure")
|
||||
+ bin := "http_other"
|
||||
+ if resp.StatusCode >= 500 {
|
||||
+ bin = "http_5xx"
|
||||
+ } else if resp.StatusCode >= 400 {
|
||||
+ bin = "http_4xx"
|
||||
+ }
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin)
|
||||
+ // Reconnect reason mapping for auth failures
|
||||
+ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError)
|
||||
+ }
|
||||
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
|
||||
logger.Debug("Received token: %s", tokenResp.Data.Token)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success")
|
||||
|
||||
return tokenResp.Data.Token, nil
|
||||
}
|
||||
|
||||
+// classifyConnError maps common errors to low-cardinality error_type labels
|
||||
+func classifyConnError(err error) string {
|
||||
+ if err == nil {
|
||||
+ return ""
|
||||
+ }
|
||||
+ msg := strings.ToLower(err.Error())
|
||||
+ switch {
|
||||
+ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
|
||||
+ return "tls"
|
||||
+ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"):
|
||||
+ return "timeout"
|
||||
+ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"):
|
||||
+ return "dns"
|
||||
+ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
|
||||
+ return "auth"
|
||||
+ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"):
|
||||
+ return "io"
|
||||
+ default:
|
||||
+ return "other"
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (c *Client) connectWithRetry() {
|
||||
for {
|
||||
select {
|
||||
@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error {
|
||||
// Get token for authentication
|
||||
token, err := c.getToken()
|
||||
if err != nil {
|
||||
+ // telemetry: connection attempt failed before dialing
|
||||
+ // site_id isn't globally available here; use client ID as site_id (low cardinality)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err))
|
||||
return fmt.Errorf("failed to get token: %w", err)
|
||||
}
|
||||
|
||||
@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error {
|
||||
q.Set("clientType", c.clientType)
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
- // Connect to WebSocket
|
||||
+ // Connect to WebSocket (optional span)
|
||||
+ tr := otel.Tracer("newt")
|
||||
+ spanCtx, span := tr.Start(context.Background(), "ws.connect")
|
||||
+ defer span.End()
|
||||
+
|
||||
dialer := websocket.DefaultDialer
|
||||
|
||||
// Use new TLS configuration method
|
||||
@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error {
|
||||
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
|
||||
}
|
||||
|
||||
- conn, _, err := dialer.Dial(u.String(), nil)
|
||||
+conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ etype := classifyConnError(err)
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype)
|
||||
+ // Map handshake-related errors to reconnect reasons where appropriate
|
||||
+ if etype == "tls" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError)
|
||||
+ } else if etype == "timeout" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout)
|
||||
+ } else {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError)
|
||||
+ }
|
||||
return fmt.Errorf("failed to connect to WebSocket: %w", err)
|
||||
}
|
||||
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success")
|
||||
c.conn = conn
|
||||
c.setConnected(true)
|
||||
|
||||
diff --git a/wg/wg.go b/wg/wg.go
|
||||
index 3cee1a9..a765279 100644
|
||||
--- a/wg/wg.go
|
||||
+++ b/wg/wg.go
|
||||
@@ -3,6 +3,7 @@
|
||||
package wg
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -23,6 +24,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/wgctrl"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
s.stopGetConfig = nil
|
||||
}
|
||||
|
||||
+ // telemetry: config reload success
|
||||
+ telemetry.IncConfigReload(context.Background(), "success")
|
||||
+ // Optional reconnect reason mapping: config change
|
||||
+ if s.serverPubKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange)
|
||||
+ }
|
||||
+
|
||||
// Ensure the WireGuard interface and peers are configured
|
||||
if err := s.ensureWireguardInterface(config); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard interface: %v", err)
|
||||
466
patches/02_reconnect_rtt.patch
Normal file
466
patches/02_reconnect_rtt.patch
Normal file
@@ -0,0 +1,466 @@
|
||||
diff --git a/main.go b/main.go
|
||||
index 12849b1..c223b75 100644
|
||||
--- a/main.go
|
||||
+++ b/main.go
|
||||
@@ -1,7 +1,9 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
+ "errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net"
|
||||
@@ -22,6 +24,9 @@ import (
|
||||
"github.com/fosrl/newt/updates"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
|
||||
+ "github.com/fosrl/newt/internal/state"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
@@ -116,6 +121,13 @@ var (
|
||||
healthMonitor *healthcheck.Monitor
|
||||
enforceHealthcheckCert bool
|
||||
|
||||
+ // Observability/metrics flags
|
||||
+ metricsEnabled bool
|
||||
+ otlpEnabled bool
|
||||
+ adminAddr string
|
||||
+ region string
|
||||
+ metricsAsyncBytes bool
|
||||
+
|
||||
// New mTLS configuration variables
|
||||
tlsClientCert string
|
||||
tlsClientKey string
|
||||
@@ -126,6 +138,10 @@ var (
|
||||
)
|
||||
|
||||
func main() {
|
||||
+ // Prepare context for graceful shutdown and signal handling
|
||||
+ ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
+ defer stop()
|
||||
+
|
||||
// if PANGOLIN_ENDPOINT, NEWT_ID, and NEWT_SECRET are set as environment variables, they will be used as default values
|
||||
endpoint = os.Getenv("PANGOLIN_ENDPOINT")
|
||||
id = os.Getenv("NEWT_ID")
|
||||
@@ -141,6 +157,13 @@ func main() {
|
||||
useNativeInterfaceEnv := os.Getenv("USE_NATIVE_INTERFACE")
|
||||
enforceHealthcheckCertEnv := os.Getenv("ENFORCE_HC_CERT")
|
||||
|
||||
+ // Metrics/observability env mirrors
|
||||
+ metricsEnabledEnv := os.Getenv("NEWT_METRICS_PROMETHEUS_ENABLED")
|
||||
+ otlpEnabledEnv := os.Getenv("NEWT_METRICS_OTLP_ENABLED")
|
||||
+ adminAddrEnv := os.Getenv("NEWT_ADMIN_ADDR")
|
||||
+ regionEnv := os.Getenv("NEWT_REGION")
|
||||
+ asyncBytesEnv := os.Getenv("NEWT_METRICS_ASYNC_BYTES")
|
||||
+
|
||||
keepInterface = keepInterfaceEnv == "true"
|
||||
acceptClients = acceptClientsEnv == "true"
|
||||
useNativeInterface = useNativeInterfaceEnv == "true"
|
||||
@@ -272,6 +295,35 @@ func main() {
|
||||
flag.StringVar(&healthFile, "health-file", "", "Path to health file (if unset, health file won't be written)")
|
||||
}
|
||||
|
||||
+ // Metrics/observability flags (mirror ENV if unset)
|
||||
+ if metricsEnabledEnv == "" {
|
||||
+ flag.BoolVar(&metricsEnabled, "metrics", true, "Enable Prometheus /metrics exporter")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(metricsEnabledEnv); err == nil { metricsEnabled = v } else { metricsEnabled = true }
|
||||
+ }
|
||||
+ if otlpEnabledEnv == "" {
|
||||
+ flag.BoolVar(&otlpEnabled, "otlp", false, "Enable OTLP exporters (metrics/traces) to OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(otlpEnabledEnv); err == nil { otlpEnabled = v }
|
||||
+ }
|
||||
+ if adminAddrEnv == "" {
|
||||
+ flag.StringVar(&adminAddr, "metrics-admin-addr", "127.0.0.1:2112", "Admin/metrics bind address")
|
||||
+ } else {
|
||||
+ adminAddr = adminAddrEnv
|
||||
+ }
|
||||
+ // Async bytes toggle
|
||||
+ if asyncBytesEnv == "" {
|
||||
+ flag.BoolVar(&metricsAsyncBytes, "metrics-async-bytes", false, "Enable async bytes counting (background flush; lower hot path overhead)")
|
||||
+ } else {
|
||||
+ if v, err := strconv.ParseBool(asyncBytesEnv); err == nil { metricsAsyncBytes = v }
|
||||
+ }
|
||||
+ // Optional region flag (resource attribute)
|
||||
+ if regionEnv == "" {
|
||||
+ flag.StringVar(®ion, "region", "", "Optional region resource attribute (also NEWT_REGION)")
|
||||
+ } else {
|
||||
+ region = regionEnv
|
||||
+ }
|
||||
+
|
||||
// do a --version check
|
||||
version := flag.Bool("version", false, "Print the version")
|
||||
|
||||
@@ -286,6 +338,50 @@ func main() {
|
||||
loggerLevel := parseLogLevel(logLevel)
|
||||
logger.GetLogger().SetLevel(parseLogLevel(logLevel))
|
||||
|
||||
+ // Initialize telemetry after flags are parsed (so flags override env)
|
||||
+ tcfg := telemetry.FromEnv()
|
||||
+ tcfg.PromEnabled = metricsEnabled
|
||||
+ tcfg.OTLPEnabled = otlpEnabled
|
||||
+ if adminAddr != "" { tcfg.AdminAddr = adminAddr }
|
||||
+ // Resource attributes (if available)
|
||||
+ tcfg.SiteID = id
|
||||
+ tcfg.Region = region
|
||||
+ // Build info
|
||||
+ tcfg.BuildVersion = newtVersion
|
||||
+ tcfg.BuildCommit = os.Getenv("NEWT_COMMIT")
|
||||
+
|
||||
+ tel, telErr := telemetry.Init(ctx, tcfg)
|
||||
+ if telErr != nil {
|
||||
+ logger.Warn("Telemetry init failed: %v", telErr)
|
||||
+ }
|
||||
+ if tel != nil {
|
||||
+ // Admin HTTP server (exposes /metrics when Prometheus exporter is enabled)
|
||||
+ mux := http.NewServeMux()
|
||||
+ mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) })
|
||||
+ if tel.PrometheusHandler != nil {
|
||||
+ mux.Handle("/metrics", tel.PrometheusHandler)
|
||||
+ }
|
||||
+ admin := &http.Server{
|
||||
+ Addr: tcfg.AdminAddr,
|
||||
+ Handler: otelhttp.NewHandler(mux, "newt-admin"),
|
||||
+ ReadTimeout: 5 * time.Second,
|
||||
+ WriteTimeout: 10 * time.Second,
|
||||
+ ReadHeaderTimeout: 5 * time.Second,
|
||||
+ IdleTimeout: 30 * time.Second,
|
||||
+ }
|
||||
+ go func() {
|
||||
+ if err := admin.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
+ logger.Warn("admin http error: %v", err)
|
||||
+ }
|
||||
+ }()
|
||||
+ defer func() {
|
||||
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
+ defer cancel()
|
||||
+ _ = admin.Shutdown(ctx)
|
||||
+ }()
|
||||
+ defer func() { _ = tel.Shutdown(context.Background()) }()
|
||||
+ }
|
||||
+
|
||||
newtVersion := "version_replaceme"
|
||||
if *version {
|
||||
fmt.Println("Newt version " + newtVersion)
|
||||
@@ -557,7 +653,10 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
}
|
||||
// Use reliable ping for initial connection test
|
||||
logger.Debug("Testing initial connection with reliable ping...")
|
||||
- _, err = reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ lat, err := reliablePing(tnet, wgData.ServerIP, pingTimeout, 5)
|
||||
+ if err == nil && wgData.PublicKey != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", wgData.PublicKey, "wireguard", lat.Seconds())
|
||||
+ }
|
||||
if err != nil {
|
||||
logger.Warn("Initial reliable ping failed, but continuing: %v", err)
|
||||
} else {
|
||||
@@ -570,14 +669,20 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
// as the pings will continue in the background
|
||||
if !connected {
|
||||
logger.Debug("Starting ping check")
|
||||
- pingStopChan = startPingCheck(tnet, wgData.ServerIP, client)
|
||||
+ pingStopChan = startPingCheck(tnet, wgData.ServerIP, client, wgData.PublicKey)
|
||||
}
|
||||
|
||||
// Create proxy manager
|
||||
pm = proxy.NewProxyManager(tnet)
|
||||
+ pm.SetAsyncBytes(metricsAsyncBytes)
|
||||
+ // Set tunnel_id for metrics (WireGuard peer public key)
|
||||
+ pm.SetTunnelID(wgData.PublicKey)
|
||||
|
||||
connected = true
|
||||
|
||||
+ // telemetry: record a successful site registration (omit region unless available)
|
||||
+ telemetry.IncSiteRegistration(context.Background(), id, "", "success")
|
||||
+
|
||||
// add the targets if there are any
|
||||
if len(wgData.Targets.TCP) > 0 {
|
||||
updateTargets(pm, "add", wgData.TunnelIP, "tcp", TargetData{Targets: wgData.Targets.TCP})
|
||||
@@ -611,10 +716,25 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/reconnect", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received reconnect message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
+ // Clear metrics attrs and sessions for the tunnel
|
||||
+ if pm != nil {
|
||||
+ pm.ClearTunnelID()
|
||||
+ state.Global().ClearTunnel(wgData.PublicKey)
|
||||
+ }
|
||||
+
|
||||
// Mark as disconnected
|
||||
connected = false
|
||||
|
||||
@@ -631,6 +751,9 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
|
||||
|
||||
client.RegisterHandler("newt/wg/terminate", func(msg websocket.WSMessage) {
|
||||
logger.Info("Received termination message")
|
||||
+ if wgData.PublicKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", wgData.PublicKey, "server_request")
|
||||
+ }
|
||||
|
||||
// Close the WireGuard device and TUN
|
||||
closeWgTunnel()
|
||||
diff --git a/util.go b/util.go
|
||||
index 7d6da4f..c1f4915 100644
|
||||
--- a/util.go
|
||||
+++ b/util.go
|
||||
@@ -17,6 +17,7 @@ import (
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/fosrl/newt/proxy"
|
||||
"github.com/fosrl/newt/websocket"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
"golang.org/x/net/icmp"
|
||||
"golang.org/x/net/ipv4"
|
||||
"golang.zx2c4.com/wireguard/device"
|
||||
@@ -229,7 +230,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
|
||||
}
|
||||
|
||||
-func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client) chan struct{} {
|
||||
+func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
|
||||
maxInterval := 6 * time.Second
|
||||
currentInterval := pingInterval
|
||||
consecutiveFailures := 0
|
||||
@@ -292,6 +293,9 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
if !connectionLost {
|
||||
connectionLost = true
|
||||
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", tunnelID, telemetry.ReasonTimeout)
|
||||
+ }
|
||||
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{}, 3*time.Second)
|
||||
// Send registration message to the server for backward compatibility
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
@@ -318,6 +322,10 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
} else {
|
||||
// Track recent latencies
|
||||
recentLatencies = append(recentLatencies, latency)
|
||||
+ // Record tunnel latency (limit sampling to this periodic check)
|
||||
+ if tunnelID != "" {
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", tunnelID, "wireguard", latency.Seconds())
|
||||
+ }
|
||||
if len(recentLatencies) > 10 {
|
||||
recentLatencies = recentLatencies[1:]
|
||||
}
|
||||
diff --git a/websocket/client.go b/websocket/client.go
|
||||
index 0c0664a..c9ac264 100644
|
||||
--- a/websocket/client.go
|
||||
+++ b/websocket/client.go
|
||||
@@ -18,6 +18,10 @@ import (
|
||||
|
||||
"github.com/fosrl/newt/logger"
|
||||
"github.com/gorilla/websocket"
|
||||
+
|
||||
+ "context"
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
+ "go.opentelemetry.io/otel"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
@@ -287,6 +291,7 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", classifyConnError(err))
|
||||
return "", fmt.Errorf("failed to request new token: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -294,6 +299,18 @@ func (c *Client) getToken() (string, error) {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
logger.Error("Failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "failure")
|
||||
+ bin := "http_other"
|
||||
+ if resp.StatusCode >= 500 {
|
||||
+ bin = "http_5xx"
|
||||
+ } else if resp.StatusCode >= 400 {
|
||||
+ bin = "http_4xx"
|
||||
+ }
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "auth", bin)
|
||||
+ // Reconnect reason mapping for auth failures
|
||||
+ if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonAuthError)
|
||||
+ }
|
||||
return "", fmt.Errorf("failed to get token with status code: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
@@ -312,10 +329,33 @@ func (c *Client) getToken() (string, error) {
|
||||
}
|
||||
|
||||
logger.Debug("Received token: %s", tokenResp.Data.Token)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "auth", "success")
|
||||
|
||||
return tokenResp.Data.Token, nil
|
||||
}
|
||||
|
||||
+// classifyConnError maps common errors to low-cardinality error_type labels
|
||||
+func classifyConnError(err error) string {
|
||||
+ if err == nil {
|
||||
+ return ""
|
||||
+ }
|
||||
+ msg := strings.ToLower(err.Error())
|
||||
+ switch {
|
||||
+ case strings.Contains(msg, "tls") || strings.Contains(msg, "certificate"):
|
||||
+ return "tls"
|
||||
+ case strings.Contains(msg, "timeout") || strings.Contains(msg, "i/o timeout"):
|
||||
+ return "timeout"
|
||||
+ case strings.Contains(msg, "no such host") || strings.Contains(msg, "dns"):
|
||||
+ return "dns"
|
||||
+ case strings.Contains(msg, "unauthorized") || strings.Contains(msg, "forbidden"):
|
||||
+ return "auth"
|
||||
+ case strings.Contains(msg, "broken pipe") || strings.Contains(msg, "connection reset") || strings.Contains(msg, "connection refused") || strings.Contains(msg, "use of closed network connection") || strings.Contains(msg, "network is unreachable"):
|
||||
+ return "io"
|
||||
+ default:
|
||||
+ return "other"
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
func (c *Client) connectWithRetry() {
|
||||
for {
|
||||
select {
|
||||
@@ -337,6 +377,10 @@ func (c *Client) establishConnection() error {
|
||||
// Get token for authentication
|
||||
token, err := c.getToken()
|
||||
if err != nil {
|
||||
+ // telemetry: connection attempt failed before dialing
|
||||
+ // site_id isn't globally available here; use client ID as site_id (low cardinality)
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", classifyConnError(err))
|
||||
return fmt.Errorf("failed to get token: %w", err)
|
||||
}
|
||||
|
||||
@@ -369,7 +413,11 @@ func (c *Client) establishConnection() error {
|
||||
q.Set("clientType", c.clientType)
|
||||
u.RawQuery = q.Encode()
|
||||
|
||||
- // Connect to WebSocket
|
||||
+ // Connect to WebSocket (optional span)
|
||||
+ tr := otel.Tracer("newt")
|
||||
+ spanCtx, span := tr.Start(context.Background(), "ws.connect")
|
||||
+ defer span.End()
|
||||
+
|
||||
dialer := websocket.DefaultDialer
|
||||
|
||||
// Use new TLS configuration method
|
||||
@@ -391,11 +439,23 @@ func (c *Client) establishConnection() error {
|
||||
logger.Debug("WebSocket TLS certificate verification disabled via SKIP_TLS_VERIFY environment variable")
|
||||
}
|
||||
|
||||
- conn, _, err := dialer.Dial(u.String(), nil)
|
||||
+conn, _, err := dialer.DialContext(spanCtx, u.String(), nil)
|
||||
if err != nil {
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "failure")
|
||||
+ etype := classifyConnError(err)
|
||||
+ telemetry.IncConnError(context.Background(), c.config.ID, "websocket", etype)
|
||||
+ // Map handshake-related errors to reconnect reasons where appropriate
|
||||
+ if etype == "tls" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonHandshakeError)
|
||||
+ } else if etype == "timeout" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonTimeout)
|
||||
+ } else {
|
||||
+ telemetry.IncReconnect(context.Background(), "", c.config.ID, telemetry.ReasonError)
|
||||
+ }
|
||||
return fmt.Errorf("failed to connect to WebSocket: %w", err)
|
||||
}
|
||||
|
||||
+ telemetry.IncConnAttempt(context.Background(), c.config.ID, "websocket", "success")
|
||||
c.conn = conn
|
||||
c.setConnected(true)
|
||||
|
||||
diff --git a/wg/wg.go b/wg/wg.go
|
||||
index 3cee1a9..a765279 100644
|
||||
--- a/wg/wg.go
|
||||
+++ b/wg/wg.go
|
||||
@@ -3,6 +3,7 @@
|
||||
package wg
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -23,6 +24,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/conn"
|
||||
"golang.zx2c4.com/wireguard/wgctrl"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -298,6 +301,13 @@ func (s *WireGuardService) handleConfig(msg websocket.WSMessage) {
|
||||
s.stopGetConfig = nil
|
||||
}
|
||||
|
||||
+ // telemetry: config reload success
|
||||
+ telemetry.IncConfigReload(context.Background(), "success")
|
||||
+ // Optional reconnect reason mapping: config change
|
||||
+ if s.serverPubKey != "" {
|
||||
+ telemetry.IncReconnect(context.Background(), "", s.serverPubKey, telemetry.ReasonConfigChange)
|
||||
+ }
|
||||
+
|
||||
// Ensure the WireGuard interface and peers are configured
|
||||
if err := s.ensureWireguardInterface(config); err != nil {
|
||||
logger.Error("Failed to ensure WireGuard interface: %v", err)
|
||||
diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go
|
||||
index 6684c40..09f160e 100644
|
||||
--- a/wgnetstack/wgnetstack.go
|
||||
+++ b/wgnetstack/wgnetstack.go
|
||||
@@ -1,6 +1,7 @@
|
||||
package wgnetstack
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
@@ -26,6 +27,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
|
||||
return service, nil
|
||||
}
|
||||
|
||||
+// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
|
||||
+func (s *WireGuardService) ReportRTT(seconds float64) {
|
||||
+ if s.serverPubKey == "" { return }
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds)
|
||||
+}
|
||||
+
|
||||
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
|
||||
logger.Debug("Received: %+v", msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if s.TunnelIP == "" || s.proxyManager == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
- return
|
||||
- }
|
||||
+ return
|
||||
+}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
0
patches/03_constants_docs.patch
Normal file
0
patches/03_constants_docs.patch
Normal file
44
patches/03_wg_rtt_hook.patch
Normal file
44
patches/03_wg_rtt_hook.patch
Normal file
@@ -0,0 +1,44 @@
|
||||
diff --git a/wgnetstack/wgnetstack.go b/wgnetstack/wgnetstack.go
|
||||
index 6684c40..09f160e 100644
|
||||
--- a/wgnetstack/wgnetstack.go
|
||||
+++ b/wgnetstack/wgnetstack.go
|
||||
@@ -1,6 +1,7 @@
|
||||
package wgnetstack
|
||||
|
||||
import (
|
||||
+ "context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
@@ -26,6 +27,8 @@ import (
|
||||
"golang.zx2c4.com/wireguard/tun"
|
||||
"golang.zx2c4.com/wireguard/tun/netstack"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
+
|
||||
+ "github.com/fosrl/newt/internal/telemetry"
|
||||
)
|
||||
|
||||
type WgConfig struct {
|
||||
@@ -240,14 +243,20 @@ func NewWireGuardService(interfaceName string, mtu int, generateAndSaveKeyTo str
|
||||
return service, nil
|
||||
}
|
||||
|
||||
+// ReportRTT allows reporting native RTTs to telemetry, rate-limited externally.
|
||||
+func (s *WireGuardService) ReportRTT(seconds float64) {
|
||||
+ if s.serverPubKey == "" { return }
|
||||
+ telemetry.ObserveTunnelLatency(context.Background(), "", s.serverPubKey, "wireguard", seconds)
|
||||
+}
|
||||
+
|
||||
func (s *WireGuardService) addTcpTarget(msg websocket.WSMessage) {
|
||||
logger.Debug("Received: %+v", msg)
|
||||
|
||||
// if there is no wgData or pm, we can't add targets
|
||||
if s.TunnelIP == "" || s.proxyManager == nil {
|
||||
logger.Info("No tunnel IP or proxy manager available")
|
||||
- return
|
||||
- }
|
||||
+ return
|
||||
+}
|
||||
|
||||
targetData, err := parseTargetData(msg.Data)
|
||||
if err != nil {
|
||||
0
patches/04_tests_docs.patch
Normal file
0
patches/04_tests_docs.patch
Normal file
25
patches/HOWTO-APPLY.md
Normal file
25
patches/HOWTO-APPLY.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# How to apply patches
|
||||
|
||||
These patches were generated from the working tree without commits. You can apply them in one shot or in topic order.
|
||||
|
||||
One shot (recommended during review):
|
||||
|
||||
```bash
|
||||
git apply patches/00_all_changes.patch
|
||||
```
|
||||
|
||||
Topic order:
|
||||
|
||||
```bash
|
||||
git apply patches/01_proxy_multitunnel.patch
|
||||
git apply patches/02_reconnect_rtt.patch
|
||||
git apply patches/03_constants_docs.patch
|
||||
```
|
||||
|
||||
Rollback (restore to HEAD and clean untracked files):
|
||||
|
||||
```bash
|
||||
git restore --source=HEAD --worktree --staged .
|
||||
git clean -fd
|
||||
```
|
||||
|
||||
55
scripts/smoke-metrics.sh
Normal file
55
scripts/smoke-metrics.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
NEWTHOST=${NEWTHOST:-localhost}
|
||||
NEWTPORT=${NEWTPORT:-2112}
|
||||
METRICS_URL="http://${NEWTHOST}:${NEWTPORT}/metrics"
|
||||
|
||||
probe() {
|
||||
local name=$1
|
||||
local pattern=$2
|
||||
echo "[probe] ${name}"
|
||||
curl -sf "${METRICS_URL}" | grep -E "${pattern}" || {
|
||||
echo "[warn] ${name} not found"
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
# Basic presence
|
||||
probe "newt_* presence" "^newt_" || true
|
||||
|
||||
# Site gauges with site_id
|
||||
probe "site_online with site_id" "^newt_site_online\{.*site_id=\"[^\"]+\"" || true
|
||||
probe "last_heartbeat with site_id" "^newt_site_last_heartbeat_timestamp_seconds\{.*site_id=\"[^\"]+\"" || true
|
||||
|
||||
# Bytes with direction ingress/egress and protocol
|
||||
probe "tunnel bytes ingress" "^newt_tunnel_bytes_total\{.*direction=\"ingress\".*protocol=\"(tcp|udp)\"" || true
|
||||
probe "tunnel bytes egress" "^newt_tunnel_bytes_total\{.*direction=\"egress\".*protocol=\"(tcp|udp)\"" || true
|
||||
|
||||
# Optional: verify absence/presence of tunnel_id based on EXPECT_TUNNEL_ID (default true)
|
||||
EXPECT_TUNNEL_ID=${EXPECT_TUNNEL_ID:-true}
|
||||
if [ "$EXPECT_TUNNEL_ID" = "false" ]; then
|
||||
echo "[probe] ensure tunnel_id label is absent when NEWT_METRICS_INCLUDE_TUNNEL_ID=false"
|
||||
! curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[fail] tunnel_id present but EXPECT_TUNNEL_ID=false"; exit 1; }
|
||||
else
|
||||
echo "[probe] ensure tunnel_id label is present (default)"
|
||||
curl -sf "${METRICS_URL}" | grep -q "tunnel_id=\"" || { echo "[warn] tunnel_id not found (may be expected if no tunnel is active)"; }
|
||||
fi
|
||||
|
||||
# WebSocket metrics (when OTLP/WS used)
|
||||
probe "websocket connect latency buckets" "^newt_websocket_connect_latency_seconds_bucket" || true
|
||||
probe "websocket messages total" "^newt_websocket_messages_total\{.*(direction|msg_type)=" || true
|
||||
probe "websocket connected gauge" "^newt_websocket_connected" || true
|
||||
probe "websocket reconnects total" "^newt_websocket_reconnects_total\{" || true
|
||||
|
||||
# Proxy metrics (when proxy active)
|
||||
probe "proxy active connections" "^newt_proxy_active_connections\{" || true
|
||||
probe "proxy buffer bytes" "^newt_proxy_buffer_bytes\{" || true
|
||||
probe "proxy drops total" "^newt_proxy_drops_total\{" || true
|
||||
probe "proxy connections total" "^newt_proxy_connections_total\{" || true
|
||||
|
||||
# Config apply
|
||||
probe "config apply seconds buckets" "^newt_config_apply_seconds_bucket\{" || true
|
||||
|
||||
echo "Smoke checks completed (warnings above are acceptable if the feature isn't exercised yet)."
|
||||
|
||||
Reference in New Issue
Block a user