mirror of
https://github.com/fosrl/newt.git
synced 2026-05-12 03:09:56 +00:00
The trigger condition that decides whether to fire the data-plane
recovery flow in startPingCheck was AND-ed with `currentInterval <
maxInterval`. That clause was meant to throttle the *backoff ramp*
(don't widen the interval past 6s), but it also gated the recovery
trigger itself — a conflation that became invisibly load-bearing
once commit 8161fa6 (March 2026) bumped the default pingInterval
from 3s to 15s while leaving maxInterval at 6s. Under the new
defaults `currentInterval` starts at 15s and `15 < 6` is permanently
false, so the recovery branch never executed. Pings just kept
failing and the failure counter climbed forever, with no
"Connection to server lost" log line and no newt/ping/request
emitted on the websocket. Real-world recovery only happened when
the underlying network came back fast enough that a periodic ping
naturally succeeded again — which doesn't happen if the WireGuard
state on either end has rotated, so users were left stuck until
they restarted newt.
This is the proximate cause of the user reports in
fosrl/newt#284 (and dups #310, fosrl/pangolin#1004). Logs in
those issues all show ping-failure counters growing without ever
emitting "Connection to server lost", which is exactly the
fingerprint of this gate being false.
The fix is to extract the trigger decision into shouldFireRecovery
and remove currentInterval from it. Backoff is now computed in a
separate `if` in the caller, still gated by `currentInterval <
maxInterval` so the ramp is a no-op under default settings (which
is the existing behaviour, just no longer entangled with the
recovery trigger). Fixing the backoff ramp itself — making it
useful when pingInterval >= maxInterval — is a follow-up: the
priority is restoring recovery, not improving the dampening
schedule.
The new shouldFireRecovery helper is unit-tested. Its signature
intentionally omits currentInterval, so a future refactor that
re-introduces the interval-dependent gate would need to change
the function signature, which makes the historical bug harder
to reintroduce silently.
252 lines
6.6 KiB
Go
252 lines
6.6 KiB
Go
package main
|
|
|
|
import (
|
|
"net"
|
|
"testing"
|
|
)
|
|
|
|
func TestParseTargetString(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
input string
|
|
wantListenPort int
|
|
wantTargetAddr string
|
|
wantErr bool
|
|
}{
|
|
// IPv4 test cases
|
|
{
|
|
name: "valid IPv4 basic",
|
|
input: "3001:192.168.1.1:80",
|
|
wantListenPort: 3001,
|
|
wantTargetAddr: "192.168.1.1:80",
|
|
wantErr: false,
|
|
},
|
|
{
|
|
name: "valid IPv4 localhost",
|
|
input: "8080:127.0.0.1:3000",
|
|
wantListenPort: 8080,
|
|
wantTargetAddr: "127.0.0.1:3000",
|
|
wantErr: false,
|
|
},
|
|
{
|
|
name: "valid IPv4 same ports",
|
|
input: "443:10.0.0.1:443",
|
|
wantListenPort: 443,
|
|
wantTargetAddr: "10.0.0.1:443",
|
|
wantErr: false,
|
|
},
|
|
|
|
// IPv6 test cases
|
|
{
|
|
name: "valid IPv6 loopback",
|
|
input: "3001:[::1]:8080",
|
|
wantListenPort: 3001,
|
|
wantTargetAddr: "[::1]:8080",
|
|
wantErr: false,
|
|
},
|
|
{
|
|
name: "valid IPv6 full address",
|
|
input: "80:[fd70:1452:b736:4dd5:caca:7db9:c588:f5b3]:8080",
|
|
wantListenPort: 80,
|
|
wantTargetAddr: "[fd70:1452:b736:4dd5:caca:7db9:c588:f5b3]:8080",
|
|
wantErr: false,
|
|
},
|
|
{
|
|
name: "valid IPv6 link-local",
|
|
input: "443:[fe80::1]:443",
|
|
wantListenPort: 443,
|
|
wantTargetAddr: "[fe80::1]:443",
|
|
wantErr: false,
|
|
},
|
|
{
|
|
name: "valid IPv6 all zeros compressed",
|
|
input: "8000:[::]:9000",
|
|
wantListenPort: 8000,
|
|
wantTargetAddr: "[::]:9000",
|
|
wantErr: false,
|
|
},
|
|
{
|
|
name: "valid IPv6 mixed notation",
|
|
input: "5000:[::ffff:192.168.1.1]:6000",
|
|
wantListenPort: 5000,
|
|
wantTargetAddr: "[::ffff:192.168.1.1]:6000",
|
|
wantErr: false,
|
|
},
|
|
|
|
// Hostname test cases
|
|
{
|
|
name: "valid hostname",
|
|
input: "8080:example.com:80",
|
|
wantListenPort: 8080,
|
|
wantTargetAddr: "example.com:80",
|
|
wantErr: false,
|
|
},
|
|
{
|
|
name: "valid hostname with subdomain",
|
|
input: "443:api.example.com:8443",
|
|
wantListenPort: 443,
|
|
wantTargetAddr: "api.example.com:8443",
|
|
wantErr: false,
|
|
},
|
|
{
|
|
name: "valid localhost hostname",
|
|
input: "3000:localhost:3000",
|
|
wantListenPort: 3000,
|
|
wantTargetAddr: "localhost:3000",
|
|
wantErr: false,
|
|
},
|
|
|
|
// Error cases
|
|
{
|
|
name: "invalid - no colons",
|
|
input: "invalid",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - empty string",
|
|
input: "",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - non-numeric listen port",
|
|
input: "abc:192.168.1.1:80",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - missing target port",
|
|
input: "3001:192.168.1.1",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - IPv6 without brackets",
|
|
input: "3001:fd70:1452:b736:4dd5:caca:7db9:c588:f5b3:80",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - only listen port",
|
|
input: "3001:",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - missing host",
|
|
input: "3001::80",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - IPv6 unclosed bracket",
|
|
input: "3001:[::1:80",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - listen port zero",
|
|
input: "0:192.168.1.1:80",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - listen port negative",
|
|
input: "-1:192.168.1.1:80",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - listen port out of range",
|
|
input: "70000:192.168.1.1:80",
|
|
wantErr: true,
|
|
},
|
|
{
|
|
name: "invalid - empty target port",
|
|
input: "3001:192.168.1.1:",
|
|
wantErr: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
listenPort, targetAddr, err := parseTargetString(tt.input)
|
|
|
|
if (err != nil) != tt.wantErr {
|
|
t.Errorf("parseTargetString(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr)
|
|
return
|
|
}
|
|
|
|
if tt.wantErr {
|
|
return // Don't check other values if we expected an error
|
|
}
|
|
|
|
if listenPort != tt.wantListenPort {
|
|
t.Errorf("parseTargetString(%q) listenPort = %d, want %d", tt.input, listenPort, tt.wantListenPort)
|
|
}
|
|
|
|
if targetAddr != tt.wantTargetAddr {
|
|
t.Errorf("parseTargetString(%q) targetAddr = %q, want %q", tt.input, targetAddr, tt.wantTargetAddr)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestParseTargetStringNetDialCompatibility verifies that the output is compatible with net.Dial
|
|
func TestParseTargetStringNetDialCompatibility(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
input string
|
|
}{
|
|
{"IPv4", "8080:127.0.0.1:80"},
|
|
{"IPv6 loopback", "8080:[::1]:80"},
|
|
{"IPv6 full", "8080:[2001:db8::1]:80"},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
_, targetAddr, err := parseTargetString(tt.input)
|
|
if err != nil {
|
|
t.Fatalf("parseTargetString(%q) unexpected error: %v", tt.input, err)
|
|
}
|
|
|
|
// Verify the format is valid for net.Dial by checking it can be split back
|
|
// This doesn't actually dial, just validates the format
|
|
_, _, err = net.SplitHostPort(targetAddr)
|
|
if err != nil {
|
|
t.Errorf("parseTargetString(%q) produced invalid net.Dial format %q: %v", tt.input, targetAddr, err)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestShouldFireRecovery is the regression guard for the broken trigger gate
|
|
// that prevented data-plane recovery from ever firing under default settings
|
|
// (fosrl/newt#284, #310, pangolin#1004). The pre-fix condition was
|
|
//
|
|
// consecutiveFailures >= failureThreshold && currentInterval < maxInterval
|
|
//
|
|
// which became permanently false once pingInterval's default was bumped from
|
|
// 3s to 15s in commit 8161fa6 — currentInterval starts at pingInterval=15s,
|
|
// maxInterval stayed at 6s, so 15<6 is false and the recovery branch never
|
|
// executed.
|
|
//
|
|
// The fix is to drop currentInterval from the trigger condition entirely;
|
|
// backoff is a separate concern computed in the caller. The cases below
|
|
// exercise the documented contract.
|
|
func TestShouldFireRecovery(t *testing.T) {
|
|
const threshold = 4
|
|
cases := []struct {
|
|
name string
|
|
failures int
|
|
connectionLost bool
|
|
want bool
|
|
}{
|
|
{"below threshold, fresh", 3, false, false},
|
|
{"below threshold, already lost", 3, true, false},
|
|
{"at threshold, fresh — recovery must fire", threshold, false, true},
|
|
{"at threshold, already lost — gate prevents re-fire", threshold, true, false},
|
|
{"far above threshold, fresh", 100, false, true},
|
|
{"far above threshold, already lost", 100, true, false},
|
|
}
|
|
for _, c := range cases {
|
|
t.Run(c.name, func(t *testing.T) {
|
|
if got := shouldFireRecovery(c.failures, threshold, c.connectionLost); got != c.want {
|
|
t.Errorf("shouldFireRecovery(failures=%d, threshold=%d, lost=%v) = %v, want %v",
|
|
c.failures, threshold, c.connectionLost, got, c.want)
|
|
}
|
|
})
|
|
}
|
|
}
|