fix(ping): decouple data-plane recovery trigger from backoff ramp

The trigger condition that decides whether to fire the data-plane recovery flow in startPingCheck was AND-ed with `currentInterval < maxInterval`. That clause was meant to throttle the *backoff ramp* (don't widen the interval past 6s), but it also gated the recovery trigger itself — a conflation that became invisibly load-bearing once commit 8161fa6 (March 2026) bumped the default pingInterval from 3s to 15s while leaving maxInterval at 6s. Under the new defaults `currentInterval` starts at 15s and `15 < 6` is permanently false, so the recovery branch never executed. Pings just kept failing and the failure counter climbed forever, with no "Connection to server lost" log line and no newt/ping/request emitted on the websocket. Real-world recovery only happened when the underlying network came back fast enough that a periodic ping naturally succeeded again — which doesn't happen if the WireGuard state on either end has rotated, so users were left stuck until they restarted newt. This is the proximate cause of the user reports in fosrl/newt#284 (and dups #310, fosrl/pangolin#1004). Logs in those issues all show ping-failure counters growing without ever emitting "Connection to server lost", which is exactly the fingerprint of this gate being false. The fix is to extract the trigger decision into shouldFireRecovery and remove currentInterval from it. Backoff is now computed in a separate `if` in the caller, still gated by `currentInterval < maxInterval` so the ramp is a no-op under default settings (which is the existing behaviour, just no longer entangled with the recovery trigger). Fixing the backoff ramp itself — making it useful when pingInterval >= maxInterval — is a follow-up: the priority is restoring recovery, not improving the dampening schedule. The new shouldFireRecovery helper is unit-tested. Its signature intentionally omits currentInterval, so a future refactor that re-introduces the interval-dependent gate would need to change the function signature, which makes the historical bug harder to reintroduce silently.
2026-05-13 11:49:55 +00:00 · 2026-05-07 08:32:55 -05:00
parent 57aa2e2e2c
commit 1e77b09e3b
2 changed files with 90 additions and 28 deletions
--- a/common.go
+++ b/common.go
@@ -220,6 +220,25 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
 	return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
 }

+// shouldFireRecovery decides whether the data-plane recovery flow in
+// startPingCheck should run on this tick. Recovery fires once when the
+// consecutive-failure counter first crosses the threshold; the connectionLost
+// flag prevents re-firing until a successful ping resets the state.
+//
+// This condition was previously inlined into startPingCheck and AND-ed with
+// `currentInterval < maxInterval`, which silently broke recovery once
+// pingInterval's default was bumped to 15s while maxInterval stayed at 6s
+// (commit 8161fa6, March 2026): the gate became permanently false on default
+// settings, so the recovery code never executed and ping failures climbed
+// forever — the proximate cause of fosrl/newt#284, #310 and pangolin#1004.
+//
+// Recovery and backoff are independent concerns; the backoff ramp is now
+// computed separately in the caller. Do not re-introduce currentInterval
+// here.
+func shouldFireRecovery(consecutiveFailures, failureThreshold int, connectionLost bool) bool {
+	return consecutiveFailures >= failureThreshold && !connectionLost
+}
+
 func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
 	maxInterval := 6 * time.Second
 	currentInterval := pingInterval
@@ -279,37 +298,41 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien

 					// More lenient threshold for declaring connection lost under load
 					failureThreshold := 4
-					if consecutiveFailures >= failureThreshold && currentInterval < maxInterval {
-						if !connectionLost {
-							connectionLost = true
-							logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
-							if tunnelID != "" {
-								telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout)
-							}
-							pingChainId := generateChainId()
-							pendingPingChainId = pingChainId
-							stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{
-								"chainId": pingChainId,
-							}, 3*time.Second)
-							// Send registration message to the server for backward compatibility
-							bcChainId := generateChainId()
-							pendingRegisterChainId = bcChainId
-							err := client.SendMessage("newt/wg/register", map[string]interface{}{
-								"publicKey":           publicKey.String(),
-								"backwardsCompatible": true,
-								"chainId":             bcChainId,
-							})
+					if shouldFireRecovery(consecutiveFailures, failureThreshold, connectionLost) {
+						connectionLost = true
+						logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
+						if tunnelID != "" {
+							telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout)
+						}
+						pingChainId := generateChainId()
+						pendingPingChainId = pingChainId
+						stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{
+							"chainId": pingChainId,
+						}, 3*time.Second)
+						// Send registration message to the server for backward compatibility
+						bcChainId := generateChainId()
+						pendingRegisterChainId = bcChainId
+						err := client.SendMessage("newt/wg/register", map[string]interface{}{
+							"publicKey":           publicKey.String(),
+							"backwardsCompatible": true,
+							"chainId":             bcChainId,
+						})
+						if err != nil {
+							logger.Error("Failed to send registration message: %v", err)
+						}
+						if healthFile != "" {
+							err = os.Remove(healthFile)
 							if err != nil {
-								logger.Error("Failed to send registration message: %v", err)
-							}
-							if healthFile != "" {
-								err = os.Remove(healthFile)
-								if err != nil {
-									logger.Error("Failed to remove health file: %v", err)
-								}
+								logger.Error("Failed to remove health file: %v", err)
 							}
 						}
-						currentInterval = time.Duration(float64(currentInterval) * 1.3) // Slower increase
+					}
+					// Backoff: ramp the periodic-ping interval up while we are
+					// past the failure threshold, capped at maxInterval. Kept
+					// independent of the recovery trigger above so the trigger
+					// fires on every outage regardless of pingInterval.
+					if consecutiveFailures >= failureThreshold && currentInterval < maxInterval {
+						currentInterval = time.Duration(float64(currentInterval) * 1.3)
 						if currentInterval > maxInterval {
 							currentInterval = maxInterval
 						}
--- a/common_test.go
+++ b/common_test.go
@@ -210,3 +210,42 @@ func TestParseTargetStringNetDialCompatibility(t *testing.T) {
 		})
 	}
 }
+
+// TestShouldFireRecovery is the regression guard for the broken trigger gate
+// that prevented data-plane recovery from ever firing under default settings
+// (fosrl/newt#284, #310, pangolin#1004). The pre-fix condition was
+//
+//	consecutiveFailures >= failureThreshold && currentInterval < maxInterval
+//
+// which became permanently false once pingInterval's default was bumped from
+// 3s to 15s in commit 8161fa6 — currentInterval starts at pingInterval=15s,
+// maxInterval stayed at 6s, so 15<6 is false and the recovery branch never
+// executed.
+//
+// The fix is to drop currentInterval from the trigger condition entirely;
+// backoff is a separate concern computed in the caller. The cases below
+// exercise the documented contract.
+func TestShouldFireRecovery(t *testing.T) {
+	const threshold = 4
+	cases := []struct {
+		name           string
+		failures       int
+		connectionLost bool
+		want           bool
+	}{
+		{"below threshold, fresh", 3, false, false},
+		{"below threshold, already lost", 3, true, false},
+		{"at threshold, fresh — recovery must fire", threshold, false, true},
+		{"at threshold, already lost — gate prevents re-fire", threshold, true, false},
+		{"far above threshold, fresh", 100, false, true},
+		{"far above threshold, already lost", 100, true, false},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := shouldFireRecovery(c.failures, threshold, c.connectionLost); got != c.want {
+				t.Errorf("shouldFireRecovery(failures=%d, threshold=%d, lost=%v) = %v, want %v",
+					c.failures, threshold, c.connectionLost, got, c.want)
+			}
+		})
+	}
+}