From 1e77b09e3b99731ce91ea0d721ce02e23c441664 Mon Sep 17 00:00:00 2001
From: Daniel Snider <daniel.snider@platform.sh>
Date: Thu, 7 May 2026 08:32:55 -0500
Subject: [PATCH] fix(ping): decouple data-plane recovery trigger from backoff
 ramp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The trigger condition that decides whether to fire the data-plane
recovery flow in startPingCheck was AND-ed with `currentInterval <
maxInterval`. That clause was meant to throttle the *backoff ramp*
(don't widen the interval past 6s), but it also gated the recovery
trigger itself — a conflation that became invisibly load-bearing
once commit 8161fa6 (March 2026) bumped the default pingInterval
from 3s to 15s while leaving maxInterval at 6s. Under the new
defaults `currentInterval` starts at 15s and `15 < 6` is permanently
false, so the recovery branch never executed. Pings just kept
failing and the failure counter climbed forever, with no
"Connection to server lost" log line and no newt/ping/request
emitted on the websocket. Real-world recovery only happened when
the underlying network came back fast enough that a periodic ping
naturally succeeded again — which doesn't happen if the WireGuard
state on either end has rotated, so users were left stuck until
they restarted newt.

This is the proximate cause of the user reports in
fosrl/newt#284 (and dups #310, fosrl/pangolin#1004). Logs in
those issues all show ping-failure counters growing without ever
emitting "Connection to server lost", which is exactly the
fingerprint of this gate being false.

The fix is to extract the trigger decision into shouldFireRecovery
and remove currentInterval from it. Backoff is now computed in a
separate `if` in the caller, still gated by `currentInterval <
maxInterval` so the ramp is a no-op under default settings (which
is the existing behaviour, just no longer entangled with the
recovery trigger). Fixing the backoff ramp itself — making it
useful when pingInterval >= maxInterval — is a follow-up: the
priority is restoring recovery, not improving the dampening
schedule.

The new shouldFireRecovery helper is unit-tested. Its signature
intentionally omits currentInterval, so a future refactor that
re-introduces the interval-dependent gate would need to change
the function signature, which makes the historical bug harder
to reintroduce silently.
---
 common.go      | 79 ++++++++++++++++++++++++++++++++------------------
 common_test.go | 39 +++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 28 deletions(-)

diff --git a/common.go b/common.go
index 4e1ed00..01d6957 100644
--- a/common.go
+++ b/common.go
@@ -220,6 +220,25 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
 	return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
 }
 
+// shouldFireRecovery decides whether the data-plane recovery flow in
+// startPingCheck should run on this tick. Recovery fires once when the
+// consecutive-failure counter first crosses the threshold; the connectionLost
+// flag prevents re-firing until a successful ping resets the state.
+//
+// This condition was previously inlined into startPingCheck and AND-ed with
+// `currentInterval < maxInterval`, which silently broke recovery once
+// pingInterval's default was bumped to 15s while maxInterval stayed at 6s
+// (commit 8161fa6, March 2026): the gate became permanently false on default
+// settings, so the recovery code never executed and ping failures climbed
+// forever — the proximate cause of fosrl/newt#284, #310 and pangolin#1004.
+//
+// Recovery and backoff are independent concerns; the backoff ramp is now
+// computed separately in the caller. Do not re-introduce currentInterval
+// here.
+func shouldFireRecovery(consecutiveFailures, failureThreshold int, connectionLost bool) bool {
+	return consecutiveFailures >= failureThreshold && !connectionLost
+}
+
 func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
 	maxInterval := 6 * time.Second
 	currentInterval := pingInterval
@@ -279,37 +298,41 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
 
 					// More lenient threshold for declaring connection lost under load
 					failureThreshold := 4
-					if consecutiveFailures >= failureThreshold && currentInterval < maxInterval {
-						if !connectionLost {
-							connectionLost = true
-							logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
-							if tunnelID != "" {
-								telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout)
-							}
-							pingChainId := generateChainId()
-							pendingPingChainId = pingChainId
-							stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{
-								"chainId": pingChainId,
-							}, 3*time.Second)
-							// Send registration message to the server for backward compatibility
-							bcChainId := generateChainId()
-							pendingRegisterChainId = bcChainId
-							err := client.SendMessage("newt/wg/register", map[string]interface{}{
-								"publicKey":           publicKey.String(),
-								"backwardsCompatible": true,
-								"chainId":             bcChainId,
-							})
+					if shouldFireRecovery(consecutiveFailures, failureThreshold, connectionLost) {
+						connectionLost = true
+						logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
+						if tunnelID != "" {
+							telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout)
+						}
+						pingChainId := generateChainId()
+						pendingPingChainId = pingChainId
+						stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{
+							"chainId": pingChainId,
+						}, 3*time.Second)
+						// Send registration message to the server for backward compatibility
+						bcChainId := generateChainId()
+						pendingRegisterChainId = bcChainId
+						err := client.SendMessage("newt/wg/register", map[string]interface{}{
+							"publicKey":           publicKey.String(),
+							"backwardsCompatible": true,
+							"chainId":             bcChainId,
+						})
+						if err != nil {
+							logger.Error("Failed to send registration message: %v", err)
+						}
+						if healthFile != "" {
+							err = os.Remove(healthFile)
 							if err != nil {
-								logger.Error("Failed to send registration message: %v", err)
-							}
-							if healthFile != "" {
-								err = os.Remove(healthFile)
-								if err != nil {
-									logger.Error("Failed to remove health file: %v", err)
-								}
+								logger.Error("Failed to remove health file: %v", err)
 							}
 						}
-						currentInterval = time.Duration(float64(currentInterval) * 1.3) // Slower increase
+					}
+					// Backoff: ramp the periodic-ping interval up while we are
+					// past the failure threshold, capped at maxInterval. Kept
+					// independent of the recovery trigger above so the trigger
+					// fires on every outage regardless of pingInterval.
+					if consecutiveFailures >= failureThreshold && currentInterval < maxInterval {
+						currentInterval = time.Duration(float64(currentInterval) * 1.3)
 						if currentInterval > maxInterval {
 							currentInterval = maxInterval
 						}
diff --git a/common_test.go b/common_test.go
index a7e659a..67c02cf 100644
--- a/common_test.go
+++ b/common_test.go
@@ -210,3 +210,42 @@ func TestParseTargetStringNetDialCompatibility(t *testing.T) {
 		})
 	}
 }
+
+// TestShouldFireRecovery is the regression guard for the broken trigger gate
+// that prevented data-plane recovery from ever firing under default settings
+// (fosrl/newt#284, #310, pangolin#1004). The pre-fix condition was
+//
+//	consecutiveFailures >= failureThreshold && currentInterval < maxInterval
+//
+// which became permanently false once pingInterval's default was bumped from
+// 3s to 15s in commit 8161fa6 — currentInterval starts at pingInterval=15s,
+// maxInterval stayed at 6s, so 15<6 is false and the recovery branch never
+// executed.
+//
+// The fix is to drop currentInterval from the trigger condition entirely;
+// backoff is a separate concern computed in the caller. The cases below
+// exercise the documented contract.
+func TestShouldFireRecovery(t *testing.T) {
+	const threshold = 4
+	cases := []struct {
+		name           string
+		failures       int
+		connectionLost bool
+		want           bool
+	}{
+		{"below threshold, fresh", 3, false, false},
+		{"below threshold, already lost", 3, true, false},
+		{"at threshold, fresh — recovery must fire", threshold, false, true},
+		{"at threshold, already lost — gate prevents re-fire", threshold, true, false},
+		{"far above threshold, fresh", 100, false, true},
+		{"far above threshold, already lost", 100, true, false},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := shouldFireRecovery(c.failures, threshold, c.connectionLost); got != c.want {
+				t.Errorf("shouldFireRecovery(failures=%d, threshold=%d, lost=%v) = %v, want %v",
+					c.failures, threshold, c.connectionLost, got, c.want)
+			}
+		})
+	}
+}