From a6533b3fa0b0af4d9080c813f1b91461bc38756d Mon Sep 17 00:00:00 2001 From: Owen Date: Wed, 29 Apr 2026 21:11:07 -0700 Subject: [PATCH 1/9] Fix incorrect redirect logic --- netstack2/http_handler.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/netstack2/http_handler.go b/netstack2/http_handler.go index 5e44844..6f08d76 100644 --- a/netstack2/http_handler.go +++ b/netstack2/http_handler.go @@ -356,9 +356,9 @@ func (h *HTTPHandler) handleRequest(w http.ResponseWriter, r *http.Request) { return } - // If the rule is plain HTTP but has a TLS certificate configured, redirect - // the client to the HTTPS equivalent of the requested URL. - if rule.Protocol == "http" && rule.TLSCert != "" && rule.TLSKey != "" { + // If the rule is HTTPS and a TLS certificate is configured, but the + // incoming request arrived over plain HTTP, redirect to HTTPS. + if rule.Protocol == "https" && rule.TLSCert != "" && rule.TLSKey != "" && r.TLS == nil { host := r.Host if host == "" { host = r.URL.Host From 5090907307b6245ff7e01cdcf27b27dfbb0b717c Mon Sep 17 00:00:00 2001 From: Owen Date: Thu, 30 Apr 2026 15:55:52 -0700 Subject: [PATCH 2/9] Update status code --- netstack2/http_handler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netstack2/http_handler.go b/netstack2/http_handler.go index 6f08d76..7ba2f63 100644 --- a/netstack2/http_handler.go +++ b/netstack2/http_handler.go @@ -365,7 +365,7 @@ func (h *HTTPHandler) handleRequest(w http.ResponseWriter, r *http.Request) { } httpsURL := "https://" + host + r.RequestURI logger.Info("HTTP handler: redirecting %s %s -> %s (TLS cert present)", r.Method, r.URL.RequestURI(), httpsURL) - http.Redirect(w, r, httpsURL, http.StatusMovedPermanently) + http.Redirect(w, r, httpsURL, http.StatusPermanentRedirect) return } From 27f7ca6bb99c7c901e367878455b9df8c25be970 Mon Sep 17 00:00:00 2001 From: Owen Date: Tue, 5 May 2026 11:40:39 -0700 Subject: [PATCH 3/9] Try to fix failover not working --- common.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/common.go b/common.go index 4e1ed00..95333d2 100644 --- a/common.go +++ b/common.go @@ -279,7 +279,7 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien // More lenient threshold for declaring connection lost under load failureThreshold := 4 - if consecutiveFailures >= failureThreshold && currentInterval < maxInterval { + if consecutiveFailures >= failureThreshold { if !connectionLost { connectionLost = true logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) @@ -309,12 +309,14 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien } } } - currentInterval = time.Duration(float64(currentInterval) * 1.3) // Slower increase - if currentInterval > maxInterval { - currentInterval = maxInterval + if currentInterval < maxInterval { + currentInterval = time.Duration(float64(currentInterval) * 1.3) // Slower increase + if currentInterval > maxInterval { + currentInterval = maxInterval + } + ticker.Reset(currentInterval) + logger.Debug("Increased ping check interval to %v due to consecutive failures", currentInterval) } - ticker.Reset(currentInterval) - logger.Debug("Increased ping check interval to %v due to consecutive failures", currentInterval) } } else { // Track recent latencies From 9ff32b8a8b37ea564761a30adb1c188c65a6111f Mon Sep 17 00:00:00 2001 From: Owen Date: Thu, 7 May 2026 16:16:47 -0700 Subject: [PATCH 4/9] Fix not logging when rewriting nat --- netstack2/proxy.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/netstack2/proxy.go b/netstack2/proxy.go index b08eea3..95fab6a 100644 --- a/netstack2/proxy.go +++ b/netstack2/proxy.go @@ -572,6 +572,18 @@ func (p *ProxyHandler) HandleIncomingPacket(packet []byte) bool { // Store destination rewrite for handler lookups p.destRewriteTable[dKey] = newDst + + // Also store the resource ID under the rewritten destination key so that + // TCP/UDP handlers can find it after DNAT (they see the post-NAT dst IP). + if matchedRule.ResourceId != 0 { + rewrittenKey := destKey{ + srcIP: srcAddr.String(), + dstIP: newDst.String(), + dstPort: dstPort, + proto: uint8(protocol), + } + p.resourceTable[rewrittenKey] = matchedRule.ResourceId + } p.natMu.Unlock() logger.Debug("New NAT entry for connection: %s -> %s", dstAddr, newDst) } From e8dc19a62bf2e6dbe88359ce5649c880070ded94 Mon Sep 17 00:00:00 2001 From: Owen Date: Thu, 7 May 2026 16:23:59 -0700 Subject: [PATCH 5/9] Attempt to fix nix issue --- common.go | 1 + 1 file changed, 1 insertion(+) diff --git a/common.go b/common.go index 95333d2..bcbd968 100644 --- a/common.go +++ b/common.go @@ -208,6 +208,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC logger.Warn(msgHealthFileWriteFailed, err) } } + return } case <-pingStopChan: // Stop the goroutine when signaled From 74fd3f3aa33b6848ec067650209f784395bb7d3b Mon Sep 17 00:00:00 2001 From: Owen Date: Thu, 7 May 2026 16:24:30 -0700 Subject: [PATCH 6/9] Bump version --- flake.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flake.nix b/flake.nix index bd760ea..08f2864 100644 --- a/flake.nix +++ b/flake.nix @@ -25,7 +25,7 @@ inherit (pkgs) lib; # Update version when releasing - version = "1.11.0"; + version = "1.12.4"; in { default = self.packages.${system}.pangolin-newt; From 1e77b09e3b99731ce91ea0d721ce02e23c441664 Mon Sep 17 00:00:00 2001 From: Daniel Snider Date: Thu, 7 May 2026 08:32:55 -0500 Subject: [PATCH 7/9] fix(ping): decouple data-plane recovery trigger from backoff ramp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The trigger condition that decides whether to fire the data-plane recovery flow in startPingCheck was AND-ed with `currentInterval < maxInterval`. That clause was meant to throttle the *backoff ramp* (don't widen the interval past 6s), but it also gated the recovery trigger itself — a conflation that became invisibly load-bearing once commit 8161fa6 (March 2026) bumped the default pingInterval from 3s to 15s while leaving maxInterval at 6s. Under the new defaults `currentInterval` starts at 15s and `15 < 6` is permanently false, so the recovery branch never executed. Pings just kept failing and the failure counter climbed forever, with no "Connection to server lost" log line and no newt/ping/request emitted on the websocket. Real-world recovery only happened when the underlying network came back fast enough that a periodic ping naturally succeeded again — which doesn't happen if the WireGuard state on either end has rotated, so users were left stuck until they restarted newt. This is the proximate cause of the user reports in fosrl/newt#284 (and dups #310, fosrl/pangolin#1004). Logs in those issues all show ping-failure counters growing without ever emitting "Connection to server lost", which is exactly the fingerprint of this gate being false. The fix is to extract the trigger decision into shouldFireRecovery and remove currentInterval from it. Backoff is now computed in a separate `if` in the caller, still gated by `currentInterval < maxInterval` so the ramp is a no-op under default settings (which is the existing behaviour, just no longer entangled with the recovery trigger). Fixing the backoff ramp itself — making it useful when pingInterval >= maxInterval — is a follow-up: the priority is restoring recovery, not improving the dampening schedule. The new shouldFireRecovery helper is unit-tested. Its signature intentionally omits currentInterval, so a future refactor that re-introduces the interval-dependent gate would need to change the function signature, which makes the historical bug harder to reintroduce silently. --- common.go | 79 ++++++++++++++++++++++++++++++++------------------ common_test.go | 39 +++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 28 deletions(-) diff --git a/common.go b/common.go index 4e1ed00..01d6957 100644 --- a/common.go +++ b/common.go @@ -220,6 +220,25 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background") } +// shouldFireRecovery decides whether the data-plane recovery flow in +// startPingCheck should run on this tick. Recovery fires once when the +// consecutive-failure counter first crosses the threshold; the connectionLost +// flag prevents re-firing until a successful ping resets the state. +// +// This condition was previously inlined into startPingCheck and AND-ed with +// `currentInterval < maxInterval`, which silently broke recovery once +// pingInterval's default was bumped to 15s while maxInterval stayed at 6s +// (commit 8161fa6, March 2026): the gate became permanently false on default +// settings, so the recovery code never executed and ping failures climbed +// forever — the proximate cause of fosrl/newt#284, #310 and pangolin#1004. +// +// Recovery and backoff are independent concerns; the backoff ramp is now +// computed separately in the caller. Do not re-introduce currentInterval +// here. +func shouldFireRecovery(consecutiveFailures, failureThreshold int, connectionLost bool) bool { + return consecutiveFailures >= failureThreshold && !connectionLost +} + func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} { maxInterval := 6 * time.Second currentInterval := pingInterval @@ -279,37 +298,41 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien // More lenient threshold for declaring connection lost under load failureThreshold := 4 - if consecutiveFailures >= failureThreshold && currentInterval < maxInterval { - if !connectionLost { - connectionLost = true - logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) - if tunnelID != "" { - telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) - } - pingChainId := generateChainId() - pendingPingChainId = pingChainId - stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{ - "chainId": pingChainId, - }, 3*time.Second) - // Send registration message to the server for backward compatibility - bcChainId := generateChainId() - pendingRegisterChainId = bcChainId - err := client.SendMessage("newt/wg/register", map[string]interface{}{ - "publicKey": publicKey.String(), - "backwardsCompatible": true, - "chainId": bcChainId, - }) + if shouldFireRecovery(consecutiveFailures, failureThreshold, connectionLost) { + connectionLost = true + logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) + if tunnelID != "" { + telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) + } + pingChainId := generateChainId() + pendingPingChainId = pingChainId + stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{ + "chainId": pingChainId, + }, 3*time.Second) + // Send registration message to the server for backward compatibility + bcChainId := generateChainId() + pendingRegisterChainId = bcChainId + err := client.SendMessage("newt/wg/register", map[string]interface{}{ + "publicKey": publicKey.String(), + "backwardsCompatible": true, + "chainId": bcChainId, + }) + if err != nil { + logger.Error("Failed to send registration message: %v", err) + } + if healthFile != "" { + err = os.Remove(healthFile) if err != nil { - logger.Error("Failed to send registration message: %v", err) - } - if healthFile != "" { - err = os.Remove(healthFile) - if err != nil { - logger.Error("Failed to remove health file: %v", err) - } + logger.Error("Failed to remove health file: %v", err) } } - currentInterval = time.Duration(float64(currentInterval) * 1.3) // Slower increase + } + // Backoff: ramp the periodic-ping interval up while we are + // past the failure threshold, capped at maxInterval. Kept + // independent of the recovery trigger above so the trigger + // fires on every outage regardless of pingInterval. + if consecutiveFailures >= failureThreshold && currentInterval < maxInterval { + currentInterval = time.Duration(float64(currentInterval) * 1.3) if currentInterval > maxInterval { currentInterval = maxInterval } diff --git a/common_test.go b/common_test.go index a7e659a..67c02cf 100644 --- a/common_test.go +++ b/common_test.go @@ -210,3 +210,42 @@ func TestParseTargetStringNetDialCompatibility(t *testing.T) { }) } } + +// TestShouldFireRecovery is the regression guard for the broken trigger gate +// that prevented data-plane recovery from ever firing under default settings +// (fosrl/newt#284, #310, pangolin#1004). The pre-fix condition was +// +// consecutiveFailures >= failureThreshold && currentInterval < maxInterval +// +// which became permanently false once pingInterval's default was bumped from +// 3s to 15s in commit 8161fa6 — currentInterval starts at pingInterval=15s, +// maxInterval stayed at 6s, so 15<6 is false and the recovery branch never +// executed. +// +// The fix is to drop currentInterval from the trigger condition entirely; +// backoff is a separate concern computed in the caller. The cases below +// exercise the documented contract. +func TestShouldFireRecovery(t *testing.T) { + const threshold = 4 + cases := []struct { + name string + failures int + connectionLost bool + want bool + }{ + {"below threshold, fresh", 3, false, false}, + {"below threshold, already lost", 3, true, false}, + {"at threshold, fresh — recovery must fire", threshold, false, true}, + {"at threshold, already lost — gate prevents re-fire", threshold, true, false}, + {"far above threshold, fresh", 100, false, true}, + {"far above threshold, already lost", 100, true, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := shouldFireRecovery(c.failures, threshold, c.connectionLost); got != c.want { + t.Errorf("shouldFireRecovery(failures=%d, threshold=%d, lost=%v) = %v, want %v", + c.failures, threshold, c.connectionLost, got, c.want) + } + }) + } +} From 901ec71baf1b10ebf24372b77fe48fe8e2fc6af0 Mon Sep 17 00:00:00 2001 From: Owen Date: Thu, 7 May 2026 17:25:13 -0700 Subject: [PATCH 8/9] Increase max attempts --- websocket/client.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/websocket/client.go b/websocket/client.go index 67e23ec..3d35494 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -48,7 +48,7 @@ type Client struct { metricsCtx context.Context configNeedsSave bool // Flag to track if config needs to be saved serverVersion string - configVersion int64 // Latest config version received from server + configVersion int64 // Latest config version received from server configVersionMux sync.RWMutex processingMessage bool // Flag to track if a message is currently being processed processingMux sync.RWMutex // Protects processingMessage @@ -271,7 +271,7 @@ func (c *Client) SendMessageInterval(messageType string, data interface{}, inter stopChan := make(chan struct{}) go func() { count := 0 - maxAttempts := 10 + maxAttempts := 16 err := c.SendMessage(messageType, data) // Send immediately if err != nil { @@ -836,7 +836,7 @@ func (c *Client) readPumpWithDisconnectDetection(started time.Time) { logger.Error("WebSocket failed to parse message: %v", err) continue } - + c.setConfigVersion(msg.ConfigVersion) c.handlersMux.RLock() From 663e98af608c2d0df5519326a767291266f45975 Mon Sep 17 00:00:00 2001 From: Owen Date: Thu, 7 May 2026 17:27:01 -0700 Subject: [PATCH 9/9] Retry interval while we are disconnected --- websocket/client.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/websocket/client.go b/websocket/client.go index 3d35494..5068471 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -273,11 +273,15 @@ func (c *Client) SendMessageInterval(messageType string, data interface{}, inter count := 0 maxAttempts := 16 + c.reconnectMux.RLock() + connected := c.isConnected + c.reconnectMux.RUnlock() err := c.SendMessage(messageType, data) // Send immediately if err != nil { logger.Error("Failed to send initial message: %v", err) + } else if connected { + count++ } - count++ ticker := time.NewTicker(interval) defer ticker.Stop() @@ -288,11 +292,15 @@ func (c *Client) SendMessageInterval(messageType string, data interface{}, inter logger.Info("SendMessageInterval timed out after %d attempts for message type: %s", maxAttempts, messageType) return } + c.reconnectMux.RLock() + connected = c.isConnected + c.reconnectMux.RUnlock() err = c.SendMessage(messageType, data) if err != nil { logger.Error("Failed to send message: %v", err) + } else if connected { + count++ } - count++ case <-stopChan: return }