diff --git a/common.go b/common.go index 4e1ed00..8fd89f1 100644 --- a/common.go +++ b/common.go @@ -208,6 +208,7 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC logger.Warn(msgHealthFileWriteFailed, err) } } + return } case <-pingStopChan: // Stop the goroutine when signaled @@ -220,6 +221,25 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background") } +// shouldFireRecovery decides whether the data-plane recovery flow in +// startPingCheck should run on this tick. Recovery fires once when the +// consecutive-failure counter first crosses the threshold; the connectionLost +// flag prevents re-firing until a successful ping resets the state. +// +// This condition was previously inlined into startPingCheck and AND-ed with +// `currentInterval < maxInterval`, which silently broke recovery once +// pingInterval's default was bumped to 15s while maxInterval stayed at 6s +// (commit 8161fa6, March 2026): the gate became permanently false on default +// settings, so the recovery code never executed and ping failures climbed +// forever — the proximate cause of fosrl/newt#284, #310 and pangolin#1004. +// +// Recovery and backoff are independent concerns; the backoff ramp is now +// computed separately in the caller. Do not re-introduce currentInterval +// here. +func shouldFireRecovery(consecutiveFailures, failureThreshold int, connectionLost bool) bool { + return consecutiveFailures >= failureThreshold && !connectionLost +} + func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} { maxInterval := 6 * time.Second currentInterval := pingInterval @@ -279,42 +299,44 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien // More lenient threshold for declaring connection lost under load failureThreshold := 4 - if consecutiveFailures >= failureThreshold && currentInterval < maxInterval { - if !connectionLost { - connectionLost = true - logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) - if tunnelID != "" { - telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) - } - pingChainId := generateChainId() - pendingPingChainId = pingChainId - stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{ - "chainId": pingChainId, - }, 3*time.Second) - // Send registration message to the server for backward compatibility - bcChainId := generateChainId() - pendingRegisterChainId = bcChainId - err := client.SendMessage("newt/wg/register", map[string]interface{}{ - "publicKey": publicKey.String(), - "backwardsCompatible": true, - "chainId": bcChainId, - }) + if shouldFireRecovery(consecutiveFailures, failureThreshold, connectionLost) { + connectionLost = true + logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) + if tunnelID != "" { + telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) + } + pingChainId := generateChainId() + pendingPingChainId = pingChainId + stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{ + "chainId": pingChainId, + }, 3*time.Second) + // Send registration message to the server for backward compatibility + bcChainId := generateChainId() + pendingRegisterChainId = bcChainId + err := client.SendMessage("newt/wg/register", map[string]interface{}{ + "publicKey": publicKey.String(), + "backwardsCompatible": true, + "chainId": bcChainId, + }) + if err != nil { + logger.Error("Failed to send registration message: %v", err) + } + if healthFile != "" { + err = os.Remove(healthFile) if err != nil { - logger.Error("Failed to send registration message: %v", err) - } - if healthFile != "" { - err = os.Remove(healthFile) - if err != nil { - logger.Error("Failed to remove health file: %v", err) - } + logger.Error("Failed to remove health file: %v", err) } } - currentInterval = time.Duration(float64(currentInterval) * 1.3) // Slower increase + } + // Backoff: ramp the periodic-ping interval up while we are + // past the failure threshold, capped at maxInterval. Kept + // independent of the recovery trigger above so the trigger + // fires on every outage regardless of pingInterval. + if consecutiveFailures >= failureThreshold && currentInterval < maxInterval { + currentInterval = time.Duration(float64(currentInterval) * 1.3) if currentInterval > maxInterval { currentInterval = maxInterval } - ticker.Reset(currentInterval) - logger.Debug("Increased ping check interval to %v due to consecutive failures", currentInterval) } } else { // Track recent latencies diff --git a/common_test.go b/common_test.go index a7e659a..67c02cf 100644 --- a/common_test.go +++ b/common_test.go @@ -210,3 +210,42 @@ func TestParseTargetStringNetDialCompatibility(t *testing.T) { }) } } + +// TestShouldFireRecovery is the regression guard for the broken trigger gate +// that prevented data-plane recovery from ever firing under default settings +// (fosrl/newt#284, #310, pangolin#1004). The pre-fix condition was +// +// consecutiveFailures >= failureThreshold && currentInterval < maxInterval +// +// which became permanently false once pingInterval's default was bumped from +// 3s to 15s in commit 8161fa6 — currentInterval starts at pingInterval=15s, +// maxInterval stayed at 6s, so 15<6 is false and the recovery branch never +// executed. +// +// The fix is to drop currentInterval from the trigger condition entirely; +// backoff is a separate concern computed in the caller. The cases below +// exercise the documented contract. +func TestShouldFireRecovery(t *testing.T) { + const threshold = 4 + cases := []struct { + name string + failures int + connectionLost bool + want bool + }{ + {"below threshold, fresh", 3, false, false}, + {"below threshold, already lost", 3, true, false}, + {"at threshold, fresh — recovery must fire", threshold, false, true}, + {"at threshold, already lost — gate prevents re-fire", threshold, true, false}, + {"far above threshold, fresh", 100, false, true}, + {"far above threshold, already lost", 100, true, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := shouldFireRecovery(c.failures, threshold, c.connectionLost); got != c.want { + t.Errorf("shouldFireRecovery(failures=%d, threshold=%d, lost=%v) = %v, want %v", + c.failures, threshold, c.connectionLost, got, c.want) + } + }) + } +} diff --git a/flake.nix b/flake.nix index 148bbe7..b388cc9 100644 --- a/flake.nix +++ b/flake.nix @@ -25,7 +25,7 @@ inherit (pkgs) lib; # Update version when releasing - version = "1.11.0"; + version = "1.12.4"; in { default = self.packages.${system}.pangolin-newt; diff --git a/netstack2/http_handler.go b/netstack2/http_handler.go index 5e44844..7ba2f63 100644 --- a/netstack2/http_handler.go +++ b/netstack2/http_handler.go @@ -356,16 +356,16 @@ func (h *HTTPHandler) handleRequest(w http.ResponseWriter, r *http.Request) { return } - // If the rule is plain HTTP but has a TLS certificate configured, redirect - // the client to the HTTPS equivalent of the requested URL. - if rule.Protocol == "http" && rule.TLSCert != "" && rule.TLSKey != "" { + // If the rule is HTTPS and a TLS certificate is configured, but the + // incoming request arrived over plain HTTP, redirect to HTTPS. + if rule.Protocol == "https" && rule.TLSCert != "" && rule.TLSKey != "" && r.TLS == nil { host := r.Host if host == "" { host = r.URL.Host } httpsURL := "https://" + host + r.RequestURI logger.Info("HTTP handler: redirecting %s %s -> %s (TLS cert present)", r.Method, r.URL.RequestURI(), httpsURL) - http.Redirect(w, r, httpsURL, http.StatusMovedPermanently) + http.Redirect(w, r, httpsURL, http.StatusPermanentRedirect) return } diff --git a/netstack2/proxy.go b/netstack2/proxy.go index b08eea3..95fab6a 100644 --- a/netstack2/proxy.go +++ b/netstack2/proxy.go @@ -572,6 +572,18 @@ func (p *ProxyHandler) HandleIncomingPacket(packet []byte) bool { // Store destination rewrite for handler lookups p.destRewriteTable[dKey] = newDst + + // Also store the resource ID under the rewritten destination key so that + // TCP/UDP handlers can find it after DNAT (they see the post-NAT dst IP). + if matchedRule.ResourceId != 0 { + rewrittenKey := destKey{ + srcIP: srcAddr.String(), + dstIP: newDst.String(), + dstPort: dstPort, + proto: uint8(protocol), + } + p.resourceTable[rewrittenKey] = matchedRule.ResourceId + } p.natMu.Unlock() logger.Debug("New NAT entry for connection: %s -> %s", dstAddr, newDst) } diff --git a/websocket/client.go b/websocket/client.go index 67e23ec..5068471 100644 --- a/websocket/client.go +++ b/websocket/client.go @@ -48,7 +48,7 @@ type Client struct { metricsCtx context.Context configNeedsSave bool // Flag to track if config needs to be saved serverVersion string - configVersion int64 // Latest config version received from server + configVersion int64 // Latest config version received from server configVersionMux sync.RWMutex processingMessage bool // Flag to track if a message is currently being processed processingMux sync.RWMutex // Protects processingMessage @@ -271,13 +271,17 @@ func (c *Client) SendMessageInterval(messageType string, data interface{}, inter stopChan := make(chan struct{}) go func() { count := 0 - maxAttempts := 10 + maxAttempts := 16 + c.reconnectMux.RLock() + connected := c.isConnected + c.reconnectMux.RUnlock() err := c.SendMessage(messageType, data) // Send immediately if err != nil { logger.Error("Failed to send initial message: %v", err) + } else if connected { + count++ } - count++ ticker := time.NewTicker(interval) defer ticker.Stop() @@ -288,11 +292,15 @@ func (c *Client) SendMessageInterval(messageType string, data interface{}, inter logger.Info("SendMessageInterval timed out after %d attempts for message type: %s", maxAttempts, messageType) return } + c.reconnectMux.RLock() + connected = c.isConnected + c.reconnectMux.RUnlock() err = c.SendMessage(messageType, data) if err != nil { logger.Error("Failed to send message: %v", err) + } else if connected { + count++ } - count++ case <-stopChan: return } @@ -836,7 +844,7 @@ func (c *Client) readPumpWithDisconnectDetection(started time.Time) { logger.Error("WebSocket failed to parse message: %v", err) continue } - + c.setConfigVersion(msg.ConfigVersion) c.handlersMux.RLock()