diff --git a/client/cmd/debug.go b/client/cmd/debug.go index 18f3547ca..d788cd0e7 100644 --- a/client/cmd/debug.go +++ b/client/cmd/debug.go @@ -388,12 +388,12 @@ func generateDebugBundle(config *profilemanager.Config, recorder *peer.Status, c } func init() { - debugBundleCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 1, "Number of rotated log files to include in debug bundle") + debugBundleCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 10, "Number of rotated log files to include in debug bundle") debugBundleCmd.Flags().BoolVarP(&systemInfoFlag, "system-info", "S", true, "Adds system information to the debug bundle") debugBundleCmd.Flags().BoolVarP(&uploadBundleFlag, "upload-bundle", "U", false, "Uploads the debug bundle to a server") debugBundleCmd.Flags().StringVar(&uploadBundleURLFlag, "upload-bundle-url", types.DefaultBundleURL, "Service URL to get an URL to upload the debug bundle") - forCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 1, "Number of rotated log files to include in debug bundle") + forCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 10, "Number of rotated log files to include in debug bundle") forCmd.Flags().BoolVarP(&systemInfoFlag, "system-info", "S", true, "Adds system information to the debug bundle") forCmd.Flags().BoolVarP(&uploadBundleFlag, "upload-bundle", "U", false, "Uploads the debug bundle to a server") forCmd.Flags().StringVar(&uploadBundleURLFlag, "upload-bundle-url", types.DefaultBundleURL, "Service URL to get an URL to upload the debug bundle") diff --git a/client/internal/dnsfwd/forwarder.go b/client/internal/dnsfwd/forwarder.go index d912919a1..86f13ad1a 100644 --- a/client/internal/dnsfwd/forwarder.go +++ b/client/internal/dnsfwd/forwarder.go @@ -46,6 +46,12 @@ type DNSForwarder struct { fwdEntries []*ForwarderEntry firewall firewaller resolver resolver + + // failure rate tracking for routed domains + failureMu sync.Mutex + failureCounts map[string]int + failureWindow time.Duration + lastLogPerHost map[string]time.Time } func NewDNSForwarder(listenAddress string, ttl uint32, firewall firewaller, statusRecorder *peer.Status) *DNSForwarder { @@ -56,6 +62,9 @@ func NewDNSForwarder(listenAddress string, ttl uint32, firewall firewaller, stat firewall: firewall, statusRecorder: statusRecorder, resolver: net.DefaultResolver, + failureCounts: make(map[string]int), + failureWindow: 10 * time.Second, + lastLogPerHost: make(map[string]time.Time), } } @@ -306,6 +315,11 @@ func (f *DNSForwarder) handleDNSError(ctx context.Context, w dns.ResponseWriter, if err := w.WriteMsg(resp); err != nil { log.Errorf("failed to write failure DNS response: %v", err) } + + // Track failure rate for routed domains only + if resID, _ := f.getMatchingEntries(strings.TrimSuffix(domain, ".")); resID != "" { + f.recordDomainFailure(strings.TrimSuffix(domain, ".")) + } } // addIPsToResponse adds IP addresses to the DNS response as appropriate A or AAAA records @@ -341,6 +355,27 @@ func (f *DNSForwarder) addIPsToResponse(resp *dns.Msg, domain string, ips []neti } } +// recordDomainFailure increments failure count for the domain and logs at info/warn with throttling. +func (f *DNSForwarder) recordDomainFailure(domain string) { + domain = strings.ToLower(domain) + + f.failureMu.Lock() + defer f.failureMu.Unlock() + + f.failureCounts[domain]++ + count := f.failureCounts[domain] + + now := time.Now() + last, ok := f.lastLogPerHost[domain] + if ok && now.Sub(last) < f.failureWindow { + return + } + f.lastLogPerHost[domain] = now + + log.Warnf("[d] DNS failures observed for routed domain: domain=%s failures=%d/%s", domain, count, f.failureWindow) + +} + // getMatchingEntries retrieves the resource IDs for a given domain. // It returns the most specific match and all matching resource IDs. func (f *DNSForwarder) getMatchingEntries(domain string) (route.ResID, []*ForwarderEntry) { diff --git a/client/internal/peer/status.go b/client/internal/peer/status.go index 239cce7e0..d7948de04 100644 --- a/client/internal/peer/status.go +++ b/client/internal/peer/status.go @@ -21,9 +21,9 @@ import ( "github.com/netbirdio/netbird/client/internal/ingressgw" "github.com/netbirdio/netbird/client/internal/relay" "github.com/netbirdio/netbird/client/proto" + "github.com/netbirdio/netbird/route" "github.com/netbirdio/netbird/shared/management/domain" relayClient "github.com/netbirdio/netbird/shared/relay/client" - "github.com/netbirdio/netbird/route" ) const eventQueueSize = 10 @@ -201,6 +201,8 @@ type Status struct { resolvedDomainsStates map[domain.Domain]ResolvedDomainInfo lazyConnectionEnabled bool + lastDisconnectLog map[string]time.Time + // To reduce the number of notification invocation this bool will be true when need to call the notification // Some Peer actions mostly used by in a batch when the network map has been synchronized. In these type of events // set to true this variable and at the end of the processing we will reset it by the FinishPeerListModifications() @@ -229,6 +231,7 @@ func NewRecorder(mgmAddress string) *Status { notifier: newNotifier(), mgmAddress: mgmAddress, resolvedDomainsStates: map[domain.Domain]ResolvedDomainInfo{}, + lastDisconnectLog: make(map[string]time.Time), } } @@ -487,6 +490,9 @@ func (d *Status) UpdatePeerRelayedStateToDisconnected(receivedState State) error d.peers[receivedState.PubKey] = peerState + // info log about disconnect with impacted routes (throttled) + d.logPeerDisconnectIfNeeded(receivedState.PubKey, peerState) + if hasConnStatusChanged(oldState, receivedState.ConnStatus) { d.notifyPeerListChanged() } @@ -519,6 +525,9 @@ func (d *Status) UpdatePeerICEStateToDisconnected(receivedState State) error { d.peers[receivedState.PubKey] = peerState + // info log about disconnect with impacted routes (throttled) + d.logPeerDisconnectIfNeeded(receivedState.PubKey, peerState) + if hasConnStatusChanged(oldState, receivedState.ConnStatus) { d.notifyPeerListChanged() } @@ -529,6 +538,31 @@ func (d *Status) UpdatePeerICEStateToDisconnected(receivedState State) error { return nil } +// logPeerDisconnectIfNeeded logs an info message when a routing peer transitions to disconnected +// with the number of impacted routes. Throttled to once per peer per 30 seconds. +func (d *Status) logPeerDisconnectIfNeeded(pubKey string, state State) { + if state.ConnStatus != StatusIdle { + return + } + + now := time.Now() + last, ok := d.lastDisconnectLog[pubKey] + if ok && now.Sub(last) < 10*time.Second { + return + } + d.lastDisconnectLog[pubKey] = now + + routes := state.GetRoutes() + numRoutes := len(routes) + + fqdn := state.FQDN + if fqdn == "" { + fqdn = pubKey + } + + log.Warnf("[d] Routing peer disconnected: peer=%s impacted_routes=%d", fqdn, numRoutes) +} + // UpdateWireGuardPeerState updates the WireGuard bits of the peer state func (d *Status) UpdateWireGuardPeerState(pubKey string, wgStats configurer.WGStats) error { d.mux.Lock() diff --git a/util/log.go b/util/log.go index a951eab87..b121702f6 100644 --- a/util/log.go +++ b/util/log.go @@ -14,7 +14,7 @@ import ( "github.com/netbirdio/netbird/formatter" ) -const defaultLogSize = 15 +const defaultLogSize = 100 const ( LogConsole = "console"