debug dns

This commit is contained in:
Hakan Sariman
2025-09-09 14:48:39 +07:00
parent 7aef0f67df
commit 5b246e0a08
4 changed files with 73 additions and 4 deletions

View File

@@ -388,12 +388,12 @@ func generateDebugBundle(config *profilemanager.Config, recorder *peer.Status, c
}
func init() {
debugBundleCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 1, "Number of rotated log files to include in debug bundle")
debugBundleCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 10, "Number of rotated log files to include in debug bundle")
debugBundleCmd.Flags().BoolVarP(&systemInfoFlag, "system-info", "S", true, "Adds system information to the debug bundle")
debugBundleCmd.Flags().BoolVarP(&uploadBundleFlag, "upload-bundle", "U", false, "Uploads the debug bundle to a server")
debugBundleCmd.Flags().StringVar(&uploadBundleURLFlag, "upload-bundle-url", types.DefaultBundleURL, "Service URL to get an URL to upload the debug bundle")
forCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 1, "Number of rotated log files to include in debug bundle")
forCmd.Flags().Uint32VarP(&logFileCount, "log-file-count", "C", 10, "Number of rotated log files to include in debug bundle")
forCmd.Flags().BoolVarP(&systemInfoFlag, "system-info", "S", true, "Adds system information to the debug bundle")
forCmd.Flags().BoolVarP(&uploadBundleFlag, "upload-bundle", "U", false, "Uploads the debug bundle to a server")
forCmd.Flags().StringVar(&uploadBundleURLFlag, "upload-bundle-url", types.DefaultBundleURL, "Service URL to get an URL to upload the debug bundle")

View File

@@ -46,6 +46,12 @@ type DNSForwarder struct {
fwdEntries []*ForwarderEntry
firewall firewaller
resolver resolver
// failure rate tracking for routed domains
failureMu sync.Mutex
failureCounts map[string]int
failureWindow time.Duration
lastLogPerHost map[string]time.Time
}
func NewDNSForwarder(listenAddress string, ttl uint32, firewall firewaller, statusRecorder *peer.Status) *DNSForwarder {
@@ -56,6 +62,9 @@ func NewDNSForwarder(listenAddress string, ttl uint32, firewall firewaller, stat
firewall: firewall,
statusRecorder: statusRecorder,
resolver: net.DefaultResolver,
failureCounts: make(map[string]int),
failureWindow: 10 * time.Second,
lastLogPerHost: make(map[string]time.Time),
}
}
@@ -306,6 +315,11 @@ func (f *DNSForwarder) handleDNSError(ctx context.Context, w dns.ResponseWriter,
if err := w.WriteMsg(resp); err != nil {
log.Errorf("failed to write failure DNS response: %v", err)
}
// Track failure rate for routed domains only
if resID, _ := f.getMatchingEntries(strings.TrimSuffix(domain, ".")); resID != "" {
f.recordDomainFailure(strings.TrimSuffix(domain, "."))
}
}
// addIPsToResponse adds IP addresses to the DNS response as appropriate A or AAAA records
@@ -341,6 +355,27 @@ func (f *DNSForwarder) addIPsToResponse(resp *dns.Msg, domain string, ips []neti
}
}
// recordDomainFailure increments failure count for the domain and logs at info/warn with throttling.
func (f *DNSForwarder) recordDomainFailure(domain string) {
domain = strings.ToLower(domain)
f.failureMu.Lock()
defer f.failureMu.Unlock()
f.failureCounts[domain]++
count := f.failureCounts[domain]
now := time.Now()
last, ok := f.lastLogPerHost[domain]
if ok && now.Sub(last) < f.failureWindow {
return
}
f.lastLogPerHost[domain] = now
log.Warnf("[d] DNS failures observed for routed domain: domain=%s failures=%d/%s", domain, count, f.failureWindow)
}
// getMatchingEntries retrieves the resource IDs for a given domain.
// It returns the most specific match and all matching resource IDs.
func (f *DNSForwarder) getMatchingEntries(domain string) (route.ResID, []*ForwarderEntry) {

View File

@@ -21,9 +21,9 @@ import (
"github.com/netbirdio/netbird/client/internal/ingressgw"
"github.com/netbirdio/netbird/client/internal/relay"
"github.com/netbirdio/netbird/client/proto"
"github.com/netbirdio/netbird/route"
"github.com/netbirdio/netbird/shared/management/domain"
relayClient "github.com/netbirdio/netbird/shared/relay/client"
"github.com/netbirdio/netbird/route"
)
const eventQueueSize = 10
@@ -201,6 +201,8 @@ type Status struct {
resolvedDomainsStates map[domain.Domain]ResolvedDomainInfo
lazyConnectionEnabled bool
lastDisconnectLog map[string]time.Time
// To reduce the number of notification invocation this bool will be true when need to call the notification
// Some Peer actions mostly used by in a batch when the network map has been synchronized. In these type of events
// set to true this variable and at the end of the processing we will reset it by the FinishPeerListModifications()
@@ -229,6 +231,7 @@ func NewRecorder(mgmAddress string) *Status {
notifier: newNotifier(),
mgmAddress: mgmAddress,
resolvedDomainsStates: map[domain.Domain]ResolvedDomainInfo{},
lastDisconnectLog: make(map[string]time.Time),
}
}
@@ -487,6 +490,9 @@ func (d *Status) UpdatePeerRelayedStateToDisconnected(receivedState State) error
d.peers[receivedState.PubKey] = peerState
// info log about disconnect with impacted routes (throttled)
d.logPeerDisconnectIfNeeded(receivedState.PubKey, peerState)
if hasConnStatusChanged(oldState, receivedState.ConnStatus) {
d.notifyPeerListChanged()
}
@@ -519,6 +525,9 @@ func (d *Status) UpdatePeerICEStateToDisconnected(receivedState State) error {
d.peers[receivedState.PubKey] = peerState
// info log about disconnect with impacted routes (throttled)
d.logPeerDisconnectIfNeeded(receivedState.PubKey, peerState)
if hasConnStatusChanged(oldState, receivedState.ConnStatus) {
d.notifyPeerListChanged()
}
@@ -529,6 +538,31 @@ func (d *Status) UpdatePeerICEStateToDisconnected(receivedState State) error {
return nil
}
// logPeerDisconnectIfNeeded logs an info message when a routing peer transitions to disconnected
// with the number of impacted routes. Throttled to once per peer per 30 seconds.
func (d *Status) logPeerDisconnectIfNeeded(pubKey string, state State) {
if state.ConnStatus != StatusIdle {
return
}
now := time.Now()
last, ok := d.lastDisconnectLog[pubKey]
if ok && now.Sub(last) < 10*time.Second {
return
}
d.lastDisconnectLog[pubKey] = now
routes := state.GetRoutes()
numRoutes := len(routes)
fqdn := state.FQDN
if fqdn == "" {
fqdn = pubKey
}
log.Warnf("[d] Routing peer disconnected: peer=%s impacted_routes=%d", fqdn, numRoutes)
}
// UpdateWireGuardPeerState updates the WireGuard bits of the peer state
func (d *Status) UpdateWireGuardPeerState(pubKey string, wgStats configurer.WGStats) error {
d.mux.Lock()

View File

@@ -14,7 +14,7 @@ import (
"github.com/netbirdio/netbird/formatter"
)
const defaultLogSize = 15
const defaultLogSize = 100
const (
LogConsole = "console"