[client] Drop DNS probes for passive health projection

This commit is contained in:
Viktor Liu
2026-04-23 06:39:16 +02:00
parent 801de8c68d
commit c102592735
12 changed files with 1568 additions and 703 deletions

View File

@@ -6,11 +6,9 @@ import (
"fmt"
"net/netip"
"net/url"
"os"
"runtime"
"strconv"
"strings"
"sync"
"time"
"github.com/miekg/dns"
"github.com/mitchellh/hashstructure/v2"
@@ -25,11 +23,26 @@ import (
"github.com/netbirdio/netbird/client/internal/listener"
"github.com/netbirdio/netbird/client/internal/peer"
"github.com/netbirdio/netbird/client/internal/statemanager"
"github.com/netbirdio/netbird/client/proto"
nbdns "github.com/netbirdio/netbird/dns"
"github.com/netbirdio/netbird/route"
"github.com/netbirdio/netbird/shared/management/domain"
)
const envSkipDNSProbe = "NB_SKIP_DNS_PROBE"
const (
// healthLookback must exceed the upstream query timeout so one
// query per refresh cycle is enough to keep a group marked healthy.
healthLookback = 60 * time.Second
nsGroupHealthRefreshInterval = 10 * time.Second
// defaultWarningDelayBase is the starting grace window before a
// "Nameserver group unreachable" event fires for a group that's
// never been healthy and only has overlay upstreams with no
// Connected peer. Per-server and overridable; see warningDelayFor.
defaultWarningDelayBase = 30 * time.Second
// warningDelayBonusCap caps the route-count bonus added to the
// base grace window. See warningDelayFor.
warningDelayBonusCap = 30 * time.Second
)
// ReadyListener is a notification mechanism what indicate the server is ready to handle host dns address changes
type ReadyListener interface {
@@ -54,10 +67,9 @@ type Server interface {
UpdateDNSServer(serial uint64, update nbdns.Config) error
OnUpdatedHostDNSServer(addrs []netip.AddrPort)
SearchDomains() []string
ProbeAvailability()
UpdateServerConfig(domains dnsconfig.ServerDomains) error
PopulateManagementDomain(mgmtURL *url.URL) error
SetRouteChecker(func(netip.Addr) bool)
SetRouteSources(selected, active func() route.HAMap)
SetFirewall(Firewall)
}
@@ -66,6 +78,47 @@ type nsGroupsByDomain struct {
groups []*nbdns.NameServerGroup
}
// nsGroupID identifies a nameserver group by the tuple (server list, domain
// list) so config updates produce stable IDs across recomputations.
type nsGroupID string
// nsHealthSnapshot is the input to projectNSGroupHealth, captured under
// s.mux so projection runs lock-free.
type nsHealthSnapshot struct {
groups []*nbdns.NameServerGroup
merged map[netip.AddrPort]UpstreamHealth
selected route.HAMap
active route.HAMap
}
// nsGroupProj holds per-group state for the emission rules.
type nsGroupProj struct {
// unhealthySince is the start of the current Unhealthy streak,
// zero when the group is not currently Unhealthy.
unhealthySince time.Time
// everHealthy is sticky: once the group has been Healthy at least
// once this session, subsequent failures skip warningDelay.
everHealthy bool
// warningActive tracks whether we've already published a warning
// for the current streak, so recovery emits iff a warning did.
warningActive bool
}
// nsGroupVerdict is the outcome of evaluateNSGroupHealth.
type nsGroupVerdict int
const (
// nsVerdictUndecided means no upstream has a fresh observation
// (startup before first query, or records aged past healthLookback).
nsVerdictUndecided nsGroupVerdict = iota
// nsVerdictHealthy means at least one upstream's most-recent
// in-lookback observation is a success.
nsVerdictHealthy
// nsVerdictUnhealthy means at least one upstream has a recent
// failure and none has a fresher success.
nsVerdictUnhealthy
)
// hostManagerWithOriginalNS extends the basic hostManager interface
type hostManagerWithOriginalNS interface {
hostManager
@@ -106,20 +159,35 @@ type DefaultServer struct {
statusRecorder *peer.Status
stateManager *statemanager.Manager
routeMatch func(netip.Addr) bool
// selectedRoutes returns admin-enabled client routes.
selectedRoutes func() route.HAMap
// activeRoutes returns the subset whose peer is in StatusConnected.
activeRoutes func() route.HAMap
probeMu sync.Mutex
probeCancel context.CancelFunc
probeWg sync.WaitGroup
nsGroups []*nbdns.NameServerGroup
healthProjectMu sync.Mutex
// nsGroupProj is the per-group state used by the emission rules.
// Accessed only under healthProjectMu.
nsGroupProj map[nsGroupID]*nsGroupProj
// warningDelayBase is the base grace window for health projection.
// Set at construction, mutated only by tests. Read by the
// refresher goroutine so never change it while one is running.
warningDelayBase time.Duration
// healthRefresh is buffered=1; writers coalesce, senders never block.
// See refreshHealth for the lock-order rationale.
healthRefresh chan struct{}
}
type handlerWithStop interface {
dns.Handler
Stop()
ProbeAvailability(context.Context)
ID() types.HandlerID
}
type upstreamHealthReporter interface {
UpstreamHealth() map[netip.AddrPort]UpstreamHealth
}
type handlerWrapper struct {
domain string
handler handlerWithStop
@@ -230,6 +298,8 @@ func newDefaultServer(
hostManager: &noopHostConfigurator{},
mgmtCacheResolver: mgmtCacheResolver,
currentConfigHash: ^uint64(0), // Initialize to max uint64 to ensure first config is always applied
warningDelayBase: defaultWarningDelayBase,
healthRefresh: make(chan struct{}, 1),
}
// register with root zone, handler chain takes care of the routing
@@ -238,12 +308,13 @@ func newDefaultServer(
return defaultServer
}
// SetRouteChecker sets the function used by upstream resolvers to determine
// whether an IP is routed through the tunnel.
func (s *DefaultServer) SetRouteChecker(f func(netip.Addr) bool) {
// SetRouteSources wires the route-manager accessors used by health
// projection to classify each upstream for emission timing.
func (s *DefaultServer) SetRouteSources(selected, active func() route.HAMap) {
s.mux.Lock()
defer s.mux.Unlock()
s.routeMatch = f
s.selectedRoutes = selected
s.activeRoutes = active
}
// RegisterHandler registers a handler for the given domains with the given priority.
@@ -256,7 +327,6 @@ func (s *DefaultServer) RegisterHandler(domains domain.List, handler dns.Handler
// TODO: This will take over zones for non-wildcard domains, for which we might not have a handler in the chain
for _, domain := range domains {
// convert to zone with simple ref counter
s.extraDomains[toZone(domain)]++
}
if !s.batchMode {
@@ -357,6 +427,8 @@ func (s *DefaultServer) Initialize() (err error) {
s.stateManager.RegisterState(&ShutdownState{})
s.startHealthRefresher()
// Keep using noop host manager if dns off requested or running in netstack mode.
// Netstack mode currently doesn't have a way to receive DNS requests.
// TODO: Use listener on localhost in netstack mode when running as root.
@@ -394,13 +466,7 @@ func (s *DefaultServer) SetFirewall(fw Firewall) {
// Stop stops the server
func (s *DefaultServer) Stop() {
s.probeMu.Lock()
if s.probeCancel != nil {
s.probeCancel()
}
s.ctxCancel()
s.probeMu.Unlock()
s.probeWg.Wait()
s.shutdownWg.Wait()
s.mux.Lock()
@@ -411,6 +477,13 @@ func (s *DefaultServer) Stop() {
}
maps.Clear(s.extraDomains)
// Clear health projection state so a subsequent Start doesn't
// inherit sticky flags (notably everHealthy) that would bypass
// the grace window during the next peer handshake.
s.healthProjectMu.Lock()
s.nsGroupProj = nil
s.healthProjectMu.Unlock()
}
func (s *DefaultServer) disableDNS() (retErr error) {
@@ -446,7 +519,6 @@ func (s *DefaultServer) disableDNS() (retErr error) {
func (s *DefaultServer) OnUpdatedHostDNSServer(hostsDnsList []netip.AddrPort) {
s.hostsDNSHolder.set(hostsDnsList)
// Check if there's any root handler
var hasRootHandler bool
for _, handler := range s.dnsMuxMap {
if handler.domain == nbdns.RootZone {
@@ -520,69 +592,6 @@ func (s *DefaultServer) SearchDomains() []string {
return searchDomains
}
// ProbeAvailability tests each upstream group's servers for availability
// and deactivates the group if no server responds.
// If a previous probe is still running, it will be cancelled before starting a new one.
func (s *DefaultServer) ProbeAvailability() {
if val := os.Getenv(envSkipDNSProbe); val != "" {
skipProbe, err := strconv.ParseBool(val)
if err != nil {
log.Warnf("failed to parse %s: %v", envSkipDNSProbe, err)
}
if skipProbe {
log.Infof("skipping DNS probe due to %s", envSkipDNSProbe)
return
}
}
s.probeMu.Lock()
// don't start probes on a stopped server
if s.ctx.Err() != nil {
s.probeMu.Unlock()
return
}
// cancel any running probe
if s.probeCancel != nil {
s.probeCancel()
s.probeCancel = nil
}
// wait for the previous probe goroutines to finish while holding
// the mutex so no other caller can start a new probe concurrently
s.probeWg.Wait()
// start a new probe
probeCtx, probeCancel := context.WithCancel(s.ctx)
s.probeCancel = probeCancel
s.probeWg.Add(1)
defer s.probeWg.Done()
// Snapshot handlers under s.mux to avoid racing with updateMux/dnsMuxMap writers.
s.mux.Lock()
handlers := make([]handlerWithStop, 0, len(s.dnsMuxMap))
for _, mux := range s.dnsMuxMap {
handlers = append(handlers, mux.handler)
}
s.mux.Unlock()
var wg sync.WaitGroup
for _, handler := range handlers {
wg.Add(1)
go func(h handlerWithStop) {
defer wg.Done()
h.ProbeAvailability(probeCtx)
}(handler)
}
s.probeMu.Unlock()
wg.Wait()
probeCancel()
}
func (s *DefaultServer) UpdateServerConfig(domains dnsconfig.ServerDomains) error {
s.mux.Lock()
defer s.mux.Unlock()
@@ -769,25 +778,23 @@ func (s *DefaultServer) registerFallback(config HostDNSConfig) {
s.wgInterface,
s.statusRecorder,
s.hostsDNSHolder,
nbdns.RootZone,
domain.Domain(nbdns.RootZone),
)
if err != nil {
log.Errorf("failed to create upstream resolver for original nameservers: %v", err)
return
}
handler.routeMatch = s.routeMatch
handler.selectedRoutes = s.selectedRoutes
var servers []netip.AddrPort
for _, ns := range originalNameservers {
if ns == config.ServerIP {
log.Debugf("skipping original nameserver %s as it is the same as the server IP %s", ns, config.ServerIP)
continue
}
addrPort := netip.AddrPortFrom(ns, DefaultPort)
handler.upstreamServers = append(handler.upstreamServers, addrPort)
servers = append(servers, netip.AddrPortFrom(ns, DefaultPort))
}
handler.deactivate = func(error) { /* always active */ }
handler.reactivate = func() { /* always active */ }
handler.addRace(servers)
s.registerHandler([]string{nbdns.RootZone}, handler, PriorityFallback)
}
@@ -847,100 +854,77 @@ func (s *DefaultServer) buildUpstreamHandlerUpdate(nameServerGroups []*nbdns.Nam
groupedNS := groupNSGroupsByDomain(nameServerGroups)
for _, domainGroup := range groupedNS {
basePriority := PriorityUpstream
priority := PriorityUpstream
if domainGroup.domain == nbdns.RootZone {
basePriority = PriorityDefault
priority = PriorityDefault
}
updates, err := s.createHandlersForDomainGroup(domainGroup, basePriority)
update, err := s.buildMergedDomainHandler(domainGroup, priority)
if err != nil {
return nil, err
}
muxUpdates = append(muxUpdates, updates...)
if update != nil {
muxUpdates = append(muxUpdates, *update)
}
}
return muxUpdates, nil
}
func (s *DefaultServer) createHandlersForDomainGroup(domainGroup nsGroupsByDomain, basePriority int) ([]handlerWrapper, error) {
var muxUpdates []handlerWrapper
// buildMergedDomainHandler merges every nameserver group that targets the
// same domain into one handler whose inner groups are raced in parallel.
func (s *DefaultServer) buildMergedDomainHandler(domainGroup nsGroupsByDomain, priority int) (*handlerWrapper, error) {
handler, err := newUpstreamResolver(
s.ctx,
s.wgInterface,
s.statusRecorder,
s.hostsDNSHolder,
domain.Domain(domainGroup.domain),
)
if err != nil {
return nil, fmt.Errorf("create upstream resolver: %v", err)
}
handler.selectedRoutes = s.selectedRoutes
for i, nsGroup := range domainGroup.groups {
// Decrement priority by handler index (0, 1, 2, ...) to avoid conflicts
priority := basePriority - i
// Check if we're about to overlap with the next priority tier
if s.leaksPriority(domainGroup, basePriority, priority) {
break
}
log.Debugf("creating handler for domain=%s with priority=%d", domainGroup.domain, priority)
handler, err := newUpstreamResolver(
s.ctx,
s.wgInterface,
s.statusRecorder,
s.hostsDNSHolder,
domainGroup.domain,
)
if err != nil {
return nil, fmt.Errorf("create upstream resolver: %v", err)
}
handler.routeMatch = s.routeMatch
for _, ns := range nsGroup.NameServers {
if ns.NSType != nbdns.UDPNameServerType {
log.Warnf("skipping nameserver %s with type %s, this peer supports only %s",
ns.IP.String(), ns.NSType.String(), nbdns.UDPNameServerType.String())
continue
}
if ns.IP == s.service.RuntimeIP() {
log.Warnf("skipping nameserver %s as it matches our DNS server IP, preventing potential loop", ns.IP)
continue
}
handler.upstreamServers = append(handler.upstreamServers, ns.AddrPort())
}
if len(handler.upstreamServers) == 0 {
handler.Stop()
log.Errorf("received a nameserver group with an invalid nameserver list")
for _, nsGroup := range domainGroup.groups {
servers := s.filterNameServers(nsGroup.NameServers)
if len(servers) == 0 {
log.Warnf("nameserver group for domain=%s yielded no usable servers, skipping", domainGroup.domain)
continue
}
// when upstream fails to resolve domain several times over all it servers
// it will calls this hook to exclude self from the configuration and
// reapply DNS settings, but it not touch the original configuration and serial number
// because it is temporal deactivation until next try
//
// after some period defined by upstream it tries to reactivate self by calling this hook
// everything we need here is just to re-apply current configuration because it already
// contains this upstream settings (temporal deactivation not removed it)
handler.deactivate, handler.reactivate = s.upstreamCallbacks(nsGroup, handler, priority)
muxUpdates = append(muxUpdates, handlerWrapper{
domain: domainGroup.domain,
handler: handler,
priority: priority,
})
handler.addRace(servers)
}
return muxUpdates, nil
if len(handler.upstreamServers) == 0 {
handler.Stop()
log.Errorf("no usable nameservers for domain=%s", domainGroup.domain)
return nil, nil
}
log.Debugf("creating merged handler for domain=%s with %d group(s) priority=%d", domainGroup.domain, len(handler.upstreamServers), priority)
return &handlerWrapper{
domain: domainGroup.domain,
handler: handler,
priority: priority,
}, nil
}
func (s *DefaultServer) leaksPriority(domainGroup nsGroupsByDomain, basePriority int, priority int) bool {
if basePriority == PriorityUpstream && priority <= PriorityDefault {
log.Warnf("too many handlers for domain=%s, would overlap with default priority tier (diff=%d). Skipping remaining handlers",
domainGroup.domain, PriorityUpstream-PriorityDefault)
return true
func (s *DefaultServer) filterNameServers(nameServers []nbdns.NameServer) []netip.AddrPort {
var out []netip.AddrPort
for _, ns := range nameServers {
if ns.NSType != nbdns.UDPNameServerType {
log.Warnf("skipping nameserver %s with type %s, this peer supports only %s",
ns.IP.String(), ns.NSType.String(), nbdns.UDPNameServerType.String())
continue
}
if ns.IP == s.service.RuntimeIP() {
log.Warnf("skipping nameserver %s as it matches our DNS server IP, preventing potential loop", ns.IP)
continue
}
out = append(out, ns.AddrPort())
}
if basePriority == PriorityDefault && priority <= PriorityFallback {
log.Warnf("too many handlers for domain=%s, would overlap with fallback priority tier (diff=%d). Skipping remaining handlers",
domainGroup.domain, PriorityDefault-PriorityFallback)
return true
}
return false
return out
}
func (s *DefaultServer) updateMux(muxUpdates []handlerWrapper) {
@@ -974,84 +958,6 @@ func (s *DefaultServer) updateMux(muxUpdates []handlerWrapper) {
s.dnsMuxMap = muxUpdateMap
}
// upstreamCallbacks returns two functions, the first one is used to deactivate
// the upstream resolver from the configuration, the second one is used to
// reactivate it. Not allowed to call reactivate before deactivate.
func (s *DefaultServer) upstreamCallbacks(
nsGroup *nbdns.NameServerGroup,
handler dns.Handler,
priority int,
) (deactivate func(error), reactivate func()) {
var removeIndex map[string]int
deactivate = func(err error) {
s.mux.Lock()
defer s.mux.Unlock()
l := log.WithField("nameservers", nsGroup.NameServers)
l.Info("Temporarily deactivating nameservers group due to timeout")
removeIndex = make(map[string]int)
for _, domain := range nsGroup.Domains {
removeIndex[domain] = -1
}
if nsGroup.Primary {
removeIndex[nbdns.RootZone] = -1
s.currentConfig.RouteAll = false
s.deregisterHandler([]string{nbdns.RootZone}, priority)
}
for i, item := range s.currentConfig.Domains {
if _, found := removeIndex[item.Domain]; found {
s.currentConfig.Domains[i].Disabled = true
s.deregisterHandler([]string{item.Domain}, priority)
removeIndex[item.Domain] = i
}
}
// Always apply host config when nameserver goes down, regardless of batch mode
s.applyHostConfig()
go func() {
if err := s.stateManager.PersistState(s.ctx); err != nil {
l.Errorf("Failed to persist dns state: %v", err)
}
}()
if runtime.GOOS == "android" && nsGroup.Primary && len(s.hostsDNSHolder.get()) > 0 {
s.addHostRootZone()
}
s.updateNSState(nsGroup, err, false)
}
reactivate = func() {
s.mux.Lock()
defer s.mux.Unlock()
for domain, i := range removeIndex {
if i == -1 || i >= len(s.currentConfig.Domains) || s.currentConfig.Domains[i].Domain != domain {
continue
}
s.currentConfig.Domains[i].Disabled = false
s.registerHandler([]string{domain}, handler, priority)
}
l := log.WithField("nameservers", nsGroup.NameServers)
l.Debug("reactivate temporary disabled nameserver group")
if nsGroup.Primary {
s.currentConfig.RouteAll = true
s.registerHandler([]string{nbdns.RootZone}, handler, priority)
}
// Always apply host config when nameserver reactivates, regardless of batch mode
s.applyHostConfig()
s.updateNSState(nsGroup, nil, true)
}
return
}
func (s *DefaultServer) addHostRootZone() {
hostDNSServers := s.hostsDNSHolder.get()
if len(hostDNSServers) == 0 {
@@ -1070,56 +976,343 @@ func (s *DefaultServer) addHostRootZone() {
log.Errorf("unable to create a new upstream resolver, error: %v", err)
return
}
handler.routeMatch = s.routeMatch
handler.selectedRoutes = s.selectedRoutes
handler.upstreamServers = maps.Keys(hostDNSServers)
handler.deactivate = func(error) {}
handler.reactivate = func() {}
handler.addRace(maps.Keys(hostDNSServers))
s.registerHandler([]string{nbdns.RootZone}, handler, PriorityDefault)
}
// updateNSGroupStates records the new group set and pokes the refresher.
// Must hold s.mux; projection runs async (see refreshHealth for why).
func (s *DefaultServer) updateNSGroupStates(groups []*nbdns.NameServerGroup) {
var states []peer.NSGroupState
s.nsGroups = groups
select {
case s.healthRefresh <- struct{}{}:
default:
}
}
for _, group := range groups {
var servers []netip.AddrPort
for _, ns := range group.NameServers {
servers = append(servers, ns.AddrPort())
// refreshHealth runs one projection cycle. Must not be called while
// holding s.mux: the route callbacks re-enter routemanager's lock.
func (s *DefaultServer) refreshHealth() {
s.mux.Lock()
groups := s.nsGroups
merged := s.collectUpstreamHealth()
selFn := s.selectedRoutes
actFn := s.activeRoutes
s.mux.Unlock()
var selected, active route.HAMap
if selFn != nil {
selected = selFn()
}
if actFn != nil {
active = actFn()
}
s.projectNSGroupHealth(nsHealthSnapshot{
groups: groups,
merged: merged,
selected: selected,
active: active,
})
}
// projectNSGroupHealth applies the emission rules to the snapshot and
// publishes the resulting NSGroupStates. Serialized by healthProjectMu,
// lock-free wrt s.mux.
//
// Rules:
// - Healthy: emit recovery iff warningActive; set everHealthy.
// - Unhealthy: stamp unhealthySince on streak start; emit warning
// iff any of immediate / everHealthy / elapsed >= effective delay.
// - Undecided: no-op.
//
// "Immediate" means the group has at least one upstream that's public
// or overlay+Connected: no peer-startup race to wait out.
func (s *DefaultServer) projectNSGroupHealth(snap nsHealthSnapshot) {
if s.statusRecorder == nil {
return
}
s.healthProjectMu.Lock()
defer s.healthProjectMu.Unlock()
if s.nsGroupProj == nil {
s.nsGroupProj = make(map[nsGroupID]*nsGroupProj)
}
now := time.Now()
delay := s.warningDelay(len(snap.selected))
states := make([]peer.NSGroupState, 0, len(snap.groups))
seen := make(map[nsGroupID]struct{}, len(snap.groups))
for _, group := range snap.groups {
servers := nameServerAddrPorts(group.NameServers)
verdict, groupErr := evaluateNSGroupHealth(snap.merged, servers, now)
id := generateGroupKey(group)
seen[id] = struct{}{}
immediate := s.groupHasImmediateUpstream(servers, snap)
p, known := s.nsGroupProj[id]
if !known {
p = &nsGroupProj{}
s.nsGroupProj[id] = p
}
state := peer.NSGroupState{
ID: generateGroupKey(group),
enabled := true
switch verdict {
case nsVerdictHealthy:
enabled = s.projectHealthy(p, servers)
case nsVerdictUnhealthy:
enabled = s.projectUnhealthy(p, servers, immediate, now, delay)
case nsVerdictUndecided:
// Stay Available until evidence says otherwise, unless a
// warning is already active for this group.
enabled = !p.warningActive
groupErr = nil
}
states = append(states, peer.NSGroupState{
ID: string(id),
Servers: servers,
Domains: group.Domains,
// The probe will determine the state, default enabled
Enabled: true,
Error: nil,
}
states = append(states, state)
Enabled: enabled,
Error: groupErr,
})
}
s.statusRecorder.UpdateDNSStates(states)
}
func (s *DefaultServer) updateNSState(nsGroup *nbdns.NameServerGroup, err error, enabled bool) {
states := s.statusRecorder.GetDNSStates()
id := generateGroupKey(nsGroup)
for i, state := range states {
if state.ID == id {
states[i].Enabled = enabled
states[i].Error = err
break
for id := range s.nsGroupProj {
if _, ok := seen[id]; !ok {
delete(s.nsGroupProj, id)
}
}
s.statusRecorder.UpdateDNSStates(states)
}
func generateGroupKey(nsGroup *nbdns.NameServerGroup) string {
// projectHealthy records a healthy tick on p and publishes a recovery
// event iff a warning was active for the current streak. Returns the
// Enabled flag to record in NSGroupState.
func (s *DefaultServer) projectHealthy(p *nsGroupProj, servers []netip.AddrPort) bool {
p.everHealthy = true
p.unhealthySince = time.Time{}
if !p.warningActive {
return true
}
log.Debugf("DNS health: group [%s] recovered, emitting event", joinAddrPorts(servers))
s.statusRecorder.PublishEvent(
proto.SystemEvent_INFO,
proto.SystemEvent_DNS,
"Nameserver group recovered",
"DNS servers are reachable again.",
map[string]string{"upstreams": joinAddrPorts(servers)},
)
p.warningActive = false
return true
}
// projectUnhealthy records an unhealthy tick on p, publishes the
// warning when the emission rules fire, and returns the Enabled flag
// to record in NSGroupState.
func (s *DefaultServer) projectUnhealthy(p *nsGroupProj, servers []netip.AddrPort, immediate bool, now time.Time, delay time.Duration) bool {
streakStart := p.unhealthySince.IsZero()
if streakStart {
p.unhealthySince = now
}
reason := unhealthyEmitReason(immediate, p.everHealthy, now.Sub(p.unhealthySince), delay)
switch {
case reason != "" && !p.warningActive:
log.Debugf("DNS health: group [%s] unreachable, emitting event (reason=%s)", joinAddrPorts(servers), reason)
s.statusRecorder.PublishEvent(
proto.SystemEvent_WARNING,
proto.SystemEvent_DNS,
"Nameserver group unreachable",
"Unable to reach one or more DNS servers. This might affect your ability to connect to some services.",
map[string]string{"upstreams": joinAddrPorts(servers)},
)
p.warningActive = true
case streakStart && reason == "":
// One line per streak, not per tick.
log.Debugf("DNS health: group [%s] unreachable but holding warning for up to %v (overlay-routed, no connected peer)", joinAddrPorts(servers), delay)
}
return false
}
// warningDelay returns the grace window for the given selected-route
// count. Scales gently: +1s per 100 routes, capped by
// warningDelayBonusCap. Parallel handshakes mean handshake time grows
// much slower than route count, so linear scaling would overcorrect.
func (s *DefaultServer) warningDelay(routeCount int) time.Duration {
bonus := time.Duration(routeCount/100) * time.Second
if bonus > warningDelayBonusCap {
bonus = warningDelayBonusCap
}
return s.warningDelayBase + bonus
}
// groupHasImmediateUpstream reports whether the group has at least one
// upstream in a classification that bypasses the grace window: public
// (outside the overlay range and not routed), or overlay/routed with a
// Connected peer.
//
// TODO(ipv6): include the v6 overlay prefix once it's plumbed in.
func (s *DefaultServer) groupHasImmediateUpstream(servers []netip.AddrPort, snap nsHealthSnapshot) bool {
var overlayV4 netip.Prefix
if s.wgInterface != nil {
overlayV4 = s.wgInterface.Address().Network
}
for _, srv := range servers {
addr := srv.Addr().Unmap()
overlay := overlayV4.IsValid() && overlayV4.Contains(addr)
routed := haMapContains(snap.selected, addr)
if !overlay && !routed {
return true
}
if haMapContains(snap.active, addr) {
return true
}
}
return false
}
// collectUpstreamHealth merges health snapshots across handlers, keeping
// the most recent success and failure per upstream when an address appears
// in more than one handler.
func (s *DefaultServer) collectUpstreamHealth() map[netip.AddrPort]UpstreamHealth {
merged := make(map[netip.AddrPort]UpstreamHealth)
for _, entry := range s.dnsMuxMap {
reporter, ok := entry.handler.(upstreamHealthReporter)
if !ok {
continue
}
for addr, h := range reporter.UpstreamHealth() {
existing, have := merged[addr]
if !have {
merged[addr] = h
continue
}
if h.LastOk.After(existing.LastOk) {
existing.LastOk = h.LastOk
}
if h.LastFail.After(existing.LastFail) {
existing.LastFail = h.LastFail
existing.LastErr = h.LastErr
}
merged[addr] = existing
}
}
return merged
}
func (s *DefaultServer) startHealthRefresher() {
s.shutdownWg.Add(1)
go func() {
defer s.shutdownWg.Done()
ticker := time.NewTicker(nsGroupHealthRefreshInterval)
defer ticker.Stop()
for {
select {
case <-s.ctx.Done():
return
case <-ticker.C:
case <-s.healthRefresh:
}
s.refreshHealth()
}
}()
}
// evaluateNSGroupHealth decides a group's verdict from query records
// alone. Per upstream, the most-recent-in-lookback observation wins.
// Group is Healthy if any upstream is fresh-working, Unhealthy if any
// is fresh-broken with no fresh-working sibling, Undecided otherwise.
func evaluateNSGroupHealth(merged map[netip.AddrPort]UpstreamHealth, servers []netip.AddrPort, now time.Time) (nsGroupVerdict, error) {
anyWorking := false
anyBroken := false
var mostRecentFail time.Time
var mostRecentErr string
for _, srv := range servers {
h, ok := merged[srv]
if !ok {
continue
}
switch classifyUpstreamHealth(h, now) {
case upstreamFresh:
anyWorking = true
case upstreamBroken:
anyBroken = true
if h.LastFail.After(mostRecentFail) {
mostRecentFail = h.LastFail
mostRecentErr = h.LastErr
}
}
}
if anyWorking {
return nsVerdictHealthy, nil
}
if anyBroken {
if mostRecentErr == "" {
return nsVerdictUnhealthy, nil
}
return nsVerdictUnhealthy, errors.New(mostRecentErr)
}
return nsVerdictUndecided, nil
}
// upstreamClassification is the per-upstream verdict within healthLookback.
type upstreamClassification int
const (
upstreamStale upstreamClassification = iota
upstreamFresh
upstreamBroken
)
// classifyUpstreamHealth compares the last ok and last fail timestamps
// against healthLookback and returns which one (if any) counts. Fresh
// wins when both are in-window and ok is newer; broken otherwise.
func classifyUpstreamHealth(h UpstreamHealth, now time.Time) upstreamClassification {
okRecent := !h.LastOk.IsZero() && now.Sub(h.LastOk) <= healthLookback
failRecent := !h.LastFail.IsZero() && now.Sub(h.LastFail) <= healthLookback
switch {
case okRecent && failRecent:
if h.LastOk.After(h.LastFail) {
return upstreamFresh
}
return upstreamBroken
case okRecent:
return upstreamFresh
case failRecent:
return upstreamBroken
}
return upstreamStale
}
// nameServerAddrPorts flattens a NameServer list to AddrPorts.
func nameServerAddrPorts(ns []nbdns.NameServer) []netip.AddrPort {
out := make([]netip.AddrPort, 0, len(ns))
for _, n := range ns {
out = append(out, n.AddrPort())
}
return out
}
func joinAddrPorts(servers []netip.AddrPort) string {
parts := make([]string, 0, len(servers))
for _, s := range servers {
parts = append(parts, s.String())
}
return strings.Join(parts, ", ")
}
func generateGroupKey(nsGroup *nbdns.NameServerGroup) nsGroupID {
var servers []string
for _, ns := range nsGroup.NameServers {
servers = append(servers, ns.AddrPort().String())
}
return fmt.Sprintf("%v_%v", servers, nsGroup.Domains)
return nsGroupID(fmt.Sprintf("%v_%v", servers, nsGroup.Domains))
}
// groupNSGroupsByDomain groups nameserver groups by their match domains
@@ -1161,6 +1354,21 @@ func toZone(d domain.Domain) domain.Domain {
)
}
// unhealthyEmitReason returns the tag of the rule that fires the
// warning now, or "" if the group is still inside its grace window.
func unhealthyEmitReason(immediate, everHealthy bool, elapsed, delay time.Duration) string {
switch {
case immediate:
return "immediate"
case everHealthy:
return "ever-healthy"
case elapsed >= delay:
return "grace-elapsed"
default:
return ""
}
}
// PopulateManagementDomain populates the DNS cache with management domain
func (s *DefaultServer) PopulateManagementDomain(mgmtURL *url.URL) error {
if s.mgmtCacheResolver != nil {