Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 72 additions & 20 deletions client/internal/dns/upstream.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,19 @@ const (

const testRecord = "com."

// netbirdCGNATRange is the CGNAT range (100.64.0.0/10) used by NetBird for
// overlay peer IP addresses. Addresses in this range are only reachable once
// the WireGuard tunnel to the peer is fully established.
var netbirdCGNATRange = netip.MustParsePrefix("100.64.0.0/10")

// isNetBirdOverlayAddr returns true if the given address is within the NetBird
// CGNAT range (100.64.0.0/10). Such addresses are only reachable after the
// WireGuard tunnel is established, so probing them at engine startup will
// always produce a false-positive "unable to reach" warning.
func isNetBirdOverlayAddr(addr netip.AddrPort) bool {
return netbirdCGNATRange.Contains(addr.Addr())
}
Comment on lines +48 to +59
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Do not treat every 100.64.0.0/10 resolver as a NetBird peer.

100.64.0.0/10 is shared CGNAT space, not NetBird-exclusive. With the current check, any resolver in that block is skipped, so probedAny stays false and the group never disables or emits the startup warning even if that address is just a local/ISP resolver that's actually down. DefaultServer.ProbeAvailability() and engine startup both rely on this path to disable groups when no real upstream responds.

At minimum, only skip when you can positively map the address to a NetBird peer/route; otherwise keep the normal probe behavior. Please also add a regression case for a non-NetBird 100.64/10 nameserver.

💡 Safer gating direction
-func isNetBirdOverlayAddr(addr netip.AddrPort) bool {
-	return netbirdCGNATRange.Contains(addr.Addr())
+func (u *upstreamResolverBase) shouldSkipStartupProbe(addr netip.AddrPort) bool {
+	if !netbirdCGNATRange.Contains(addr.Addr()) {
+		return false
+	}
+	return findPeerForIP(addr.Addr(), u.statusRecorder) != nil
 }
@@
-		if isNetBirdOverlayAddr(upstream) {
+		if u.shouldSkipStartupProbe(upstream) {
 			log.Debugf("skipping probe for NetBird overlay nameserver %s (tunnel not yet established)", upstream)
 			continue
 		}

Also applies to: 303-310, 339-343

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@client/internal/dns/upstream.go` around lines 48 - 59, The current
isNetBirdOverlayAddr/netbirdCGNATRange check incorrectly treats every
100.64.0.0/10 resolver as a NetBird peer; change isNetBirdOverlayAddr (and any
callers) to only skip probing when the address is both in netbirdCGNATRange AND
is positively associated with a NetBird peer/route (e.g., consult the local
peer-to-IP mapping or routing table used by the engine before returning true),
otherwise return false so DefaultServer.ProbeAvailability and engine startup
will probe it normally; update callers at the other noted sites (around lines
~303-310 and ~339-343) to use the revised predicate; also add a regression test
that creates a non-NetBird nameserver in 100.64.0.0/10 and asserts it is probed
and that groups are disabled/emits the expected startup warning when no real
upstream responds.


type upstreamClient interface {
exchange(ctx context.Context, upstream string, r *dns.Msg) (*dns.Msg, time.Duration, error)
}
Expand All @@ -65,6 +78,7 @@ type upstreamResolverBase struct {
mutex sync.Mutex
reactivatePeriod time.Duration
upstreamTimeout time.Duration
wg sync.WaitGroup

deactivate func(error)
reactivate func()
Expand Down Expand Up @@ -115,6 +129,8 @@ func (u *upstreamResolverBase) MatchSubdomains() bool {
func (u *upstreamResolverBase) Stop() {
log.Debugf("stopping serving DNS for upstreams %s", u.upstreamServers)
u.cancel()

u.wg.Wait()
}

// ServeDNS handles a DNS request
Expand Down Expand Up @@ -259,51 +275,76 @@ func formatFailures(failures []upstreamFailure) string {
}

// ProbeAvailability tests all upstream servers simultaneously and
// disables the resolver if none work
// disables the resolver if none work.
//
// Nameservers configured with NetBird overlay IPs (100.64.0.0/10 CGNAT range)
// are skipped during this probe. Those addresses are only reachable once the
// WireGuard tunnel to the peer is established, which has not yet happened at
// engine startup. Probing them at this point always produces a false-positive
// "unable to reach" warning even though they will be available moments later.
// They continue to be exercised by real DNS queries and the waitUntilResponse
// retry loop if an actual failure occurs.
func (u *upstreamResolverBase) ProbeAvailability() {
u.mutex.Lock()
defer u.mutex.Unlock()

select {
case <-u.ctx.Done():
return
default:
}

// avoid probe if upstreams could resolve at least one query
if u.successCount.Load() > 0 {
return
}

var success bool
var probedAny bool
var mu sync.Mutex
var wg sync.WaitGroup

var errors *multierror.Error
var errs *multierror.Error
for _, upstream := range u.upstreamServers {
upstream := upstream
// Skip probing NetBird overlay addresses (100.64.0.0/10 CGNAT range).
// These peers are only reachable once the WireGuard tunnel is established.
// Probing them at engine startup produces a spurious warning; they will
// be exercised normally once real DNS queries arrive.
if isNetBirdOverlayAddr(upstream) {
log.Debugf("skipping probe for NetBird overlay nameserver %s (tunnel not yet established)", upstream)
continue
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

probedAny = true
wg.Add(1)
go func() {
go func(upstream netip.AddrPort) {
defer wg.Done()
err := u.testNameserver(upstream, 500*time.Millisecond)
err := u.testNameserver(u.ctx, nil, upstream, 500*time.Millisecond)
if err != nil {
errors = multierror.Append(errors, err)
mu.Lock()
errs = multierror.Append(errs, err)
mu.Unlock()
log.Warnf("probing upstream nameserver %s: %s", upstream, err)
return
}

mu.Lock()
defer mu.Unlock()
success = true
}()
mu.Unlock()
}(upstream)
}

wg.Wait()

select {
case <-u.ctx.Done():
return
default:
}

// All upstreams were overlay addresses — skip startup disable/warning.
// They will be exercised by real DNS queries once the tunnel is established.
if !probedAny {
return
}

// didn't find a working upstream server, let's disable and try later
if !success {
u.disable(errors.ErrorOrNil())
u.disable(errs.ErrorOrNil())

if u.statusRecorder == nil {
return
Expand Down Expand Up @@ -339,7 +380,7 @@ func (u *upstreamResolverBase) waitUntilResponse() {
}

for _, upstream := range u.upstreamServers {
if err := u.testNameserver(upstream, probeTimeout); err != nil {
if err := u.testNameserver(u.ctx, nil, upstream, probeTimeout); err != nil {
log.Tracef("upstream check for %s: %s", upstream, err)
} else {
// at least one upstream server is available, stop probing
Expand All @@ -364,7 +405,9 @@ func (u *upstreamResolverBase) waitUntilResponse() {
log.Infof("upstreams %s are responsive again. Adding them back to system", u.upstreamServersString())
u.successCount.Add(1)
u.reactivate()
u.mutex.Lock()
u.disabled = false
u.mutex.Unlock()
}

// isTimeout returns true if the given error is a network timeout error.
Expand All @@ -387,7 +430,11 @@ func (u *upstreamResolverBase) disable(err error) {
u.successCount.Store(0)
u.deactivate(err)
u.disabled = true
go u.waitUntilResponse()
u.wg.Add(1)
go func() {
defer u.wg.Done()
u.waitUntilResponse()
}()
}

func (u *upstreamResolverBase) upstreamServersString() string {
Expand All @@ -398,13 +445,18 @@ func (u *upstreamResolverBase) upstreamServersString() string {
return strings.Join(servers, ", ")
}

func (u *upstreamResolverBase) testNameserver(server netip.AddrPort, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(u.ctx, timeout)
func (u *upstreamResolverBase) testNameserver(baseCtx context.Context, externalCtx context.Context, server netip.AddrPort, timeout time.Duration) error {
mergedCtx, cancel := context.WithTimeout(baseCtx, timeout)
defer cancel()

if externalCtx != nil {
stop2 := context.AfterFunc(externalCtx, cancel)
defer stop2()
}

r := new(dns.Msg).SetQuestion(testRecord, dns.TypeSOA)

_, _, err := u.upstreamClient.exchange(ctx, server.String(), r)
_, _, err := u.upstreamClient.exchange(mergedCtx, server.String(), r)
return err
}

Expand Down
Loading