Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 66 additions & 20 deletions client/internal/dns/upstream.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,19 @@ const (

const testRecord = "com."

// netbirdCGNATRange is the CGNAT range (100.64.0.0/10) used by NetBird for
// overlay peer IP addresses. Addresses in this range are only reachable once
// the WireGuard tunnel to the peer is fully established.
var netbirdCGNATRange = netip.MustParsePrefix("100.64.0.0/10")

// isNetBirdOverlayAddr returns true if the given address is within the NetBird
// CGNAT range (100.64.0.0/10). Such addresses are only reachable after the
// WireGuard tunnel is established, so probing them at engine startup will
// always produce a false-positive "unable to reach" warning.
func isNetBirdOverlayAddr(addr netip.AddrPort) bool {
return netbirdCGNATRange.Contains(addr.Addr())
}
Comment on lines +48 to +59

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Do not treat every 100.64.0.0/10 resolver as a NetBird peer.

100.64.0.0/10 is shared CGNAT space, not NetBird-exclusive. With the current check, any resolver in that block is skipped, so probedAny stays false and the group never disables or emits the startup warning even if that address is just a local/ISP resolver that's actually down. DefaultServer.ProbeAvailability() and engine startup both rely on this path to disable groups when no real upstream responds.

At minimum, only skip when you can positively map the address to a NetBird peer/route; otherwise keep the normal probe behavior. Please also add a regression case for a non-NetBird 100.64/10 nameserver.

💡 Safer gating direction
-func isNetBirdOverlayAddr(addr netip.AddrPort) bool {
-	return netbirdCGNATRange.Contains(addr.Addr())
+func (u *upstreamResolverBase) shouldSkipStartupProbe(addr netip.AddrPort) bool {
+	if !netbirdCGNATRange.Contains(addr.Addr()) {
+		return false
+	}
+	return findPeerForIP(addr.Addr(), u.statusRecorder) != nil
 }
@@
-		if isNetBirdOverlayAddr(upstream) {
+		if u.shouldSkipStartupProbe(upstream) {
 			log.Debugf("skipping probe for NetBird overlay nameserver %s (tunnel not yet established)", upstream)
 			continue
 		}

Also applies to: 303-310, 339-343

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@client/internal/dns/upstream.go` around lines 48 - 59, The current
isNetBirdOverlayAddr/netbirdCGNATRange check incorrectly treats every
100.64.0.0/10 resolver as a NetBird peer; change isNetBirdOverlayAddr (and any
callers) to only skip probing when the address is both in netbirdCGNATRange AND
is positively associated with a NetBird peer/route (e.g., consult the local
peer-to-IP mapping or routing table used by the engine before returning true),
otherwise return false so DefaultServer.ProbeAvailability and engine startup
will probe it normally; update callers at the other noted sites (around lines
~303-310 and ~339-343) to use the revised predicate; also add a regression test
that creates a non-NetBird nameserver in 100.64.0.0/10 and asserts it is probed
and that groups are disabled/emits the expected startup warning when no real
upstream responds.


type upstreamClient interface {
exchange(ctx context.Context, upstream string, r *dns.Msg) (*dns.Msg, time.Duration, error)
}
Expand All @@ -65,6 +78,7 @@ type upstreamResolverBase struct {
mutex sync.Mutex
reactivatePeriod time.Duration
upstreamTimeout time.Duration
wg sync.WaitGroup

deactivate func(error)
reactivate func()
Expand Down Expand Up @@ -115,6 +129,7 @@ func (u *upstreamResolverBase) MatchSubdomains() bool {
func (u *upstreamResolverBase) Stop() {
log.Debugf("stopping serving DNS for upstreams %s", u.upstreamServers)
u.cancel()
u.wg.Wait()
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// ServeDNS handles a DNS request
Expand Down Expand Up @@ -259,17 +274,19 @@ func formatFailures(failures []upstreamFailure) string {
}

// ProbeAvailability tests all upstream servers simultaneously and
// disables the resolver if none work
// disables the resolver if none work.
//
// Nameservers configured with NetBird overlay IPs (100.64.0.0/10 CGNAT range)
// are skipped during this probe. Those addresses are only reachable once the
// WireGuard tunnel to the peer is established, which has not yet happened at
// engine startup. Probing them at this point always produces a false-positive
// "unable to reach" warning even though they will be available moments later.
// They continue to be exercised by real DNS queries and the waitUntilResponse
// retry loop if an actual failure occurs.
func (u *upstreamResolverBase) ProbeAvailability() {
u.mutex.Lock()
defer u.mutex.Unlock()

select {
case <-u.ctx.Done():
return
default:
}

// avoid probe if upstreams could resolve at least one query
if u.successCount.Load() > 0 {
return
Expand All @@ -279,31 +296,49 @@ func (u *upstreamResolverBase) ProbeAvailability() {
var mu sync.Mutex
var wg sync.WaitGroup

var errors *multierror.Error
var errs *multierror.Error
for _, upstream := range u.upstreamServers {
upstream := upstream
// Skip probing NetBird overlay addresses (100.64.0.0/10 CGNAT range).
// These peers are only reachable once the WireGuard tunnel is established.
// Probing them at engine startup produces a spurious warning; they will
// be exercised normally once real DNS queries arrive.
if isNetBirdOverlayAddr(upstream) {
log.Debugf("skipping probe for NetBird overlay nameserver %s (tunnel not yet established)", upstream)
mu.Lock()
success = true
mu.Unlock()
continue
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}

wg.Add(1)
go func() {
go func(upstream netip.AddrPort) {
defer wg.Done()
err := u.testNameserver(upstream, 500*time.Millisecond)
err := u.testNameserver(u.ctx, nil, upstream, 500*time.Millisecond)
if err != nil {
errors = multierror.Append(errors, err)
mu.Lock()
errs = multierror.Append(errs, err)
mu.Unlock()
log.Warnf("probing upstream nameserver %s: %s", upstream, err)
return
}

mu.Lock()
defer mu.Unlock()
success = true
}()
mu.Unlock()
}(upstream)
}

wg.Wait()

select {
case <-u.ctx.Done():
return
default:
}

// didn't find a working upstream server, let's disable and try later
if !success {
u.disable(errors.ErrorOrNil())
u.disable(errs.ErrorOrNil())

if u.statusRecorder == nil {
return
Expand Down Expand Up @@ -339,7 +374,7 @@ func (u *upstreamResolverBase) waitUntilResponse() {
}

for _, upstream := range u.upstreamServers {
if err := u.testNameserver(upstream, probeTimeout); err != nil {
if err := u.testNameserver(u.ctx, nil, upstream, probeTimeout); err != nil {
log.Tracef("upstream check for %s: %s", upstream, err)
} else {
// at least one upstream server is available, stop probing
Expand All @@ -364,7 +399,9 @@ func (u *upstreamResolverBase) waitUntilResponse() {
log.Infof("upstreams %s are responsive again. Adding them back to system", u.upstreamServersString())
u.successCount.Add(1)
u.reactivate()
u.mutex.Lock()
u.disabled = false
u.mutex.Unlock()
}

// isTimeout returns true if the given error is a network timeout error.
Expand All @@ -387,7 +424,11 @@ func (u *upstreamResolverBase) disable(err error) {
u.successCount.Store(0)
u.deactivate(err)
u.disabled = true
go u.waitUntilResponse()
u.wg.Add(1)
go func() {
defer u.wg.Done()
u.waitUntilResponse()
}()
}

func (u *upstreamResolverBase) upstreamServersString() string {
Expand All @@ -398,13 +439,18 @@ func (u *upstreamResolverBase) upstreamServersString() string {
return strings.Join(servers, ", ")
}

func (u *upstreamResolverBase) testNameserver(server netip.AddrPort, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(u.ctx, timeout)
func (u *upstreamResolverBase) testNameserver(baseCtx context.Context, externalCtx context.Context, server netip.AddrPort, timeout time.Duration) error {
mergedCtx, cancel := context.WithTimeout(baseCtx, timeout)
defer cancel()

if externalCtx != nil {
stop2 := context.AfterFunc(externalCtx, cancel)
defer stop2()
}

r := new(dns.Msg).SetQuestion(testRecord, dns.TypeSOA)

_, _, err := u.upstreamClient.exchange(ctx, server.String(), r)
_, _, err := u.upstreamClient.exchange(mergedCtx, server.String(), r)
return err
}

Expand Down
221 changes: 221 additions & 0 deletions client/internal/dns/upstream_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,224 @@ func TestFormatFailures(t *testing.T) {
})
}
}

// TestIsNetBirdOverlayAddr verifies that the CGNAT range check correctly
// identifies NetBird overlay addresses vs regular addresses.
func TestIsNetBirdOverlayAddr(t *testing.T) {
testCases := []struct {
name string
addr string
expected bool
}{
{
name: "NetBird overlay address at start of range",
addr: "100.64.0.1:53",
expected: true,
},
{
name: "NetBird overlay address mid-range",
addr: "100.110.145.54:53",
expected: true,
},
{
name: "NetBird overlay address end of range",
addr: "100.127.255.254:53",
expected: true,
},
{
name: "LAN address",
addr: "192.168.1.5:53",
expected: false,
},
{
name: "LAN address secondary DC",
addr: "192.168.1.30:53",
expected: false,
},
{
name: "public DNS address",
addr: "8.8.8.8:53",
expected: false,
},
{
name: "address just below CGNAT range",
addr: "100.63.255.255:53",
expected: false,
},
{
name: "address just above CGNAT range",
addr: "100.128.0.0:53",
expected: false,
},
{
name: "loopback address",
addr: "127.0.0.1:53",
expected: false,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
addr := netip.MustParseAddrPort(tc.addr)
result := isNetBirdOverlayAddr(addr)
assert.Equal(t, tc.expected, result, "isNetBirdOverlayAddr(%s)", tc.addr)
})
}
}

// TestProbeAvailability_SkipsOverlayAddresses verifies that ProbeAvailability
// does not fire the "probe failed" warning event when all nameservers are
// NetBird overlay addresses (100.64.0.0/10). These are unreachable at engine
// startup because the WireGuard tunnel has not yet been established.
func TestProbeAvailability_SkipsOverlayAddresses(t *testing.T) {
overlayUpstream := netip.MustParseAddrPort("100.110.145.54:53")

probed := false
mockClient := &mockUpstreamResolverPerServer{
responses: map[string]mockUpstreamResponse{
overlayUpstream.String(): {err: fmt.Errorf("tunnel not up")},
},
rtt: time.Millisecond,
}

trackingClient := &trackingUpstreamClient{
inner: mockClient,
probed: &probed,
}

deactivated := false
resolver := &upstreamResolverBase{
ctx: context.Background(),
upstreamClient: trackingClient,
upstreamServers: []netip.AddrPort{overlayUpstream},
upstreamTimeout: UpstreamTimeout,
deactivate: func(error) { deactivated = true },
reactivate: func() {},
}

resolver.ProbeAvailability()

assert.False(t, probed, "overlay address should not have been probed")
assert.False(t, deactivated, "resolver should not have been deactivated for overlay address")
assert.False(t, resolver.disabled, "resolver should not be disabled")
}

// TestProbeAvailability_ProbesLANAddresses verifies that ProbeAvailability
// still probes non-overlay (LAN/public) nameservers normally.
func TestProbeAvailability_ProbesLANAddresses(t *testing.T) {
lanUpstream := netip.MustParseAddrPort("192.168.1.5:53")

probed := false
mockClient := &mockUpstreamResolverPerServer{
responses: map[string]mockUpstreamResponse{
lanUpstream.String(): {err: fmt.Errorf("connection refused")},
},
rtt: time.Millisecond,
}

trackingClient := &trackingUpstreamClient{
inner: mockClient,
probed: &probed,
}

deactivated := false
resolver := &upstreamResolverBase{
ctx: context.Background(),
upstreamClient: trackingClient,
upstreamServers: []netip.AddrPort{lanUpstream},
upstreamTimeout: UpstreamTimeout,
deactivate: func(error) { deactivated = true },
reactivate: func() {},
}

resolver.ProbeAvailability()

assert.True(t, probed, "LAN address should have been probed")
assert.True(t, deactivated, "resolver should have been deactivated when LAN probe fails")
}
Comment on lines +598 to +612

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

These failure-path tests leave the retry loop running.

Both resolvers use context.Background(). After ProbeAvailability() fails, disable() starts waitUntilResponse(), and that goroutine never exits because the mock never recovers and nothing cancels the context. That leaks work into the rest of the suite.

Use context.WithCancel here, set cancel, and register t.Cleanup(resolver.Stop) in both tests.

🧹 Suggested cleanup
-	resolver := &upstreamResolverBase{
-		ctx:             context.Background(),
+	ctx, cancel := context.WithCancel(context.Background())
+	resolver := &upstreamResolverBase{
+		ctx:             ctx,
+		cancel:          cancel,
 		upstreamClient:  trackingClient,
 		upstreamServers: []netip.AddrPort{lanUpstream},
 		upstreamTimeout: UpstreamTimeout,
 		deactivate:      func(error) { deactivated = true },
 		reactivate:      func() {},
 	}
+	t.Cleanup(resolver.Stop)

Apply the same pattern to the mixed-overlay failure test below.

Also applies to: 694-706

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@client/internal/dns/upstream_test.go` around lines 598 - 612, The tests use
context.Background() for upstreamResolverBase so when ProbeAvailability triggers
deactivate() the background context leaves waitUntilResponse() running and leaks
goroutines; fix both tests by creating the resolver with a cancellable context
via context.WithCancel (capture cancel) and register t.Cleanup(func(){ cancel();
resolver.Stop() }) or at minimum t.Cleanup(resolver.Stop) so the retry goroutine
started by waitUntilResponse() is stopped; update the tests that construct
upstreamResolverBase and the mixed-overlay failure test to use
context.WithCancel and t.Cleanup(resolver.Stop).


// TestProbeAvailability_MixedOverlayAndLAN verifies that when a nameserver
// group contains both overlay and LAN addresses, the overlay is skipped but
// the LAN address is still probed. A successful LAN probe prevents deactivation.
func TestProbeAvailability_MixedOverlayAndLAN(t *testing.T) {
overlayUpstream := netip.MustParseAddrPort("100.110.145.54:53")
lanUpstream := netip.MustParseAddrPort("192.168.1.5:53")

probedAddrs := make(map[string]bool)
mockClient := &mockUpstreamResolverPerServer{
responses: map[string]mockUpstreamResponse{
overlayUpstream.String(): {err: fmt.Errorf("tunnel not up")},
lanUpstream.String(): {msg: &dns.Msg{MsgHdr: dns.MsgHdr{Response: true}}},
},
rtt: time.Millisecond,
}

trackingClient := &trackingUpstreamClientPerAddr{
inner: mockClient,
probedAddrs: probedAddrs,
}

deactivated := false
resolver := &upstreamResolverBase{
ctx: context.Background(),
upstreamClient: trackingClient,
upstreamServers: []netip.AddrPort{overlayUpstream, lanUpstream},
upstreamTimeout: UpstreamTimeout,
deactivate: func(error) { deactivated = true },
reactivate: func() {},
}

resolver.ProbeAvailability()

assert.False(t, probedAddrs[overlayUpstream.String()], "overlay address should not have been probed")
assert.True(t, probedAddrs[lanUpstream.String()], "LAN address should have been probed")
assert.False(t, deactivated, "resolver should not be deactivated when LAN probe succeeds")
}

// TestProbeAvailability_OnlyOverlay_NoDeactivation verifies that a nameserver
// group containing only overlay addresses is never deactivated, even though
// no actual probe is performed.
func TestProbeAvailability_OnlyOverlay_NoDeactivation(t *testing.T) {
overlay1 := netip.MustParseAddrPort("100.110.145.54:53")
overlay2 := netip.MustParseAddrPort("100.64.0.1:53")

deactivated := false
resolver := &upstreamResolverBase{
ctx: context.Background(),
// upstreamClient intentionally nil — it must never be called
upstreamServers: []netip.AddrPort{overlay1, overlay2},
upstreamTimeout: UpstreamTimeout,
deactivate: func(error) { deactivated = true },
reactivate: func() {},
}

// Should not panic (nil client never called) and should not deactivate
require.NotPanics(t, func() {
resolver.ProbeAvailability()
})

assert.False(t, deactivated, "resolver should not be deactivated for overlay-only nameserver group")
assert.False(t, resolver.disabled, "resolver should not be disabled")
}

// trackingUpstreamClient records whether any probe was attempted, regardless of address.
type trackingUpstreamClient struct {
inner *mockUpstreamResolverPerServer
probed *bool
}

func (t *trackingUpstreamClient) exchange(ctx context.Context, upstream string, r *dns.Msg) (*dns.Msg, time.Duration, error) {
*t.probed = true
return t.inner.exchange(ctx, upstream, r)
}

// trackingUpstreamClientPerAddr records which addresses were probed.
type trackingUpstreamClientPerAddr struct {
inner *mockUpstreamResolverPerServer
probedAddrs map[string]bool
}

func (t *trackingUpstreamClientPerAddr) exchange(ctx context.Context, upstream string, r *dns.Msg) (*dns.Msg, time.Duration, error) {
t.probedAddrs[upstream] = true
return t.inner.exchange(ctx, upstream, r)
}