From 764bee5e2305599d535888c45ac8c0d192789b69 Mon Sep 17 00:00:00 2001 From: Michael Uray <25169478+MichaelUray@users.noreply.github.com> Date: Sat, 18 Apr 2026 06:16:29 +0000 Subject: [PATCH] [client] Fix WGIface.Close deadlock when DNS filter hook re-enters GetDevice WGIface.Close() took w.mu and held it across w.tun.Close(). The underlying wireguard-go device waits for its send/receive goroutines to drain before Close() returns, and some of those goroutines re-enter WGIface during shutdown. In particular, the userspace packet filter DNS hook in client/internal/dns.ServiceViaMemory.filterDNSTraffic calls s.wgInterface.GetDevice() on every packet, which also needs w.mu. With the Close-side holding the mutex, the read goroutine blocks in GetDevice and Close waits forever for that goroutine to exit: goroutine N (TestDNSPermanent_updateUpstream): WGIface.Close -> holds w.mu -> tun.Close -> sync.WaitGroup.Wait goroutine M (wireguard read routine): FilteredDevice.Read -> filterOutbound -> udpHooksDrop -> filterDNSTraffic.func1 -> WGIface.GetDevice -> sync.Mutex.Lock This surfaces as a 5 minute test timeout on the macOS Client/Unit CI job (panic: test timed out after 5m0s, running tests: TestDNSPermanent_updateUpstream). Release w.mu before calling w.tun.Close(). The other Close steps (wgProxyFactory.Free, waitUntilRemoved, Destroy) do not mutate any fields guarded by w.mu beyond what Free() already does, so the lock is not needed once the tun has started shutting down. A new unit test in iface_close_test.go uses a fake WGTunDevice to reproduce the deadlock deterministically without requiring CAP_NET_ADMIN. --- client/iface/iface.go | 11 ++- client/iface/iface_close_test.go | 113 +++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 client/iface/iface_close_test.go diff --git a/client/iface/iface.go b/client/iface/iface.go index 9b331d68c7c..655dd1682f4 100644 --- a/client/iface/iface.go +++ b/client/iface/iface.go @@ -217,7 +217,6 @@ func (w *WGIface) RemoveAllowedIP(peerKey string, allowedIP netip.Prefix) error // Close closes the tunnel interface func (w *WGIface) Close() error { w.mu.Lock() - defer w.mu.Unlock() var result *multierror.Error @@ -225,7 +224,15 @@ func (w *WGIface) Close() error { result = multierror.Append(result, fmt.Errorf("failed to free WireGuard proxy: %w", err)) } - if err := w.tun.Close(); err != nil { + // Release w.mu before calling w.tun.Close(): the underlying + // wireguard-go device.Close() waits for its send/receive goroutines + // to drain. Some of those goroutines re-enter WGIface methods that + // take w.mu (e.g. the packet filter DNS hook calls GetDevice()), so + // holding the mutex here would deadlock the shutdown path. + tun := w.tun + w.mu.Unlock() + + if err := tun.Close(); err != nil { result = multierror.Append(result, fmt.Errorf("failed to close wireguard interface %s: %w", w.Name(), err)) } diff --git a/client/iface/iface_close_test.go b/client/iface/iface_close_test.go new file mode 100644 index 00000000000..171e15d0a39 --- /dev/null +++ b/client/iface/iface_close_test.go @@ -0,0 +1,113 @@ +//go:build !android + +package iface + +import ( + "errors" + "sync" + "testing" + "time" + + wgdevice "golang.zx2c4.com/wireguard/device" + "golang.zx2c4.com/wireguard/tun/netstack" + + "github.com/netbirdio/netbird/client/iface/device" + "github.com/netbirdio/netbird/client/iface/udpmux" + "github.com/netbirdio/netbird/client/iface/wgaddr" + "github.com/netbirdio/netbird/client/iface/wgproxy" +) + +// fakeTunDevice implements WGTunDevice and lets the test control when +// Close() returns. It mimics the wireguard-go shutdown path, which blocks +// until its goroutines drain. Some of those goroutines (e.g. the packet +// filter DNS hook in client/internal/dns) call back into WGIface, so if +// WGIface.Close() held w.mu across tun.Close() the shutdown would +// deadlock. +type fakeTunDevice struct { + closeStarted chan struct{} + unblockClose chan struct{} +} + +func (f *fakeTunDevice) Create() (device.WGConfigurer, error) { + return nil, errors.New("not implemented") +} +func (f *fakeTunDevice) Up() (*udpmux.UniversalUDPMuxDefault, error) { + return nil, errors.New("not implemented") +} +func (f *fakeTunDevice) UpdateAddr(wgaddr.Address) error { return nil } +func (f *fakeTunDevice) WgAddress() wgaddr.Address { return wgaddr.Address{} } +func (f *fakeTunDevice) MTU() uint16 { return DefaultMTU } +func (f *fakeTunDevice) DeviceName() string { return "nb-close-test" } +func (f *fakeTunDevice) FilteredDevice() *device.FilteredDevice { return nil } +func (f *fakeTunDevice) Device() *wgdevice.Device { return nil } +func (f *fakeTunDevice) GetNet() *netstack.Net { return nil } +func (f *fakeTunDevice) GetICEBind() device.EndpointManager { return nil } + +func (f *fakeTunDevice) Close() error { + close(f.closeStarted) + <-f.unblockClose + return nil +} + +type fakeProxyFactory struct{} + +func (fakeProxyFactory) GetProxy() wgproxy.Proxy { return nil } +func (fakeProxyFactory) GetProxyPort() uint16 { return 0 } +func (fakeProxyFactory) Free() error { return nil } + +// TestWGIface_CloseReleasesMutexBeforeTunClose guards against a deadlock +// that surfaces as a macOS test-timeout in +// TestDNSPermanent_updateUpstream: WGIface.Close() used to hold w.mu +// while waiting for the wireguard-go device goroutines to finish, and +// one of those goroutines (the DNS filter hook) calls back into +// WGIface.GetDevice() which needs the same mutex. The fix is to drop +// the lock before tun.Close() returns control. +func TestWGIface_CloseReleasesMutexBeforeTunClose(t *testing.T) { + tun := &fakeTunDevice{ + closeStarted: make(chan struct{}), + unblockClose: make(chan struct{}), + } + w := &WGIface{ + tun: tun, + wgProxyFactory: fakeProxyFactory{}, + } + + closeDone := make(chan error, 1) + go func() { + closeDone <- w.Close() + }() + + select { + case <-tun.closeStarted: + case <-time.After(2 * time.Second): + close(tun.unblockClose) + t.Fatal("tun.Close() was never invoked") + } + + // Simulate the WireGuard read goroutine calling back into WGIface + // via the packet filter's DNS hook. If Close() still held w.mu + // during tun.Close(), this would block until the test timeout. + getDeviceDone := make(chan struct{}) + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + _ = w.GetDevice() + close(getDeviceDone) + }() + + select { + case <-getDeviceDone: + case <-time.After(2 * time.Second): + close(tun.unblockClose) + wg.Wait() + t.Fatal("GetDevice() deadlocked while WGIface.Close was closing the tun") + } + + close(tun.unblockClose) + select { + case <-closeDone: + case <-time.After(2 * time.Second): + t.Fatal("WGIface.Close() never returned after the tun was unblocked") + } +}