From b5d6101d0dc3f2510ffc8e7f897512d7c01d4ea0 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 19 Mar 2025 11:55:07 -0700
Subject: [PATCH] kvm: replace machine.availableCond with futexes

Between when machine.Get() observes that a vCPU is not in state vCPUReady, and
when it observes that the same vCPU is not in state vCPUGuest, the vCPU can
racily transition from vCPUGuest to vCPUReady in bluepillGuestExit(), causing
machine.Get() to block in machine.available.Wait() despite a vCPU being ready.
To fix this, drop the vCPU handoff mechanism (vCPUWaiter is still needed for
bounce()) and replace machine.available with a sequence counter futex. This
also improves utilization by allowing machine.Get() to grab the first vCPU that
becomes ready, rather than only the specific vCPU it tags for handoff.

PiperOrigin-RevId: 738486108
---
 pkg/sentry/platform/kvm/BUILD              |  2 +
 pkg/sentry/platform/kvm/bluepill_unsafe.go |  6 +-
 pkg/sentry/platform/kvm/config.go          | 14 +++-
 pkg/sentry/platform/kvm/kvm_test.go        | 58 +++++++++++++
 pkg/sentry/platform/kvm/machine.go         | 95 ++++++++++++----------
 pkg/sentry/platform/kvm/machine_amd64.go   | 25 +++---
 pkg/sentry/platform/kvm/machine_arm64.go   | 18 ++--
 pkg/sentry/platform/kvm/machine_unsafe.go  | 38 +++++++++
 8 files changed, 185 insertions(+), 71 deletions(-)

diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 2aac1a1ef7..8eeb0a84ae 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -130,6 +130,7 @@ go_test(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm/testutil",
         "//pkg/sentry/time",
+        "//pkg/sync",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -164,6 +165,7 @@ go_test(
         "//pkg/sentry/platform",
         "//pkg/sentry/platform/kvm/testutil",
         "//pkg/sentry/time",
+        "//pkg/sync",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index fe161acdd0..beda31cd02 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -72,7 +72,11 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) {
 
 	// Return to the vCPUReady state; notify any waiters.
 	user := c.state.Load() & vCPUUser
-	switch c.state.Swap(user) {
+	oldState := c.state.Swap(user)
+	if user == 0 {
+		c.machine.availableNotify()
+	}
+	switch oldState {
 	case user | vCPUGuest: // Expected case.
 	case user | vCPUGuest | vCPUWaiter:
 		c.notify()
diff --git a/pkg/sentry/platform/kvm/config.go b/pkg/sentry/platform/kvm/config.go
index 8cf8fa5c87..76d28876db 100644
--- a/pkg/sentry/platform/kvm/config.go
+++ b/pkg/sentry/platform/kvm/config.go
@@ -18,6 +18,16 @@
 package kvm
 
 // Config sets configuration options for each platform instance.
-type Config struct{}
+type Config struct {
+	// MaxVCPUs is the maximum number of vCPUs the platform instance will
+	// create. If MaxVCPUs is 0, the platform will choose a reasonable default.
+	MaxVCPUs int
+}
 
-func (*machine) applyConfig(config *Config) error { return nil }
+func (m *machine) applyConfig(config *Config) error {
+	if config.MaxVCPUs < 0 {
+		return fmt.Errorf("invalid Config.MaxVCPUs: %d", config.MaxVCPUs)
+	}
+	m.maxVCPUs = config.MaxVCPUs
+	return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go
index 48a501c016..67c25782e7 100644
--- a/pkg/sentry/platform/kvm/kvm_test.go
+++ b/pkg/sentry/platform/kvm/kvm_test.go
@@ -32,6 +32,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/platform"
 	"gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil"
 	ktime "gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
 // dummyFPState is initialized in TestMain.
@@ -480,6 +481,63 @@ func TestKernelVDSO(t *testing.T) {
 	})
 }
 
+// Regression test for b/404271139.
+func TestSingleVCPU(t *testing.T) {
+	// Create the machine.
+	deviceFile, err := OpenDevice("")
+	if err != nil {
+		t.Fatalf("error opening device file: %v", err)
+	}
+	k, err := New(deviceFile, Config{
+		MaxVCPUs: 1,
+	})
+	if err != nil {
+		t.Fatalf("error creating KVM instance: %v", err)
+	}
+	defer k.machine.Destroy()
+
+	// Ping-pong the single vCPU between two goroutines. The test passes if
+	// this does not deadlock.
+	stopC := make(chan struct{})
+	var doneWG sync.WaitGroup
+	defer func() {
+		close(stopC)
+		doneWG.Wait()
+	}()
+	var wakeC [2]chan struct{}
+	for i := range wakeC {
+		wakeC[i] = make(chan struct{}, 1)
+	}
+	for i := range wakeC {
+		doneWG.Add(1)
+		go func(i int) {
+			defer doneWG.Done()
+			for {
+				// Multiple ready channels in a select statement are chosen
+				// from randomly, so have a separate non-blocking receive from
+				// stopC first to ensure that it's honored in deterministic
+				// time.
+				select {
+				case <-stopC:
+					return
+				default:
+				}
+				select {
+				case <-stopC:
+					return
+				case <-wakeC[i]:
+					c := k.machine.Get()
+					bluepill(c)
+					wakeC[1-i] <- struct{}{}
+					k.machine.Put(c)
+				}
+			}
+		}(i)
+	}
+	wakeC[0] <- struct{}{}
+	time.Sleep(time.Second)
+}
+
 func BenchmarkApplicationSyscall(b *testing.B) {
 	var (
 		i int // Iteration includes machine.Get() / machine.Put().
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index a89dea1706..67c97ea234 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -60,8 +60,12 @@ type machine struct {
 	// mu protects vCPUs.
 	mu sync.RWMutex
 
-	// available is notified when vCPUs are available.
-	available sync.Cond
+	// availableWaiters is the number of goroutines waiting for a vCPU to
+	// become ready.
+	availableWaiters atomicbitops.Int32
+
+	// availableSeq is incremented whenever a vCPU becomes ready.
+	availableSeq atomicbitops.Uint32
 
 	// vCPUsByTID are the machine vCPUs.
 	//
@@ -270,7 +274,6 @@ var forceMappingEntireAddressSpace = false
 func newMachine(vm int, config *Config) (*machine, error) {
 	// Create the machine.
 	m := &machine{fd: vm}
-	m.available.L = &m.mu
 
 	if err := m.applyConfig(config); err != nil {
 		panic(fmt.Sprintf("error setting config parameters: %s", err))
@@ -532,6 +535,7 @@ func (m *machine) Get() *vCPU {
 	runtime.UnlockOSThread()
 	m.mu.Lock()
 
+	waiter := false
 	for {
 		runtime.LockOSThread()
 		tid = hosttid.Current()
@@ -540,6 +544,9 @@ func (m *machine) Get() *vCPU {
 		if c := m.vCPUsByTID[tid]; c != nil {
 			c.lock()
 			m.mu.Unlock()
+			if waiter {
+				m.availableWaiters.Add(-1)
+			}
 			getVCPUCounter.Increment(&getVCPUAcquisitionReused)
 			return c
 		}
@@ -551,68 +558,62 @@ func (m *machine) Get() *vCPU {
 			c.lock()
 			m.vCPUsByTID[tid] = c
 			m.mu.Unlock()
+			if waiter {
+				m.availableWaiters.Add(-1)
+			}
 			c.loadSegments(tid)
 			getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
 			return c
 		}
 
-		// Scan for an available vCPU.
+		if !waiter {
+			// Scan for an available vCPU, best-effort.
+			for origTID, c := range m.vCPUsByTID {
+				if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
+					delete(m.vCPUsByTID, origTID)
+					m.vCPUsByTID[tid] = c
+					m.mu.Unlock()
+					c.loadSegments(tid)
+					getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
+					return c
+				}
+			}
+
+			// Indicate that we are waiting for a vCPU.
+			waiter = true
+			m.availableWaiters.Add(1)
+		}
+
+		// Scan for an available vCPU, with m.availableWaiters != 0.
+		epoch := m.availableSeq.Load()
 		for origTID, c := range m.vCPUsByTID {
 			if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
 				delete(m.vCPUsByTID, origTID)
 				m.vCPUsByTID[tid] = c
 				m.mu.Unlock()
+				m.availableWaiters.Add(-1)
 				c.loadSegments(tid)
 				getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
 				return c
 			}
 		}
 
-		// Scan for something not in user mode.
-		for origTID, c := range m.vCPUsByTID {
-			if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
-				continue
-			}
-
-			// The vCPU is not be able to transition to
-			// vCPUGuest|vCPUWaiter or to vCPUUser because that
-			// transition requires holding the machine mutex, as we
-			// do now. There is no path to register a waiter on
-			// just the vCPUReady state.
-			for {
-				c.waitUntilNot(vCPUGuest | vCPUWaiter)
-				if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
-					break
-				}
-			}
-
-			// Steal the vCPU.
-			delete(m.vCPUsByTID, origTID)
-			m.vCPUsByTID[tid] = c
-			m.mu.Unlock()
-			c.loadSegments(tid)
-			getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
-			return c
-		}
-
-		// Everything is executing in user mode. Wait until something is
-		// available. As with m.mu.Lock() above, unlock the OS thread while we
-		// do this to avoid spawning additional system threads. Note that
-		// signaling the condition variable will have the extra effect of
-		// kicking the vCPUs out of guest mode if that's where they were.
+		// All vCPUs are already in guest mode. Wait until a vCPU becomes
+		// available. m.availableWait() blocks in the host, but unlocking the
+		// OS thread still makes waking up less expensive if sysmon steals our
+		// P while we're blocked.
 		runtime.UnlockOSThread()
-		m.available.Wait()
+		m.availableWait(epoch)
 	}
 }
 
 // Put puts the current vCPU.
 func (m *machine) Put(c *vCPU) {
-	c.unlock()
+	ready := c.unlock()
 	runtime.UnlockOSThread()
-
-	m.mu.RLock()
-	m.available.Signal()
-	m.mu.RUnlock()
+	if ready {
+		m.availableNotify()
+	}
 }
 
 // newDirtySet returns a new dirty set.
@@ -646,15 +647,16 @@ func (c *vCPU) lock() {
 	atomicbitops.OrUint32(&c.state, vCPUUser)
 }
 
-// unlock clears the vCPUUser bit.
+// unlock clears the vCPUUser bit. It returns true if it transitions the vCPU
+// state to vCPUReady.
 //
 //go:nosplit
-func (c *vCPU) unlock() {
+func (c *vCPU) unlock() bool {
 	origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest)
 	if origState == vCPUUser|vCPUGuest {
 		// Happy path: no exits are forced, and we can continue
 		// executing on our merry way with a single atomic access.
-		return
+		return false
 	}
 
 	// Clear the lock.
@@ -668,6 +670,7 @@ func (c *vCPU) unlock() {
 	switch origState {
 	case vCPUUser:
 		// Normal state.
+		return true
 	case vCPUUser | vCPUGuest | vCPUWaiter:
 		// Force a transition: this must trigger a notification when we
 		// return from guest mode. We must clear vCPUWaiter here
@@ -677,11 +680,13 @@ func (c *vCPU) unlock() {
 		// syscall in this period, BounceToKernel will hang.
 		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
 		c.notify()
+		return false
 	case vCPUUser | vCPUWaiter:
 		// Waiting for the lock to be released; the responsibility is
 		// on us to notify the waiter and clear the associated bit.
 		atomicbitops.AndUint32(&c.state, ^vCPUWaiter)
 		c.notify()
+		return true
 	default:
 		panic("invalid state")
 	}
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index 342013d90b..1c7a4e01cb 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -497,21 +497,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
 
 // getMaxVCPU get max vCPU number
 func (m *machine) getMaxVCPU() {
+	if m.maxVCPUs == 0 {
+		// The goal here is to avoid vCPU contentions for reasonable workloads.
+		// But "reasonable" isn't defined well in this case. Let's say that CPU
+		// overcommit with factor 2 is still acceptable. We allocate a set of
+		// vCPU for each goruntime processor (P) and two sets of vCPUs to run
+		// user code.
+		m.maxVCPUs = 3 * runtime.GOMAXPROCS(0)
+	}
+
+	// Apply KVM limit.
 	maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
 	if errno != 0 {
-		m.maxVCPUs = _KVM_NR_VCPUS
-	} else {
-		m.maxVCPUs = int(maxVCPUs)
+		maxVCPUs = _KVM_NR_VCPUS
 	}
-
-	// The goal here is to avoid vCPU contentions for reasonable workloads.
-	// But "reasonable" isn't defined well in this case. Let's say that CPU
-	// overcommit with factor 2 is still acceptable. We allocate a set of
-	// vCPU for each goruntime processor (P) and two sets of vCPUs to run
-	// user code.
-	rCPUs := runtime.GOMAXPROCS(0)
-	if 3*rCPUs < m.maxVCPUs {
-		m.maxVCPUs = 3 * rCPUs
+	if m.maxVCPUs > int(maxVCPUs) {
+		m.maxVCPUs = int(maxVCPUs)
 	}
 }
 
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index bcc7fb7760..f60e8fff60 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -185,16 +185,12 @@ func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType,
 
 // getMaxVCPU get max vCPU number
 func (m *machine) getMaxVCPU() {
-	rmaxVCPUs := runtime.NumCPU()
-	smaxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
-	// compare the max vcpu number from runtime and syscall, use smaller one.
-	if errno != 0 {
-		m.maxVCPUs = rmaxVCPUs
-	} else {
-		if rmaxVCPUs < int(smaxVCPUs) {
-			m.maxVCPUs = rmaxVCPUs
-		} else {
-			m.maxVCPUs = int(smaxVCPUs)
-		}
+	if m.maxVCPUs == 0 {
+		m.maxVCPUs = runtime.NumCPU()
+	}
+
+	// Apply KVM limit.
+	if maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS); errno == 0 && m.maxVCPUs > int(maxVCPUs) {
+		m.maxVCPUs = int(maxVCPUs)
 	}
 }
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index cf2e36c5c7..cf8b37b4b8 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -114,6 +114,44 @@ func (a *atomicAddressSpace) get() *addressSpace {
 	return (*addressSpace)(atomic.LoadPointer(&a.pointer))
 }
 
+// availableNotify is called when a vCPU's state transitions to vCPUReady.
+//
+//go:nosplit
+func (m *machine) availableNotify() {
+	m.availableSeq.Add(1)
+	if m.availableWaiters.Load() == 0 {
+		return
+	}
+	errno := hostsyscall.RawSyscallErrno( // escapes: no.
+		unix.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&m.availableSeq)),
+		linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
+		1)
+	if errno != 0 {
+		throw("futex wake error")
+	}
+}
+
+// availableWait blocks until availableNotify is called.
+//
+// Preconditions:
+// - epoch was the value of m.availableSeq before the caller last checked that
+// no vCPUs were in state vCPUReady.
+// - m.availableWaiters must be non-zero.
+//
+//go:nosplit
+func (m *machine) availableWait(epoch uint32) {
+	_, _, errno := unix.Syscall6(
+		unix.SYS_FUTEX,
+		uintptr(unsafe.Pointer(&m.availableSeq)),
+		linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG,
+		uintptr(epoch),
+		0, 0, 0)
+	if errno != 0 && errno != unix.EINTR && errno != unix.EAGAIN {
+		panic("futex wait error")
+	}
+}
+
 // notify notifies that the vCPU has transitioned modes.
 //
 // This may be called by a signal handler and therefore throws on error.