diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 2aac1a1ef7..8eeb0a84ae 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -130,6 +130,7 @@ go_test( "//pkg/sentry/platform", "//pkg/sentry/platform/kvm/testutil", "//pkg/sentry/time", + "//pkg/sync", "@org_golang_x_sys//unix:go_default_library", ], ) @@ -164,6 +165,7 @@ go_test( "//pkg/sentry/platform", "//pkg/sentry/platform/kvm/testutil", "//pkg/sentry/time", + "//pkg/sync", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go index fe161acdd0..beda31cd02 100644 --- a/pkg/sentry/platform/kvm/bluepill_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go @@ -72,7 +72,11 @@ func bluepillGuestExit(c *vCPU, context unsafe.Pointer) { // Return to the vCPUReady state; notify any waiters. user := c.state.Load() & vCPUUser - switch c.state.Swap(user) { + oldState := c.state.Swap(user) + if user == 0 { + c.machine.availableNotify() + } + switch oldState { case user | vCPUGuest: // Expected case. case user | vCPUGuest | vCPUWaiter: c.notify() diff --git a/pkg/sentry/platform/kvm/config.go b/pkg/sentry/platform/kvm/config.go index 8cf8fa5c87..76d28876db 100644 --- a/pkg/sentry/platform/kvm/config.go +++ b/pkg/sentry/platform/kvm/config.go @@ -18,6 +18,16 @@ package kvm // Config sets configuration options for each platform instance. -type Config struct{} +type Config struct { + // MaxVCPUs is the maximum number of vCPUs the platform instance will + // create. If MaxVCPUs is 0, the platform will choose a reasonable default. + MaxVCPUs int +} -func (*machine) applyConfig(config *Config) error { return nil } +func (m *machine) applyConfig(config *Config) error { + if config.MaxVCPUs < 0 { + return fmt.Errorf("invalid Config.MaxVCPUs: %d", config.MaxVCPUs) + } + m.maxVCPUs = config.MaxVCPUs + return nil +} diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index 48a501c016..67c25782e7 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" ktime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sync" ) // dummyFPState is initialized in TestMain. @@ -480,6 +481,63 @@ func TestKernelVDSO(t *testing.T) { }) } +// Regression test for b/404271139. +func TestSingleVCPU(t *testing.T) { + // Create the machine. + deviceFile, err := OpenDevice("") + if err != nil { + t.Fatalf("error opening device file: %v", err) + } + k, err := New(deviceFile, Config{ + MaxVCPUs: 1, + }) + if err != nil { + t.Fatalf("error creating KVM instance: %v", err) + } + defer k.machine.Destroy() + + // Ping-pong the single vCPU between two goroutines. The test passes if + // this does not deadlock. + stopC := make(chan struct{}) + var doneWG sync.WaitGroup + defer func() { + close(stopC) + doneWG.Wait() + }() + var wakeC [2]chan struct{} + for i := range wakeC { + wakeC[i] = make(chan struct{}, 1) + } + for i := range wakeC { + doneWG.Add(1) + go func(i int) { + defer doneWG.Done() + for { + // Multiple ready channels in a select statement are chosen + // from randomly, so have a separate non-blocking receive from + // stopC first to ensure that it's honored in deterministic + // time. + select { + case <-stopC: + return + default: + } + select { + case <-stopC: + return + case <-wakeC[i]: + c := k.machine.Get() + bluepill(c) + wakeC[1-i] <- struct{}{} + k.machine.Put(c) + } + } + }(i) + } + wakeC[0] <- struct{}{} + time.Sleep(time.Second) +} + func BenchmarkApplicationSyscall(b *testing.B) { var ( i int // Iteration includes machine.Get() / machine.Put(). diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index a89dea1706..67c97ea234 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -60,8 +60,12 @@ type machine struct { // mu protects vCPUs. mu sync.RWMutex - // available is notified when vCPUs are available. - available sync.Cond + // availableWaiters is the number of goroutines waiting for a vCPU to + // become ready. + availableWaiters atomicbitops.Int32 + + // availableSeq is incremented whenever a vCPU becomes ready. + availableSeq atomicbitops.Uint32 // vCPUsByTID are the machine vCPUs. // @@ -270,7 +274,6 @@ var forceMappingEntireAddressSpace = false func newMachine(vm int, config *Config) (*machine, error) { // Create the machine. m := &machine{fd: vm} - m.available.L = &m.mu if err := m.applyConfig(config); err != nil { panic(fmt.Sprintf("error setting config parameters: %s", err)) @@ -532,6 +535,7 @@ func (m *machine) Get() *vCPU { runtime.UnlockOSThread() m.mu.Lock() + waiter := false for { runtime.LockOSThread() tid = hosttid.Current() @@ -540,6 +544,9 @@ func (m *machine) Get() *vCPU { if c := m.vCPUsByTID[tid]; c != nil { c.lock() m.mu.Unlock() + if waiter { + m.availableWaiters.Add(-1) + } getVCPUCounter.Increment(&getVCPUAcquisitionReused) return c } @@ -551,68 +558,62 @@ func (m *machine) Get() *vCPU { c.lock() m.vCPUsByTID[tid] = c m.mu.Unlock() + if waiter { + m.availableWaiters.Add(-1) + } c.loadSegments(tid) getVCPUCounter.Increment(&getVCPUAcquisitionUnused) return c } - // Scan for an available vCPU. + if !waiter { + // Scan for an available vCPU, best-effort. + for origTID, c := range m.vCPUsByTID { + if c.state.CompareAndSwap(vCPUReady, vCPUUser) { + delete(m.vCPUsByTID, origTID) + m.vCPUsByTID[tid] = c + m.mu.Unlock() + c.loadSegments(tid) + getVCPUCounter.Increment(&getVCPUAcquisitionUnused) + return c + } + } + + // Indicate that we are waiting for a vCPU. + waiter = true + m.availableWaiters.Add(1) + } + + // Scan for an available vCPU, with m.availableWaiters != 0. + epoch := m.availableSeq.Load() for origTID, c := range m.vCPUsByTID { if c.state.CompareAndSwap(vCPUReady, vCPUUser) { delete(m.vCPUsByTID, origTID) m.vCPUsByTID[tid] = c m.mu.Unlock() + m.availableWaiters.Add(-1) c.loadSegments(tid) getVCPUCounter.Increment(&getVCPUAcquisitionUnused) return c } } - // Scan for something not in user mode. - for origTID, c := range m.vCPUsByTID { - if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) { - continue - } - - // The vCPU is not be able to transition to - // vCPUGuest|vCPUWaiter or to vCPUUser because that - // transition requires holding the machine mutex, as we - // do now. There is no path to register a waiter on - // just the vCPUReady state. - for { - c.waitUntilNot(vCPUGuest | vCPUWaiter) - if c.state.CompareAndSwap(vCPUReady, vCPUUser) { - break - } - } - - // Steal the vCPU. - delete(m.vCPUsByTID, origTID) - m.vCPUsByTID[tid] = c - m.mu.Unlock() - c.loadSegments(tid) - getVCPUCounter.Increment(&getVCPUAcquisitionStolen) - return c - } - - // Everything is executing in user mode. Wait until something is - // available. As with m.mu.Lock() above, unlock the OS thread while we - // do this to avoid spawning additional system threads. Note that - // signaling the condition variable will have the extra effect of - // kicking the vCPUs out of guest mode if that's where they were. + // All vCPUs are already in guest mode. Wait until a vCPU becomes + // available. m.availableWait() blocks in the host, but unlocking the + // OS thread still makes waking up less expensive if sysmon steals our + // P while we're blocked. runtime.UnlockOSThread() - m.available.Wait() + m.availableWait(epoch) } } // Put puts the current vCPU. func (m *machine) Put(c *vCPU) { - c.unlock() + ready := c.unlock() runtime.UnlockOSThread() - - m.mu.RLock() - m.available.Signal() - m.mu.RUnlock() + if ready { + m.availableNotify() + } } // newDirtySet returns a new dirty set. @@ -646,15 +647,16 @@ func (c *vCPU) lock() { atomicbitops.OrUint32(&c.state, vCPUUser) } -// unlock clears the vCPUUser bit. +// unlock clears the vCPUUser bit. It returns true if it transitions the vCPU +// state to vCPUReady. // //go:nosplit -func (c *vCPU) unlock() { +func (c *vCPU) unlock() bool { origState := atomicbitops.CompareAndSwapUint32(&c.state, vCPUUser|vCPUGuest, vCPUGuest) if origState == vCPUUser|vCPUGuest { // Happy path: no exits are forced, and we can continue // executing on our merry way with a single atomic access. - return + return false } // Clear the lock. @@ -668,6 +670,7 @@ func (c *vCPU) unlock() { switch origState { case vCPUUser: // Normal state. + return true case vCPUUser | vCPUGuest | vCPUWaiter: // Force a transition: this must trigger a notification when we // return from guest mode. We must clear vCPUWaiter here @@ -677,11 +680,13 @@ func (c *vCPU) unlock() { // syscall in this period, BounceToKernel will hang. atomicbitops.AndUint32(&c.state, ^vCPUWaiter) c.notify() + return false case vCPUUser | vCPUWaiter: // Waiting for the lock to be released; the responsibility is // on us to notify the waiter and clear the associated bit. atomicbitops.AndUint32(&c.state, ^vCPUWaiter) c.notify() + return true default: panic("invalid state") } diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 342013d90b..1c7a4e01cb 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -497,21 +497,22 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) { // getMaxVCPU get max vCPU number func (m *machine) getMaxVCPU() { + if m.maxVCPUs == 0 { + // The goal here is to avoid vCPU contentions for reasonable workloads. + // But "reasonable" isn't defined well in this case. Let's say that CPU + // overcommit with factor 2 is still acceptable. We allocate a set of + // vCPU for each goruntime processor (P) and two sets of vCPUs to run + // user code. + m.maxVCPUs = 3 * runtime.GOMAXPROCS(0) + } + + // Apply KVM limit. maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) if errno != 0 { - m.maxVCPUs = _KVM_NR_VCPUS - } else { - m.maxVCPUs = int(maxVCPUs) + maxVCPUs = _KVM_NR_VCPUS } - - // The goal here is to avoid vCPU contentions for reasonable workloads. - // But "reasonable" isn't defined well in this case. Let's say that CPU - // overcommit with factor 2 is still acceptable. We allocate a set of - // vCPU for each goruntime processor (P) and two sets of vCPUs to run - // user code. - rCPUs := runtime.GOMAXPROCS(0) - if 3*rCPUs < m.maxVCPUs { - m.maxVCPUs = 3 * rCPUs + if m.maxVCPUs > int(maxVCPUs) { + m.maxVCPUs = int(maxVCPUs) } } diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index bcc7fb7760..f60e8fff60 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -185,16 +185,12 @@ func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType, // getMaxVCPU get max vCPU number func (m *machine) getMaxVCPU() { - rmaxVCPUs := runtime.NumCPU() - smaxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS) - // compare the max vcpu number from runtime and syscall, use smaller one. - if errno != 0 { - m.maxVCPUs = rmaxVCPUs - } else { - if rmaxVCPUs < int(smaxVCPUs) { - m.maxVCPUs = rmaxVCPUs - } else { - m.maxVCPUs = int(smaxVCPUs) - } + if m.maxVCPUs == 0 { + m.maxVCPUs = runtime.NumCPU() + } + + // Apply KVM limit. + if maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS); errno == 0 && m.maxVCPUs > int(maxVCPUs) { + m.maxVCPUs = int(maxVCPUs) } } diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go index cf2e36c5c7..cf8b37b4b8 100644 --- a/pkg/sentry/platform/kvm/machine_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_unsafe.go @@ -114,6 +114,44 @@ func (a *atomicAddressSpace) get() *addressSpace { return (*addressSpace)(atomic.LoadPointer(&a.pointer)) } +// availableNotify is called when a vCPU's state transitions to vCPUReady. +// +//go:nosplit +func (m *machine) availableNotify() { + m.availableSeq.Add(1) + if m.availableWaiters.Load() == 0 { + return + } + errno := hostsyscall.RawSyscallErrno( // escapes: no. + unix.SYS_FUTEX, + uintptr(unsafe.Pointer(&m.availableSeq)), + linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG, + 1) + if errno != 0 { + throw("futex wake error") + } +} + +// availableWait blocks until availableNotify is called. +// +// Preconditions: +// - epoch was the value of m.availableSeq before the caller last checked that +// no vCPUs were in state vCPUReady. +// - m.availableWaiters must be non-zero. +// +//go:nosplit +func (m *machine) availableWait(epoch uint32) { + _, _, errno := unix.Syscall6( + unix.SYS_FUTEX, + uintptr(unsafe.Pointer(&m.availableSeq)), + linux.FUTEX_WAIT|linux.FUTEX_PRIVATE_FLAG, + uintptr(epoch), + 0, 0, 0) + if errno != 0 && errno != unix.EINTR && errno != unix.EAGAIN { + panic("futex wait error") + } +} + // notify notifies that the vCPU has transitioned modes. // // This may be called by a signal handler and therefore throws on error.