Skip to content

Commit

Permalink
Merge pull request #25 from XuehaiPan/fix-get-running-processes
Browse files Browse the repository at this point in the history
Fix GetComputeRunningProcesses on CUDA 10.x
  • Loading branch information
elezar authored Aug 13, 2021
2 parents 10a3a25 + 9eb2e80 commit 053ac68
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 16 deletions.
69 changes: 61 additions & 8 deletions gen/nvml/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -931,20 +931,54 @@ func (Device Device) GetBridgeChipInfo() (BridgeChipHierarchy, Return) {
return DeviceGetBridgeChipInfo(Device)
}

// Helper function for DeviceGet{Compute,Graphics}RunningProcesses
func (p *ProcessInfo_v1) AsProcessInfoPointer() *ProcessInfo {
return (*ProcessInfo)(unsafe.Pointer(p))
}

// Helper function for DeviceGet{Compute,Graphics}RunningProcesses
func (p ProcessInfo_v1) ToProcessInfo() ProcessInfo {
return ProcessInfo{
Pid: p.Pid,
UsedGpuMemory: p.UsedGpuMemory,
GpuInstanceId: 0xFFFFFFFF, // GPU instance ID is invalid in v1
ComputeInstanceId: 0xFFFFFFFF, // Compute instance ID is invalid in v1
}
}

// nvml.DeviceGetComputeRunningProcesses()
func DeviceGetComputeRunningProcesses(Device Device) ([]ProcessInfo, Return) {
var Infos []ProcessInfo // This is the v2 version of process info data structure
var InfoCount uint32 = 1 // Will be reduced upon returning
var ret = SUCCESS // Will be changed upon returning
for {
Infos := make([]ProcessInfo, InfoCount)
ret := nvmlDeviceGetComputeRunningProcesses(Device, &InfoCount, &Infos[0])
if ret == SUCCESS {
return Infos[:InfoCount], ret
if usesNvmlDeviceGetComputeRunningProcesses_v1 {
var v1Infos = make([]ProcessInfo_v1, InfoCount)
ret = nvmlDeviceGetComputeRunningProcesses_v1(Device, &InfoCount, (&v1Infos[0]).AsProcessInfoPointer()) // Call v1 version
if ret == SUCCESS {
// Convert process info data structure from v1 to v2
for i := uint32(0); i < InfoCount; i++ {
Infos = append(Infos, v1Infos[i].ToProcessInfo())
}
break
}
} else {
Infos = make([]ProcessInfo, InfoCount)
ret = nvmlDeviceGetComputeRunningProcesses(Device, &InfoCount, &Infos[0]) // Call v2 version directly
if ret == SUCCESS {
break
}
}
if ret != ERROR_INSUFFICIENT_SIZE {
return nil, ret
}
InfoCount *= 2
}

if InfoCount == 0 {
return []ProcessInfo{}, SUCCESS
}
return Infos[:InfoCount], SUCCESS
}

func (Device Device) GetComputeRunningProcesses() ([]ProcessInfo, Return) {
Expand All @@ -953,18 +987,37 @@ func (Device Device) GetComputeRunningProcesses() ([]ProcessInfo, Return) {

// nvml.DeviceGetGraphicsRunningProcesses()
func DeviceGetGraphicsRunningProcesses(Device Device) ([]ProcessInfo, Return) {
var Infos []ProcessInfo // This is the v2 version of process info data structure
var InfoCount uint32 = 1 // Will be reduced upon returning
var ret = SUCCESS // Will be changed upon returning
for {
Infos := make([]ProcessInfo, InfoCount)
ret := nvmlDeviceGetGraphicsRunningProcesses(Device, &InfoCount, &Infos[0])
if ret == SUCCESS {
return Infos[:InfoCount], ret
if usesNvmlDeviceGetGraphicsRunningProcesses_v1 {
var v1Infos = make([]ProcessInfo_v1, InfoCount)
ret = nvmlDeviceGetGraphicsRunningProcesses_v1(Device, &InfoCount, (&v1Infos[0]).AsProcessInfoPointer()) // Call v1 version
if ret == SUCCESS {
// Convert process info data structure from v1 to v2
for i := uint32(0); i < InfoCount; i++ {
Infos = append(Infos, v1Infos[i].ToProcessInfo())
}
break
}
} else {
Infos = make([]ProcessInfo, InfoCount)
ret = nvmlDeviceGetGraphicsRunningProcesses(Device, &InfoCount, &Infos[0]) // Call v2 version directly
if ret == SUCCESS {
break
}
}
if ret != ERROR_INSUFFICIENT_SIZE {
return nil, ret
}
InfoCount *= 2
}

if InfoCount == 0 {
return []ProcessInfo{}, SUCCESS
}
return Infos[:InfoCount], SUCCESS
}

func (Device Device) GetGraphicsRunningProcesses() ([]ProcessInfo, Return) {
Expand Down
12 changes: 12 additions & 0 deletions gen/nvml/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ var nvmlComputeInstanceGetInfo = nvmlComputeInstanceGetInfo_v1
var nvmlDeviceGetComputeRunningProcesses = nvmlDeviceGetComputeRunningProcesses_v1
var nvmlDeviceGetGraphicsRunningProcesses = nvmlDeviceGetGraphicsRunningProcesses_v1

// ProcessInfo_v1 matches the ProcessInfo_st definition before CUDA 11.
type ProcessInfo_v1 struct {
Pid uint32
UsedGpuMemory uint64
}
type ProcessInfo_v2 ProcessInfo // Defined by cgo from nvml.h (always v2)

var usesNvmlDeviceGetComputeRunningProcesses_v1 = true
var usesNvmlDeviceGetGraphicsRunningProcesses_v1 = true

// updateVersionedSymbols()
func updateVersionedSymbols() {
err := nvml.Lookup("nvmlInit_v2")
Expand Down Expand Up @@ -153,10 +163,12 @@ func updateVersionedSymbols() {
err = nvml.Lookup("nvmlDeviceGetComputeRunningProcesses_v2")
if err == nil {
nvmlDeviceGetComputeRunningProcesses = nvmlDeviceGetComputeRunningProcesses_v2
usesNvmlDeviceGetComputeRunningProcesses_v1 = false
}
err = nvml.Lookup("nvmlDeviceGetGraphicsRunningProcesses_v2")
if err == nil {
nvmlDeviceGetGraphicsRunningProcesses = nvmlDeviceGetGraphicsRunningProcesses_v2
usesNvmlDeviceGetGraphicsRunningProcesses_v1 = false
}

}
69 changes: 61 additions & 8 deletions pkg/nvml/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -931,20 +931,54 @@ func (Device Device) GetBridgeChipInfo() (BridgeChipHierarchy, Return) {
return DeviceGetBridgeChipInfo(Device)
}

// Helper function for DeviceGet{Compute,Graphics}RunningProcesses
func (p *ProcessInfo_v1) AsProcessInfoPointer() *ProcessInfo {
return (*ProcessInfo)(unsafe.Pointer(p))
}

// Helper function for DeviceGet{Compute,Graphics}RunningProcesses
func (p ProcessInfo_v1) ToProcessInfo() ProcessInfo {
return ProcessInfo{
Pid: p.Pid,
UsedGpuMemory: p.UsedGpuMemory,
GpuInstanceId: 0xFFFFFFFF, // GPU instance ID is invalid in v1
ComputeInstanceId: 0xFFFFFFFF, // Compute instance ID is invalid in v1
}
}

// nvml.DeviceGetComputeRunningProcesses()
func DeviceGetComputeRunningProcesses(Device Device) ([]ProcessInfo, Return) {
var Infos []ProcessInfo // This is the v2 version of process info data structure
var InfoCount uint32 = 1 // Will be reduced upon returning
var ret = SUCCESS // Will be changed upon returning
for {
Infos := make([]ProcessInfo, InfoCount)
ret := nvmlDeviceGetComputeRunningProcesses(Device, &InfoCount, &Infos[0])
if ret == SUCCESS {
return Infos[:InfoCount], ret
if usesNvmlDeviceGetComputeRunningProcesses_v1 {
var v1Infos = make([]ProcessInfo_v1, InfoCount)
ret = nvmlDeviceGetComputeRunningProcesses_v1(Device, &InfoCount, (&v1Infos[0]).AsProcessInfoPointer()) // Call v1 version
if ret == SUCCESS {
// Convert process info data structure from v1 to v2
for i := uint32(0); i < InfoCount; i++ {
Infos = append(Infos, v1Infos[i].ToProcessInfo())
}
break
}
} else {
Infos = make([]ProcessInfo, InfoCount)
ret = nvmlDeviceGetComputeRunningProcesses(Device, &InfoCount, &Infos[0]) // Call v2 version directly
if ret == SUCCESS {
break
}
}
if ret != ERROR_INSUFFICIENT_SIZE {
return nil, ret
}
InfoCount *= 2
}

if InfoCount == 0 {
return []ProcessInfo{}, SUCCESS
}
return Infos[:InfoCount], SUCCESS
}

func (Device Device) GetComputeRunningProcesses() ([]ProcessInfo, Return) {
Expand All @@ -953,18 +987,37 @@ func (Device Device) GetComputeRunningProcesses() ([]ProcessInfo, Return) {

// nvml.DeviceGetGraphicsRunningProcesses()
func DeviceGetGraphicsRunningProcesses(Device Device) ([]ProcessInfo, Return) {
var Infos []ProcessInfo // This is the v2 version of process info data structure
var InfoCount uint32 = 1 // Will be reduced upon returning
var ret = SUCCESS // Will be changed upon returning
for {
Infos := make([]ProcessInfo, InfoCount)
ret := nvmlDeviceGetGraphicsRunningProcesses(Device, &InfoCount, &Infos[0])
if ret == SUCCESS {
return Infos[:InfoCount], ret
if usesNvmlDeviceGetGraphicsRunningProcesses_v1 {
var v1Infos = make([]ProcessInfo_v1, InfoCount)
ret = nvmlDeviceGetGraphicsRunningProcesses_v1(Device, &InfoCount, (&v1Infos[0]).AsProcessInfoPointer()) // Call v1 version
if ret == SUCCESS {
// Convert process info data structure from v1 to v2
for i := uint32(0); i < InfoCount; i++ {
Infos = append(Infos, v1Infos[i].ToProcessInfo())
}
break
}
} else {
Infos = make([]ProcessInfo, InfoCount)
ret = nvmlDeviceGetGraphicsRunningProcesses(Device, &InfoCount, &Infos[0]) // Call v2 version directly
if ret == SUCCESS {
break
}
}
if ret != ERROR_INSUFFICIENT_SIZE {
return nil, ret
}
InfoCount *= 2
}

if InfoCount == 0 {
return []ProcessInfo{}, SUCCESS
}
return Infos[:InfoCount], SUCCESS
}

func (Device Device) GetGraphicsRunningProcesses() ([]ProcessInfo, Return) {
Expand Down
12 changes: 12 additions & 0 deletions pkg/nvml/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ var nvmlComputeInstanceGetInfo = nvmlComputeInstanceGetInfo_v1
var nvmlDeviceGetComputeRunningProcesses = nvmlDeviceGetComputeRunningProcesses_v1
var nvmlDeviceGetGraphicsRunningProcesses = nvmlDeviceGetGraphicsRunningProcesses_v1

// ProcessInfo_v1 matches the ProcessInfo_st definition before CUDA 11.
type ProcessInfo_v1 struct {
Pid uint32
UsedGpuMemory uint64
}
type ProcessInfo_v2 ProcessInfo // Defined by cgo from nvml.h (always v2)

var usesNvmlDeviceGetComputeRunningProcesses_v1 = true
var usesNvmlDeviceGetGraphicsRunningProcesses_v1 = true

// updateVersionedSymbols()
func updateVersionedSymbols() {
err := nvml.Lookup("nvmlInit_v2")
Expand Down Expand Up @@ -153,10 +163,12 @@ func updateVersionedSymbols() {
err = nvml.Lookup("nvmlDeviceGetComputeRunningProcesses_v2")
if err == nil {
nvmlDeviceGetComputeRunningProcesses = nvmlDeviceGetComputeRunningProcesses_v2
usesNvmlDeviceGetComputeRunningProcesses_v1 = false
}
err = nvml.Lookup("nvmlDeviceGetGraphicsRunningProcesses_v2")
if err == nil {
nvmlDeviceGetGraphicsRunningProcesses = nvmlDeviceGetGraphicsRunningProcesses_v2
usesNvmlDeviceGetGraphicsRunningProcesses_v1 = false
}

}

0 comments on commit 053ac68

Please sign in to comment.