Skip to content

Commit

Permalink
Choose the GPU with the least overall usage
Browse files Browse the repository at this point in the history
  • Loading branch information
ioppermann committed Dec 10, 2024
1 parent 64a2136 commit 893f8c2
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 17 deletions.
36 changes: 22 additions & 14 deletions resources/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"os"
"sort"
"sync"
"time"

Expand Down Expand Up @@ -474,7 +475,7 @@ func (r *resources) Request(req Request) (Response, error) {
return res, fmt.Errorf("some GPU resources requested but no GPU available")
}

foundGPU := -1
fittingGPU := []psutil.GPUInfo{}
for _, g := range gpustat {
if req.GPUUsage > 0 && g.Usage+req.GPUUsage > r.maxGPU {
logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu": g.Usage}).Log("Rejected, GPU usage limit exceeded")
Expand All @@ -499,24 +500,31 @@ func (r *resources) Request(req Request) (Response, error) {
continue
}

foundGPU = g.Index

logger = logger.Debug().WithFields(log.Fields{
"cur_gpu": foundGPU,
"cur_gpu_general": g.Usage,
"cur_gpu_encoder": g.Encoder,
"cur_gpu_decoder": g.Decoder,
"cur_gpu_memory": gpuMemoryUsage,
})

break
fittingGPU = append(fittingGPU, g)
}

if foundGPU < 0 {
if len(fittingGPU) == 0 {
return res, fmt.Errorf("all GPU usage limits are exceeded")
}

res.GPU = foundGPU
sort.SliceStable(fittingGPU, func(a, b int) bool {
loadA := fittingGPU[a].Usage + fittingGPU[a].Encoder + fittingGPU[a].Decoder
loadB := fittingGPU[b].Usage + fittingGPU[b].Encoder + fittingGPU[b].Decoder

return loadA < loadB
})

foundGPU := fittingGPU[0]

logger = logger.Debug().WithFields(log.Fields{
"cur_gpu": foundGPU.Index,
"cur_gpu_general": foundGPU.Usage,
"cur_gpu_encoder": foundGPU.Encoder,
"cur_gpu_decoder": foundGPU.Decoder,
"cur_gpu_memory": float64(foundGPU.MemoryUsed) / float64(foundGPU.MemoryTotal) * 100,
})

res.GPU = foundGPU.Index
}

logger.Debug().WithFields(log.Fields{
Expand Down
10 changes: 7 additions & 3 deletions resources/resources_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -725,16 +725,20 @@ func TestRequestGPULimitsMoreGPU(t *testing.T) {
MaxMemory: 100,
MaxGPU: 60,
MaxGPUMemory: 60,
PSUtil: psutil.New(2),
PSUtil: psutil.New(3),
})
require.NoError(t, err)

_, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 50, GPUMemory: 10})
require.Error(t, err)

res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10})
res, err := r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 10, GPUMemory: 10})
require.NoError(t, err)
require.Equal(t, 2, res.GPU)

res, err = r.Request(Request{CPU: 10, Memory: 10, GPUEncoder: 30, GPUMemory: 10})
require.NoError(t, err)
require.Equal(t, 1, res.GPU)
require.Equal(t, 2, res.GPU)
}

func TestHasLimits(t *testing.T) {
Expand Down

0 comments on commit 893f8c2

Please sign in to comment.