Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions cmd/gpu_nfdhook/labeler.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,18 +113,17 @@ func fallback() uint64 {
return getEnvVarNumber(memoryOverrideEnv)
}

// getMemoryAmount reads the GPU memory amount from the system.
func (l *labeler) getMemoryAmount(gpuName string) uint64 {
// getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count.
func (l *labeler) getTileMemoryAmount(gpuName string) (mem, numTiles uint64) {
reserved := getEnvVarNumber(memoryReservedEnv)
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")

files, err := filepath.Glob(filePath)
if err != nil {
klog.V(4).Info("Can't read sysfs folder", err)
return fallback()
return fallback(), 1
}

mem := uint64(0)
for _, fileName := range files {
dat, err := ioutil.ReadFile(fileName)
if err != nil {
Expand All @@ -138,14 +137,15 @@ func (l *labeler) getMemoryAmount(gpuName string) uint64 {
continue
}

numTiles++
mem += n
}

if mem == 0 {
return fallback()
return fallback(), 1
}

return mem - reserved
return mem - reserved, numTiles
}

// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
Expand All @@ -159,7 +159,7 @@ func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) {
}

// createCapabilityLabels creates labels from the gpu capability file under debugfs.
func (l *labeler) createCapabilityLabels(cardNum string) {
func (l *labeler) createCapabilityLabels(cardNum string, numTiles uint64) {
// try to read the capabilities from the i915_capabilities file
file, err := os.Open(filepath.Join(l.debugfsDRIDir, cardNum, "i915_capabilities"))
if err != nil {
Expand All @@ -172,6 +172,7 @@ func (l *labeler) createCapabilityLabels(cardNum string) {
searchStringActionMap := map[string]func(string){
"platform: ": func(platformName string) {
l.labels.addNumericLabel(labelNamespace+"platform_"+platformName+".count", 1)
l.labels[labelNamespace+"platform_"+platformName+".tiles"] = strconv.FormatInt(int64(numTiles), 10)
l.labels[labelNamespace+"platform_"+platformName+".present"] = "true"
},
"gen: ": func(genName string) {
Expand Down Expand Up @@ -212,11 +213,13 @@ func (l *labeler) createLabels() error {
return errors.Wrap(err, "gpu name parsing error")
}

// read the memory amount to find a proper max allocation value
memoryAmount, numTiles := l.getTileMemoryAmount(gpuName)

// try to add capability labels
l.createCapabilityLabels(gpuNum)
l.createCapabilityLabels(gpuNum, numTiles)

// read the memory amount to find a proper max allocation value
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(l.getMemoryAmount(gpuName)))
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(memoryAmount))
}
gpuCount := len(gpuNameList)
// add gpu list label (example: "card0.card1.card2")
Expand Down
5 changes: 5 additions & 0 deletions cmd/gpu_nfdhook/labeler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ func getTestCases() []testcase {
"gpu.intel.com/memory.max": "8086",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "1",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
},
Expand Down Expand Up @@ -87,6 +88,7 @@ func getTestCases() []testcase {
"gpu.intel.com/memory.max": "8088",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "2",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
},
Expand Down Expand Up @@ -114,6 +116,7 @@ func getTestCases() []testcase {
"gpu.intel.com/memory.max": "8000",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "1",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
},
Expand All @@ -138,6 +141,7 @@ func getTestCases() []testcase {
"gpu.intel.com/memory.max": "16000000000",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "1",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
},
Expand All @@ -161,6 +165,7 @@ func getTestCases() []testcase {
"gpu.intel.com/memory.max": "16000000000",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "1",
"gpu.intel.com/cards": "card0",
},
},
Expand Down