Skip to content

Commit 50b9056

Browse files
committed
count memory up to NumGPU
1 parent 9c76b30 commit 50b9056

File tree

1 file changed

+11
-9
lines changed

1 file changed

+11
-9
lines changed

llm/memory.go

+11-9
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
5353
opts.NumCtx = max(opts.NumCtx, 2048)
5454
}
5555

56+
layers := ggml.Tensors().Layers()
57+
// add one layer worth of memorr as a buffer
58+
memoryMinimum += layers["blk.0"].size()
59+
5660
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
5761
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
5862

@@ -73,13 +77,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
7377
graphPartialOffload = graphFullOffload
7478
}
7579

76-
layers := ggml.Tensors().Layers()
77-
7880
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
79-
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
81+
memoryRequiredTotal := memoryMinimum + graphFullOffload
8082

8183
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
82-
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
84+
memoryRequiredPartial := memoryMinimum + graphPartialOffload
8385

8486
var memoryLayerOutput uint64
8587
if layer, ok := layers["output_norm"]; ok {
@@ -106,7 +108,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
106108
memoryLayer += kv / ggml.KV().BlockCount()
107109

108110
memoryRequiredTotal += memoryLayer
109-
if memoryAvailable > memoryRequiredPartial+memoryLayer {
111+
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
110112
memoryRequiredPartial += memoryLayer
111113
layerCount++
112114
}
@@ -117,7 +119,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
117119
memoryRequiredTotal += memoryLayerOutput
118120
}
119121

120-
if memoryAvailable > memoryRequiredTotal {
122+
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
121123
layerCount = int(ggml.KV().BlockCount()) + 1
122124
memoryRequiredPartial = memoryRequiredTotal
123125
}
@@ -128,10 +130,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
128130
"offload to gpu",
129131
slog.Group(
130132
"layers",
131-
// actual number of layers offloaded
132-
"real", opts.NumGPU,
133+
// requested number of layers to offload
134+
"requested", opts.NumGPU,
133135
// estimated number of layers that can be offloaded
134-
"estimate", layerCount,
136+
"real", layerCount,
135137
),
136138
slog.Group(
137139
"memory",

0 commit comments

Comments
 (0)