Skip to content

Commit 0e331c7

Browse files
authored
Merge pull request ollama#4328 from ollama/mxyng/mem
count memory up to NumGPU if set by user
2 parents a4b8d1f + 1d359e7 commit 0e331c7

File tree

1 file changed

+21
-15
lines changed

1 file changed

+21
-15
lines changed

llm/memory.go

+21-15
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
5353
opts.NumCtx = max(opts.NumCtx, 2048)
5454
}
5555

56+
layers := ggml.Tensors().Layers()
57+
// add one layer worth of memory as a buffer
58+
if blk0, ok := layers["blk.0"]; ok {
59+
memoryMinimum += blk0.size()
60+
}
61+
5662
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
5763
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
5864

@@ -73,13 +79,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
7379
graphPartialOffload = graphFullOffload
7480
}
7581

76-
layers := ggml.Tensors().Layers()
77-
7882
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
79-
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
83+
memoryRequiredTotal := memoryMinimum + graphFullOffload
8084

8185
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
82-
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
86+
memoryRequiredPartial := memoryMinimum + graphPartialOffload
8387

8488
var memoryLayerOutput uint64
8589
if layer, ok := layers["output_norm"]; ok {
@@ -100,15 +104,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
100104

101105
var layerCount int
102106
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
103-
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
107+
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
108+
memoryLayer := blk.size()
104109

105-
// KV is proportional to the number of layers
106-
memoryLayer += kv / ggml.KV().BlockCount()
110+
// KV is proportional to the number of layers
111+
memoryLayer += kv / ggml.KV().BlockCount()
107112

108-
memoryRequiredTotal += memoryLayer
109-
if memoryAvailable > memoryRequiredPartial+memoryLayer {
110-
memoryRequiredPartial += memoryLayer
111-
layerCount++
113+
memoryRequiredTotal += memoryLayer
114+
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
115+
memoryRequiredPartial += memoryLayer
116+
layerCount++
117+
}
112118
}
113119
}
114120

@@ -117,7 +123,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
117123
memoryRequiredTotal += memoryLayerOutput
118124
}
119125

120-
if memoryAvailable > memoryRequiredTotal {
126+
if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
121127
layerCount = int(ggml.KV().BlockCount()) + 1
122128
memoryRequiredPartial = memoryRequiredTotal
123129
}
@@ -128,10 +134,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
128134
"offload to gpu",
129135
slog.Group(
130136
"layers",
131-
// actual number of layers offloaded
132-
"real", opts.NumGPU,
137+
// requested number of layers to offload
138+
"requested", opts.NumGPU,
133139
// estimated number of layers that can be offloaded
134-
"estimate", layerCount,
140+
"real", layerCount,
135141
),
136142
slog.Group(
137143
"memory",

0 commit comments

Comments
 (0)