@@ -53,6 +53,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
53
53
opts .NumCtx = max (opts .NumCtx , 2048 )
54
54
}
55
55
56
+ layers := ggml .Tensors ().Layers ()
57
+ // add one layer worth of memorr as a buffer
58
+ memoryMinimum += layers ["blk.0" ].size ()
59
+
56
60
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
57
61
var kv uint64 = 2 * 2 * uint64 (opts .NumCtx ) * ggml .KV ().BlockCount () * ggml .KV ().EmbeddingLength () / ggml .KV ().HeadCount () * ggml .KV ().HeadCountKV ()
58
62
@@ -73,13 +77,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
73
77
graphPartialOffload = graphFullOffload
74
78
}
75
79
76
- layers := ggml .Tensors ().Layers ()
77
-
78
80
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
79
- memoryRequiredTotal := memoryMinimum + graphFullOffload + layers [ "blk.0" ]. size ()
81
+ memoryRequiredTotal := memoryMinimum + graphFullOffload
80
82
81
83
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
82
- memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers [ "blk.0" ]. size ()
84
+ memoryRequiredPartial := memoryMinimum + graphPartialOffload
83
85
84
86
var memoryLayerOutput uint64
85
87
if layer , ok := layers ["output_norm" ]; ok {
@@ -106,7 +108,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
106
108
memoryLayer += kv / ggml .KV ().BlockCount ()
107
109
108
110
memoryRequiredTotal += memoryLayer
109
- if memoryAvailable > memoryRequiredPartial + memoryLayer {
111
+ if ( opts . NumGPU >= 0 && layerCount + 1 <= opts . NumGPU ) || ( opts . NumGPU < 0 && memoryAvailable > memoryRequiredPartial + memoryLayer ) {
110
112
memoryRequiredPartial += memoryLayer
111
113
layerCount ++
112
114
}
@@ -117,7 +119,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
117
119
memoryRequiredTotal += memoryLayerOutput
118
120
}
119
121
120
- if memoryAvailable > memoryRequiredTotal {
122
+ if ( opts . NumGPU >= 0 && layerCount + 1 <= opts . NumGPU ) || ( opts . NumGPU < 0 && memoryAvailable > memoryRequiredTotal ) {
121
123
layerCount = int (ggml .KV ().BlockCount ()) + 1
122
124
memoryRequiredPartial = memoryRequiredTotal
123
125
}
@@ -128,10 +130,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
128
130
"offload to gpu" ,
129
131
slog .Group (
130
132
"layers" ,
131
- // actual number of layers offloaded
132
- "real " , opts .NumGPU ,
133
+ // requested number of layers to offload
134
+ "requested " , opts .NumGPU ,
133
135
// estimated number of layers that can be offloaded
134
- "estimate " , layerCount ,
136
+ "real " , layerCount ,
135
137
),
136
138
slog .Group (
137
139
"memory" ,
0 commit comments