@@ -53,6 +53,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
53
53
opts .NumCtx = max (opts .NumCtx , 2048 )
54
54
}
55
55
56
+ layers := ggml .Tensors ().Layers ()
57
+ // add one layer worth of memory as a buffer
58
+ if blk0 , ok := layers ["blk.0" ]; ok {
59
+ memoryMinimum += blk0 .size ()
60
+ }
61
+
56
62
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
57
63
var kv uint64 = 2 * 2 * uint64 (opts .NumCtx ) * ggml .KV ().BlockCount () * ggml .KV ().EmbeddingLength () / ggml .KV ().HeadCount () * ggml .KV ().HeadCountKV ()
58
64
@@ -73,13 +79,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
73
79
graphPartialOffload = graphFullOffload
74
80
}
75
81
76
- layers := ggml .Tensors ().Layers ()
77
-
78
82
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
79
- memoryRequiredTotal := memoryMinimum + graphFullOffload + layers [ "blk.0" ]. size ()
83
+ memoryRequiredTotal := memoryMinimum + graphFullOffload
80
84
81
85
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
82
- memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers [ "blk.0" ]. size ()
86
+ memoryRequiredPartial := memoryMinimum + graphPartialOffload
83
87
84
88
var memoryLayerOutput uint64
85
89
if layer , ok := layers ["output_norm" ]; ok {
@@ -100,15 +104,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
100
104
101
105
var layerCount int
102
106
for i := 0 ; i < int (ggml .KV ().BlockCount ()); i ++ {
103
- memoryLayer := layers [fmt .Sprintf ("blk.%d" , i )].size ()
107
+ if blk , ok := layers [fmt .Sprintf ("blk.%d" , i )]; ok {
108
+ memoryLayer := blk .size ()
104
109
105
- // KV is proportional to the number of layers
106
- memoryLayer += kv / ggml .KV ().BlockCount ()
110
+ // KV is proportional to the number of layers
111
+ memoryLayer += kv / ggml .KV ().BlockCount ()
107
112
108
- memoryRequiredTotal += memoryLayer
109
- if memoryAvailable > memoryRequiredPartial + memoryLayer {
110
- memoryRequiredPartial += memoryLayer
111
- layerCount ++
113
+ memoryRequiredTotal += memoryLayer
114
+ if (opts .NumGPU >= 0 && layerCount + 1 <= opts .NumGPU ) || (opts .NumGPU < 0 && memoryAvailable > memoryRequiredPartial + memoryLayer ) {
115
+ memoryRequiredPartial += memoryLayer
116
+ layerCount ++
117
+ }
112
118
}
113
119
}
114
120
@@ -117,7 +123,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
117
123
memoryRequiredTotal += memoryLayerOutput
118
124
}
119
125
120
- if memoryAvailable > memoryRequiredTotal {
126
+ if ( opts . NumGPU >= 0 && layerCount + 1 <= opts . NumGPU ) || ( opts . NumGPU < 0 && memoryAvailable > memoryRequiredTotal ) {
121
127
layerCount = int (ggml .KV ().BlockCount ()) + 1
122
128
memoryRequiredPartial = memoryRequiredTotal
123
129
}
@@ -128,10 +134,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
128
134
"offload to gpu" ,
129
135
slog .Group (
130
136
"layers" ,
131
- // actual number of layers offloaded
132
- "real " , opts .NumGPU ,
137
+ // requested number of layers to offload
138
+ "requested " , opts .NumGPU ,
133
139
// estimated number of layers that can be offloaded
134
- "estimate " , layerCount ,
140
+ "real " , layerCount ,
135
141
),
136
142
slog .Group (
137
143
"memory" ,
0 commit comments