You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
- added functionality to find the smallest fitting buffer instead of the first found buffer that >= than requested
-- this prevents that two buffer allocations in sequence can take a huge buffer for a small tensor and then require a new buffer for the 2nd tensor
-- in my test it saved 1GB VRAM that are now free for more offloading
cuda free buffers:
- added a helper function that frees all unused buffers from a device to prevent huge F32 buffers from cuBLAS occupying VRAM needlessly after token ingestion
libfalcon:
- corrected vram_overhead calculation to account for the actual non-weight buffers needed during inference
- added vram_overhead for n_batch > 1 as this switches the ingestion into a 32 bit dequantization mode for cu_blas which needs almost 2 GB VRAM buffers
- corrected the automated layer distribution to fill VRAM as much as possible with layers
From here on it's recommended to use --ngl 100 and -b 1 for CUDA processing.
In addition -t is recommended using 1 or 1 less threads than CPU cores (depends on CPU, GPU used)
size_t vram_reserved=1024*1024*512; //will be adapted by model
1173
+
size_t vram_total=0;
1174
+
size_t vram_free=0;
1175
+
constsize_t vram_reserved=512*MB; // that amount of VRAM is to stay free on GPU (headroom for other processes - may be reduced in pure server environments)
1176
+
size_t vram_overhead = 1250*MB; // this amount of vram is estimated for non weight storage buffers on VRAM (no big difference between 7B and 40B, needs to increase when more work is offloaded in the future)
1177
+
// cublas is used in 32 bit mode, temporary cuda storage/conversion buffers are needed for batch ingestion ( could be run in 16 bit mode without performance downgrade and save half the VRAM)
1178
+
if (model.type == FALCON_40B && n_batch > 1)
1179
+
vram_overhead += (1024+288+256) * MB;
1180
+
if (model.type == FALCON_7B && n_batch > 1)
1181
+
vram_overhead += (315+80+78) * MB;
1172
1182
#if defined(GGML_USE_CUBLAS)
1173
1183
cudaMemGetInfo(&vram_free, &vram_total); // this should go in ggml-cuda.cu but I don't want to make Johannes life harder by modifying that yet
@@ -1252,12 +1246,14 @@ size_t vram_reserved=1024*1024*512; //will be adapted by model
1252
1246
}
1253
1247
1254
1248
constint i_gpu_start = n_layer - n_gpu_layers;
1255
-
int i_gpu_end = n_layer; // allows to terminate the offloading earlier. TODO: instead do a proper calculation run and determine the start before the loop
1249
+
int i_gpu_last = n_layer; // allows to terminate the offloading earlier. TODO: instead do a proper calculation run and determine the start before the loop
@@ -1288,14 +1284,15 @@ size_t vram_reserved=1024*1024*512; //will be adapted by model
1288
1284
vram_layer = calculate_layer_vram_bytes(layer);
1289
1285
vram_weights += vram_layer;
1290
1286
vram_free = (vram_layer > vram_free) ? 0 : vram_free - vram_layer; // simulate the layer being loaded in VRAM
1291
-
1292
-
if (vram_free <= (vram_overhead+vram_scratch+vram_reserved))
1287
+
// test if we have enough VRAM to load the next layer
1288
+
if (i < n_layer && vram_free <= (vram_overhead+vram_scratch+vram_reserved+vram_layer))
1293
1289
{
1294
1290
// this needs some polishing (instead of fiddling with --ngl I'd like the option to auto-fill the vram with as many layers as possible as an alternative)
1295
-
fprintf(stderr, "WARNING: Not enough VRAM to load the model as configured - at layer %d of %d\n", i, n_layer);
1291
+
fprintf(stderr, "INFO: Not enough VRAM to load all requested layers - at layer %d of %d: skipping\n", i, n_layer);
1296
1292
n_gpu_layers = i+1;
1297
-
model.n_gpu_layers = n_gpu_layers;
1298
-
i_gpu_end = i;
1293
+
model.n_gpu_layers = n_gpu_layers;
1294
+
i_gpu_last = i;
1295
+
model.i_gpu_last = i_gpu_last;
1299
1296
}
1300
1297
}
1301
1298
@@ -1335,7 +1332,7 @@ size_t vram_reserved=1024*1024*512; //will be adapted by model
1335
1332
if (n_gpu_layers > (int) hparams.n_layer) {
1336
1333
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
0 commit comments