Skip to content

Commit 0661e6a

Browse files
committed
sched : add a new split if the current one has too many inputs
reduce max inputs per split more cleanup
1 parent 9809075 commit 0661e6a

File tree

5 files changed

+81
-70
lines changed

5 files changed

+81
-70
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,10 @@ static std::string get_cpu_info() {
114114
static std::string get_gpu_info() {
115115
std::string id;
116116
#ifdef GGML_USE_CUBLAS
117-
int count = ggml_cuda_get_device_count();
117+
int count = ggml_backend_cuda_get_device_count();
118118
for (int i = 0; i < count; i++) {
119119
char buf[128];
120-
ggml_cuda_get_device_description(i, buf, sizeof(buf));
120+
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
121121
id += buf;
122122
if (i < count - 1) {
123123
id += "/";

ggml-backend.c

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
768768

769769
if (cpu_plan->cplan.work_size > 0) {
770770
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
771+
if (cpu_plan->cplan.work_data == NULL) {
772+
free(cpu_plan);
773+
return NULL;
774+
}
771775
}
772776

773777
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
@@ -1007,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
10071011
#endif
10081012

10091013
#ifndef GGML_SCHED_MAX_SPLITS
1010-
#define GGML_SCHED_MAX_SPLITS 1024
1014+
#define GGML_SCHED_MAX_SPLITS 2048
10111015
#endif
10121016

10131017
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1014-
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
1018+
#define GGML_SCHED_MAX_SPLIT_INPUTS 4
10151019
#endif
10161020

10171021
#ifndef GGML_SCHED_MAX_COPIES
@@ -1422,31 +1426,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
14221426

14231427
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
14241428

1425-
// check if a weight is on a different backend and start a new split if so
1426-
// by starting a new split, the memory of the previously offloaded weights can be reused
1427-
bool offload = false;
1429+
// check if we should start a new split based on the sources of the current node
1430+
bool need_new_split = false;
14281431
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
14291432
for (int j = 0; j < GGML_MAX_SRC; j++) {
14301433
struct ggml_tensor * src = node->src[j];
14311434
if (src == NULL) {
14321435
continue;
14331436
}
1437+
// check if a weight is on a different backend
1438+
// by starting a new split, the memory of the previously offloaded weights can be reused
14341439
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
14351440
int src_backend_id = tensor_backend_id(src);
14361441
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1437-
offload = true;
1442+
need_new_split = true;
1443+
break;
1444+
}
1445+
}
1446+
// check if the split has too many inputs
1447+
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1448+
const size_t id = hash_id(src);
1449+
int src_backend_id = sched->tensor_backend_id[id];
1450+
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
1451+
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1452+
need_new_split = true;
14381453
break;
14391454
}
14401455
}
14411456
}
14421457
}
14431458

1444-
if (node_backend_id != cur_backend_id || offload) {
1459+
if (node_backend_id != cur_backend_id || need_new_split) {
14451460
split->i_end = i;
14461461
i_split++;
14471462
if (i_split >= sched->splits_capacity) {
14481463
sched->splits_capacity *= 2;
14491464
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1465+
GGML_ASSERT(sched->splits != NULL);
14501466
}
14511467
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
14521468
split = &sched->splits[i_split];
@@ -1523,13 +1539,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
15231539

15241540
// create copies of the graph for each split
15251541
// TODO: avoid this copy
1526-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
1542+
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
15271543
for (int i = 0; i < sched->n_splits; i++) {
15281544
struct ggml_backend_sched_split * split = &sched->splits[i];
15291545
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
15301546

15311547
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
15321548
for (int j = 0; j < split->n_inputs; j++) {
1549+
assert(graph_copy->size > (graph_copy->n_nodes + 1));
1550+
15331551
struct ggml_tensor * input = split->inputs[j];
15341552
const size_t input_id = hash_id(input);
15351553
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
@@ -1546,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
15461564
}
15471565

15481566
for (int j = split->i_start; j < split->i_end; j++) {
1567+
assert(graph_copy->size > graph_copy->n_nodes);
15491568
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
15501569
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
15511570
}
@@ -1630,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
16301649
}
16311650
ggml_backend_tensor_copy(input, input_cpy);
16321651
} else {
1652+
// wait for the split backend to finish using the input before overwriting it
16331653
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
16341654
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
16351655
} else {
16361656
ggml_backend_synchronize(split_backend);
1637-
ggml_backend_synchronize(input_backend);
16381657
}
1639-
16401658
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
16411659
}
16421660
}
@@ -1709,8 +1727,10 @@ ggml_backend_sched_t ggml_backend_sched_new(
17091727
sched->hash_set = ggml_hash_set_new(graph_size);
17101728
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
17111729
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1712-
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
1713-
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
1730+
1731+
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732+
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733+
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
17141734

17151735
sched->n_backends = n_backends;
17161736

@@ -1770,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
17701790
}
17711791

17721792
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1793+
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1794+
17731795
ggml_backend_sched_split_graph(sched, measure_graph);
17741796

17751797
// TODO: extract this to a separate function

ggml-cuda.cu

Lines changed: 10 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7791,11 +7791,11 @@ struct cuda_pool_alloc {
77917791

77927792
static bool g_cublas_loaded = false;
77937793

7794-
GGML_CALL bool ggml_cublas_loaded(void) {
7794+
static bool ggml_cublas_loaded(void) {
77957795
return g_cublas_loaded;
77967796
}
77977797

7798-
GGML_CALL void ggml_init_cublas() {
7798+
static void ggml_init_cublas() {
77997799
static bool initialized = false;
78007800

78017801
if (!initialized) {
@@ -7884,7 +7884,7 @@ GGML_CALL void ggml_init_cublas() {
78847884
}
78857885
}
78867886

7887-
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
7887+
static void * ggml_cuda_host_malloc(size_t size) {
78887888
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
78897889
return nullptr;
78907890
}
@@ -7902,7 +7902,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
79027902
return ptr;
79037903
}
79047904

7905-
GGML_CALL void ggml_cuda_host_free(void * ptr) {
7905+
static void ggml_cuda_host_free(void * ptr) {
79067906
CUDA_CHECK(cudaFreeHost(ptr));
79077907
}
79087908

@@ -9569,21 +9569,6 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
95699569
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
95709570
}
95719571

9572-
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
9573-
if (!g_cublas_loaded) return false;
9574-
9575-
const int64_t ne10 = src1->ne[0];
9576-
9577-
const int64_t ne0 = dst->ne[0];
9578-
const int64_t ne1 = dst->ne[1];
9579-
9580-
// TODO: find the optimal values for these
9581-
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
9582-
src1->type == GGML_TYPE_F32 &&
9583-
dst->type == GGML_TYPE_F32 &&
9584-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
9585-
}
9586-
95879572
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
95889573
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
95899574
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
@@ -10336,7 +10321,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
1033610321
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
1033710322
}
1033810323

10339-
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
10324+
static void ggml_cuda_set_main_device(const int main_device) {
1034010325
if (main_device >= g_device_count) {
1034110326
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
1034210327
main_device, g_device_count, g_main_device);
@@ -10351,7 +10336,7 @@ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
1035110336
}
1035210337
}
1035310338

10354-
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
10339+
static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
1035510340
if (!g_cublas_loaded) return false;
1035610341

1035710342
if (tensor->op == GGML_OP_MUL_MAT) {
@@ -10505,15 +10490,15 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
1050510490
return true;
1050610491
}
1050710492

10508-
GGML_CALL int ggml_cuda_get_device_count() {
10493+
static int ggml_cuda_get_device_count() {
1050910494
int device_count;
1051010495
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
1051110496
return 0;
1051210497
}
1051310498
return device_count;
1051410499
}
1051510500

10516-
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
10501+
static void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
1051710502
cudaDeviceProp prop;
1051810503
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
1051910504
snprintf(description, description_size, "%s", prop.name);
@@ -11397,6 +11382,8 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
1139711382
const int min_batch_size = 32;
1139811383

1139911384
return op->ne[1] > min_batch_size && op->op != GGML_OP_GET_ROWS;
11385+
11386+
UNUSED(backend);
1140011387
}
1140111388

1140211389
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {

ggml-cuda.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,17 @@ extern "C" {
1717

1818
#define GGML_CUDA_MAX_DEVICES 16
1919

20-
// TODO: remove this
21-
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
22-
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
23-
2420
// backend API
2521
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
2622

2723
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
2824

25+
// device buffer
2926
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
27+
3028
// split tensor buffer that splits matrices by rows across multiple devices
3129
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
30+
3231
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
3332
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
3433

llama.cpp

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -8612,16 +8612,18 @@ static struct ggml_cgraph * llama_build_graph(
86128612
}
86138613

86148614
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
8615-
// to fix this, we assign the norm layer manually to the backend of its layer
8616-
// FIXME: interferes with auto offloading of large batches
8617-
//if (il != -1 && strcmp(name, "norm") == 0) {
8618-
// for (auto * backend : lctx.backends) {
8619-
// if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
8620-
// ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
8621-
// break;
8622-
// }
8623-
// }
8624-
//}
8615+
// FIXME: fix in ggml_backend_sched
8616+
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
8617+
if (batch.n_tokens <= 32 || full_offload) {
8618+
if (il != -1 && strcmp(name, "norm") == 0) {
8619+
for (auto * backend : lctx.backends) {
8620+
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
8621+
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
8622+
break;
8623+
}
8624+
}
8625+
}
8626+
}
86258627
};
86268628

86278629
struct ggml_cgraph * result = NULL;
@@ -13119,27 +13121,25 @@ struct llama_context * llama_new_context_with_model(
1311913121
ctx->backends.push_back(ctx->backend_metal);
1312013122
}
1312113123
#elif defined(GGML_USE_CUBLAS)
13122-
if (model->n_gpu_layers >= 0) { // TODO: make auto-offload configurable
13124+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
1312313125
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
13124-
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
13125-
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
13126+
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
13127+
if (backend == nullptr) {
13128+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
13129+
llama_free(ctx);
13130+
return nullptr;
13131+
}
13132+
ctx->backends.push_back(backend);
13133+
} else {
13134+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
13135+
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
13136+
ggml_backend_t backend = ggml_backend_cuda_init(device);
1312613137
if (backend == nullptr) {
13127-
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
13138+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
1312813139
llama_free(ctx);
1312913140
return nullptr;
1313013141
}
1313113142
ctx->backends.push_back(backend);
13132-
} else {
13133-
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
13134-
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
13135-
ggml_backend_t backend = ggml_backend_cuda_init(device);
13136-
if (backend == nullptr) {
13137-
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
13138-
llama_free(ctx);
13139-
return nullptr;
13140-
}
13141-
ctx->backends.push_back(backend);
13142-
}
1314313143
}
1314413144
}
1314513145
#elif defined(GGML_USE_VULKAN)
@@ -13297,14 +13297,17 @@ struct llama_context * llama_new_context_with_model(
1329713297
ggml_backend_t backend = ctx->backends[i];
1329813298
ggml_backend_buffer_type_t buft = backend_buft[i];
1329913299
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
13300-
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
13301-
ggml_backend_buft_name(buft),
13302-
size / 1024.0 / 1024.0);
13300+
if (size > 1) {
13301+
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
13302+
ggml_backend_buft_name(buft),
13303+
size / 1024.0 / 1024.0);
13304+
}
1330313305
}
1330413306

1330513307
// note: the number of splits during measure is higher than during inference due to the kv shift
1330613308
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
13307-
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
13309+
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
13310+
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
1330813311
}
1330913312
}
1331013313

0 commit comments

Comments
 (0)