Skip to content

Commit f000018

Browse files
remove extra, backend from ggml.c, ggml.h
1 parent e645d12 commit f000018

File tree

4 files changed

+38
-56
lines changed

4 files changed

+38
-56
lines changed

ggml-cuda.cu

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,7 +1196,7 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
11961196
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true);
11971197
}
11981198

1199-
void ggml_cuda_noop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1199+
void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
12001200
(void) src0;
12011201
(void) src1;
12021202
(void) dst;
@@ -1287,6 +1287,10 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
12871287
}
12881288

12891289
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
1290+
if (tensor->src0 != nullptr && tensor->src0->op == GGML_OP_RESHAPE) {
1291+
ggml_cuda_assign_buffers(tensor);
1292+
}
1293+
12901294
const size_t size = ggml_nbytes(tensor);
12911295
const size_t scratch_size = g_n_batch * GGML_CUDA_SCRATCH_SIZE_PER_BATCH;
12921296
GGML_ASSERT(size <= scratch_size);
@@ -1367,7 +1371,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
13671371
if (!any_on_device) {
13681372
return false;
13691373
}
1370-
func = ggml_cuda_noop;
1374+
func = ggml_cuda_nop;
13711375
break;
13721376
case GGML_OP_ROPE:
13731377
if (!any_on_device) {

ggml.c

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3639,8 +3639,6 @@ struct ggml_context {
36393639

36403640
struct ggml_scratch scratch;
36413641
struct ggml_scratch scratch_save;
3642-
3643-
enum ggml_backend default_backend;
36443642
};
36453643

36463644
struct ggml_context_container {
@@ -3967,7 +3965,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
39673965
/*.objects_end =*/ NULL,
39683966
/*.scratch =*/ { 0, 0, NULL, },
39693967
/*.scratch_save =*/ { 0, 0, NULL, },
3970-
/*.default_backend =*/ GGML_BACKEND_CPU,
39713968
};
39723969

39733970
GGML_ASSERT(ctx->mem_buffer != NULL);
@@ -4026,10 +4023,6 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
40264023
ctx->no_alloc = no_alloc;
40274024
}
40284025

4029-
void ggml_set_default_backend(struct ggml_context * ctx, enum ggml_backend backend) {
4030-
ctx->default_backend = backend;
4031-
}
4032-
40334026
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
40344027
return ctx->mem_buffer;
40354028
}
@@ -4141,7 +4134,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
41414134

41424135
*result = (struct ggml_tensor) {
41434136
/*.type =*/ type,
4144-
/*.backend =*/ ctx->default_backend,
4137+
/*.backend =*/ GGML_BACKEND_CPU,
41454138
/*.n_dims =*/ n_dims,
41464139
/*.ne =*/ { 1, 1, 1, 1 },
41474140
/*.nb =*/ { 0, 0, 0, 0 },
@@ -4174,15 +4167,6 @@ struct ggml_tensor * ggml_new_tensor_impl(
41744167
result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
41754168
}
41764169

4177-
#ifdef GGML_USE_CUBLAS
4178-
if (result->backend == GGML_BACKEND_GPU) {
4179-
ggml_cuda_assign_buffers(result);
4180-
}
4181-
#else
4182-
GGML_ASSERT(result->backend == GGML_BACKEND_CPU);
4183-
#endif // GGML_USE_CUBLAS
4184-
GGML_ASSERT(result->backend != GGML_BACKEND_GPU_SPLIT);
4185-
41864170
ctx->n_objects++;
41874171

41884172
return result;
@@ -4537,8 +4521,6 @@ struct ggml_tensor * ggml_view_tensor(
45374521
result->nb[1] = src->nb[1];
45384522
result->nb[2] = src->nb[2];
45394523
result->nb[3] = src->nb[3];
4540-
result->backend = src->backend;
4541-
result->extra = src->extra;
45424524

45434525
return result;
45444526
}
@@ -5691,8 +5673,6 @@ struct ggml_tensor * ggml_reshape(
56915673
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
56925674
result->src0 = a;
56935675
result->src1 = NULL;
5694-
result->backend = a->backend;
5695-
result->extra = a->extra;
56965676

56975677
return result;
56985678
}
@@ -5717,8 +5697,6 @@ struct ggml_tensor * ggml_reshape_1d(
57175697
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
57185698
result->src0 = a;
57195699
result->src1 = NULL;
5720-
result->backend = a->backend;
5721-
result->extra = a->extra;
57225700

57235701
return result;
57245702
}
@@ -5744,8 +5722,6 @@ struct ggml_tensor * ggml_reshape_2d(
57445722
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
57455723
result->src0 = a;
57465724
result->src1 = NULL;
5747-
result->backend = a->backend;
5748-
result->extra = a->extra;
57495725

57505726
return result;
57515727
}
@@ -5772,8 +5748,6 @@ struct ggml_tensor * ggml_reshape_3d(
57725748
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
57735749
result->src0 = a;
57745750
result->src1 = NULL;
5775-
result->backend = a->backend;
5776-
result->extra = a->extra;
57775751

57785752
return result;
57795753
}
@@ -5802,8 +5776,6 @@ struct ggml_tensor * ggml_reshape_4d(
58025776
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
58035777
result->src0 = a;
58045778
result->src1 = NULL;
5805-
result->backend = a->backend;
5806-
result->extra = a->extra;
58075779

58085780
return result;
58095781
}

ggml.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,6 @@ extern "C" {
479479

480480
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
481481
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
482-
GGML_API void ggml_set_default_backend(struct ggml_context * ctx, enum ggml_backend backend);
483482

484483
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
485484
GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);

llama.cpp

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ static const size_t MB = 1024*1024;
6060
// TODO: dynamically determine these sizes
6161
// needs modifications in ggml
6262

63+
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
64+
void llama_nop(struct ggml_tensor * tensor) {} // do nothing by default
65+
6366
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
6467
{
6568
static std::map<e_model, size_t> k_sizes = {
@@ -1300,10 +1303,11 @@ static bool llama_eval_internal(
13001303
const int i_gpu_start = n_layer - n_gpu_layers;
13011304

13021305
for (int il = 0; il < n_layer; ++il) {
1303-
ggml_backend backend_offload = GGML_BACKEND_CPU;
1306+
offload_func_t offload_func = llama_nop;
1307+
13041308
#ifdef GGML_USE_CUBLAS
13051309
if (il >= i_gpu_start) {
1306-
backend_offload = GGML_BACKEND_GPU;
1310+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
13071311
}
13081312
#endif // GGML_USE_CUBLAS
13091313

@@ -1313,40 +1317,31 @@ static bool llama_eval_internal(
13131317

13141318
// norm
13151319
{
1316-
ggml_set_default_backend(ctx0, backend_offload);
13171320
cur = ggml_rms_norm(ctx0, inpL);
1321+
offload_func(cur);
13181322
ggml_set_name(cur, "rms_norm_0");
13191323

13201324
// cur = cur*attention_norm(broadcasted)
13211325
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
1326+
offload_func(cur);
13221327
ggml_set_name(cur, "attention_norm_0");
13231328
}
13241329

13251330
// self-attention
13261331
{
13271332
// compute Q and K and RoPE them
13281333
struct ggml_tensor * tmpq = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N);
1334+
offload_func(cur);
13291335
ggml_set_name(tmpq, "tmpq");
13301336
struct ggml_tensor * tmpk = ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N);
1337+
offload_func(cur);
13311338
ggml_set_name(tmpk, "tmpk");
1332-
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
13331339

1334-
#ifdef GGML_USE_CUBLAS
1335-
struct ggml_tensor * Kcur;
1336-
struct ggml_tensor * Qcur;
1337-
if (backend_offload == GGML_BACKEND_GPU) {
1338-
Kcur = ggml_rope(ctx0, tmpk, n_past, n_rot, 0);
1339-
Qcur = ggml_rope(ctx0, tmpq, n_past, n_rot, 0);
1340-
} else {
1341-
Kcur = ggml_rope_inplace(ctx0, tmpk, n_past, n_rot, 0);
1342-
Qcur = ggml_rope_inplace(ctx0, tmpq, n_past, n_rot, 0);
1343-
}
1344-
#else
13451340
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, tmpk, n_past, n_rot, 0);
1341+
ggml_set_name(Kcur, "Kcur");
1342+
13461343
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, tmpq, n_past, n_rot, 0);
1347-
#endif // GGML_USE_CUBLAS
13481344
ggml_set_name(Qcur, "Qcur");
1349-
ggml_set_name(Kcur, "Kcur");
13501345

13511346
// store key and value to memory
13521347
{
@@ -1430,62 +1425,70 @@ static bool llama_eval_internal(
14301425
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
14311426
ggml_set_name(cur, "KQV_merged_contiguous");
14321427

1433-
ggml_set_default_backend(ctx0, backend_offload);
14341428
// projection (no bias)
14351429
cur = ggml_mul_mat(ctx0,
14361430
model.layers[il].wo,
14371431
cur);
1432+
offload_func(cur);
14381433
ggml_set_name(cur, "result_wo");
14391434
}
14401435

14411436
lctx.use_buf(ctx0, 1);
14421437
//ggml_cuda_set_scratch(1);
14431438

14441439
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1440+
offload_func(inpFF);
14451441
ggml_set_name(inpFF, "inpFF");
14461442

14471443
// feed-forward network
14481444
{
14491445
// norm
14501446
{
14511447
cur = ggml_rms_norm(ctx0, inpFF);
1448+
offload_func(cur);
14521449
ggml_set_name(cur, "rms_norm_1");
14531450

14541451
// cur = cur*ffn_norm(broadcasted)
14551452
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
1453+
offload_func(cur);
14561454
ggml_set_name(cur, "ffn_norm");
14571455
}
14581456

14591457
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
14601458
model.layers[il].w3,
14611459
cur);
1462-
ggml_set_name(cur, "result_w3");
1460+
offload_func(tmp);
1461+
ggml_set_name(tmp, "result_w3");
14631462

14641463
cur = ggml_mul_mat(ctx0,
14651464
model.layers[il].w1,
14661465
cur);
1466+
offload_func(cur);
14671467
ggml_set_name(cur, "result_w2");
14681468

14691469
// SILU activation
14701470
cur = ggml_silu(ctx0, cur);
1471+
offload_func(cur);
14711472
ggml_set_name(cur, "silu");
14721473

14731474
cur = ggml_mul(ctx0, cur, tmp);
1475+
offload_func(cur);
14741476
ggml_set_name(cur, "silu_x_result_w3");
14751477

14761478
cur = ggml_mul_mat(ctx0,
14771479
model.layers[il].w2,
14781480
cur);
1481+
offload_func(cur);
14791482
ggml_set_name(cur, "result_w2");
14801483
}
14811484

14821485
cur = ggml_add(ctx0, cur, inpFF);
1486+
offload_func(cur);
14831487
ggml_set_name(cur, "inpFF_+_result_w2");
14841488

14851489
// input for next layer
14861490
inpL = cur;
14871491

1488-
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
14891492
}
14901493

14911494
lctx.use_buf(ctx0, 0);
@@ -1494,28 +1497,32 @@ static bool llama_eval_internal(
14941497
// used at the end to optionally extract the embeddings
14951498
struct ggml_tensor * embeddings = NULL;
14961499

1500+
offload_func_t offload_func = llama_nop;
1501+
14971502
#ifdef GGML_USE_CUBLAS
1498-
if (n_gpu_layers > n_layer) {
1499-
ggml_set_default_backend(ctx0, GGML_BACKEND_GPU);
1500-
}
1503+
if (n_gpu_layers > n_layer) {
1504+
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
1505+
}
15011506
#endif // GGML_USE_CUBLAS
15021507

15031508
// norm
15041509
{
15051510
cur = ggml_rms_norm(ctx0, inpL);
1511+
offload_func(cur);
15061512
ggml_set_name(cur, "rms_norm_inpL");
15071513

15081514
cur = ggml_rms_norm(ctx0, cur);
1515+
offload_func(cur);
15091516
ggml_set_name(cur, "rms_norm_after");
15101517

15111518
// cur = cur*norm(broadcasted)
15121519
cur = ggml_mul(ctx0, cur, model.norm);
1520+
offload_func(cur);
15131521
ggml_set_name(cur, "result_norm");
15141522

15151523
embeddings = cur;
15161524
}
15171525

1518-
ggml_set_default_backend(ctx0, GGML_BACKEND_CPU);
15191526

15201527
// lm_head
15211528
cur = ggml_mul_mat(ctx0, model.output, cur);

0 commit comments

Comments
 (0)