@@ -3345,7 +3345,6 @@ static struct ggml_tensor * llm_build_ffn(
3345
3345
// if max_alibi_bias > 0 then apply ALiBi
3346
3346
static struct ggml_tensor * llm_build_kqv (
3347
3347
struct ggml_context * ctx,
3348
- struct ggml_tensor * cur,
3349
3348
const llama_hparams & hparams,
3350
3349
const llama_kv_cache & kv,
3351
3350
struct ggml_tensor * wo,
@@ -3411,7 +3410,7 @@ static struct ggml_tensor * llm_build_kqv(
3411
3410
struct ggml_tensor * kqv_merged = ggml_permute (ctx, kqv, 0 , 2 , 1 , 3 );
3412
3411
cb (kqv_merged, " kqv_merged" , il);
3413
3412
3414
- cur = ggml_cont_2d (ctx, kqv_merged, n_embd, n_tokens);
3413
+ struct ggml_tensor * cur = ggml_cont_2d (ctx, kqv_merged, n_embd, n_tokens);
3415
3414
cb (cur, " kqv_merged_cont" , il);
3416
3415
3417
3416
cur = ggml_mul_mat (ctx, wo, cur);
@@ -3565,7 +3564,7 @@ struct llm_build_context {
3565
3564
3566
3565
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
3567
3566
3568
- cur = llm_build_kqv (ctx0, cur, hparams, kv_self,
3567
+ cur = llm_build_kqv (ctx0, hparams, kv_self,
3569
3568
model.layers [il].wo , NULL ,
3570
3569
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , cb, il);
3571
3570
cb (cur, " kqv_out" , il);
@@ -3677,7 +3676,7 @@ struct llm_build_context {
3677
3676
// apply ALiBi for 13B model
3678
3677
const float max_alibi_bias = model.type == MODEL_13B ? 8 .0f : -1 .0f ;
3679
3678
3680
- cur = llm_build_kqv (ctx0, cur, hparams, kv_self,
3679
+ cur = llm_build_kqv (ctx0, hparams, kv_self,
3681
3680
model.layers [il].wo , NULL ,
3682
3681
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il);
3683
3682
cb (cur, " kqv_out" , il);
@@ -3795,7 +3794,7 @@ struct llm_build_context {
3795
3794
3796
3795
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
3797
3796
3798
- cur = llm_build_kqv (ctx0, attn_norm, hparams, kv_self,
3797
+ cur = llm_build_kqv (ctx0, hparams, kv_self,
3799
3798
model.layers [il].wo , NULL ,
3800
3799
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , cb, il);
3801
3800
cb (cur, " kqv_out" , il);
@@ -3895,7 +3894,7 @@ struct llm_build_context {
3895
3894
3896
3895
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
3897
3896
3898
- cur = llm_build_kqv (ctx0, cur, hparams, kv_self,
3897
+ cur = llm_build_kqv (ctx0, hparams, kv_self,
3899
3898
model.layers [il].wo , model.layers [il].bo ,
3900
3899
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , cb, il);
3901
3900
cb (cur, " kqv_out" , il);
@@ -4100,7 +4099,7 @@ struct llm_build_context {
4100
4099
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4101
4100
4102
4101
// TODO: not tested, could be broken
4103
- cur = llm_build_kqv (ctx0, Q, hparams, kv_self,
4102
+ cur = llm_build_kqv (ctx0, hparams, kv_self,
4104
4103
model.layers [il].wo , model.layers [il].bo ,
4105
4104
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , cb, il);
4106
4105
cb (cur, " kqv_out" , il);
@@ -4191,7 +4190,7 @@ struct llm_build_context {
4191
4190
4192
4191
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4193
4192
4194
- cur = llm_build_kqv (ctx0, Qcur, hparams, kv_self,
4193
+ cur = llm_build_kqv (ctx0, hparams, kv_self,
4195
4194
model.layers [il].wo , NULL ,
4196
4195
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8 .0f , cb, il);
4197
4196
cb (cur, " kqv_out" , il);
@@ -4288,7 +4287,7 @@ struct llm_build_context {
4288
4287
4289
4288
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4290
4289
4291
- cur = llm_build_kqv (ctx0, Qcur, hparams, kv_self,
4290
+ cur = llm_build_kqv (ctx0, hparams, kv_self,
4292
4291
model.layers [il].wo , model.layers [il].bo ,
4293
4292
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8 .0f , cb, il);
4294
4293
cb (cur, " kqv_out" , il);
@@ -4382,7 +4381,7 @@ struct llm_build_context {
4382
4381
4383
4382
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
4384
4383
4385
- cur = llm_build_kqv (ctx0, Qcur, hparams, kv_self,
4384
+ cur = llm_build_kqv (ctx0, hparams, kv_self,
4386
4385
model.layers [il].wo , NULL ,
4387
4386
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias , cb, il);
4388
4387
cb (cur, " kqv_out" , il);
0 commit comments