@@ -3638,7 +3638,6 @@ static struct ggml_cgraph * llm_build_baichaun(
3638
3638
3639
3639
// self-attention
3640
3640
{
3641
- // compute Q and K and RoPE them
3642
3641
struct ggml_tensor * Qcur = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
3643
3642
cb (Qcur, " Qcur" , il);
3644
3643
@@ -3676,12 +3675,9 @@ static struct ggml_cgraph * llm_build_baichaun(
3676
3675
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
3677
3676
cb (K, " K" , il);
3678
3677
3679
- // K * Q
3680
3678
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
3681
3679
cb (KQ, " KQ" , il);
3682
3680
3683
- // KQ_scaled = KQ / sqrt(n_embd_head)
3684
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
3685
3681
struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
3686
3682
cb (KQ_scaled, " KQ_scaled" , il);
3687
3683
@@ -3694,19 +3690,17 @@ static struct ggml_cgraph * llm_build_baichaun(
3694
3690
break ;
3695
3691
case MODEL_13B:
3696
3692
// TODO: replace with ggml_add()
3697
- KQ_scaled_alibi = ggml_alibi (ctx0, KQ_scaled, /* n_past*/ 0 , n_head, 8 );
3693
+ KQ_scaled_alibi = ggml_alibi (ctx0, KQ_scaled, /* n_past*/ 0 , n_head, 8 ); // TODO: n_head or n_head_kv
3698
3694
cb (KQ_scaled_alibi, " KQ_scaled_alibi" , il);
3699
3695
KQ_masked = ggml_add (ctx0, KQ_scaled_alibi, KQ_mask);
3700
3696
break ;
3701
3697
default :
3702
3698
GGML_ASSERT (false );
3703
3699
}
3704
3700
3705
- // KQ = soft_max(KQ_masked)
3706
3701
struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
3707
3702
cb (KQ_soft_max, " KQ_soft_max" , il);
3708
3703
3709
- // split cached V into n_head heads
3710
3704
struct ggml_tensor * V =
3711
3705
ggml_view_3d (ctx0, kv_self.v ,
3712
3706
n_kv, n_embd_head, n_head_kv,
@@ -3718,15 +3712,12 @@ static struct ggml_cgraph * llm_build_baichaun(
3718
3712
struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
3719
3713
cb (KQV, " KQV" , il);
3720
3714
3721
- // KQV_merged = KQV.permute(0, 2, 1, 3)
3722
3715
struct ggml_tensor * KQV_merged = ggml_permute (ctx0, KQV, 0 , 2 , 1 , 3 );
3723
3716
cb (KQV_merged, " KQV_merged" , il);
3724
3717
3725
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
3726
3718
cur = ggml_cont_2d (ctx0, KQV_merged, n_embd, n_tokens);
3727
3719
cb (cur, " KQV_merged_contiguous" , il);
3728
3720
3729
- // projection (no bias)
3730
3721
cur = ggml_mul_mat (ctx0,
3731
3722
model.layers [il].wo ,
3732
3723
cur);
@@ -3882,7 +3873,6 @@ static struct ggml_cgraph * llm_build_falcon(
3882
3873
cur = attn_norm;
3883
3874
}
3884
3875
3885
- // compute QKV
3886
3876
cur = ggml_mul_mat (ctx0, model.layers [il].wqkv , cur);
3887
3877
cb (cur, " wqkv" , il);
3888
3878
@@ -4106,24 +4096,18 @@ static struct ggml_cgraph * llm_build_starcoder(
4106
4096
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
4107
4097
cb (K, " K" , il);
4108
4098
4109
- // K * Q
4110
4099
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
4111
4100
cb (KQ, " KQ" , il);
4112
4101
4113
- // KQ_scaled = KQ / sqrt(n_embd_head)
4114
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
4115
4102
struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
4116
4103
cb (KQ_scaled, " KQ_scaled" , il);
4117
4104
4118
- // KQ_masked = mask_past(KQ_scaled)
4119
4105
struct ggml_tensor * KQ_masked = ggml_add (ctx0, KQ_scaled, KQ_mask);
4120
4106
cb (KQ_masked, " KQ_masked" , il);
4121
4107
4122
- // KQ = soft_max(KQ_masked)
4123
4108
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
4124
4109
cb (KQ_soft_max, " KQ_soft_max" , il);
4125
4110
4126
- // split cached V into n_head heads
4127
4111
struct ggml_tensor * V =
4128
4112
ggml_view_3d (ctx0, kv_self.v ,
4129
4113
n_kv, n_embd_head, n_head_kv,
@@ -4142,7 +4126,6 @@ static struct ggml_cgraph * llm_build_starcoder(
4142
4126
cb (cur, " KQV_merged_contiguous" , il);
4143
4127
}
4144
4128
4145
- // Projection
4146
4129
cur = ggml_add (ctx0, ggml_mul_mat (ctx0, model.layers [il].wo , cur), model.layers [il].bo );
4147
4130
cb (cur, " result_wo" , il);
4148
4131
@@ -4506,8 +4489,6 @@ static struct ggml_cgraph * llm_build_refact(
4506
4489
const int32_t n_kv = worst_case ? n_ctx : kv_self.n ;
4507
4490
const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head ;
4508
4491
4509
- // printf("n_kv = %d\n", n_kv);
4510
-
4511
4492
auto & buf_compute = lctx.buf_compute ;
4512
4493
4513
4494
struct ggml_init_params params = {
@@ -4584,27 +4565,21 @@ static struct ggml_cgraph * llm_build_refact(
4584
4565
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
4585
4566
cb (K, " K" , il);
4586
4567
4587
- // K * Q
4588
4568
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
4589
4569
cb (KQ, " KQ" , il);
4590
4570
4591
- // KQ_scaled = KQ / sqrt(n_embd_head)
4592
- // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
4593
4571
struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
4594
4572
cb (KQ_scaled, " KQ_scaled" , il);
4595
4573
4596
- // KQ_masked = mask_past(KQ_scaled)
4597
4574
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi (ctx0, KQ_scaled, /* n_past*/ 0 , n_head, 8 );
4598
4575
cb (KQ_scaled_alibi, " KQ_scaled_alibi" , il);
4599
4576
4600
4577
struct ggml_tensor * KQ_masked = ggml_add (ctx0, KQ_scaled_alibi, KQ_mask);
4601
4578
cb (KQ_masked, " KQ_masked" , il);
4602
4579
4603
- // KQ = soft_max(KQ_masked)
4604
4580
struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
4605
4581
cb (KQ_soft_max, " KQ_soft_max" , il);
4606
4582
4607
- // split cached V into n_head heads
4608
4583
struct ggml_tensor * V =
4609
4584
ggml_view_3d (ctx0, kv_self.v ,
4610
4585
n_kv, n_embd_head, n_head_kv,
@@ -4616,15 +4591,12 @@ static struct ggml_cgraph * llm_build_refact(
4616
4591
struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
4617
4592
cb (KQV, " KQV" , il);
4618
4593
4619
- // KQV_merged = KQV.permute(0, 2, 1, 3)
4620
4594
struct ggml_tensor * KQV_merged = ggml_permute (ctx0, KQV, 0 , 2 , 1 , 3 );
4621
4595
cb (KQV_merged, " KQV_merged" , il);
4622
4596
4623
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4624
4597
cur = ggml_cont_2d (ctx0, KQV_merged, n_embd, n_tokens);
4625
4598
cb (cur, " KQV_merged_contiguous" , il);
4626
4599
4627
- // projection (no bias)
4628
4600
cur = ggml_mul_mat (ctx0,
4629
4601
model.layers [il].wo ,
4630
4602
cur);
@@ -4789,27 +4761,21 @@ static struct ggml_cgraph * llm_build_bloom(
4789
4761
ggml_element_size (kv_self.k )*n_embd_gqa*n_ctx*il);
4790
4762
cb (K, " K" , il);
4791
4763
4792
- // K * Q
4793
4764
struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
4794
4765
cb (KQ, " KQ" , il);
4795
4766
4796
- // KQ_scaled = KQ / sqrt(n_embd_head)
4797
- // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
4798
4767
struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx0, KQ, KQ_scale);
4799
4768
cb (KQ_scaled, " KQ_scaled" , il);
4800
4769
4801
4770
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi (ctx0, KQ_scaled, /* n_past*/ kv_head, n_head, 8 );
4802
4771
cb (KQ_scaled_alibi, " KQ_scaled_alibi" , il);
4803
4772
4804
- // KQ_masked = mask_past(KQ_scaled)
4805
4773
struct ggml_tensor * KQ_masked = ggml_add (ctx0, KQ_scaled_alibi, KQ_mask);
4806
4774
cb (KQ_masked, " KQ_masked" , il);
4807
4775
4808
- // KQ = soft_max(KQ_masked)
4809
4776
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace (ctx0, KQ_masked);
4810
4777
cb (KQ_soft_max, " KQ_soft_max" , il);
4811
4778
4812
- // split cached V into n_head heads
4813
4779
struct ggml_tensor * V =
4814
4780
ggml_view_3d (ctx0, kv_self.v ,
4815
4781
n_kv, n_embd_head, n_head_kv,
@@ -4821,16 +4787,13 @@ static struct ggml_cgraph * llm_build_bloom(
4821
4787
struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
4822
4788
cb (KQV, " KQV" , il);
4823
4789
4824
- // KQV_merged = KQV.permute(0, 2, 1, 3)
4825
4790
struct ggml_tensor * KQV_merged = ggml_permute (ctx0, KQV, 0 , 2 , 1 , 3 );
4826
4791
cb (KQV_merged, " KQV_merged" , il);
4827
4792
4828
- // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
4829
4793
cur = ggml_cont_2d (ctx0, KQV_merged, n_embd, n_tokens);
4830
4794
cb (cur, " KQV_merged_contiguous" , il);
4831
4795
}
4832
4796
4833
- // Projection
4834
4797
cur = ggml_mul_mat (ctx0, model.layers [il].wo , cur);
4835
4798
cb (cur, " result_wo" , il);
4836
4799
0 commit comments