Skip to content

Commit c9121fd

Browse files
committed
llama : remove obsolete comments in build graphs
1 parent a104abe commit c9121fd

File tree

1 file changed

+1
-38
lines changed

1 file changed

+1
-38
lines changed

llama.cpp

Lines changed: 1 addition & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3638,7 +3638,6 @@ static struct ggml_cgraph * llm_build_baichaun(
36383638

36393639
// self-attention
36403640
{
3641-
// compute Q and K and RoPE them
36423641
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
36433642
cb(Qcur, "Qcur", il);
36443643

@@ -3676,12 +3675,9 @@ static struct ggml_cgraph * llm_build_baichaun(
36763675
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
36773676
cb(K, "K", il);
36783677

3679-
// K * Q
36803678
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
36813679
cb(KQ, "KQ", il);
36823680

3683-
// KQ_scaled = KQ / sqrt(n_embd_head)
3684-
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
36853681
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
36863682
cb(KQ_scaled, "KQ_scaled", il);
36873683

@@ -3694,19 +3690,17 @@ static struct ggml_cgraph * llm_build_baichaun(
36943690
break;
36953691
case MODEL_13B:
36963692
// TODO: replace with ggml_add()
3697-
KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
3693+
KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); // TODO: n_head or n_head_kv
36983694
cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
36993695
KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
37003696
break;
37013697
default:
37023698
GGML_ASSERT(false);
37033699
}
37043700

3705-
// KQ = soft_max(KQ_masked)
37063701
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
37073702
cb(KQ_soft_max, "KQ_soft_max", il);
37083703

3709-
// split cached V into n_head heads
37103704
struct ggml_tensor * V =
37113705
ggml_view_3d(ctx0, kv_self.v,
37123706
n_kv, n_embd_head, n_head_kv,
@@ -3718,15 +3712,12 @@ static struct ggml_cgraph * llm_build_baichaun(
37183712
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
37193713
cb(KQV, "KQV", il);
37203714

3721-
// KQV_merged = KQV.permute(0, 2, 1, 3)
37223715
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
37233716
cb(KQV_merged, "KQV_merged", il);
37243717

3725-
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
37263718
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
37273719
cb(cur, "KQV_merged_contiguous", il);
37283720

3729-
// projection (no bias)
37303721
cur = ggml_mul_mat(ctx0,
37313722
model.layers[il].wo,
37323723
cur);
@@ -3882,7 +3873,6 @@ static struct ggml_cgraph * llm_build_falcon(
38823873
cur = attn_norm;
38833874
}
38843875

3885-
// compute QKV
38863876
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
38873877
cb(cur, "wqkv", il);
38883878

@@ -4106,24 +4096,18 @@ static struct ggml_cgraph * llm_build_starcoder(
41064096
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
41074097
cb(K, "K", il);
41084098

4109-
// K * Q
41104099
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
41114100
cb(KQ, "KQ", il);
41124101

4113-
// KQ_scaled = KQ / sqrt(n_embd_head)
4114-
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
41154102
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
41164103
cb(KQ_scaled, "KQ_scaled", il);
41174104

4118-
// KQ_masked = mask_past(KQ_scaled)
41194105
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
41204106
cb(KQ_masked, "KQ_masked", il);
41214107

4122-
// KQ = soft_max(KQ_masked)
41234108
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
41244109
cb(KQ_soft_max, "KQ_soft_max", il);
41254110

4126-
// split cached V into n_head heads
41274111
struct ggml_tensor * V =
41284112
ggml_view_3d(ctx0, kv_self.v,
41294113
n_kv, n_embd_head, n_head_kv,
@@ -4142,7 +4126,6 @@ static struct ggml_cgraph * llm_build_starcoder(
41424126
cb(cur, "KQV_merged_contiguous", il);
41434127
}
41444128

4145-
// Projection
41464129
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
41474130
cb(cur, "result_wo", il);
41484131

@@ -4506,8 +4489,6 @@ static struct ggml_cgraph * llm_build_refact(
45064489
const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
45074490
const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
45084491

4509-
// printf("n_kv = %d\n", n_kv);
4510-
45114492
auto & buf_compute = lctx.buf_compute;
45124493

45134494
struct ggml_init_params params = {
@@ -4584,27 +4565,21 @@ static struct ggml_cgraph * llm_build_refact(
45844565
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
45854566
cb(K, "K", il);
45864567

4587-
// K * Q
45884568
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
45894569
cb(KQ, "KQ", il);
45904570

4591-
// KQ_scaled = KQ / sqrt(n_embd_head)
4592-
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
45934571
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
45944572
cb(KQ_scaled, "KQ_scaled", il);
45954573

4596-
// KQ_masked = mask_past(KQ_scaled)
45974574
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
45984575
cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
45994576

46004577
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
46014578
cb(KQ_masked, "KQ_masked", il);
46024579

4603-
// KQ = soft_max(KQ_masked)
46044580
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
46054581
cb(KQ_soft_max, "KQ_soft_max", il);
46064582

4607-
// split cached V into n_head heads
46084583
struct ggml_tensor * V =
46094584
ggml_view_3d(ctx0, kv_self.v,
46104585
n_kv, n_embd_head, n_head_kv,
@@ -4616,15 +4591,12 @@ static struct ggml_cgraph * llm_build_refact(
46164591
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
46174592
cb(KQV, "KQV", il);
46184593

4619-
// KQV_merged = KQV.permute(0, 2, 1, 3)
46204594
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
46214595
cb(KQV_merged, "KQV_merged", il);
46224596

4623-
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
46244597
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
46254598
cb(cur, "KQV_merged_contiguous", il);
46264599

4627-
// projection (no bias)
46284600
cur = ggml_mul_mat(ctx0,
46294601
model.layers[il].wo,
46304602
cur);
@@ -4789,27 +4761,21 @@ static struct ggml_cgraph * llm_build_bloom(
47894761
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
47904762
cb(K, "K", il);
47914763

4792-
// K * Q
47934764
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
47944765
cb(KQ, "KQ", il);
47954766

4796-
// KQ_scaled = KQ / sqrt(n_embd_head)
4797-
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
47984767
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
47994768
cb(KQ_scaled, "KQ_scaled", il);
48004769

48014770
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
48024771
cb(KQ_scaled_alibi, "KQ_scaled_alibi", il);
48034772

4804-
// KQ_masked = mask_past(KQ_scaled)
48054773
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
48064774
cb(KQ_masked, "KQ_masked", il);
48074775

4808-
// KQ = soft_max(KQ_masked)
48094776
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
48104777
cb(KQ_soft_max, "KQ_soft_max", il);
48114778

4812-
// split cached V into n_head heads
48134779
struct ggml_tensor * V =
48144780
ggml_view_3d(ctx0, kv_self.v,
48154781
n_kv, n_embd_head, n_head_kv,
@@ -4821,16 +4787,13 @@ static struct ggml_cgraph * llm_build_bloom(
48214787
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
48224788
cb(KQV, "KQV", il);
48234789

4824-
// KQV_merged = KQV.permute(0, 2, 1, 3)
48254790
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
48264791
cb(KQV_merged, "KQV_merged", il);
48274792

4828-
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
48294793
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
48304794
cb(cur, "KQV_merged_contiguous", il);
48314795
}
48324796

4833-
// Projection
48344797
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
48354798
cb(cur, "result_wo", il);
48364799

0 commit comments

Comments
 (0)