@@ -510,22 +510,9 @@ struct llama_state {
510
510
// global state
511
511
static llama_state g_state;
512
512
513
- template <typename T>
514
- static T checked_mul (T a, T b) {
515
- T ret = a * b;
516
- if (a != 0 && ret / a != b) {
517
- throw std::runtime_error (format (" overflow multiplying %llu * %llu" ,
518
- (unsigned long long ) a, (unsigned long long ) b));
519
- }
520
- return ret;
521
- }
522
-
523
- static size_t checked_div (size_t a, size_t b) {
524
- if (b == 0 || a % b != 0 ) {
525
- throw std::runtime_error (format (" error dividing %zu / %zu" , a, b));
526
- }
527
- return a / b;
528
- }
513
+ //
514
+ // model loading and saving
515
+ //
529
516
530
517
static std::string llama_format_tensor_shape (const std::vector<uint32_t > & ne) {
531
518
char buf[256 ];
@@ -536,14 +523,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
536
523
return buf;
537
524
}
538
525
539
- static size_t llama_calc_tensor_size (const std::vector<uint32_t > & ne, enum ggml_type type) {
540
- size_t size = ggml_type_size (type);
541
- for (uint32_t dim : ne) {
542
- size = checked_mul<size_t >(size, dim);
543
- }
544
- return size / ggml_blck_size (type);
545
- }
546
-
547
526
struct gguf_load_tensor {
548
527
std::string name;
549
528
enum ggml_type type = GGML_TYPE_F32;
@@ -573,20 +552,19 @@ struct gguf_file_loader {
573
552
574
553
struct ggml_context * ctx_data = NULL ;
575
554
576
- gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map)
577
- : file(fname, " rb" ) {
555
+ gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, " rb" ) {
578
556
fprintf (stderr, " llama.cpp: loading model from %s\n " , fname);
579
557
580
- struct gguf_init_params params = {
581
- /* .no_alloc = */ true ,
582
- /* .ctx = */ &ctx_data,
583
- };
558
+ struct gguf_init_params params = {
559
+ /* .no_alloc = */ true ,
560
+ /* .ctx = */ &ctx_data,
561
+ };
584
562
585
- gguf_ctx = gguf_init_from_file (fname, params);
586
- file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
563
+ gguf_ctx = gguf_init_from_file (fname, params);
564
+ file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
587
565
588
- read_hparams ();
589
- read_vocab ();
566
+ read_hparams ();
567
+ read_vocab ();
590
568
read_tensor_metadata (tensors_map);
591
569
}
592
570
@@ -636,18 +614,18 @@ struct gguf_file_loader {
636
614
637
615
void read_vocab () {
638
616
vocab.id_to_token .resize (hparams.n_vocab );
639
- int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
617
+
618
+ const int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
640
619
if (token_idx == -1 ) {
641
620
throw std::runtime_error (" cannot find token list in GGUF file\n " );
642
621
}
643
622
644
- int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
623
+ const int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
645
624
if (score_idx == -1 ) {
646
625
throw std::runtime_error (" cannot find token scores list in GGUF file\n " );
647
626
}
648
627
649
628
for (uint32_t i = 0 ; i < hparams.n_vocab ; i++) {
650
-
651
629
std::string word = gguf_get_arr_str (gguf_ctx, token_idx, i);
652
630
653
631
vocab.token_to_id [word] = i;
@@ -701,7 +679,7 @@ struct gguf_file_loader {
701
679
tensor.file_off = gguf_get_data_offset (gguf_ctx) + gguf_get_tensor_offset (gguf_ctx, i);
702
680
703
681
tensor.name = name;
704
- tensor.size = llama_calc_tensor_size (tensor. ne , tensor. type );
682
+ tensor.size = ggml_nbytes (cur );
705
683
706
684
tensors_map.tensors .push_back (tensor);
707
685
tensors_map.name_to_idx [name] = tensors_map.tensors .size () - 1 ;
@@ -786,7 +764,7 @@ struct gguf_file_saver {
786
764
gguf_type arr_type;
787
765
int n_arr;
788
766
789
- switch (vtype) {
767
+ switch (vtype) {
790
768
case GGUF_TYPE_BOOL:
791
769
bool_val = gguf_get_val_bool (fl->gguf_ctx , i);
792
770
file.write_val <bool >(key, GGUF_TYPE_BOOL, bool_val);
@@ -809,7 +787,7 @@ struct gguf_file_saver {
809
787
break ;
810
788
case GGUF_TYPE_STRING:
811
789
str_val = gguf_get_val_str (fl->gguf_ctx , i);
812
- file.write_val <std::string> (key, GGUF_TYPE_STRING, str_val);
790
+ file.write_str (key, GGUF_TYPE_STRING, str_val);
813
791
break ;
814
792
case GGUF_TYPE_UINT16:
815
793
u16_val = gguf_get_val_u16 (fl->gguf_ctx , i);
@@ -825,7 +803,7 @@ struct gguf_file_saver {
825
803
break ;
826
804
case GGUF_TYPE_ARRAY:
827
805
arr_type = gguf_get_arr_type (fl->gguf_ctx , i);
828
- n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
806
+ n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
829
807
if (arr_type == GGUF_TYPE_FLOAT32) {
830
808
write_hparam_arr_f32 (key, arr_type, i, n_arr);
831
809
} else if (arr_type == GGUF_TYPE_STRING) {
@@ -922,20 +900,6 @@ struct llama_model_loader {
922
900
}
923
901
}
924
902
925
- struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
926
- auto it = tensors_map.name_to_idx .find (name);
927
- if (it == tensors_map.name_to_idx .end ()) {
928
- throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
929
- }
930
- gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
931
- if (lt.ne != ne) {
932
- throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
933
- name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
934
- }
935
-
936
- return get_tensor_for (lt, backend);
937
- }
938
-
939
903
struct ggml_tensor * get_tensor_for (gguf_load_tensor & lt, ggml_backend backend) {
940
904
struct ggml_tensor * tensor;
941
905
if (backend != GGML_BACKEND_CPU) {
@@ -959,16 +923,41 @@ struct llama_model_loader {
959
923
return tensor;
960
924
}
961
925
926
+ struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
927
+ auto it = tensors_map.name_to_idx .find (name);
928
+ if (it == tensors_map.name_to_idx .end ()) {
929
+ throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
930
+ }
931
+ gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
932
+ if (lt.ne != ne) {
933
+ throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
934
+ name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
935
+ }
936
+
937
+ return get_tensor_for (lt, backend);
938
+ }
939
+
962
940
void done_getting_tensors () const {
963
941
if (num_ggml_tensors_created != tensors_map.tensors .size ()) {
964
942
throw std::runtime_error (std::string (" llama.cpp: file contained more tensors than expected" ));
965
943
}
966
944
}
967
945
968
- void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
969
- size_t data_size = 0 ;
946
+ void load_data_for (gguf_load_tensor & lt) const {
947
+ if (use_mmap) {
948
+ lt.data = (uint8_t *) mapping->addr + lt.file_off ;
949
+ } else {
950
+ gguf_file & file = file_loader->file ;
951
+ file.seek (lt.file_off , SEEK_SET);
952
+ file.read_raw (lt.data , lt.size );
953
+ }
954
+ }
955
+
956
+ void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
957
+ size_t data_size = 0 ;
970
958
size_t prefetch_size = 0 ;
971
- size_t lock_size = 0 ;
959
+ size_t lock_size = 0 ;
960
+
972
961
for (const gguf_load_tensor & lt : tensors_map.tensors ) {
973
962
data_size += lt.size ;
974
963
if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -1030,31 +1019,6 @@ struct llama_model_loader {
1030
1019
done_size += lt.size ;
1031
1020
}
1032
1021
}
1033
-
1034
- void load_data_for (gguf_load_tensor & lt) {
1035
- if (use_mmap) {
1036
- lt.data = (uint8_t *) mapping->addr + lt.file_off ;
1037
- } else {
1038
- gguf_file & file = file_loader->file ;
1039
- file.seek (lt.file_off , SEEK_SET);
1040
- file.read_raw (lt.data , lt.size );
1041
- }
1042
-
1043
- if (0 ) {
1044
- print_checksum (lt);
1045
- }
1046
- }
1047
-
1048
- static void print_checksum (gguf_load_tensor & lt) {
1049
- uint32_t sum = 0 ;
1050
- for (size_t i = 0 ; i < lt.size ; i++) {
1051
- uint8_t byte = lt.data [i];
1052
- sum = byte + (sum << 6 ) + (sum << 16 ) - sum; // sdbm hash
1053
- }
1054
- fprintf (stderr, " %s checksum: %#08x (%s, size %zu)\n " , lt.name .c_str (), sum,
1055
- llama_format_tensor_shape (lt.ne ).c_str (), lt.size );
1056
- }
1057
-
1058
1022
};
1059
1023
1060
1024
//
@@ -1184,18 +1148,18 @@ int64_t llama_time_us() {
1184
1148
}
1185
1149
1186
1150
//
1187
- // model loading
1151
+ // load LLaMA models
1188
1152
//
1189
1153
1190
- static const char *gguf_file_version_name (gguf_file_version version) {
1154
+ static const char * gguf_file_version_name (gguf_file_version version) {
1191
1155
switch (version) {
1192
1156
case GGUF_FILE_VERSION_V1: return " GGUF V1 (latest)" ;
1193
- }
1157
+ }
1194
1158
1195
1159
return " unknown" ;
1196
1160
}
1197
1161
1198
- static const char *llama_ftype_name (enum llama_ftype ftype) {
1162
+ static const char * llama_ftype_name (enum llama_ftype ftype) {
1199
1163
switch (ftype) {
1200
1164
case LLAMA_FTYPE_ALL_F32: return " all F32" ;
1201
1165
case LLAMA_FTYPE_MOSTLY_F16: return " mostly F16" ;
@@ -1206,24 +1170,26 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
1206
1170
case LLAMA_FTYPE_MOSTLY_Q5_0: return " mostly Q5_0" ;
1207
1171
case LLAMA_FTYPE_MOSTLY_Q5_1: return " mostly Q5_1" ;
1208
1172
case LLAMA_FTYPE_MOSTLY_Q8_0: return " mostly Q8_0" ;
1173
+
1209
1174
// K-quants
1210
- case LLAMA_FTYPE_MOSTLY_Q2_K: return " mostly Q2_K" ;
1175
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return " mostly Q2_K" ;
1211
1176
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return " mostly Q3_K - Small" ;
1212
1177
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return " mostly Q3_K - Medium" ;
1213
1178
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return " mostly Q3_K - Large" ;
1214
1179
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return " mostly Q4_K - Small" ;
1215
1180
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return " mostly Q4_K - Medium" ;
1216
1181
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return " mostly Q5_K - Small" ;
1217
1182
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return " mostly Q5_K - Medium" ;
1218
- case LLAMA_FTYPE_MOSTLY_Q6_K: return " mostly Q6_K" ;
1219
- default : return " unknown, may not work" ;
1183
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return " mostly Q6_K" ;
1184
+
1185
+ default : return " unknown, may not work" ;
1220
1186
}
1221
1187
}
1222
1188
1223
- static const char *llama_model_type_name (e_model type) {
1189
+ static const char * llama_model_type_name (e_model type) {
1224
1190
switch (type) {
1225
- case MODEL_3B: return " 3B" ;
1226
- case MODEL_7B: return " 7B" ;
1191
+ case MODEL_3B: return " 3B" ;
1192
+ case MODEL_7B: return " 7B" ;
1227
1193
case MODEL_13B: return " 13B" ;
1228
1194
case MODEL_30B: return " 30B" ;
1229
1195
case MODEL_65B: return " 65B" ;
@@ -1604,7 +1570,6 @@ static struct ggml_cgraph * llama_build_graph(
1604
1570
const int64_t n_embd_head = hparams.n_embd_head ();
1605
1571
const int64_t n_embd_gqa = hparams.n_embd_gqa ();
1606
1572
1607
-
1608
1573
GGML_ASSERT (n_embd_head == hparams.n_rot );
1609
1574
1610
1575
const float freq_base = hparams.rope_freq_base ;
@@ -1713,7 +1678,7 @@ static struct ggml_cgraph * llama_build_graph(
1713
1678
1714
1679
struct ggml_tensor * inpSA = inpL;
1715
1680
1716
- lctx. use_buf (ctx0, 0 );
1681
+ llama_context:: use_buf (ctx0, 0 );
1717
1682
1718
1683
// norm
1719
1684
{
@@ -1852,7 +1817,7 @@ static struct ggml_cgraph * llama_build_graph(
1852
1817
ggml_set_name (cur, " result_wo" );
1853
1818
}
1854
1819
1855
- lctx. use_buf (ctx0, 1 );
1820
+ llama_context:: use_buf (ctx0, 1 );
1856
1821
1857
1822
struct ggml_tensor * inpFF = ggml_add (ctx0, cur, inpSA);
1858
1823
offload_func (inpFF);
@@ -1908,7 +1873,7 @@ static struct ggml_cgraph * llama_build_graph(
1908
1873
inpL = cur;
1909
1874
}
1910
1875
1911
- lctx. use_buf (ctx0, 0 );
1876
+ llama_context:: use_buf (ctx0, 0 );
1912
1877
1913
1878
// norm
1914
1879
{
@@ -1926,7 +1891,7 @@ static struct ggml_cgraph * llama_build_graph(
1926
1891
cur = ggml_mul_mat (ctx0, model.output , cur);
1927
1892
ggml_set_name (cur, " result_output" );
1928
1893
1929
- lctx. use_buf (ctx0, -1 );
1894
+ llama_context:: use_buf (ctx0, -1 );
1930
1895
1931
1896
// logits -> probs
1932
1897
// cur = ggml_soft_max_inplace(ctx0, cur);
@@ -2996,9 +2961,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
2996
2961
}
2997
2962
}
2998
2963
2999
- const auto rejects =
3000
- llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
3001
- for (auto & reject : rejects) {
2964
+ const auto rejects = llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
2965
+ for (const auto & reject : rejects) {
3002
2966
candidates->data [reject.index ].logit = -INFINITY;
3003
2967
}
3004
2968
@@ -3725,7 +3689,7 @@ void llama_free(struct llama_context * ctx) {
3725
3689
int llama_model_quantize (
3726
3690
const char * fname_inp,
3727
3691
const char * fname_out,
3728
- const llama_model_quantize_params *params) {
3692
+ const llama_model_quantize_params * params) {
3729
3693
try {
3730
3694
llama_model_quantize_internal (fname_inp, fname_out, params);
3731
3695
return 0 ;
@@ -4343,8 +4307,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
4343
4307
GGML_UNUSED (n_token_capacity);
4344
4308
GGML_UNUSED (n_token_count_out);
4345
4309
4346
-
4347
- // TODO: implement with GGUF format
4310
+ // TODO: implement with GGUF format
4348
4311
return true ;
4349
4312
}
4350
4313
@@ -4389,7 +4352,6 @@ int llama_eval(
4389
4352
return 0 ;
4390
4353
}
4391
4354
4392
-
4393
4355
int llama_eval_embd (
4394
4356
struct llama_context * ctx,
4395
4357
const float * embd,
0 commit comments