@@ -573,20 +573,19 @@ struct gguf_file_loader {
573
573
574
574
struct ggml_context * ctx_data = NULL ;
575
575
576
- gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map)
577
- : file(fname, " rb" ) {
576
+ gguf_file_loader (const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, " rb" ) {
578
577
fprintf (stderr, " llama.cpp: loading model from %s\n " , fname);
579
578
580
- struct gguf_init_params params = {
581
- /* .no_alloc = */ true ,
582
- /* .ctx = */ &ctx_data,
583
- };
579
+ struct gguf_init_params params = {
580
+ /* .no_alloc = */ true ,
581
+ /* .ctx = */ &ctx_data,
582
+ };
584
583
585
- gguf_ctx = gguf_init_from_file (fname, params);
586
- file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
584
+ gguf_ctx = gguf_init_from_file (fname, params);
585
+ file_version = (enum gguf_file_version) gguf_get_version (gguf_ctx);
587
586
588
- read_hparams ();
589
- read_vocab ();
587
+ read_hparams ();
588
+ read_vocab ();
590
589
read_tensor_metadata (tensors_map);
591
590
}
592
591
@@ -636,18 +635,18 @@ struct gguf_file_loader {
636
635
637
636
void read_vocab () {
638
637
vocab.id_to_token .resize (hparams.n_vocab );
639
- int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
638
+
639
+ const int token_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.tokens" );
640
640
if (token_idx == -1 ) {
641
641
throw std::runtime_error (" cannot find token list in GGUF file\n " );
642
642
}
643
643
644
- int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
644
+ const int score_idx = gguf_find_key (gguf_ctx, " tokenizer.ggml.scores" );
645
645
if (score_idx == -1 ) {
646
646
throw std::runtime_error (" cannot find token scores list in GGUF file\n " );
647
647
}
648
648
649
649
for (uint32_t i = 0 ; i < hparams.n_vocab ; i++) {
650
-
651
650
std::string word = gguf_get_arr_str (gguf_ctx, token_idx, i);
652
651
653
652
vocab.token_to_id [word] = i;
@@ -786,7 +785,7 @@ struct gguf_file_saver {
786
785
gguf_type arr_type;
787
786
int n_arr;
788
787
789
- switch (vtype) {
788
+ switch (vtype) {
790
789
case GGUF_TYPE_BOOL:
791
790
bool_val = gguf_get_val_bool (fl->gguf_ctx , i);
792
791
file.write_val <bool >(key, GGUF_TYPE_BOOL, bool_val);
@@ -809,7 +808,7 @@ struct gguf_file_saver {
809
808
break ;
810
809
case GGUF_TYPE_STRING:
811
810
str_val = gguf_get_val_str (fl->gguf_ctx , i);
812
- file.write_val <std::string> (key, GGUF_TYPE_STRING, str_val);
811
+ file.write_str (key, GGUF_TYPE_STRING, str_val);
813
812
break ;
814
813
case GGUF_TYPE_UINT16:
815
814
u16_val = gguf_get_val_u16 (fl->gguf_ctx , i);
@@ -825,7 +824,7 @@ struct gguf_file_saver {
825
824
break ;
826
825
case GGUF_TYPE_ARRAY:
827
826
arr_type = gguf_get_arr_type (fl->gguf_ctx , i);
828
- n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
827
+ n_arr = gguf_get_arr_n (fl->gguf_ctx , i);
829
828
if (arr_type == GGUF_TYPE_FLOAT32) {
830
829
write_hparam_arr_f32 (key, arr_type, i, n_arr);
831
830
} else if (arr_type == GGUF_TYPE_STRING) {
@@ -922,20 +921,6 @@ struct llama_model_loader {
922
921
}
923
922
}
924
923
925
- struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
926
- auto it = tensors_map.name_to_idx .find (name);
927
- if (it == tensors_map.name_to_idx .end ()) {
928
- throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
929
- }
930
- gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
931
- if (lt.ne != ne) {
932
- throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
933
- name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
934
- }
935
-
936
- return get_tensor_for (lt, backend);
937
- }
938
-
939
924
struct ggml_tensor * get_tensor_for (gguf_load_tensor & lt, ggml_backend backend) {
940
925
struct ggml_tensor * tensor;
941
926
if (backend != GGML_BACKEND_CPU) {
@@ -959,16 +944,41 @@ struct llama_model_loader {
959
944
return tensor;
960
945
}
961
946
947
+ struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend) {
948
+ auto it = tensors_map.name_to_idx .find (name);
949
+ if (it == tensors_map.name_to_idx .end ()) {
950
+ throw std::runtime_error (std::runtime_error (format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ())));
951
+ }
952
+ gguf_load_tensor & lt = tensors_map.tensors .at (it->second );
953
+ if (lt.ne != ne) {
954
+ throw std::runtime_error (format (" llama.cpp: tensor '%s' has wrong shape; expected %s, got %s" ,
955
+ name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ()));
956
+ }
957
+
958
+ return get_tensor_for (lt, backend);
959
+ }
960
+
962
961
void done_getting_tensors () const {
963
962
if (num_ggml_tensors_created != tensors_map.tensors .size ()) {
964
963
throw std::runtime_error (std::string (" llama.cpp: file contained more tensors than expected" ));
965
964
}
966
965
}
967
966
968
- void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
969
- size_t data_size = 0 ;
967
+ void load_data_for (gguf_load_tensor & lt) const {
968
+ if (use_mmap) {
969
+ lt.data = (uint8_t *) mapping->addr + lt.file_off ;
970
+ } else {
971
+ gguf_file & file = file_loader->file ;
972
+ file.seek (lt.file_off , SEEK_SET);
973
+ file.read_raw (lt.data , lt.size );
974
+ }
975
+ }
976
+
977
+ void load_all_data (llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
978
+ size_t data_size = 0 ;
970
979
size_t prefetch_size = 0 ;
971
- size_t lock_size = 0 ;
980
+ size_t lock_size = 0 ;
981
+
972
982
for (const gguf_load_tensor & lt : tensors_map.tensors ) {
973
983
data_size += lt.size ;
974
984
if (lt.ggml_tensor ->backend == GGML_BACKEND_CPU) {
@@ -1030,31 +1040,6 @@ struct llama_model_loader {
1030
1040
done_size += lt.size ;
1031
1041
}
1032
1042
}
1033
-
1034
- void load_data_for (gguf_load_tensor & lt) {
1035
- if (use_mmap) {
1036
- lt.data = (uint8_t *) mapping->addr + lt.file_off ;
1037
- } else {
1038
- gguf_file & file = file_loader->file ;
1039
- file.seek (lt.file_off , SEEK_SET);
1040
- file.read_raw (lt.data , lt.size );
1041
- }
1042
-
1043
- if (0 ) {
1044
- print_checksum (lt);
1045
- }
1046
- }
1047
-
1048
- static void print_checksum (gguf_load_tensor & lt) {
1049
- uint32_t sum = 0 ;
1050
- for (size_t i = 0 ; i < lt.size ; i++) {
1051
- uint8_t byte = lt.data [i];
1052
- sum = byte + (sum << 6 ) + (sum << 16 ) - sum; // sdbm hash
1053
- }
1054
- fprintf (stderr, " %s checksum: %#08x (%s, size %zu)\n " , lt.name .c_str (), sum,
1055
- llama_format_tensor_shape (lt.ne ).c_str (), lt.size );
1056
- }
1057
-
1058
1043
};
1059
1044
1060
1045
//
@@ -1187,15 +1172,15 @@ int64_t llama_time_us() {
1187
1172
// model loading
1188
1173
//
1189
1174
1190
- static const char *gguf_file_version_name (gguf_file_version version) {
1175
+ static const char * gguf_file_version_name (gguf_file_version version) {
1191
1176
switch (version) {
1192
1177
case GGUF_FILE_VERSION_V1: return " GGUF V1 (latest)" ;
1193
- }
1178
+ }
1194
1179
1195
1180
return " unknown" ;
1196
1181
}
1197
1182
1198
- static const char *llama_ftype_name (enum llama_ftype ftype) {
1183
+ static const char * llama_ftype_name (enum llama_ftype ftype) {
1199
1184
switch (ftype) {
1200
1185
case LLAMA_FTYPE_ALL_F32: return " all F32" ;
1201
1186
case LLAMA_FTYPE_MOSTLY_F16: return " mostly F16" ;
@@ -1220,10 +1205,10 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
1220
1205
}
1221
1206
}
1222
1207
1223
- static const char *llama_model_type_name (e_model type) {
1208
+ static const char * llama_model_type_name (e_model type) {
1224
1209
switch (type) {
1225
- case MODEL_3B: return " 3B" ;
1226
- case MODEL_7B: return " 7B" ;
1210
+ case MODEL_3B: return " 3B" ;
1211
+ case MODEL_7B: return " 7B" ;
1227
1212
case MODEL_13B: return " 13B" ;
1228
1213
case MODEL_30B: return " 30B" ;
1229
1214
case MODEL_65B: return " 65B" ;
@@ -2996,9 +2981,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
2996
2981
}
2997
2982
}
2998
2983
2999
- const auto rejects =
3000
- llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
3001
- for (auto & reject : rejects) {
2984
+ const auto rejects = llama_grammar_reject_candidates (grammar->rules , grammar->stacks , candidates_grammar);
2985
+ for (const auto & reject : rejects) {
3002
2986
candidates->data [reject.index ].logit = -INFINITY;
3003
2987
}
3004
2988
@@ -3725,7 +3709,7 @@ void llama_free(struct llama_context * ctx) {
3725
3709
int llama_model_quantize (
3726
3710
const char * fname_inp,
3727
3711
const char * fname_out,
3728
- const llama_model_quantize_params *params) {
3712
+ const llama_model_quantize_params * params) {
3729
3713
try {
3730
3714
llama_model_quantize_internal (fname_inp, fname_out, params);
3731
3715
return 0 ;
@@ -4343,8 +4327,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
4343
4327
GGML_UNUSED (n_token_capacity);
4344
4328
GGML_UNUSED (n_token_count_out);
4345
4329
4346
-
4347
- // TODO: implement with GGUF format
4330
+ // TODO: implement with GGUF format
4348
4331
return true ;
4349
4332
}
4350
4333
@@ -4389,7 +4372,6 @@ int llama_eval(
4389
4372
return 0 ;
4390
4373
}
4391
4374
4392
-
4393
4375
int llama_eval_embd (
4394
4376
struct llama_context * ctx,
4395
4377
const float * embd,
0 commit comments