@@ -695,172 +695,14 @@ struct gguf_file_loader {
695
695
696
696
tensor.name = name;
697
697
tensor.size = ggml_nbytes (cur);
698
+ tensor.ggml_tensor = cur;
698
699
699
700
tensors_map.tensors .push_back (tensor);
700
701
tensors_map.name_to_idx [name] = tensors_map.tensors .size () - 1 ;
701
702
}
702
703
}
703
704
};
704
705
705
- struct gguf_file_saver {
706
- // TODO
707
- // this implementation now assumes that the data section is of the same length as the unquantized model.
708
- // this is needed to write tensor metadata and weights in a single pass by seeking to appropriate positions in the file.
709
- // this may not be true when we add quantization version and change ftype description (currently it's string according to the specs,
710
- // but better to have it as uint32).
711
- // we need to calculate the delta in number of bytes written with a counter as a struct member.
712
-
713
- gguf_context * ctx; // loaded gguf context (used to re-write the KV section (good enough for now))
714
-
715
- gguf_file file;
716
- size_t info_offset;
717
- size_t tensor_offset;
718
-
719
- gguf_file_saver (const char * fname, gguf_context * ctx) : ctx(ctx), file(fname, " wb" ) {
720
- LLAMA_LOG_INFO (" %s: saving model to %s\n " , __func__, fname);
721
-
722
- write_header ();
723
- write_kv ();
724
- }
725
-
726
- void write_header () {
727
- file.write_i32 (GGUF_MAGIC);
728
- file.write_i32 (GGUF_VERSION);
729
- file.write_i32 (gguf_get_n_tensors (ctx));
730
- file.write_i32 (gguf_get_n_kv (ctx));
731
- }
732
-
733
- void write_kv_arr_i32 (const std::string & key, enum gguf_type type, int i, int n_arr) {
734
- std::vector<int32_t > data (n_arr);
735
-
736
- for (int j = 0 ; j < n_arr; ++j) {
737
- int32_t val = gguf_get_arr_i32 (ctx, i, j);
738
- data[j] = val;
739
- }
740
-
741
- file.write_arr <int32_t >(key, type, data);
742
- }
743
-
744
- void write_kv_arr_f32 (const std::string & key, enum gguf_type type, int i, int n_arr) {
745
- std::vector<float > data (n_arr);
746
-
747
- for (int j = 0 ; j < n_arr; ++j) {
748
- float val = gguf_get_arr_f32 (ctx, i, j);
749
- data[j] = val;
750
- }
751
-
752
- file.write_arr <float >(key, type, data);
753
- }
754
-
755
- void write_kv_arr_str (const std::string & key, enum gguf_type type, int i, int n_arr) {
756
- std::vector<std::string> data (n_arr);
757
-
758
- for (int j = 0 ; j < n_arr; ++j) {
759
- std::string val = gguf_get_arr_str (ctx, i, j);
760
- data[j] = val;
761
- }
762
-
763
- file.write_arr (key, type, data);
764
- }
765
-
766
- // re-write the key-value section from the loaded file
767
- void write_kv () {
768
- const int32_t n_kv = gguf_get_n_kv (ctx);
769
- for (int i = 0 ; i < n_kv; ++i) {
770
- const char * key = gguf_get_key (ctx, i);
771
- LLAMA_LOG_INFO (" %s: writing key '%s'\n " , __func__, key);
772
-
773
- if (strcmp (key, " general.quantization_version" ) == 0 ) {
774
- file.write_val <uint32_t >(" general.quantization_version" , GGUF_TYPE_UINT32, GGML_QNT_VERSION);
775
- } else {
776
- const gguf_type vtype = gguf_get_kv_type (ctx, i);
777
-
778
- switch (vtype) {
779
- case GGUF_TYPE_BOOL: file.write_val <bool > (key, GGUF_TYPE_BOOL, gguf_get_val_bool (ctx, i)); break ;
780
- case GGUF_TYPE_FLOAT32: file.write_val <float > (key, GGUF_TYPE_FLOAT32, gguf_get_val_f32 (ctx, i)); break ;
781
- case GGUF_TYPE_INT16: file.write_val <int16_t > (key, GGUF_TYPE_INT16, gguf_get_val_i16 (ctx, i)); break ;
782
- case GGUF_TYPE_INT32: file.write_val <int32_t > (key, GGUF_TYPE_INT32, gguf_get_val_i32 (ctx, i)); break ;
783
- case GGUF_TYPE_INT8: file.write_val <int8_t > (key, GGUF_TYPE_INT8, gguf_get_val_i8 (ctx, i)); break ;
784
- case GGUF_TYPE_STRING: file.write_str (key, GGUF_TYPE_STRING, gguf_get_val_str (ctx, i)); break ;
785
- case GGUF_TYPE_UINT16: file.write_val <uint16_t >(key, GGUF_TYPE_UINT16, gguf_get_val_u16 (ctx, i)); break ;
786
- case GGUF_TYPE_UINT32: file.write_val <uint32_t >(key, GGUF_TYPE_UINT32, gguf_get_val_u32 (ctx, i)); break ;
787
- case GGUF_TYPE_UINT8: file.write_val <uint8_t > (key, GGUF_TYPE_UINT8, gguf_get_val_u8 (ctx, i)); break ;
788
- case GGUF_TYPE_ARRAY:
789
- {
790
- const gguf_type arr_type = gguf_get_arr_type (ctx, i);
791
- const int n_arr = gguf_get_arr_n (ctx, i);
792
-
793
- switch (arr_type) {
794
- case GGUF_TYPE_FLOAT32: write_kv_arr_f32 (key, arr_type, i, n_arr); break ;
795
- case GGUF_TYPE_INT32: write_kv_arr_i32 (key, arr_type, i, n_arr); break ;
796
- case GGUF_TYPE_STRING: write_kv_arr_str (key, arr_type, i, n_arr); break ;
797
- default :
798
- throw std::runtime_error (format (" cannot recognize array type for key %s\n " , key));
799
- }
800
- } break ;
801
- default :
802
- throw std::runtime_error (format (" cannot recognize value type for key %s\n " , key));
803
- }
804
- }
805
- }
806
-
807
- info_offset = file.tell ();
808
-
809
- GGML_ASSERT (gguf_get_data_offset (ctx) >= info_offset);
810
-
811
- const size_t count = gguf_get_data_offset (ctx) - info_offset;
812
-
813
- file.write_zeros (count);
814
- file.seek (info_offset, SEEK_SET);
815
- }
816
-
817
- size_t write_tensor_info (gguf_load_tensor & tensor, enum ggml_type type) {
818
- size_t total_written = 0 ;
819
- file.seek (info_offset, SEEK_SET);
820
- total_written += file.write_str (tensor.name );
821
-
822
- int32_t n_dims = tensor.ne .size ();
823
- total_written += file.write_i32 (n_dims);
824
- for (int32_t i = 0 ; i < n_dims; ++i) {
825
- total_written += file.write_i32 (tensor.ne [i]);
826
- }
827
-
828
- total_written += file.write_i32 (type);
829
- total_written += file.write_u64 (tensor_offset);
830
- info_offset += total_written; // position to write info of the next tensor
831
-
832
- file.seek (0 , SEEK_END);
833
-
834
- return total_written;
835
- }
836
-
837
- void write_tensor (gguf_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
838
- switch (new_type) {
839
- case GGML_TYPE_F32:
840
- case GGML_TYPE_F16:
841
- case GGML_TYPE_Q4_0:
842
- case GGML_TYPE_Q4_1:
843
- case GGML_TYPE_Q5_0:
844
- case GGML_TYPE_Q5_1:
845
- case GGML_TYPE_Q8_0:
846
- case GGML_TYPE_Q2_K:
847
- case GGML_TYPE_Q3_K:
848
- case GGML_TYPE_Q4_K:
849
- case GGML_TYPE_Q5_K:
850
- case GGML_TYPE_Q6_K:
851
- break ;
852
- default : GGML_ASSERT (false );
853
- }
854
-
855
- write_tensor_info (tensor, new_type);
856
- file.write_raw (new_data, new_size);
857
- size_t padded_size = GGML_PAD (new_size, GGUF_DEFAULT_ALIGNMENT); // TODO: handle custom alignment
858
- size_t pad = padded_size - new_size;
859
- file.write_zeros (pad);
860
- tensor_offset += padded_size; // offset of the next tensor
861
- }
862
- };
863
-
864
706
struct llama_model_loader {
865
707
std::unique_ptr<gguf_file_loader> file_loader;
866
708
gguf_load_tensors_map tensors_map;
@@ -897,7 +739,6 @@ struct llama_model_loader {
897
739
tensor = ggml_new_tensor_1d (ggml_ctx, lt.type , lt.ne .at (0 ));
898
740
}
899
741
ggml_set_name (tensor, lt.name .c_str ());
900
- GGML_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
901
742
902
743
if (backend != GGML_BACKEND_CPU) {
903
744
ggml_set_no_alloc (ggml_ctx, use_mmap);
@@ -3245,7 +3086,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3245
3086
}
3246
3087
3247
3088
std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ));
3248
- gguf_file_saver file_saver (fname_out.c_str (), model_loader->file_loader ->gguf_ctx );
3089
+
3090
+ struct gguf_context * ctx_out = gguf_init_empty ();
3091
+
3092
+ // copy the KV pairs from the input file
3093
+ gguf_set_kv (ctx_out, model_loader->file_loader ->gguf_ctx );
3094
+ gguf_set_val_u32 (ctx_out, " general.quantization_version" , GGML_QNT_VERSION);
3249
3095
3250
3096
#ifdef GGML_USE_K_QUANTS
3251
3097
int n_attention_wv = 0 ;
@@ -3279,6 +3125,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3279
3125
std::vector<uint8_t > read_data;
3280
3126
std::vector<uint8_t > work;
3281
3127
3128
+ std::vector<std::vector<uint8_t >> work_map (model_loader->tensors_map .tensors .size ());
3129
+
3282
3130
for (gguf_load_tensor & tensor : model_loader->tensors_map .tensors ) {
3283
3131
read_data.resize (tensor.size );
3284
3132
tensor.data = read_data.data ();
@@ -3437,12 +3285,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
3437
3285
}
3438
3286
total_size_org += tensor.size ;
3439
3287
total_size_new += new_size;
3440
- file_saver.write_tensor (tensor, new_type, new_data, new_size);
3288
+
3289
+ // TODO: temp fix until we have stream support in gguf
3290
+ work_map[idx - 1 ] = std::vector<uint8_t >((char *) new_data, (char *) new_data + new_size);
3291
+
3292
+ gguf_add_tensor_ex (ctx_out, tensor.ggml_tensor , new_type, work_map[idx - 1 ].data (), new_size);
3441
3293
}
3442
3294
3295
+ gguf_write_to_file (ctx_out, fname_out.c_str ());
3296
+ gguf_free (ctx_out);
3297
+
3443
3298
LLAMA_LOG_INFO (" %s: model size = %8.2f MB\n " , __func__, total_size_org/1024.0 /1024.0 );
3444
3299
LLAMA_LOG_INFO (" %s: quant size = %8.2f MB\n " , __func__, total_size_new/1024.0 /1024.0 );
3445
3300
3301
+ // print histogram for all tensors
3446
3302
{
3447
3303
int64_t sum_all = 0 ;
3448
3304
for (size_t i = 0 ; i < hist_all.size (); i++) {
0 commit comments