17
17
#pragma warning(disable: 4244 4267) // possible loss of data
18
18
#endif
19
19
20
+ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
21
+ #define LLAMA_FILE_VERSION_GGJT_V3 3
22
+
20
23
// ////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
21
24
typedef struct {
22
25
int dim; // transformer dimension
@@ -49,10 +52,10 @@ typedef struct {
49
52
// float* freq_cis_real; // (seq_len, dim/2)
50
53
// float* freq_cis_imag; // (seq_len, dim/2)
51
54
// (optional) classifier weights for the logits, on the last layer
52
- // float* wcls;
55
+ float * wcls;
53
56
} TransformerWeights;
54
57
55
- void malloc_weights (TransformerWeights* w, Config* p) {
58
+ void malloc_weights (TransformerWeights* w, Config* p, bool shared_weights ) {
56
59
// we calloc instead of malloc to keep valgrind happy
57
60
w->token_embedding_table = new float [p->vocab_size * p->dim ]();
58
61
printf (" [%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n " ,__func__,p->vocab_size , p->dim , p->vocab_size * p->dim );
@@ -86,9 +89,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
86
89
87
90
w->rms_final_weight = new float [p->dim ]();
88
91
printf (" [%s:AK] Allocating [%d] float space for w->rms_final_weight\n " ,__func__,p->dim );
92
+
93
+ if (shared_weights) {
94
+ w->wcls = NULL ;
95
+ } else {
96
+ w->wcls = new float [p->vocab_size * p->dim ]();
97
+ printf (" [%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n " ,__func__,p->vocab_size , p->dim , p->vocab_size * p->dim );
98
+ }
89
99
}
90
100
91
- int checkpoint_init_weights (TransformerWeights *w, Config* p, FILE* f) {
101
+ int checkpoint_init_weights (TransformerWeights *w, Config* p, FILE* f, bool shared_weights ) {
92
102
if (fread (w->token_embedding_table , sizeof (float ), p->vocab_size * p->dim , f) != static_cast <size_t >(p->vocab_size * p->dim )) return 1 ;
93
103
if (fread (w->rms_att_weight , sizeof (float ), p->n_layers * p->dim , f) != static_cast <size_t >(p->n_layers * p->dim )) return 1 ;
94
104
if (fread (w->wq , sizeof (float ), p->n_layers * p->dim * p->dim , f) != static_cast <size_t >(p->n_layers * p->dim * p->dim )) return 1 ;
@@ -100,6 +110,22 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
100
110
if (fread (w->w2 , sizeof (float ), p->n_layers * p->hidden_dim * p->dim , f) != static_cast <size_t >(p->n_layers * p->hidden_dim * p->dim )) return 1 ;
101
111
if (fread (w->w3 , sizeof (float ), p->n_layers * p->dim * p->hidden_dim , f) != static_cast <size_t >(p->n_layers * p->dim * p->hidden_dim )) return 1 ;
102
112
if (fread (w->rms_final_weight , sizeof (float ), p->dim , f) != static_cast <size_t >(p->dim )) return 1 ;
113
+
114
+ // Skip freq_cis_real & freq_cis_imag
115
+ int head_size = p->dim / p->n_heads ;
116
+ fseek (f, p->seq_len * head_size * sizeof (float ), SEEK_CUR);
117
+
118
+ if (!shared_weights && fread (w->wcls , sizeof (float ), p->vocab_size * p->dim , f) != static_cast <size_t >(p->vocab_size * p->dim )) return 1 ;
119
+
120
+ // Check we didn't forget to read anything
121
+ auto curr = ftell (f);
122
+ fseek (f, 0 , SEEK_END);
123
+ auto end = ftell (f);
124
+ if (curr != end) {
125
+ printf (" Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n " , curr, end);
126
+ return 1 ;
127
+ }
128
+
103
129
return 0 ;
104
130
}
105
131
@@ -115,6 +141,7 @@ void free_weights(TransformerWeights* w) {
115
141
delete w->w2 ;
116
142
delete w->w3 ;
117
143
delete w->rms_final_weight ;
144
+ if (w->wcls ) delete w->wcls ;
118
145
}
119
146
120
147
void print_sample_weights (TransformerWeights *w){
@@ -131,6 +158,7 @@ void print_sample_weights(TransformerWeights *w){
131
158
printf (" %f\n " , w->w2 [0 ]);
132
159
printf (" %f\n " , w->w3 [0 ]);
133
160
printf (" %f\n " , w->rms_att_weight [0 ]);
161
+ if (w->wcls ) printf (" %f\n " , w->wcls [0 ]);
134
162
}
135
163
// //////////////////////////////////////////////////////////////////////////////////////////////////////////
136
164
@@ -509,26 +537,28 @@ bool is_ggml_file(const char *filename) {
509
537
}
510
538
511
539
void load_vocab (const char *filename, Config *config, struct llama_vocab *vocab) {
512
- // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
513
- if (is_ggml_file (filename)) {
514
-
515
- struct llama_context_params llama_params = llama_context_default_params ();
516
- llama_params.vocab_only = true ;
517
-
518
- struct llama_model * lmodel = llama_load_model_from_file (filename, llama_params);
519
- struct llama_context * lctx = llama_new_context_with_model (lmodel, llama_params);
520
-
521
- const int n_vocab = llama_n_vocab (lctx);
522
- vocab->id_to_token .resize (n_vocab);
523
- for (int i=0 ; i<n_vocab; ++i) {
524
- vocab->id_to_token [i].text = llama_token_get_text (lctx, i);
525
- vocab->id_to_token [i].score = llama_token_get_score (lctx, i);
526
- vocab->id_to_token [i].type = llama_token_get_type (lctx, i);
527
- vocab->token_to_id .emplace (vocab->id_to_token [i].text , i);
528
- }
529
- llama_free (lctx);
530
- llama_free_model (lmodel);
531
- } else { // assume llama2.c vocabulary
540
+ #pragma message("TODO: implement reading vocabulary using gguf")
541
+ // // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
542
+ // if (is_ggml_file(filename)) {
543
+ //
544
+ // struct llama_context_params llama_params = llama_context_default_params();
545
+ // llama_params.vocab_only = true;
546
+ //
547
+ // struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
548
+ // struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
549
+ //
550
+ // const int n_vocab = llama_n_vocab(lctx);
551
+ // vocab->id_to_token.resize(n_vocab);
552
+ // for (int i=0; i<n_vocab; ++i) {
553
+ // vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
554
+ // vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
555
+ // vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
556
+ // vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
557
+ // }
558
+ // llama_free(lctx);
559
+ // llama_free_model(lmodel);
560
+ // } else
561
+ { // assume llama2.c vocabulary
532
562
printf (" Assuming llama2.c vocabulary since %s is not a ggml file\n " , filename);
533
563
llama_file file (filename, " rb" );
534
564
const int n_vocab = config->vocab_size ;
@@ -538,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
538
568
float_t score = file.read_f32 ();
539
569
uint32_t len = file.read_u32 ();
540
570
std::string text = file.read_string (len);
571
+ // Special-case handling of <0xXX> single byte tokens.
572
+ char byte_val;
573
+ if (sscanf (text.c_str (), " <0x%02hhX>" , &byte_val) == 1 ) {
574
+ char cstr[2 ] = { byte_val, 0 };
575
+ text = cstr;
576
+ }
541
577
vocab->id_to_token [i].text = text;
542
578
vocab->id_to_token [i].score = score;
543
579
vocab->id_to_token [i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
@@ -589,83 +625,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
589
625
}
590
626
591
627
#pragma message("TODO: implement file saving using gguf")
592
- (void ) vocab;
593
- (void ) model;
594
- (void ) w;
595
- // // write_magic
596
- // file.write_u32(LLAMA_FILE_MAGIC); // magic
597
- // file.write_u32(LLAMA_FILE_VERSION); // version
598
- // // write_hparams
599
- // file.write_u32(model->hparams.n_vocab);
600
- // file.write_u32(model->hparams.n_embd);
601
- // file.write_u32(model->hparams.n_mult);
602
- // file.write_u32(model->hparams.n_head);
603
- // file.write_u32(model->hparams.n_layer);
604
- // file.write_u32(model->hparams.n_rot);
605
- // file.write_u32(LLAMA_FTYPE_ALL_F32);
606
- //
607
- // // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
608
- // uint32_t n_vocab = model->hparams.n_vocab;
609
- // for (uint32_t i = 0; i < n_vocab; i++) {
610
- // const auto & token_data = vocab->id_to_token.at(i);
611
- // file.write_u32((uint32_t) token_data.tok.size());
612
- // file.write_raw(token_data.tok.data(), token_data.tok.size());
613
- // file.write_raw(&token_data.score, sizeof(token_data.score));
614
- // }
615
- //
616
- // // stuff AK weights into GG weights one by one.
617
- // // w->token_embedding_table -> model->tok_embeddings
618
- // // float* -> struct ggml_tensor
619
- // stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
620
- // stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
621
- //
622
- // stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
623
- // //print_row(model->norm, 0);
624
- //
625
- // // for rms-att-weight
626
- // int row_length = model->hparams.n_embd;
627
- // const auto & hparams = model->hparams;
628
- // //int n_ff = model->hparams.n_embd;
629
- // int n_ff = get_n_ff(&hparams);
630
- //
631
- // for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
632
- // auto & layer = model->layers[i];
633
- // // 1d
634
- // stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
635
- // stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
636
- //
637
- // // from 3d matrix layer x dim x dim to 2d matrix dim x dim
638
- // stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
639
- // stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
640
- // stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
641
- // stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
642
- //
643
- // stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
644
- // stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
645
- // stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
646
- // }
647
- // // write tensors
648
- // write_tensor(&file, model->tok_embeddings);
649
- // write_tensor(&file, model->norm);
650
- // write_tensor(&file, model->output); // ?
651
- // for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
652
- // auto & layer = model->layers[i];
653
- //
654
- // write_tensor(&file, layer.attention_norm);
655
- // write_tensor(&file, layer.wq);
656
- // write_tensor(&file, layer.wk);
657
- // write_tensor(&file, layer.wv);
658
- // write_tensor(&file, layer.wo);
659
- // write_tensor(&file, layer.ffn_norm);
660
- // write_tensor(&file, layer.w1);
661
- // write_tensor(&file, layer.w2);
662
- // write_tensor(&file, layer.w3);
663
- // }
628
+ // write_magic
629
+ file.write_u32 (LLAMA_FILE_MAGIC_GGJT); // magic
630
+ file.write_u32 (LLAMA_FILE_VERSION_GGJT_V3); // version
631
+ // write_hparams
632
+ file.write_u32 (model->hparams .n_vocab );
633
+ file.write_u32 (model->hparams .n_embd );
634
+ file.write_u32 (model->hparams .n_mult );
635
+ file.write_u32 (model->hparams .n_head );
636
+ file.write_u32 (model->hparams .n_layer );
637
+ file.write_u32 (model->hparams .n_rot );
638
+ file.write_u32 (LLAMA_FTYPE_ALL_F32);
639
+
640
+ // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
641
+ uint32_t n_vocab = model->hparams .n_vocab ;
642
+ for (uint32_t i = 0 ; i < n_vocab; i++) {
643
+ const auto & token_data = vocab->id_to_token .at (i);
644
+ file.write_u32 ((uint32_t ) token_data.text .size ());
645
+ file.write_raw (token_data.text .data (), token_data.text .size ());
646
+ file.write_raw (&token_data.score , sizeof (token_data.score ));
647
+ }
648
+
649
+ // stuff AK weights into GG weights one by one.
650
+ // w->token_embedding_table -> model->tok_embeddings
651
+ // float* -> struct ggml_tensor
652
+ stuff_karpathy_weights_into_gg (model->tok_embeddings , w->token_embedding_table );
653
+ stuff_karpathy_weights_into_gg (model->output , w->wcls ? w->wcls : w->token_embedding_table );
654
+
655
+ stuff_karpathy_weights_into_gg (model->norm , w->rms_final_weight );
656
+ // print_row(model->norm, 0);
657
+
658
+ // for rms-att-weight
659
+ int row_length = model->hparams .n_embd ;
660
+ const auto & hparams = model->hparams ;
661
+ // int n_ff = model->hparams.n_embd;
662
+ int n_ff = get_n_ff (&hparams);
663
+
664
+ for (uint32_t i = 0 ; i < model->hparams .n_layer ; ++i){
665
+ auto & layer = model->layers [i];
666
+ // 1d
667
+ stuff_karpathy_weights_into_gg (layer.attention_norm , &w->rms_att_weight [i*row_length]);
668
+ stuff_karpathy_weights_into_gg (layer.ffn_norm , &w->rms_ffn_weight [i*row_length]);
669
+
670
+ // from 3d matrix layer x dim x dim to 2d matrix dim x dim
671
+ stuff_karpathy_weights_into_gg (layer.wq , &w->wq [i*row_length*row_length]);
672
+ stuff_karpathy_weights_into_gg (layer.wk , &w->wk [i*row_length*row_length]);
673
+ stuff_karpathy_weights_into_gg (layer.wv , &w->wv [i*row_length*row_length]);
674
+ stuff_karpathy_weights_into_gg (layer.wo , &w->wo [i*row_length*row_length]);
675
+
676
+ stuff_karpathy_weights_into_gg (layer.w1 , &w->w1 [i*row_length*n_ff]);
677
+ stuff_karpathy_weights_into_gg (layer.w2 , &w->w2 [i*n_ff*row_length]);
678
+ stuff_karpathy_weights_into_gg (layer.w3 , &w->w3 [i*row_length*n_ff]);
679
+ }
680
+ // write tensors
681
+ write_tensor (&file, model->tok_embeddings );
682
+ write_tensor (&file, model->norm );
683
+ write_tensor (&file, model->output ); // ?
684
+ for (uint32_t i = 0 ; i < model->hparams .n_layer ; ++i) {
685
+ auto & layer = model->layers [i];
686
+
687
+ write_tensor (&file, layer.attention_norm );
688
+ write_tensor (&file, layer.wq );
689
+ write_tensor (&file, layer.wk );
690
+ write_tensor (&file, layer.wv );
691
+ write_tensor (&file, layer.wo );
692
+ write_tensor (&file, layer.ffn_norm );
693
+ write_tensor (&file, layer.w1 );
694
+ write_tensor (&file, layer.w2 );
695
+ write_tensor (&file, layer.w3 );
696
+ }
664
697
}
665
698
666
699
struct train_params get_default_train_params () {
667
700
struct train_params params;
668
- params.fn_vocab_model = " models/ggml-vocab .bin" ;
701
+ params.fn_vocab_model = " tokenizer .bin" ;
669
702
params.fn_llama2c_output_model = " ak_llama_model.bin" ;
670
703
params.fn_train_data = " shakespeare.txt" ;
671
704
params.fn_checkpoint_in = " checkpoint.bin" ;
@@ -718,7 +751,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
718
751
fprintf (stderr, " \n " );
719
752
fprintf (stderr, " options:\n " );
720
753
fprintf (stderr, " -h, --help show this help message and exit\n " );
721
- fprintf (stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n " , params->fn_vocab_model );
754
+ fprintf (stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n " , params->fn_vocab_model );
722
755
fprintf (stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n " );
723
756
fprintf (stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n " , params->fn_llama2c_output_model );
724
757
fprintf (stderr, " \n " );
@@ -791,9 +824,12 @@ int main(int argc, char ** argv) {
791
824
if (!file) { printf (" Unable to open the checkpoint file %s!\n " , params.fn_llama2c_model ); return 1 ; }
792
825
// read in the config header
793
826
if (fread (&config, sizeof (Config), 1 , file) != 1 ) { return 1 ; }
827
+ auto shared_weights = config.vocab_size > 0 ;
828
+ config.vocab_size = abs (config.vocab_size );
829
+
794
830
// read in the Transformer weights
795
- malloc_weights (&weights, &config);
796
- if (checkpoint_init_weights (&weights, &config, file)) { return 1 ; }
831
+ malloc_weights (&weights, &config, shared_weights );
832
+ if (checkpoint_init_weights (&weights, &config, file, shared_weights )) { return 1 ; }
797
833
fclose (file);
798
834
}
799
835
0 commit comments