Skip to content

Commit 9538524

Browse files
authored
examples : restore the functionality to import llama2.c models (#2685)
* Fix import of llama2.c models that don't share weights between embedding layers * llama2c: reinstate ggmlv3 conversion output + update readme w/ gguf conv * llama2.c: comment out legacy "load from ggml model" logic * llama2.c: convert special-cased "<0xXX>" single byte tokens from tokenizer.bin
1 parent 335acd2 commit 9538524

File tree

2 files changed

+144
-104
lines changed

2 files changed

+144
-104
lines changed

examples/convert-llama2c-to-ggml/README.md

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,19 @@ usage: ./convert-llama2c-to-ggml [options]
1212
1313
options:
1414
-h, --help show this help message and exit
15-
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'models/ggml-vocab.bin')
15+
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
1616
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
1717
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
1818
```
1919

20-
An example command is as follows:
20+
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
2121

22-
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
22+
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
2323

24-
Now you can use the model with command like:
24+
For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
2525

26-
`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
26+
`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
27+
28+
Now you can use the model with a command like:
29+
30+
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

Lines changed: 135 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
#pragma warning(disable: 4244 4267) // possible loss of data
1818
#endif
1919

20+
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
21+
#define LLAMA_FILE_VERSION_GGJT_V3 3
22+
2023
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
2124
typedef struct {
2225
int dim; // transformer dimension
@@ -49,10 +52,10 @@ typedef struct {
4952
// float* freq_cis_real; // (seq_len, dim/2)
5053
// float* freq_cis_imag; // (seq_len, dim/2)
5154
// (optional) classifier weights for the logits, on the last layer
52-
//float* wcls;
55+
float* wcls;
5356
} TransformerWeights;
5457

55-
void malloc_weights(TransformerWeights* w, Config* p) {
58+
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
5659
// we calloc instead of malloc to keep valgrind happy
5760
w->token_embedding_table = new float[p->vocab_size * p->dim]();
5861
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@@ -86,9 +89,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
8689

8790
w->rms_final_weight = new float[p->dim]();
8891
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
92+
93+
if (shared_weights) {
94+
w->wcls = NULL;
95+
} else {
96+
w->wcls = new float[p->vocab_size * p->dim]();
97+
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
98+
}
8999
}
90100

91-
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
101+
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
92102
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
93103
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
94104
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@@ -100,6 +110,22 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
100110
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
101111
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
102112
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
113+
114+
// Skip freq_cis_real & freq_cis_imag
115+
int head_size = p->dim / p->n_heads;
116+
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
117+
118+
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
119+
120+
// Check we didn't forget to read anything
121+
auto curr = ftell(f);
122+
fseek(f, 0, SEEK_END);
123+
auto end = ftell(f);
124+
if (curr != end) {
125+
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
126+
return 1;
127+
}
128+
103129
return 0;
104130
}
105131

@@ -115,6 +141,7 @@ void free_weights(TransformerWeights* w) {
115141
delete w->w2;
116142
delete w->w3;
117143
delete w->rms_final_weight;
144+
if (w->wcls) delete w->wcls;
118145
}
119146

120147
void print_sample_weights(TransformerWeights *w){
@@ -131,6 +158,7 @@ void print_sample_weights(TransformerWeights *w){
131158
printf("%f\n", w->w2[0]);
132159
printf("%f\n", w->w3[0]);
133160
printf("%f\n", w->rms_att_weight[0]);
161+
if (w->wcls) printf("%f\n", w->wcls[0]);
134162
}
135163
////////////////////////////////////////////////////////////////////////////////////////////////////////////
136164

@@ -509,26 +537,28 @@ bool is_ggml_file(const char *filename) {
509537
}
510538

511539
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
512-
// heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
513-
if (is_ggml_file(filename)) {
514-
515-
struct llama_context_params llama_params = llama_context_default_params();
516-
llama_params.vocab_only = true;
517-
518-
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
519-
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
520-
521-
const int n_vocab = llama_n_vocab(lctx);
522-
vocab->id_to_token.resize(n_vocab);
523-
for (int i=0; i<n_vocab; ++i) {
524-
vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
525-
vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
526-
vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
527-
vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
528-
}
529-
llama_free(lctx);
530-
llama_free_model(lmodel);
531-
} else { // assume llama2.c vocabulary
540+
#pragma message("TODO: implement reading vocabulary using gguf")
541+
// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
542+
// if (is_ggml_file(filename)) {
543+
//
544+
// struct llama_context_params llama_params = llama_context_default_params();
545+
// llama_params.vocab_only = true;
546+
//
547+
// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
548+
// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
549+
//
550+
// const int n_vocab = llama_n_vocab(lctx);
551+
// vocab->id_to_token.resize(n_vocab);
552+
// for (int i=0; i<n_vocab; ++i) {
553+
// vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
554+
// vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
555+
// vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
556+
// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
557+
// }
558+
// llama_free(lctx);
559+
// llama_free_model(lmodel);
560+
// } else
561+
{ // assume llama2.c vocabulary
532562
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
533563
llama_file file(filename, "rb");
534564
const int n_vocab = config->vocab_size;
@@ -538,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
538568
float_t score = file.read_f32();
539569
uint32_t len = file.read_u32();
540570
std::string text = file.read_string(len);
571+
// Special-case handling of <0xXX> single byte tokens.
572+
char byte_val;
573+
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
574+
char cstr[2] = { byte_val, 0 };
575+
text = cstr;
576+
}
541577
vocab->id_to_token[i].text = text;
542578
vocab->id_to_token[i].score = score;
543579
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
@@ -589,83 +625,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
589625
}
590626

591627
#pragma message("TODO: implement file saving using gguf")
592-
(void) vocab;
593-
(void) model;
594-
(void) w;
595-
// // write_magic
596-
// file.write_u32(LLAMA_FILE_MAGIC); // magic
597-
// file.write_u32(LLAMA_FILE_VERSION); // version
598-
// // write_hparams
599-
// file.write_u32(model->hparams.n_vocab);
600-
// file.write_u32(model->hparams.n_embd);
601-
// file.write_u32(model->hparams.n_mult);
602-
// file.write_u32(model->hparams.n_head);
603-
// file.write_u32(model->hparams.n_layer);
604-
// file.write_u32(model->hparams.n_rot);
605-
// file.write_u32(LLAMA_FTYPE_ALL_F32);
606-
//
607-
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
608-
// uint32_t n_vocab = model->hparams.n_vocab;
609-
// for (uint32_t i = 0; i < n_vocab; i++) {
610-
// const auto & token_data = vocab->id_to_token.at(i);
611-
// file.write_u32((uint32_t) token_data.tok.size());
612-
// file.write_raw(token_data.tok.data(), token_data.tok.size());
613-
// file.write_raw(&token_data.score, sizeof(token_data.score));
614-
// }
615-
//
616-
// // stuff AK weights into GG weights one by one.
617-
// // w->token_embedding_table -> model->tok_embeddings
618-
// // float* -> struct ggml_tensor
619-
// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
620-
// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
621-
//
622-
// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
623-
// //print_row(model->norm, 0);
624-
//
625-
// // for rms-att-weight
626-
// int row_length = model->hparams.n_embd;
627-
// const auto & hparams = model->hparams;
628-
// //int n_ff = model->hparams.n_embd;
629-
// int n_ff = get_n_ff(&hparams);
630-
//
631-
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
632-
// auto & layer = model->layers[i];
633-
// // 1d
634-
// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
635-
// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
636-
//
637-
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim
638-
// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
639-
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
640-
// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
641-
// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
642-
//
643-
// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
644-
// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
645-
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
646-
// }
647-
// // write tensors
648-
// write_tensor(&file, model->tok_embeddings);
649-
// write_tensor(&file, model->norm);
650-
// write_tensor(&file, model->output); // ?
651-
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
652-
// auto & layer = model->layers[i];
653-
//
654-
// write_tensor(&file, layer.attention_norm);
655-
// write_tensor(&file, layer.wq);
656-
// write_tensor(&file, layer.wk);
657-
// write_tensor(&file, layer.wv);
658-
// write_tensor(&file, layer.wo);
659-
// write_tensor(&file, layer.ffn_norm);
660-
// write_tensor(&file, layer.w1);
661-
// write_tensor(&file, layer.w2);
662-
// write_tensor(&file, layer.w3);
663-
// }
628+
// write_magic
629+
file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
630+
file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
631+
// write_hparams
632+
file.write_u32(model->hparams.n_vocab);
633+
file.write_u32(model->hparams.n_embd);
634+
file.write_u32(model->hparams.n_mult);
635+
file.write_u32(model->hparams.n_head);
636+
file.write_u32(model->hparams.n_layer);
637+
file.write_u32(model->hparams.n_rot);
638+
file.write_u32(LLAMA_FTYPE_ALL_F32);
639+
640+
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
641+
uint32_t n_vocab = model->hparams.n_vocab;
642+
for (uint32_t i = 0; i < n_vocab; i++) {
643+
const auto & token_data = vocab->id_to_token.at(i);
644+
file.write_u32((uint32_t) token_data.text.size());
645+
file.write_raw(token_data.text.data(), token_data.text.size());
646+
file.write_raw(&token_data.score, sizeof(token_data.score));
647+
}
648+
649+
// stuff AK weights into GG weights one by one.
650+
// w->token_embedding_table -> model->tok_embeddings
651+
// float* -> struct ggml_tensor
652+
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
653+
stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
654+
655+
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
656+
//print_row(model->norm, 0);
657+
658+
// for rms-att-weight
659+
int row_length = model->hparams.n_embd;
660+
const auto & hparams = model->hparams;
661+
//int n_ff = model->hparams.n_embd;
662+
int n_ff = get_n_ff(&hparams);
663+
664+
for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
665+
auto & layer = model->layers[i];
666+
// 1d
667+
stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
668+
stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
669+
670+
// from 3d matrix layer x dim x dim to 2d matrix dim x dim
671+
stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
672+
stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
673+
stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
674+
stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
675+
676+
stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
677+
stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
678+
stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
679+
}
680+
// write tensors
681+
write_tensor(&file, model->tok_embeddings);
682+
write_tensor(&file, model->norm);
683+
write_tensor(&file, model->output); // ?
684+
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
685+
auto & layer = model->layers[i];
686+
687+
write_tensor(&file, layer.attention_norm);
688+
write_tensor(&file, layer.wq);
689+
write_tensor(&file, layer.wk);
690+
write_tensor(&file, layer.wv);
691+
write_tensor(&file, layer.wo);
692+
write_tensor(&file, layer.ffn_norm);
693+
write_tensor(&file, layer.w1);
694+
write_tensor(&file, layer.w2);
695+
write_tensor(&file, layer.w3);
696+
}
664697
}
665698

666699
struct train_params get_default_train_params() {
667700
struct train_params params;
668-
params.fn_vocab_model = "models/ggml-vocab.bin";
701+
params.fn_vocab_model = "tokenizer.bin";
669702
params.fn_llama2c_output_model = "ak_llama_model.bin";
670703
params.fn_train_data = "shakespeare.txt";
671704
params.fn_checkpoint_in = "checkpoint.bin";
@@ -718,7 +751,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
718751
fprintf(stderr, "\n");
719752
fprintf(stderr, "options:\n");
720753
fprintf(stderr, " -h, --help show this help message and exit\n");
721-
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
754+
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
722755
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
723756
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
724757
fprintf(stderr, "\n");
@@ -791,9 +824,12 @@ int main(int argc, char ** argv) {
791824
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
792825
// read in the config header
793826
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
827+
auto shared_weights = config.vocab_size > 0;
828+
config.vocab_size = abs(config.vocab_size);
829+
794830
// read in the Transformer weights
795-
malloc_weights(&weights, &config);
796-
if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
831+
malloc_weights(&weights, &config, shared_weights);
832+
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
797833
fclose(file);
798834
}
799835

0 commit comments

Comments
 (0)