diff --git a/.gitignore b/.gitignore index 5eb1ff1b873f1..25111eb8bbe12 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ models/* arm_neon.h compile_commands.json +*.dSYM/ diff --git a/Makefile b/Makefile index 8388c290d75ce..35b627be735fa 100644 --- a/Makefile +++ b/Makefile @@ -30,9 +30,9 @@ endif # Compile flags # -CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -LDFLAGS = +CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -g -I/opt/homebrew/include +CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -g -I/opt/homebrew/include +LDFLAGS = -L/opt/homebrew/lib -lsentencepiece # OS specific # TODO: support Windows diff --git a/main.cpp b/main.cpp index 98ccde5dbf63b..a83db0e81acda 100644 --- a/main.cpp +++ b/main.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include @@ -84,7 +85,7 @@ struct llama_model { }; // load the model's weights from a file -bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { +bool llama_model_load(const std::string & fname, llama_model & model, sentencepiece::SentencePieceProcessor & sp, gpt_vocab & vocab, int n_ctx) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -146,6 +147,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab return false; } + printf("total pieces: %d", sp.GetPieceSize()); + std::string word; for (int i = 0; i < n_vocab; i++) { uint32_t len; @@ -154,8 +157,9 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab word.resize(len); fin.read((char *) word.data(), len); - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; + std::string wordx = sp.IdToPiece(i); + vocab.token_to_id[wordx] = i; + vocab.id_to_token[i] = wordx; //if (i < 30000) { // printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); @@ -767,11 +771,15 @@ int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; + params.tokenizer = "models/tokenizer.model"; if (gpt_params_parse(argc, argv, params) == false) { return 1; } + sentencepiece::SentencePieceProcessor sp; + sp.Load(params.tokenizer); + if (params.seed < 0) { params.seed = time(NULL); } @@ -795,7 +803,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (!llama_model_load(params.model, model, vocab, 512)) { // TODO: set context from user input ?? + if (!llama_model_load(params.model, model, sp, vocab, 512)) { // TODO: set context from user input ?? fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); return 1; } @@ -811,12 +819,12 @@ int main(int argc, char ** argv) { std::vector logits; // tokenize the prompt - std::vector embd_inp = ::llama_tokenize(vocab, params.prompt, true); + std::vector embd_inp = ::llama_tokenize(sp, vocab, params.prompt, true); params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); // tokenize the reverse prompt - std::vector antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false); + std::vector antiprompt_inp = ::llama_tokenize(sp, vocab, params.antiprompt, false); printf("\n"); printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); @@ -882,6 +890,8 @@ int main(int argc, char ** argv) { printf(ANSI_COLOR_YELLOW); } + // buffering UTF-8 tokens like <0xE6>,<0xAC><0xA2> spanning across multiple output to make it complete. + std::vector buffids = {}; while (remaining_tokens > 0) { // predict if (embd.size() > 0) { @@ -943,9 +953,8 @@ int main(int argc, char ** argv) { // display text if (!input_noecho) { - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } + untokenize(sp, buffids, embd); + // reset color to default if we there is no pending user input if (params.use_color && embd_inp.size() <= input_consumed) { printf(ANSI_COLOR_RESET); @@ -986,7 +995,7 @@ int main(int argc, char ** argv) { buf[n_read+1] = 0; } - std::vector line_inp = ::llama_tokenize(vocab, buf, false); + std::vector line_inp = ::llama_tokenize(sp, vocab, buf, false); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); remaining_tokens -= line_inp.size(); diff --git a/utils.cpp b/utils.cpp index b340bd61b39e1..4c2823d920b07 100644 --- a/utils.cpp +++ b/utils.cpp @@ -4,10 +4,7 @@ #include #include #include -#include -#include -#include -#include +#include #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -49,6 +46,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.n_batch = std::stoi(argv[++i]); } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; + } else if (arg == "--tokenizer") { + params.tokenizer = argv[++i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--interactive-start") { @@ -96,6 +95,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " --tokenizer FNAME\n"); + fprintf(stderr, " tokenizer path (default: %s)\n", params.model.c_str()); fprintf(stderr, "\n"); } @@ -272,42 +273,11 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { - //auto res = gpt_tokenize(vocab, text); - - //if (bos) { - // res.insert(res.begin(), 1); // TODO: replace with vocab.bos - //} - +std::vector llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos) { std::vector res; - if (bos) { - res.push_back(1); // TODO: replace with vocab.bos - } - - //find the longest token that matches the text - int pos = 0; - while (true) { - int l = 0; - int t = 0; - for (const auto & kv : vocab.id_to_token) { - if (kv.second.size() < l) continue; - if (kv.second.size() > text.size() - pos) continue; - if (text.substr(pos, kv.second.size()) == kv.second) { - l = kv.second.size(); - t = kv.first; - } - } - - if (l == 0) { - break; - } - - res.push_back(t); - pos += l; - } - - return res; + std::vector pieces; + return sp.EncodeAsIds(text); } bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { @@ -542,3 +512,39 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t return (n/k)*row_size; } + +void untokenize(sentencepiece::SentencePieceProcessor &sp, std::vector &buffids, std::vector &embd) +{ + for (auto id : embd) + { + std::string s = sp.IdToPiece(id); // vocab.id_to_token[id]; + + if (s.find("<0x") == 0 && s[s.length() - 1] == '>') + { + buffids.push_back(id); + std::string txt = sp.DecodeIds(buffids); + // printf("bufferring %s, total buffer: %s\n", s.c_str(), txt.c_str()); + } + else if (s.find("▁") == 0) + { + if (!buffids.empty()) + { + std::string txt = sp.DecodeIds(buffids); + printf("%s", txt.c_str()); + buffids.clear(); + } + s = std::regex_replace(s, std::regex("▁"), " "); + printf("%s", s.c_str()); + } + else + { + if (!buffids.empty()) + { + std::string txt = sp.DecodeIds(buffids); + printf("%s", txt.c_str()); + buffids.clear(); + } + printf("%s", s.c_str()); + } + } +} \ No newline at end of file diff --git a/utils.h b/utils.h index 4f98011cf257c..2511c29419be9 100644 --- a/utils.h +++ b/utils.h @@ -7,6 +7,7 @@ #include #include #include +#include // // CLI argument parsing @@ -27,6 +28,7 @@ struct gpt_params { int32_t n_batch = 8; // batch size for prompt processing std::string model = "models/lamma-7B/ggml-model.bin"; // model path + std::string tokenizer = "models/tokenizer.model"; // tokenizer path std::string prompt; bool use_color = false; // use color to distinguish generations and inputs @@ -73,7 +75,7 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // ref: https://github.com/google/sentencepiece -std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); +std::vector llama_tokenize(sentencepiece::SentencePieceProcessor & sp, const gpt_vocab & vocab, const std::string & text, bool bos); // load the tokens from encoder.json bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); @@ -102,3 +104,6 @@ void sample_top_k(std::vector> & logits_id, int size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); + +void untokenize(sentencepiece::SentencePieceProcessor & sp, std::vector & buffids, std::vector & embd); +