Skip to content

Commit 7494c78

Browse files
authored
llama : sync gguf-llama with llama (#2613)
* llama : sync gguf-llama with llama * tests : fix build + warnings (test-tokenizer-1 still fails) * tests : fix wstring_convert * convert : fix layer names * llama : sync gguf-llama.cpp * convert : update HF converter to new tokenizer voodoo magics
1 parent afc4ca2 commit 7494c78

8 files changed

+589
-292
lines changed

convert-llama-h5-to-gguf.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def count_model_parts(dir_model: str) -> int:
9595

9696
gguf_writer.add_architecture(llm_arch)
9797
gguf_writer.add_name(last_dir)
98-
gguf_writer.add_file_type( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
98+
gguf_writer.add_file_type("All tensors F32" if ftype == 0 else "Most tensors F16, some F32")
9999
gguf_writer.add_source_hf_repo(hf_repo)
100100
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
101101
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
@@ -122,19 +122,11 @@ def count_model_parts(dir_model: str) -> int:
122122

123123
for i in range(tokenizer.vocab_size()):
124124
text: bytes
125-
if tokenizer.is_unknown(i):
126-
text = " \u2047 ".encode("utf-8")
127-
elif tokenizer.is_control(i):
128-
text = b""
129-
if tokenizer.is_byte(i):
130-
piece = tokenizer.id_to_piece(i)
131-
if len(piece) != 6:
132-
raise Exception(f"Invalid token: {piece}")
133-
byte_value = int(piece[3:-1], 16)
134-
text = struct.pack("B", byte_value)
135-
else:
136-
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
137-
score: float = tokenizer.get_score(i)
125+
score: float
126+
127+
piece = tokenizer.id_to_piece(i)
128+
text = piece.encode("utf-8")
129+
score = tokenizer.get_score(i)
138130

139131
tokens.append(text)
140132
scores.append(score)

examples/gguf/gguf-llama-simple.cpp

Lines changed: 126 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1,126 +1,126 @@
1-
#ifndef _GNU_SOURCE
2-
#define _GNU_SOURCE
3-
#endif
4-
5-
#include "common.h"
6-
#include "gguf-llama.h"
7-
#include "build-info.h"
8-
9-
#include <cmath>
10-
#include <cstdio>
11-
#include <string>
12-
#include <vector>
13-
14-
int main(int argc, char ** argv) {
15-
gpt_params params;
16-
17-
if (argc == 1 || argv[1][0] == '-') {
18-
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
19-
return 1 ;
20-
}
21-
22-
if (argc >= 2) {
23-
params.model = argv[1];
24-
}
25-
26-
if (argc >= 3) {
27-
params.prompt = argv[2];
28-
}
29-
30-
if (params.prompt.empty()) {
31-
params.prompt = "Hello my name is";
32-
}
33-
34-
// init LLM
35-
36-
llama_backend_init(params.numa);
37-
38-
llama_context_params ctx_params = llama_context_default_params();
39-
40-
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
41-
42-
if (model == NULL) {
43-
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
44-
return 1;
45-
}
46-
47-
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
48-
49-
// tokenize the prompt
50-
51-
std::vector<llama_token> tokens_list;
52-
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
53-
54-
const int max_context_size = llama_n_ctx(ctx);
55-
const int max_tokens_list_size = max_context_size - 4;
56-
57-
if ((int)tokens_list.size() > max_tokens_list_size) {
58-
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
59-
return 1;
60-
}
61-
62-
fprintf(stderr, "\n\n");
63-
64-
for (auto id : tokens_list) {
65-
fprintf(stderr, "%s", llama_token_to_str(ctx, id));
66-
}
67-
68-
fflush(stderr);
69-
70-
// main loop
71-
72-
// The LLM keeps a contextual cache memory of previous token evaluation.
73-
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
74-
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
75-
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
76-
77-
while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
78-
// evaluate the transformer
79-
80-
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
81-
fprintf(stderr, "%s : failed to eval\n", __func__);
82-
return 1;
83-
}
84-
85-
tokens_list.clear();
86-
87-
// sample the next token
88-
89-
llama_token new_token_id = 0;
90-
91-
auto logits = llama_get_logits(ctx);
92-
auto n_vocab = llama_n_vocab(ctx);
93-
94-
std::vector<llama_token_data> candidates;
95-
candidates.reserve(n_vocab);
96-
97-
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
98-
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
99-
}
100-
101-
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
102-
103-
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
104-
105-
// is it an end of stream ?
106-
if (new_token_id == llama_token_eos()) {
107-
fprintf(stderr, " [end of text]\n");
108-
break;
109-
}
110-
111-
// print the new token :
112-
printf("%s", llama_token_to_str(ctx, new_token_id));
113-
fflush(stdout);
114-
115-
// push this new token for next evaluation
116-
tokens_list.push_back(new_token_id);
117-
118-
}
119-
120-
llama_free(ctx);
121-
llama_free_model(model);
122-
123-
llama_backend_free();
124-
125-
return 0;
126-
}
1+
#ifndef _GNU_SOURCE
2+
#define _GNU_SOURCE
3+
#endif
4+
5+
#include "common.h"
6+
#include "gguf-llama.h"
7+
#include "build-info.h"
8+
9+
#include <cmath>
10+
#include <cstdio>
11+
#include <string>
12+
#include <vector>
13+
14+
int main(int argc, char ** argv) {
15+
gpt_params params;
16+
17+
if (argc == 1 || argv[1][0] == '-') {
18+
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
19+
return 1 ;
20+
}
21+
22+
if (argc >= 2) {
23+
params.model = argv[1];
24+
}
25+
26+
if (argc >= 3) {
27+
params.prompt = argv[2];
28+
}
29+
30+
if (params.prompt.empty()) {
31+
params.prompt = "Hello my name is";
32+
}
33+
34+
// init LLM
35+
36+
llama_backend_init(params.numa);
37+
38+
llama_context_params ctx_params = llama_context_default_params();
39+
40+
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
41+
42+
if (model == NULL) {
43+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
44+
return 1;
45+
}
46+
47+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
48+
49+
// tokenize the prompt
50+
51+
std::vector<llama_token> tokens_list;
52+
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
53+
54+
const int max_context_size = llama_n_ctx(ctx);
55+
const int max_tokens_list_size = max_context_size - 4;
56+
57+
if ((int) tokens_list.size() > max_tokens_list_size) {
58+
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
59+
return 1;
60+
}
61+
62+
fprintf(stderr, "\n\n");
63+
64+
for (auto id : tokens_list) {
65+
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
66+
}
67+
68+
fflush(stderr);
69+
70+
// main loop
71+
72+
// The LLM keeps a contextual cache memory of previous token evaluation.
73+
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
74+
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
75+
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
76+
77+
while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
78+
// evaluate the transformer
79+
80+
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
81+
fprintf(stderr, "%s : failed to eval\n", __func__);
82+
return 1;
83+
}
84+
85+
tokens_list.clear();
86+
87+
// sample the next token
88+
89+
llama_token new_token_id = 0;
90+
91+
auto logits = llama_get_logits(ctx);
92+
auto n_vocab = llama_n_vocab(ctx);
93+
94+
std::vector<llama_token_data> candidates;
95+
candidates.reserve(n_vocab);
96+
97+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
98+
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
99+
}
100+
101+
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
102+
103+
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
104+
105+
// is it an end of stream ?
106+
if (new_token_id == llama_token_eos()) {
107+
fprintf(stderr, " [end of text]\n");
108+
break;
109+
}
110+
111+
// print the new token :
112+
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
113+
fflush(stdout);
114+
115+
// push this new token for next evaluation
116+
tokens_list.push_back(new_token_id);
117+
118+
}
119+
120+
llama_free(ctx);
121+
llama_free_model(model);
122+
123+
llama_backend_free();
124+
125+
return 0;
126+
}

0 commit comments

Comments
 (0)