Skip to content

Commit 1544192

Browse files
ngxsonpull[bot]
authored andcommitted
Fix gemma2 tokenizer convert (#8244)
* fix gemma2 tokenizer convert * remove scores * improve code, fix new line issue
1 parent 2697bfa commit 1544192

File tree

1 file changed

+27
-10
lines changed

1 file changed

+27
-10
lines changed

convert-hf-to-gguf.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,19 @@ def _set_vocab_qwen(self):
576576
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
577577
special_vocab.add_to_gguf(self.gguf_writer)
578578

579-
def _set_vocab_sentencepiece(self):
579+
def _set_vocab_sentencepiece(self, add_to_gguf=True):
580+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
581+
582+
self.gguf_writer.add_tokenizer_model("llama")
583+
self.gguf_writer.add_tokenizer_pre("default")
584+
self.gguf_writer.add_token_list(tokens)
585+
self.gguf_writer.add_token_scores(scores)
586+
self.gguf_writer.add_token_types(toktypes)
587+
588+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
589+
special_vocab.add_to_gguf(self.gguf_writer)
590+
591+
def _create_vocab_sentencepiece(self):
580592
from sentencepiece import SentencePieceProcessor
581593

582594
tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -638,14 +650,7 @@ def _set_vocab_sentencepiece(self):
638650
scores.append(-1000.0)
639651
toktypes.append(SentencePieceTokenTypes.UNUSED)
640652

641-
self.gguf_writer.add_tokenizer_model("llama")
642-
self.gguf_writer.add_tokenizer_pre("default")
643-
self.gguf_writer.add_token_list(tokens)
644-
self.gguf_writer.add_token_scores(scores)
645-
self.gguf_writer.add_token_types(toktypes)
646-
647-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
648-
special_vocab.add_to_gguf(self.gguf_writer)
653+
return tokens, scores, toktypes
649654

650655
def _set_vocab_llama_hf(self):
651656
vocab = gguf.LlamaHfVocab(self.dir_model)
@@ -2345,7 +2350,19 @@ class Gemma2Model(Model):
23452350
model_arch = gguf.MODEL_ARCH.GEMMA2
23462351

23472352
def set_vocab(self):
2348-
self._set_vocab_llama_hf()
2353+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
2354+
# hack: This is required so that we can properly use start/end-of-turn for chat template
2355+
for i in range(108):
2356+
# including <unusedX>, <start_of_turn>, <end_of_turn>
2357+
toktypes[i] = SentencePieceTokenTypes.CONTROL
2358+
self.gguf_writer.add_tokenizer_model("llama")
2359+
self.gguf_writer.add_tokenizer_pre("default")
2360+
self.gguf_writer.add_token_list(tokens)
2361+
self.gguf_writer.add_token_scores(scores)
2362+
self.gguf_writer.add_token_types(toktypes)
2363+
2364+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2365+
special_vocab.add_to_gguf(self.gguf_writer)
23492366
self.gguf_writer.add_add_space_prefix(False)
23502367

23512368
def set_gguf_parameters(self):

0 commit comments

Comments
 (0)