@@ -576,7 +576,19 @@ def _set_vocab_qwen(self):
576
576
special_vocab ._set_special_token ("unk" , tokenizer .special_tokens ["<|endoftext|>" ])
577
577
special_vocab .add_to_gguf (self .gguf_writer )
578
578
579
- def _set_vocab_sentencepiece (self ):
579
+ def _set_vocab_sentencepiece (self , add_to_gguf = True ):
580
+ tokens , scores , toktypes = self ._create_vocab_sentencepiece ()
581
+
582
+ self .gguf_writer .add_tokenizer_model ("llama" )
583
+ self .gguf_writer .add_tokenizer_pre ("default" )
584
+ self .gguf_writer .add_token_list (tokens )
585
+ self .gguf_writer .add_token_scores (scores )
586
+ self .gguf_writer .add_token_types (toktypes )
587
+
588
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
589
+ special_vocab .add_to_gguf (self .gguf_writer )
590
+
591
+ def _create_vocab_sentencepiece (self ):
580
592
from sentencepiece import SentencePieceProcessor
581
593
582
594
tokenizer_path = self .dir_model / 'tokenizer.model'
@@ -638,14 +650,7 @@ def _set_vocab_sentencepiece(self):
638
650
scores .append (- 1000.0 )
639
651
toktypes .append (SentencePieceTokenTypes .UNUSED )
640
652
641
- self .gguf_writer .add_tokenizer_model ("llama" )
642
- self .gguf_writer .add_tokenizer_pre ("default" )
643
- self .gguf_writer .add_token_list (tokens )
644
- self .gguf_writer .add_token_scores (scores )
645
- self .gguf_writer .add_token_types (toktypes )
646
-
647
- special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
648
- special_vocab .add_to_gguf (self .gguf_writer )
653
+ return tokens , scores , toktypes
649
654
650
655
def _set_vocab_llama_hf (self ):
651
656
vocab = gguf .LlamaHfVocab (self .dir_model )
@@ -2345,7 +2350,19 @@ class Gemma2Model(Model):
2345
2350
model_arch = gguf .MODEL_ARCH .GEMMA2
2346
2351
2347
2352
def set_vocab (self ):
2348
- self ._set_vocab_llama_hf ()
2353
+ tokens , scores , toktypes = self ._create_vocab_sentencepiece ()
2354
+ # hack: This is required so that we can properly use start/end-of-turn for chat template
2355
+ for i in range (108 ):
2356
+ # including <unusedX>, <start_of_turn>, <end_of_turn>
2357
+ toktypes [i ] = SentencePieceTokenTypes .CONTROL
2358
+ self .gguf_writer .add_tokenizer_model ("llama" )
2359
+ self .gguf_writer .add_tokenizer_pre ("default" )
2360
+ self .gguf_writer .add_token_list (tokens )
2361
+ self .gguf_writer .add_token_scores (scores )
2362
+ self .gguf_writer .add_token_types (toktypes )
2363
+
2364
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2365
+ special_vocab .add_to_gguf (self .gguf_writer )
2349
2366
self .gguf_writer .add_add_space_prefix (False )
2350
2367
2351
2368
def set_gguf_parameters (self ):
0 commit comments