Skip to content

Commit 47ca05a

Browse files
fix: tokenization of special characters: (#850)
It should behave like llama.cpp, where most out of the box usages treat special characters accordingly
1 parent 8bf7fa6 commit 47ca05a

File tree

4 files changed

+13
-4
lines changed

4 files changed

+13
-4
lines changed

llama_cpp/llama.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -856,7 +856,7 @@ def create_embedding(
856856
data: List[EmbeddingData] = []
857857
total_tokens = 0
858858
for index, input in enumerate(inputs):
859-
tokens = self.tokenize(input.encode("utf-8"))
859+
tokens = self.tokenize(input.encode("utf-8"), special=True)
860860
self.reset()
861861
self.eval(tokens)
862862
n_tokens = len(tokens)
@@ -928,7 +928,7 @@ def _create_completion(
928928
completion_tokens: List[int] = []
929929
# Add blank space to start of prompt to match OG llama tokenizer
930930
prompt_tokens: List[int] = (
931-
self.tokenize(prompt.encode("utf-8"))
931+
self.tokenize(prompt.encode("utf-8"), special=True)
932932
if prompt != ""
933933
else [self.token_bos()]
934934
)
@@ -1826,7 +1826,7 @@ def __init__(self, llama: Llama):
18261826

18271827
def encode(self, text: str, add_bos: bool = True) -> List[int]:
18281828
return self.llama.tokenize(
1829-
text.encode("utf-8", errors="ignore"), add_bos=add_bos
1829+
text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=True
18301830
)
18311831

18321832
def decode(self, tokens: List[int]) -> str:

llama_cpp/server/app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ def make_logit_bias_processor(
594594
elif logit_bias_type == "tokens":
595595
for token, score in logit_bias.items():
596596
token = token.encode("utf-8")
597-
for input_id in llama.tokenize(token, add_bos=False):
597+
for input_id in llama.tokenize(token, add_bos=False, special=True):
598598
to_bias[input_id] = score
599599

600600
def logit_bias_processor(

test.py

Whitespace-only changes.

tests/test_llama.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@ def test_llama_cpp_tokenization():
2525
detokenized = llama.detokenize(tokens)
2626
assert detokenized != text
2727

28+
text = b"Hello World</s>"
29+
tokens = llama.tokenize(text)
30+
assert tokens[-1] != llama.token_eos()
31+
assert tokens == [1, 15043, 2787, 829, 29879, 29958]
32+
33+
tokens = llama.tokenize(text, special=True)
34+
assert tokens[-1] == llama.token_eos()
35+
assert tokens == [1, 10994, 2787, 2]
36+
2837

2938
def test_llama_patch(monkeypatch):
3039
llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)

0 commit comments

Comments
 (0)