convert : fix incorrect added token dedup in BpeVocab

cebtenzzre · cebtenzzre · commit d12a63ca3e35 · 2024-03-27T17:54:40.000-04:00
diff --git a/convert.py b/convert.py
@@ -387,7 +387,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None):
                     (item['content'], item['id'])
                     for item in tokenizer_json.get('added_tokens', [])
                     # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in bpe_tokenizer)
+                    if item['content'] not in self.vocab)
 
         vocab_size   = len(self.vocab)
         expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))