Skip to content

Commit 6bca95c

Browse files
committed
Add sentencepiece tokenizer
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
1 parent 3dcccef commit 6bca95c

File tree

7 files changed

+238
-0
lines changed

7 files changed

+238
-0
lines changed

.gitignore

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
build/
2+
.DS_Store
3+
# Editor temporaries
4+
*.swa
5+
*.swb
6+
*.swc
7+
*.swd
8+
*.swe
9+
*.swf
10+
*.swg
11+
*.swh
12+
*.swi
13+
*.swj
14+
*.swk
15+
*.swl
16+
*.swm
17+
*.swn
18+
*.swo
19+
*.swp
20+
*~
21+
.~lock.*
22+
*.idea

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "third-party/sentencepiece"]
2+
path = third-party/sentencepiece
3+
url = https://github.com/google/sentencepiece.git

CMakeLists.txt

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
2+
#
3+
# This source code is licensed under the BSD-style license found in the LICENSE
4+
# file in the root directory of this source tree.
5+
6+
#
7+
# Build tokenizers.
8+
#
9+
# ### Editing this file ###
10+
#
11+
# This file should be formatted with
12+
# ~~~
13+
# cmake-format -i CMakeLists.txt
14+
# ~~~
15+
# It should also be cmake-lint clean.
16+
#
17+
cmake_minimum_required(VERSION 3.24)
18+
19+
project(Tokenizers)
20+
21+
set(ABSL_ENABLE_INSTALL ON)
22+
set(ABSL_PROPAGATE_CXX_STD ON)
23+
set(_pic_flag
24+
${CMAKE_POSITION_INDEPENDENT_CODE})
25+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
26+
add_subdirectory(third-party/sentencepiece)
27+
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
28+
29+
add_library(tokenizers STATIC src/sentencepiece.cpp)
30+
31+
# Using abseil from sentencepiece/third_party
32+
target_include_directories(
33+
tokenizers PUBLIC third-party/sentencepiece/src
34+
third-party/sentencepiece include)
35+
36+
target_link_libraries(tokenizers PUBLIC sentencepiece-static)

include/error.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ enum class Error : error_code_t {
3939

4040
/// Token out of range.
4141
OutOfRange = 0x03,
42+
43+
/// Artifact load failure.
44+
LoadFailure = 0x04,
45+
46+
/// Encode failure.
47+
EncodeFailure = 0x05,
4248
};
4349

4450
} // namespace tokenizers

include/sentencepiece.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// A tokenizer that works with sentencepiece.
10+
#pragma once
11+
12+
#include "sentencepiece_processor.h"
13+
#include "tokenizer.h"
14+
#include <memory>
15+
#include <vector>
16+
namespace tokenizers {
17+
18+
struct TokenIndex {
19+
const char *str;
20+
int32_t id;
21+
};
22+
23+
class SPTokenizer : public Tokenizer {
24+
public:
25+
explicit SPTokenizer();
26+
~SPTokenizer() override;
27+
28+
Error load(const std::string &tokenizer_path) override;
29+
30+
Result<std::vector<uint64_t>> encode(const std::string &input, int8_t bos,
31+
int8_t eos) const override;
32+
33+
Result<std::string> decode(uint64_t prev_token,
34+
uint64_t token) const override;
35+
36+
private:
37+
std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;
38+
};
39+
40+
} // namespace tokenizers

src/sentencepiece.cpp

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// A tokenizer that works with sentencepiece.
10+
11+
#include "sentencepiece.h"
12+
#include "third_party/absl/strings/str_replace.h"
13+
#include <cinttypes>
14+
#include <string>
15+
namespace tokenizers {
16+
const char kSpaceSymbol[] = "\xe2\x96\x81";
17+
18+
SPTokenizer::SPTokenizer()
19+
: Tokenizer(),
20+
_processor(std::make_unique<sentencepiece::SentencePieceProcessor>()) {}
21+
22+
/**
23+
* @brief Load the tokenizer from a file. The tokenizer file contains the
24+
* vocabulary and scores. The format is: the first integer is the maximum
25+
* token length, followed by a list of (word_len, word) pairs. Here we
26+
* are reading all the vocabulary into memory and keep it sorted for fast
27+
* lookup.
28+
*
29+
* @param tokenizer_path The path to the tokenizer file.
30+
* @return Error
31+
*/
32+
Error SPTokenizer::load(const std::string &tokenizer_path) {
33+
if (initialized_) {
34+
fprintf(stderr, "Tokenizer already initialized.\n");
35+
return Error::Ok;
36+
}
37+
// read in the file
38+
const auto status = _processor->Load(tokenizer_path);
39+
if (!status.ok()) {
40+
fprintf(stderr,
41+
"couldn't load %s\n. If this tokenizer artifact is for llama3, "
42+
"please pass `-l 3`.",
43+
tokenizer_path.c_str());
44+
return Error::LoadFailure;
45+
}
46+
// load vocab_size, bos_tok, eos_tok
47+
vocab_size_ = _processor->GetPieceSize();
48+
bos_tok_ = _processor->bos_id();
49+
eos_tok_ = _processor->eos_id();
50+
initialized_ = true;
51+
return Error::Ok;
52+
}
53+
54+
SPTokenizer::~SPTokenizer() {}
55+
56+
/**
57+
* @brief Decode a token into string.
58+
*
59+
* @param prev_token The previous token.
60+
* @param token The current token.
61+
* @return Result<std::string> The string representation of the
62+
* token.
63+
*/
64+
Result<std::string> SPTokenizer::decode(uint64_t prev_token,
65+
uint64_t token) const {
66+
if (!initialized_) {
67+
fprintf(stderr, "Tokenizer not initialized\n");
68+
return Error::Uninitialized;
69+
}
70+
// get rid of the control ids <s> and </s>
71+
if (_processor->IsControl(token)) {
72+
// NB: returning empty string doesn't work for some reason. It causes
73+
// free(): invalid pointer error.
74+
return std::string(" ");
75+
}
76+
77+
std::string result =
78+
absl::StrReplaceAll(_processor->IdToPiece(token), {{kSpaceSymbol, " "}});
79+
80+
// following BOS token, sentencepiece decoder strips any leading
81+
// whitespace
82+
if (prev_token == bos_tok_ && result[0] == ' ') {
83+
result = result.substr(1);
84+
}
85+
86+
// handle <0x0A>
87+
result = absl::StrReplaceAll(result, {{"<0x0A>", "\n"}});
88+
89+
return result;
90+
}
91+
92+
/**
93+
* @brief Encode a string into a sequence of tokens.
94+
*
95+
* @param text The string to be encoded.
96+
* @param bos The number of BOS to prepend to the token list.
97+
* @param eos The number of EOS to append to the token list.
98+
* @return Result<std::vector<uint64_t>>
99+
*/
100+
Result<std::vector<uint64_t>>
101+
SPTokenizer::encode(const std::string &text, int8_t bos, int8_t eos) const {
102+
if (!initialized_) {
103+
fprintf(stderr, "Tokenizer not initialized\n");
104+
return Error::Uninitialized;
105+
}
106+
// workaround a weird issue that text doesn't have correct size()
107+
std::string input(text.c_str());
108+
// should we reserve memory?
109+
std::vector<int> res;
110+
auto status = _processor->Encode(input, &res);
111+
if (!status.ok()) {
112+
fprintf(stderr, "couldn't encode %s\n", text.c_str());
113+
return Error::EncodeFailure;
114+
}
115+
116+
std::vector<uint64_t> tokens;
117+
for (auto i = 0; i < bos; ++i) {
118+
tokens.push_back(bos_tok_);
119+
}
120+
121+
for (auto i = 0; i < res.size(); ++i) {
122+
tokens.push_back(res[i]);
123+
}
124+
125+
for (auto i = 0; i < eos; ++i) {
126+
tokens.push_back(eos_tok_);
127+
}
128+
return tokens;
129+
}
130+
} // namespace tokenizers

third-party/sentencepiece

Submodule sentencepiece added at d8f7418

0 commit comments

Comments
 (0)