@@ -130,42 +130,25 @@ static std::vector<uint64_t> _byte_pair_merge(
130
130
// ---- Helper utils end -------------------------------------------------------
131
131
// ---- protected start --------------------------------------------------------
132
132
133
- std::pair<std::optional<std::string>, re2::StringPiece >
133
+ std::pair<std::optional<std::string>, std::string >
134
134
BPETokenizerBase::split_with_allowed_special_token_ (
135
- re2::StringPiece& input,
135
+ const std::string& input,
136
+ size_t offset,
136
137
const TokenMap& allowed_special) const {
137
138
if (!special_token_regex_) {
138
- return std::make_pair (std::nullopt , input);
139
+ return std::make_pair (std::nullopt , input. substr (offset) );
139
140
}
140
141
141
- #if __cplusplus >= 202002L
142
- auto start = input.begin ();
143
- #else
144
- const char * start = input.data ();
145
- #endif
142
+ auto matches = special_token_regex_->find_all (input.substr (offset));
146
143
147
- std::string special;
148
- while (true ) {
149
- if (!re2::RE2::FindAndConsume (&input, *special_token_regex_, &special)) {
150
- // No special token.
151
- break ;
144
+ for (const auto & m : matches) {
145
+ std::string matched_text = input.substr (offset + m.start , m.end - m.start );
146
+ if (allowed_special.tryGetInteger (matched_text).has_value ()) {
147
+ return {matched_text, input.substr (offset, m.start )};
152
148
}
153
-
154
- if (allowed_special.tryGetInteger (special).has_value ()) {
155
- // Found an allowed special token, split the text with it.
156
- #if __cplusplus >= 202002L
157
- return std::make_pair (
158
- special,
159
- re2::StringPiece (start, input.begin () - start - special.size ()));
160
- #else
161
- return std::make_pair (
162
- special,
163
- re2::StringPiece (start, (input.data () - start) - special.size ()));
164
- #endif
165
- } // else try to find the next special token
166
149
}
167
150
168
- return std::make_pair (std:: nullopt , input) ;
151
+ return { std::nullopt , input. substr (offset)} ;
169
152
}
170
153
171
154
Result<std::pair<std::vector<uint64_t >, uint64_t >>
@@ -174,33 +157,31 @@ BPETokenizerBase::encode_with_special_token_(
174
157
const TokenMap& allowed_special) const {
175
158
std::vector<uint64_t > tokens;
176
159
uint64_t last_piece_token_len = 0 ;
177
- re2::StringPiece input (text);
178
- while (true ) {
160
+ size_t offset = 0 ;
161
+
162
+ while (offset < text.size ()) {
179
163
auto [special, sub_input] =
180
- split_with_allowed_special_token_ (input , allowed_special);
164
+ split_with_allowed_special_token_ (text, offset , allowed_special);
181
165
182
166
TK_CHECK_OK_OR_RETURN_ERROR (
183
167
_encode (sub_input, tokens, last_piece_token_len));
168
+ offset += sub_input.size ();
184
169
185
170
if (special) {
186
171
const auto result = special_token_map_->tryGetInteger (*special);
187
172
if (!result) {
188
- // Should never go here, since special pattern includes all special
189
- // chars.
190
173
TK_LOG (Error, " unknown special token: %s\n " , special->c_str ());
191
174
return Error::EncodeFailure;
192
175
}
193
176
194
177
tokens.push_back (*result);
195
178
last_piece_token_len = 0 ;
179
+ offset += special->size (); // advance past the matched token
196
180
} else {
197
181
break ;
198
182
}
199
183
}
200
184
201
- // last_piece_token_len is how many tokens came from the last regex split.
202
- // This is used for determining unstable tokens, since you can't merge
203
- // across (stable) regex splits
204
185
return std::make_pair (tokens, last_piece_token_len);
205
186
}
206
187
@@ -273,7 +254,7 @@ Result<std::string> BPETokenizerBase::decode(uint64_t prev, uint64_t cur)
273
254
} else {
274
255
token_bytes = *result;
275
256
}
276
- _decode (token_bytes, ret);
257
+ _decode (std::string ( token_bytes) , ret);
277
258
278
259
return ret;
279
260
}
0 commit comments