Add sentencepiece tokenizer

larryliu0820 · larryliu0820 · commit 6bca95cf76c1 · 2024-11-25T15:00:57.000-08:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,22 @@
+build/
+.DS_Store
+# Editor temporaries
+*.swa
+*.swb
+*.swc
+*.swd
+*.swe
+*.swf
+*.swg
+*.swh
+*.swi
+*.swj
+*.swk
+*.swl
+*.swm
+*.swn
+*.swo
+*.swp
+*~
+.~lock.*
+*.idea
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third-party/sentencepiece"]
+	path = third-party/sentencepiece
+	url = https://github.com/google/sentencepiece.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the LICENSE
+# file in the root directory of this source tree.
+
+#
+# Build tokenizers.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+cmake_minimum_required(VERSION 3.24)
+
+project(Tokenizers)
+
+set(ABSL_ENABLE_INSTALL ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+set(_pic_flag
+${CMAKE_POSITION_INDEPENDENT_CODE})
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_subdirectory(third-party/sentencepiece)
+set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+
+add_library(tokenizers STATIC src/sentencepiece.cpp)
+
+# Using abseil from sentencepiece/third_party
+target_include_directories(
+  tokenizers PUBLIC third-party/sentencepiece/src
+                    third-party/sentencepiece include)
+
+target_link_libraries(tokenizers PUBLIC sentencepiece-static)
diff --git a/include/error.h b/include/error.h
@@ -39,6 +39,12 @@ enum class Error : error_code_t {
 
   /// Token out of range.
   OutOfRange = 0x03,
+
+  /// Artifact load failure.
+  LoadFailure = 0x04,
+
+  /// Encode failure.
+  EncodeFailure = 0x05,
 };
 
 } // namespace tokenizers
diff --git a/include/sentencepiece.h b/include/sentencepiece.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A tokenizer that works with sentencepiece.
+#pragma once
+
+#include "sentencepiece_processor.h"
+#include "tokenizer.h"
+#include <memory>
+#include <vector>
+namespace tokenizers {
+
+struct TokenIndex {
+  const char *str;
+  int32_t id;
+};
+
+class SPTokenizer : public Tokenizer {
+public:
+  explicit SPTokenizer();
+  ~SPTokenizer() override;
+
+  Error load(const std::string &tokenizer_path) override;
+
+  Result<std::vector<uint64_t>> encode(const std::string &input, int8_t bos,
+                                       int8_t eos) const override;
+
+  Result<std::string> decode(uint64_t prev_token,
+                             uint64_t token) const override;
+
+private:
+  std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;
+};
+
+} // namespace tokenizers
diff --git a/src/sentencepiece.cpp b/src/sentencepiece.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A tokenizer that works with sentencepiece.
+
+#include "sentencepiece.h"
+#include "third_party/absl/strings/str_replace.h"
+#include <cinttypes>
+#include <string>
+namespace tokenizers {
+const char kSpaceSymbol[] = "\xe2\x96\x81";
+
+SPTokenizer::SPTokenizer()
+    : Tokenizer(),
+      _processor(std::make_unique<sentencepiece::SentencePieceProcessor>()) {}
+
+/**
+ * @brief Load the tokenizer from a file. The tokenizer file contains the
+ * vocabulary and scores. The format is: the first integer is the maximum
+ * token length, followed by a list of (word_len, word) pairs. Here we
+ * are reading all the vocabulary into memory and keep it sorted for fast
+ * lookup.
+ *
+ * @param tokenizer_path The path to the tokenizer file.
+ * @return Error
+ */
+Error SPTokenizer::load(const std::string &tokenizer_path) {
+  if (initialized_) {
+    fprintf(stderr, "Tokenizer already initialized.\n");
+    return Error::Ok;
+  }
+  // read in the file
+  const auto status = _processor->Load(tokenizer_path);
+  if (!status.ok()) {
+    fprintf(stderr,
+            "couldn't load %s\n. If this tokenizer artifact is for llama3, "
+            "please pass `-l 3`.",
+            tokenizer_path.c_str());
+    return Error::LoadFailure;
+  }
+  // load vocab_size, bos_tok, eos_tok
+  vocab_size_ = _processor->GetPieceSize();
+  bos_tok_ = _processor->bos_id();
+  eos_tok_ = _processor->eos_id();
+  initialized_ = true;
+  return Error::Ok;
+}
+
+SPTokenizer::~SPTokenizer() {}
+
+/**
+ * @brief Decode a token into string.
+ *
+ * @param prev_token The previous token.
+ * @param token The current token.
+ * @return Result<std::string> The string representation of the
+ * token.
+ */
+Result<std::string> SPTokenizer::decode(uint64_t prev_token,
+                                        uint64_t token) const {
+  if (!initialized_) {
+    fprintf(stderr, "Tokenizer not initialized\n");
+    return Error::Uninitialized;
+  }
+  // get rid of the control ids <s> and </s>
+  if (_processor->IsControl(token)) {
+    // NB: returning empty string doesn't work for some reason. It causes
+    // free(): invalid pointer error.
+    return std::string(" ");
+  }
+
+  std::string result =
+      absl::StrReplaceAll(_processor->IdToPiece(token), {{kSpaceSymbol, " "}});
+
+  // following BOS token, sentencepiece decoder strips any leading
+  // whitespace
+  if (prev_token == bos_tok_ && result[0] == ' ') {
+    result = result.substr(1);
+  }
+
+  // handle <0x0A>
+  result = absl::StrReplaceAll(result, {{"<0x0A>", "\n"}});
+
+  return result;
+}
+
+/**
+ * @brief Encode a string into a sequence of tokens.
+ *
+ * @param text The string to be encoded.
+ * @param bos The number of BOS to prepend to the token list.
+ * @param eos The number of EOS to append to the token list.
+ * @return Result<std::vector<uint64_t>>
+ */
+Result<std::vector<uint64_t>>
+SPTokenizer::encode(const std::string &text, int8_t bos, int8_t eos) const {
+  if (!initialized_) {
+    fprintf(stderr, "Tokenizer not initialized\n");
+    return Error::Uninitialized;
+  }
+  // workaround a weird issue that text doesn't have correct size()
+  std::string input(text.c_str());
+  // should we reserve memory?
+  std::vector<int> res;
+  auto status = _processor->Encode(input, &res);
+  if (!status.ok()) {
+    fprintf(stderr, "couldn't encode %s\n", text.c_str());
+    return Error::EncodeFailure;
+  }
+
+  std::vector<uint64_t> tokens;
+  for (auto i = 0; i < bos; ++i) {
+    tokens.push_back(bos_tok_);
+  }
+
+  for (auto i = 0; i < res.size(); ++i) {
+    tokens.push_back(res[i]);
+  }
+
+  for (auto i = 0; i < eos; ++i) {
+    tokens.push_back(eos_tok_);
+  }
+  return tokens;
+}
+} // namespace tokenizers
diff --git a/third-party/sentencepiece b/third-party/sentencepiece
@@ -0,0 +1 @@
+Subproject commit d8f741853847553169444afc12c00f4bbff3e9ce

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "third-party/sentencepiece"]`
	`2`	`+ path = third-party/sentencepiece`
	`3`	`+ url = https://github.com/google/sentencepiece.git`