|
| 1 | +/* |
| 2 | + * Copyright (c) Meta Platforms, Inc. and affiliates. |
| 3 | + * All rights reserved. |
| 4 | + * |
| 5 | + * This source code is licensed under the BSD-style license found in the |
| 6 | + * LICENSE file in the root directory of this source tree. |
| 7 | + */ |
| 8 | + |
| 9 | +#include "sentencepiece.h" |
| 10 | +#include "gtest/gtest.h" |
| 11 | + |
| 12 | +namespace tokenizers { |
| 13 | + |
| 14 | +TEST(SPTokenizerTest, TestEncodeWithoutLoad) { |
| 15 | + SPTokenizer tokenizer; |
| 16 | + std::string text = "Hello world!"; |
| 17 | + auto result = tokenizer.encode(text, /*bos*/ 0, /*eos*/ 1); |
| 18 | + EXPECT_EQ(result.error(), Error::Uninitialized); |
| 19 | +} |
| 20 | + |
| 21 | +TEST(SPTokenizerTest, TestDecodeWithoutLoad) { |
| 22 | + SPTokenizer tokenizer; |
| 23 | + auto result = tokenizer.decode(0, 0); |
| 24 | + EXPECT_EQ(result.error(), Error::Uninitialized); |
| 25 | +} |
| 26 | + |
| 27 | +TEST(SPTokenizerTest, TestLoad) { |
| 28 | + SPTokenizer tokenizer; |
| 29 | + auto resources = std::getenv("RESOURCES_PATH"); |
| 30 | + auto path = resources + std::string("/test_sentencepiece.model"); |
| 31 | + auto error = tokenizer.load(path); |
| 32 | + EXPECT_EQ(error, Error::Ok); |
| 33 | +} |
| 34 | + |
| 35 | +TEST(SPTokenizerTest, TestLoadInvalidPath) { |
| 36 | + SPTokenizer tokenizer; |
| 37 | + auto error = tokenizer.load("invalid_path"); |
| 38 | + EXPECT_EQ(error, Error::LoadFailure); |
| 39 | +} |
| 40 | + |
| 41 | +TEST(SPTokenizerTest, TestEncode) { |
| 42 | + SPTokenizer tokenizer; |
| 43 | + auto resources = std::getenv("RESOURCES_PATH"); |
| 44 | + auto path = resources + std::string("/test_sentencepiece.model"); |
| 45 | + auto error = tokenizer.load(path); |
| 46 | + EXPECT_EQ(error, Error::Ok); |
| 47 | + std::string text = "Hello world!"; |
| 48 | + auto result = tokenizer.encode(text, /*bos*/ 1, /*eos*/ 0); |
| 49 | + EXPECT_TRUE(result.ok()); |
| 50 | + EXPECT_EQ(result.get().size(), 4); |
| 51 | + EXPECT_EQ(result.get()[0], 1); |
| 52 | + EXPECT_EQ(result.get()[1], 15043); |
| 53 | + EXPECT_EQ(result.get()[2], 3186); |
| 54 | + EXPECT_EQ(result.get()[3], 29991); |
| 55 | +} |
| 56 | + |
| 57 | +TEST(SPTokenizerTest, TestDecode) { |
| 58 | + SPTokenizer tokenizer; |
| 59 | + auto resources = std::getenv("RESOURCES_PATH"); |
| 60 | + auto path = resources + std::string("/test_sentencepiece.model"); |
| 61 | + auto error = tokenizer.load(path); |
| 62 | + EXPECT_EQ(error, Error::Ok); |
| 63 | + std::vector<uint64_t> tokens = {1, 15043, 3186, 29991}; |
| 64 | + std::vector<std::string> expected = {"", "Hello", " world", "!"}; |
| 65 | + for (auto i = 0; i < 3; ++i) { |
| 66 | + auto result = tokenizer.decode(tokens[i], tokens[i + 1]); |
| 67 | + EXPECT_TRUE(result.ok()); |
| 68 | + EXPECT_EQ(result.get(), expected[i + 1]); |
| 69 | + } |
| 70 | +} |
| 71 | + |
| 72 | +} // namespace tokenizers |
0 commit comments