Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions bindings/node/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@ export function ctcDecoder(
cleanup?: boolean | undefined | null,
): Decoder
export function fuseDecoder(): Decoder
export function metaspaceDecoder(replacement?: string = '▁', addPrefixSpace?: bool = true): Decoder
export function metaspaceDecoder(
replacement?: string = '▁',
prependScheme?: prepend_scheme = 'always',
split?: split = true,
): Decoder
export function replaceDecoder(pattern: string, content: string): Decoder
export function sequenceDecoder(decoders: Array<Decoder>): Decoder
export function stripDecoder(content: string, left: number, right: number): Decoder
Expand Down Expand Up @@ -89,7 +93,11 @@ export function byteLevelAlphabet(): Array<string>
export function whitespacePreTokenizer(): PreTokenizer
export function whitespaceSplitPreTokenizer(): PreTokenizer
export function bertPreTokenizer(): PreTokenizer
export function metaspacePreTokenizer(replacement?: string = '▁', addPrefixSpace?: bool = true): PreTokenizer
export function metaspacePreTokenizer(
replacement?: string = '▁',
prependScheme?: prepend_scheme = 'always',
split?: split = true,
): PreTokenizer
export function splitPreTokenizer(pattern: string, behavior: string, invert?: boolean | undefined | null): PreTokenizer
export function punctuationPreTokenizer(behavior?: string | undefined | null): PreTokenizer
export function sequencePreTokenizer(preTokenizers: Array<PreTokenizer>): PreTokenizer
Expand Down
37 changes: 37 additions & 0 deletions bindings/node/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,43 @@ switch (platform) {
loadError = e
}
break
case 'riscv64':
if (isMusl()) {
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-riscv64-musl.node'))
try {
if (localFileExisted) {
nativeBinding = require('./tokenizers.linux-riscv64-musl.node')
} else {
nativeBinding = require('tokenizers-linux-riscv64-musl')
}
} catch (e) {
loadError = e
}
} else {
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-riscv64-gnu.node'))
try {
if (localFileExisted) {
nativeBinding = require('./tokenizers.linux-riscv64-gnu.node')
} else {
nativeBinding = require('tokenizers-linux-riscv64-gnu')
}
} catch (e) {
loadError = e
}
}
break
case 's390x':
localFileExisted = existsSync(join(__dirname, 'tokenizers.linux-s390x-gnu.node'))
try {
if (localFileExisted) {
nativeBinding = require('./tokenizers.linux-s390x-gnu.node')
} else {
nativeBinding = require('tokenizers-linux-s390x-gnu')
}
} catch (e) {
loadError = e
}
break
default:
throw new Error(`Unsupported architecture on Linux: ${arch}`)
}
Expand Down
2 changes: 1 addition & 1 deletion bindings/node/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "tokenizers",
"version": "0.14.0-dev0",
"version": "0.15.3-dev0",
"repository": {
"type": "git",
"url": "git+https://github.com/huggingface/tokenizers.git"
Expand Down
19 changes: 16 additions & 3 deletions bindings/node/src/decoders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,19 +90,32 @@ pub fn fuse_decoder() -> Decoder {
#[napi]
pub fn metaspace_decoder(
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
#[napi(ts_arg_type = "bool = true")] add_prefix_space: Option<bool>,
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
) -> Result<Decoder> {
let add_prefix_space = add_prefix_space.unwrap_or(true);
use tk::pre_tokenizers::metaspace::PrependScheme;
let split = split.unwrap_or(true);
let replacement = replacement.unwrap_or("▁".to_string());
if replacement.chars().count() != 1 {
return Err(Error::from_reason(
"replacement is supposed to be a single char",
));
}
let replacement = replacement.chars().next().unwrap();
let prepend_scheme: PrependScheme =
match prepend_scheme.unwrap_or(String::from("always")).as_str() {
"always" => PrependScheme::Always,
"first" => PrependScheme::First,
"never" => PrependScheme::Never,
_ => {
return Err(Error::from_reason(
"prepend_scheme is supposed to be either 'always', 'first' or 'never'",
));
}
};
Ok(Decoder {
decoder: Some(Arc::new(RwLock::new(
tk::decoders::metaspace::Metaspace::new(replacement, add_prefix_space).into(),
tk::decoders::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
))),
})
}
Expand Down
19 changes: 16 additions & 3 deletions bindings/node/src/pre_tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,20 +155,33 @@ pub fn bert_pre_tokenizer() -> PreTokenizer {
#[napi]
pub fn metaspace_pre_tokenizer(
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
#[napi(ts_arg_type = "bool = true")] add_prefix_space: Option<bool>,
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
) -> Result<PreTokenizer> {
let add_prefix_space = add_prefix_space.unwrap_or(true);
use tk::pre_tokenizers::metaspace::PrependScheme;
let split = split.unwrap_or(true);
let replacement = replacement.unwrap_or("▁".to_string());
if replacement.chars().count() != 1 {
return Err(Error::from_reason(
"replacement is supposed to be a single char",
));
}
let replacement = replacement.chars().next().unwrap();
let prepend_scheme: PrependScheme =
match prepend_scheme.unwrap_or(String::from("always")).as_str() {
"always" => PrependScheme::Always,
"first" => PrependScheme::First,
"never" => PrependScheme::Never,
_ => {
return Err(Error::from_reason(
"prepend_scheme is supposed to be either 'always', 'first' or 'never'",
));
}
};

Ok(PreTokenizer {
pretok: Some(Arc::new(RwLock::new(
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, add_prefix_space).into(),
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
))),
})
}
Expand Down
Loading