Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
3b0bed2
Add segmenter as test type for nodejs with minimal data
sven-oly May 15, 2025
a1e35ea
Update test and verify data
sven-oly May 15, 2025
b1c3779
Add segmenter test cases
sven-oly May 15, 2025
34f6e00
Updated generator to produce segmentation tests from NodeJS
sven-oly May 16, 2025
e6b07b4
Updating data gen and characterizing differences in lists
sven-oly May 20, 2025
6d7ec26
Remove temporary code
sven-oly May 20, 2025
b980052
Add CPP segmenter to executor
sven-oly May 21, 2025
ec76ece
Fix so segmenter data is recomputed
sven-oly May 21, 2025
628e3e3
Trying modified segmenter test generation for line
sven-oly May 22, 2025
014ed9d
Hacking line break generator to handle some line segmentation
sven-oly May 22, 2025
78b4e28
include new segmenter.cpp function
sven-oly May 22, 2025
6e6b45e
Merge remote-tracking branch 'upstream/main' into segmenter_nodejs
sven-oly May 23, 2025
d01ebe4
Merge remote-tracking branch 'upstream/main' into segmenter_cpp
sven-oly May 23, 2025
8b38eb5
Update as per comments on this PR
sven-oly Jun 2, 2025
4acafa6
Removing unneeded .gitignore items
sven-oly Jun 2, 2025
b0020da
Add classification type
sven-oly Jun 3, 2025
f3c6dad
Merge branch 'segmenter_nodejs' into segmenter_cpp
sven-oly Jun 3, 2025
0db4405
Merge remote-tracking branch 'upstream/main' into segmenter_cpp
sven-oly Jun 3, 2025
3060ca5
Fix generator
sven-oly Jun 3, 2025
c4c3e8a
Removing empty debug lines
sven-oly Jun 4, 2025
7410086
Update executors/cpp/segmenter.cpp
sven-oly Jun 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion executors/cpp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ TARGET=executor

# All object files (C or C++)

OBJECTS=main.o coll.o datetime_fmt.o localedisplaynames.o likely_subtags.o list_fmt.o message_fmt2.o number_fmt.o plural_rules.o relativedatetime_fmt.o util.o
OBJECTS=main.o coll.o datetime_fmt.o localedisplaynames.o likely_subtags.o list_fmt.o message_fmt2.o number_fmt.o plural_rules.o relativedatetime_fmt.o segmenter.o util.o

#### rules
# Load in standard makefile definitions
Expand Down
6 changes: 5 additions & 1 deletion executors/cpp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ extern auto TestDatetimeFmt(json_object *json_in) -> const string;
extern auto TestLocaleDisplayNames(json_object *json_in) -> const string;
extern auto TestLikelySubtags(json_object *json_in) -> const string;
extern auto TestListFmt(json_object *json_in) -> const string;
extern auto TestSegmenter(json_object *json_in) -> const string;

// This API was added in ICU75.1
#if U_ICU_VERSION_MAJOR_NUM >= 75
Expand Down Expand Up @@ -71,7 +72,8 @@ auto main(int argc, const char** argv) -> int {
"lang_names",
"number_fmt",
"plural_rules",
"rdt_fmt"
"rdt_fmt",
"segmenter"
};

for (std::string line; std::getline(cin, line);) {
Expand Down Expand Up @@ -132,6 +134,8 @@ auto main(int argc, const char** argv) -> int {
outputLine = TestPluralRules(json_input);
} else if (test_type == "rdt_fmt") {
outputLine = TestRelativeDateTimeFmt(json_input);
} else if (test_type == "segmenter") {
outputLine = TestSegmenter(json_input);
} else {
outputLine = "# BAD TEST " + test_type;
}
Expand Down
128 changes: 128 additions & 0 deletions executors/cpp/segmenter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/******
* testing segmenter / break iterator for locales
*/

#include <json-c/json.h>
#include <json-c/arraylist.h>

#include <unicode/brkiter.h>
#include <unicode/bytestream.h>
#include <unicode/locid.h>
#include <unicode/uclean.h>
#include <unicode/unistr.h>
#include <unicode/utypes.h>


#include <cstdio>
#include <cstdlib>

#include <cstring>
#include <iostream>
#include <string>

using std::string;

using icu::BreakIterator;
using icu::Locale;
using icu::StringByteSink;
using icu::UnicodeString;

void free_string(void* data) {
if (data) {
free(data);
}
}

auto TestSegmenter(json_object *json_in) -> string {
UErrorCode status = U_ZERO_ERROR;

json_object *label_obj = json_object_object_get(json_in, "label");
string label_string = json_object_get_string(label_obj);

// The locale in which the name is given.
json_object *locale_label_obj = json_object_object_get(json_in, "locale");
string locale_string = json_object_get_string(locale_label_obj);
Locale locale(locale_string.c_str());


// What we are segmenting...
json_object *input_obj = json_object_object_get(json_in, "input");
string input = json_object_get_string(input_obj);

UnicodeString u_input = UnicodeString::fromUTF8(input);

// The type of conversion requested
json_object *options_obj = json_object_object_get(json_in, "options");

json_object *granularity_obj = json_object_object_get(options_obj, "granularity");
string granularity_value = json_object_get_string(granularity_obj);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Observation: granularity_value is an enum, but we're leaving it as a C++ string. We do properly convert enum values into Java enums in the ICU4J executor. Not for this PR but for the future, it would be better to convert enums into C++ enums.


// Create output array to store results
struct json_object* test_result = json_object_new_array();

json_object *return_json = json_object_new_object();
json_object_object_add(return_json, "label", label_obj);

// The default.
BreakIterator* brk_iterator;

if (granularity_value == "grapheme_cluster" ||
granularity_value == "grapheme") {
brk_iterator = BreakIterator::createCharacterInstance(locale, status);
} else if (granularity_value == "word") {
brk_iterator = BreakIterator::createWordInstance(locale, status);
} else if (granularity_value == "sentence") {
brk_iterator = BreakIterator::createSentenceInstance(locale, status);
} else if (granularity_value == "line") {
brk_iterator = BreakIterator::createLineInstance(locale, status);
} else {
// No such granularity
json_object_object_add(
return_json,
"error",
json_object_new_string("Unknown granularity"));
json_object_object_add(
return_json,
"error_detail",
json_object_new_string(granularity_value.c_str()));
return json_object_to_json_string(return_json);
}

// Check if there's an error in the creation of the iterator.
if (U_FAILURE(status) != 0) {
// An error in the call.
json_object_object_add(
return_json,
"error",
json_object_new_string("Failure to create break iterator "));
json_object_object_add(
return_json,
"error_detail",
json_object_new_string(granularity_value.c_str()));
return json_object_to_json_string(return_json);
}

// We must have an interator
brk_iterator->setText(u_input);

int32_t start_pos = brk_iterator->first();
int32_t end_pos = brk_iterator->next();

// Loop until we get DONE or an error.
while (end_pos != BreakIterator::DONE) {
// Extract the Unicode string, converting to a c string.
UnicodeString u_target;
u_input.extractBetween(start_pos, end_pos, u_target);
string target;
u_target.toUTF8String(target);
json_object* j_target = json_object_new_string(target.c_str());
json_object_array_add(test_result, j_target);
start_pos = end_pos;
end_pos = brk_iterator->next();
}

// For each, extract the current part of the input string, adding to the output
json_object_object_add(return_json, "result", test_result);

return json_object_to_json_string(return_json);
}
9 changes: 6 additions & 3 deletions run_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
"message_fmt2",
"number_fmt",
"plural_rules",
"rdt_fmt"
"rdt_fmt",
"segmenter"
],
"per_execution": 10000
}
Expand All @@ -40,7 +41,8 @@
"message_fmt2",
"number_fmt",
"plural_rules",
"rdt_fmt"
"rdt_fmt",
"segmenter"
],
"per_execution": 10000
}
Expand Down Expand Up @@ -151,7 +153,8 @@
"message_fmt2",
"number_fmt",
"plural_rules",
"rdt_fmt"
"rdt_fmt",
"segmenter"
],
"per_execution": 10000
}
Expand Down
24 changes: 10 additions & 14 deletions testgen/generators/segmenter_gen.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,48 +25,48 @@ const locale_text_data = [
// Empty input
"locale": "en-US",
"input": "",
'expected_line_results': []
"expected_line_results": []
},

{
"locale": "en-US",
"input": "The cât, in the hat. There's a dog̈ in the yard?",
'expected_line_results': ["The ","cât, ","in ","the ","hat. ","There's ","a ","dog̈ ","in ","the ","yard?"]
"expected_line_results": ["The ","cât, ","in ","the ","hat. ","There's ","a ","dog̈ ","in ","the ","yard?"]
},
{
"locale": "ja-JP",
"input": "文字に分解しましょう。単語にも。ああ、文にも。",
'expected_line_results': ["文","字","に","分","解","し","ま","しょ","う。","単","語","に","も。","あ","あ、","文","に","も。"]
"expected_line_results": ["文","字","に","分","解","し","ま","しょ","う。","単","語","に","も。","あ","あ、","文","に","も。"]
},
{
"locale": "fr",
"input": "C'est ainsi qu'on décompose les personnages. Les mots aussi. Oh, et les phrases aussi.",
'expected_line_results': ["C'est ","ainsi ","qu'on ","décompose ","les ","personnages. ","Les ","mots ","aussi. ","Oh, ","et ","les ","phrases ","aussi."]
"expected_line_results": ["C'est ","ainsi ","qu'on ","décompose ","les ","personnages. ","Les ","mots ","aussi. ","Oh, ","et ","les ","phrases ","aussi."]
},
{
"locale": "as",
"input": "এইটোৱেই হৈছে চৰিত্ৰত ভাঙি যোৱাৰ উপায়। লগতে শব্দ। অ’, আৰু বাক্যবোৰো।",
'expected_line_results': ["এইটোৱেই ","হৈছে ","চৰিত্ৰত ","ভাঙি ","যোৱাৰ ","উপায়। ","লগতে ","শব্দ। ","অ’, ","আৰু ","বাক্যবোৰো।"]
"expected_line_results": ["এইটোৱেই ","হৈছে ","চৰিত্ৰত ","ভাঙি ","যোৱাৰ ","উপায়। ","লগতে ","শব্দ। ","অ’, ","আৰু ","বাক্যবোৰো।"]
},
{
"locale": "zh-Hans",
"input": "分解成字符。还有单词。哦,还有句子。",
'expected_line_results': ["分","解","成","字","符。","还","有","单","词。","哦,","还","有","句","子。"]
"expected_line_results": ["分","解","成","字","符。","还","有","单","词。","哦,","还","有","句","子。"]
},
{
"locale": "zh-Hant",
"input": "分解成字元。還有文字。哦,還有句子。",
'expected_line_results': ["分","解","成","字","元。","還","有","文","字。","哦,","還","有","句","子。"]
"expected_line_results": ["分","解","成","字","元。","還","有","文","字。","哦,","還","有","句","子。"]
},
{
"locale": "my",
"input": "ဤသည်မှာ ဇာတ်ကောင်များအဖြစ်သို့ ဖောက်ထွက်ရန် နည်းလမ်းဖြစ်သည်။ စကားလည်း ပါတယ်။ သြော် စာကြောင်းတွေလည်း ပါပါတယ်။",
'expected_line_results': ["ဤ","သည်မှာ ","ဇာတ်ကောင်","များ","အဖြစ်","သို့ ","ဖောက်","ထွက်","ရန် ","နည်း","လမ်း","ဖြစ်သည်။ ","စကား","လည်း ","ပါ","တယ်။ ","သြော် ","စာကြောင်း","တွေ","လည်း ","ပါ","ပါ","တယ်။"]
"expected_line_results": ["ဤ","သည်မှာ ","ဇာတ်ကောင်","များ","အဖြစ်","သို့ ","ဖောက်","ထွက်","ရန် ","နည်း","လမ်း","ဖြစ်သည်။ ","စကား","လည်း ","ပါ","တယ်။ ","သြော် ","စာကြောင်း","တွေ","လည်း ","ပါ","ပါ","တယ်။"]
},
{
"locale": "ff-Adlm",
"input": "𞤊𞤭𞤴𞤢𞥄𞤳𞤵 𞤱𞤢𞤴𞤤𞤮𞤪𞤢 𞤳𞤫𞤲𞤫𞤲. 𞤖𞤢𞤳𞥆𞤫𞤪𞤫𞤲 𞤫𞤯𞤫𞤲 𞤸𞤮𞥅𞤤𞤭𞥅 𞤸𞤭𞤧𞤭⹁ 𞤫𞤲𞤢 𞤯𞤫𞤲. 𞤐𞤣𞤫𞤲𞤧𞤢𞤴 𞤼𞤵𞤲⹁ 𞤭𞤱𞤪𞤢𞤼𞤢 𞤱𞤮𞥅⹁ 𞤣𞤫𞥅𞤰𞤵𞤲𞤮𞥅 𞤬𞤮𞤱⹁ 𞤮𞤲.",
'expected_line_results': ["𞤊𞤭𞤴𞤢𞥄𞤳𞤵 ","𞤱𞤢𞤴𞤤𞤮𞤪𞤢 ","𞤳𞤫𞤲𞤫𞤲. ","𞤖𞤢𞤳𞥆𞤫𞤪𞤫𞤲 ","𞤫𞤯𞤫𞤲 ","𞤸𞤮𞥅𞤤𞤭𞥅 ","𞤸𞤭𞤧𞤭⹁ ","𞤫𞤲𞤢 ","𞤯𞤫𞤲. ","𞤐𞤣𞤫𞤲𞤧𞤢𞤴 ","𞤼𞤵𞤲⹁ ","𞤭𞤱𞤪𞤢𞤼𞤢 ","𞤱𞤮𞥅⹁ ","𞤣𞤫𞥅𞤰𞤵𞤲𞤮𞥅 ","𞤬𞤮𞤱⹁ ","𞤮𞤲."]
"expected_line_results": ["𞤊𞤭𞤴𞤢𞥄𞤳𞤵 ","𞤱𞤢𞤴𞤤𞤮𞤪𞤢 ","𞤳𞤫𞤲𞤫𞤲. ","𞤖𞤢𞤳𞥆𞤫𞤪𞤫𞤲 ","𞤫𞤯𞤫𞤲 ","𞤸𞤮𞥅𞤤𞤭𞥅 ","𞤸𞤭𞤧𞤭⹁ ","𞤫𞤲𞤢 ","𞤯𞤫𞤲. ","𞤐𞤣𞤫𞤲𞤧𞤢𞤴 ","𞤼𞤵𞤲⹁ ","𞤭𞤱𞤪𞤢𞤼𞤢 ","𞤱𞤮𞥅⹁ ","𞤣𞤫𞥅𞤰𞤵𞤲𞤮𞥅 ","𞤬𞤮𞤱⹁ ","𞤮𞤲."]
},
{
"locale": "ar",
Expand Down Expand Up @@ -98,8 +98,6 @@ const locale_text_data = [
"input": "ሊሆኑ የሚችሉ የእረፍት ነጥቦችን እንፈልግ። በዚህ ውሂብ ውስጥ?",
"expected_line_results": ["ሊሆኑ ","የሚችሉ ","የእረፍት ","ነጥቦችን ","እንፈልግ። ","በዚህ ","ውሂብ ","ውስጥ?"]
},


];

function generateAll() {
Expand Down Expand Up @@ -179,9 +177,6 @@ function generateAll() {
gen_hash.generate_hash_for_test(test_case);
test_case['label'] = label_string;

if (debug) {
console.log("TEST CASE :", test_case);
}
if (segmentation_type == 'line') {
// To get line data, even though not supported in ECMAIntl
all_options['granularity'] = 'line';
Expand All @@ -192,6 +187,7 @@ function generateAll() {
if (segmentation_type == 'line') {
result = locale_data['expected_line_results'];
}

try{
verify_cases.push({'label': label_string,
'verify': result});
Expand Down
Loading