unicode-org · sven-oly · Jun 4, 2025 · May 15, 2025 · May 15, 2025 · May 15, 2025
diff --git a/executors/cpp/Makefile b/executors/cpp/Makefile
@@ -16,7 +16,7 @@ TARGET=executor
 
 # All object files (C or C++)
 
-OBJECTS=main.o coll.o datetime_fmt.o localedisplaynames.o likely_subtags.o list_fmt.o message_fmt2.o number_fmt.o plural_rules.o relativedatetime_fmt.o util.o
+OBJECTS=main.o coll.o datetime_fmt.o localedisplaynames.o likely_subtags.o list_fmt.o message_fmt2.o number_fmt.o plural_rules.o relativedatetime_fmt.o segmenter.o util.o
 
 #### rules
 # Load in standard makefile definitions

diff --git a/executors/cpp/main.cpp b/executors/cpp/main.cpp
@@ -44,6 +44,7 @@ extern auto TestDatetimeFmt(json_object *json_in) -> const string;
 extern auto TestLocaleDisplayNames(json_object *json_in) -> const string;
 extern auto TestLikelySubtags(json_object *json_in) -> const string;
 extern auto TestListFmt(json_object *json_in) -> const string;
+extern auto TestSegmenter(json_object *json_in) -> const string;
 
 // This API was added in ICU75.1
 #if U_ICU_VERSION_MAJOR_NUM >= 75
@@ -71,7 +72,8 @@ auto main(int argc, const char** argv) -> int {
     "lang_names",
     "number_fmt",
     "plural_rules",
-    "rdt_fmt"
+    "rdt_fmt",
+    "segmenter"
   };
 
   for (std::string line; std::getline(cin, line);) {
@@ -132,6 +134,8 @@ auto main(int argc, const char** argv) -> int {
         outputLine = TestPluralRules(json_input);
       } else if (test_type == "rdt_fmt") {
         outputLine = TestRelativeDateTimeFmt(json_input);
+      } else if (test_type == "segmenter") {
+        outputLine = TestSegmenter(json_input);
       } else {
         outputLine =  "# BAD TEST " + test_type;
       }

diff --git a/executors/cpp/segmenter.cpp b/executors/cpp/segmenter.cpp
@@ -0,0 +1,128 @@
+/******
+ * testing segmenter / break iterator for locales
+ */
+
+#include <json-c/json.h>
+#include <json-c/arraylist.h>
+
+#include <unicode/brkiter.h>
+#include <unicode/bytestream.h>
+#include <unicode/locid.h>
+#include <unicode/uclean.h>
+#include <unicode/unistr.h>
+#include <unicode/utypes.h>
+
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <cstring>
+#include <iostream>
+#include <string>
+
+using std::string;
+
+using icu::BreakIterator;
+using icu::Locale;
+using icu::StringByteSink;
+using icu::UnicodeString;
+
+void free_string(void* data) {
+  if (data) {
+    free(data);
+  }
+}
+
+auto TestSegmenter(json_object *json_in) -> string {
+  UErrorCode status = U_ZERO_ERROR;
+
+  json_object *label_obj = json_object_object_get(json_in, "label");
+  string label_string = json_object_get_string(label_obj);
+
+  // The locale in which the name is given.
+  json_object *locale_label_obj = json_object_object_get(json_in, "locale");
+  string locale_string = json_object_get_string(locale_label_obj);
+  Locale locale(locale_string.c_str());
+
+
+  // What we are segmenting...
+  json_object *input_obj = json_object_object_get(json_in, "input");
+  string input = json_object_get_string(input_obj);
+
+  UnicodeString u_input = UnicodeString::fromUTF8(input);
+
+  // The type of conversion requested
+  json_object *options_obj = json_object_object_get(json_in, "options");
+
+  json_object *granularity_obj = json_object_object_get(options_obj, "granularity");
+  string granularity_value = json_object_get_string(granularity_obj);
+
+  // Create output array to store results
+  struct json_object* test_result = json_object_new_array();
+
+  json_object *return_json = json_object_new_object();
+  json_object_object_add(return_json, "label", label_obj);
+
+  // The default.
+  BreakIterator* brk_iterator;
+
+  if (granularity_value == "grapheme_cluster" ||
+      granularity_value == "grapheme") {
+    brk_iterator = BreakIterator::createCharacterInstance(locale, status);
+  } else if (granularity_value == "word") {
+    brk_iterator = BreakIterator::createWordInstance(locale, status);
+  } else if (granularity_value == "sentence") {
+    brk_iterator = BreakIterator::createSentenceInstance(locale, status);
+  } else if (granularity_value == "line") {
+    brk_iterator = BreakIterator::createLineInstance(locale, status);
+  } else {
+    // No such granularity
+    json_object_object_add(
+        return_json,
+        "error",
+        json_object_new_string("Unknown granularity"));
+    json_object_object_add(
+        return_json,
+        "error_detail",
+        json_object_new_string(granularity_value.c_str()));
+    return  json_object_to_json_string(return_json);
+  }
+
+  // Check if there's an error in the creation of the iterator.
+  if (U_FAILURE(status) != 0) {
+    // An error in the call.
+    json_object_object_add(
+        return_json,
+        "error",
+        json_object_new_string("Failure to create break iterator "));
+    json_object_object_add(
+        return_json,
+        "error_detail",
+        json_object_new_string(granularity_value.c_str()));
+    return  json_object_to_json_string(return_json);
+  }
+
+  // We must have an interator
+  brk_iterator->setText(u_input);
+
+  int32_t start_pos = brk_iterator->first();
+  int32_t end_pos = brk_iterator->next();
+
+  // Loop until we get DONE or an error.
+  while (end_pos != BreakIterator::DONE) {
+    // Extract the Unicode string, converting to a c string.
+    UnicodeString u_target;
+    u_input.extractBetween(start_pos, end_pos, u_target);
+    string target;
+    u_target.toUTF8String(target);
+    json_object* j_target = json_object_new_string(target.c_str());
+    json_object_array_add(test_result, j_target);
+    start_pos = end_pos;
+    end_pos = brk_iterator->next();
+  }
+
+  // For each, extract the current part of the input string, adding to the output
+  json_object_object_add(return_json, "result", test_result);
+
+  return  json_object_to_json_string(return_json);
+}
diff --git a/run_config.json b/run_config.json
@@ -17,7 +17,8 @@
         "message_fmt2",
         "number_fmt",
         "plural_rules",
-        "rdt_fmt"
+        "rdt_fmt",
+        "segmenter"
       ],
       "per_execution": 10000
     }
@@ -40,7 +41,8 @@
         "message_fmt2",
         "number_fmt",
         "plural_rules",
-        "rdt_fmt"
+        "rdt_fmt",
+        "segmenter"
       ],
       "per_execution": 10000
     }
@@ -151,7 +153,8 @@
         "message_fmt2",
         "number_fmt",
         "plural_rules",
-        "rdt_fmt"
+        "rdt_fmt",
+        "segmenter"
       ],
       "per_execution": 10000
     }

diff --git a/testgen/generators/segmenter_gen.js b/testgen/generators/segmenter_gen.js
@@ -25,48 +25,48 @@ const locale_text_data = [
     // Empty input
     "locale": "en-US",
     "input": "",
-    'expected_line_results': []
+    "expected_line_results": []
   },
 
   {
     "locale": "en-US",
     "input": "The cât, in the hat. There's a dog̈ in the yard?",
-    'expected_line_results': ["The ","cât, ","in ","the ","hat. ","There's ","a ","dog̈ ","in ","the ","yard?"]
+    "expected_line_results": ["The ","cât, ","in ","the ","hat. ","There's ","a ","dog̈ ","in ","the ","yard?"]
   },
   {
     "locale": "ja-JP",
     "input": "文字に分解しましょう。単語にも。ああ、文にも。",
-    'expected_line_results': ["文","字","に","分","解","し","ま","しょ","う。","単","語","に","も。","あ","あ、","文","に","も。"]
+    "expected_line_results": ["文","字","に","分","解","し","ま","しょ","う。","単","語","に","も。","あ","あ、","文","に","も。"]
   },
   {
     "locale": "fr",
     "input": "C'est ainsi qu'on décompose les personnages. Les mots aussi. Oh, et les phrases aussi.",
-    'expected_line_results': ["C'est ","ainsi ","qu'on ","décompose ","les ","personnages. ","Les ","mots ","aussi. ","Oh, ","et ","les ","phrases ","aussi."]
+    "expected_line_results": ["C'est ","ainsi ","qu'on ","décompose ","les ","personnages. ","Les ","mots ","aussi. ","Oh, ","et ","les ","phrases ","aussi."]
   },
   {
     "locale": "as",
     "input": "এইটোৱেই হৈছে চৰিত্ৰত ভাঙি যোৱাৰ উপায়। লগতে শব্দ। অ’, আৰু বাক্যবোৰো।",
-    'expected_line_results': ["এইটোৱেই ","হৈছে ","চৰিত্ৰত ","ভাঙি ","যোৱাৰ ","উপায়। ","লগতে ","শব্দ। ","অ’, ","আৰু ","বাক্যবোৰো।"]
+    "expected_line_results": ["এইটোৱেই ","হৈছে ","চৰিত্ৰত ","ভাঙি ","যোৱাৰ ","উপায়। ","লগতে ","শব্দ। ","অ’, ","আৰু ","বাক্যবোৰো।"]
   },
   {
     "locale": "zh-Hans",
     "input": "分解成字符。还有单词。哦，还有句子。",
-    'expected_line_results': ["分","解","成","字","符。","还","有","单","词。","哦，","还","有","句","子。"]
+    "expected_line_results": ["分","解","成","字","符。","还","有","单","词。","哦，","还","有","句","子。"]
   },
   {
     "locale": "zh-Hant",
     "input": "分解成字元。還有文字。哦，還有句子。",
-    'expected_line_results': ["分","解","成","字","元。","還","有","文","字。","哦，","還","有","句","子。"]
+    "expected_line_results": ["分","解","成","字","元。","還","有","文","字。","哦，","還","有","句","子。"]
   },
   {
     "locale": "my",
     "input": "ဤသည်မှာ ဇာတ်ကောင်များအဖြစ်သို့ ဖောက်ထွက်ရန် နည်းလမ်းဖြစ်သည်။ စကားလည်း ပါတယ်။ သြော် စာကြောင်းတွေလည်း ပါပါတယ်။",
-    'expected_line_results': ["ဤ","သည်မှာ ","ဇာတ်ကောင်","များ","အဖြစ်","သို့ ","ဖောက်","ထွက်","ရန် ","နည်း","လမ်း","ဖြစ်သည်။ ","စကား","လည်း ","ပါ","တယ်။ ","သြော် ","စာကြောင်း","တွေ","လည်း ","ပါ","ပါ","တယ်။"]
+    "expected_line_results": ["ဤ","သည်မှာ ","ဇာတ်ကောင်","များ","အဖြစ်","သို့ ","ဖောက်","ထွက်","ရန် ","နည်း","လမ်း","ဖြစ်သည်။ ","စကား","လည်း ","ပါ","တယ်။ ","သြော် ","စာကြောင်း","တွေ","လည်း ","ပါ","ပါ","တယ်။"]
   },
   {
     "locale": "ff-Adlm",
     "input": "𞤊𞤭𞤴𞤢𞥄𞤳𞤵 𞤱𞤢𞤴𞤤𞤮𞤪𞤢 𞤳𞤫𞤲𞤫𞤲. 𞤖𞤢𞤳𞥆𞤫𞤪𞤫𞤲 𞤫𞤯𞤫𞤲 𞤸𞤮𞥅𞤤𞤭𞥅  𞤸𞤭𞤧𞤭⹁ 𞤫𞤲𞤢 𞤯𞤫𞤲. 𞤐𞤣𞤫𞤲𞤧𞤢𞤴 𞤼𞤵𞤲⹁ 𞤭𞤱𞤪𞤢𞤼𞤢 𞤱𞤮𞥅⹁ 𞤣𞤫𞥅𞤰𞤵𞤲𞤮𞥅 𞤬𞤮𞤱⹁ 𞤮𞤲.",
-    'expected_line_results': ["𞤊𞤭𞤴𞤢𞥄𞤳𞤵 ","𞤱𞤢𞤴𞤤𞤮𞤪𞤢 ","𞤳𞤫𞤲𞤫𞤲. ","𞤖𞤢𞤳𞥆𞤫𞤪𞤫𞤲 ","𞤫𞤯𞤫𞤲 ","𞤸𞤮𞥅𞤤𞤭𞥅  ","𞤸𞤭𞤧𞤭⹁ ","𞤫𞤲𞤢 ","𞤯𞤫𞤲. ","𞤐𞤣𞤫𞤲𞤧𞤢𞤴 ","𞤼𞤵𞤲⹁ ","𞤭𞤱𞤪𞤢𞤼𞤢 ","𞤱𞤮𞥅⹁ ","𞤣𞤫𞥅𞤰𞤵𞤲𞤮𞥅 ","𞤬𞤮𞤱⹁ ","𞤮𞤲."]
+    "expected_line_results": ["𞤊𞤭𞤴𞤢𞥄𞤳𞤵 ","𞤱𞤢𞤴𞤤𞤮𞤪𞤢 ","𞤳𞤫𞤲𞤫𞤲. ","𞤖𞤢𞤳𞥆𞤫𞤪𞤫𞤲 ","𞤫𞤯𞤫𞤲 ","𞤸𞤮𞥅𞤤𞤭𞥅  ","𞤸𞤭𞤧𞤭⹁ ","𞤫𞤲𞤢 ","𞤯𞤫𞤲. ","𞤐𞤣𞤫𞤲𞤧𞤢𞤴 ","𞤼𞤵𞤲⹁ ","𞤭𞤱𞤪𞤢𞤼𞤢 ","𞤱𞤮𞥅⹁ ","𞤣𞤫𞥅𞤰𞤵𞤲𞤮𞥅 ","𞤬𞤮𞤱⹁ ","𞤮𞤲."]
   },
   {
     "locale": "ar",
@@ -98,8 +98,6 @@ const locale_text_data = [
     "input": "ሊሆኑ የሚችሉ የእረፍት ነጥቦችን እንፈልግ። በዚህ ውሂብ ውስጥ?",
     "expected_line_results": ["ሊሆኑ ","የሚችሉ ","የእረፍት ","ነጥቦችን ","እንፈልግ። ","በዚህ ","ውሂብ ","ውስጥ?"]
   },
-
-
 ];
 
 function generateAll() {
@@ -179,9 +177,6 @@ function generateAll() {
       gen_hash.generate_hash_for_test(test_case);
       test_case['label'] = label_string;
 
-      if (debug) {
-        console.log("TEST CASE :", test_case);
-      }
       if (segmentation_type == 'line') {
         // To get line data, even though not supported in ECMAIntl
         all_options['granularity'] = 'line';
@@ -192,6 +187,7 @@ function generateAll() {
       if (segmentation_type == 'line') {
         result = locale_data['expected_line_results'];
       }
+
       try{
         verify_cases.push({'label': label_string,
                            'verify': result});