From 2a9507094b29f8dca4e363b6a9e52a634b691f08 Mon Sep 17 00:00:00 2001
From: penw0lf <zun3dali@gmail.com>
Date: Tue, 8 Oct 2024 13:21:29 +0530
Subject: [PATCH 01/11] Added list of string support to sent_tokenize/solved
 #906

---
 pythainlp/tokenize/core.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 993bcff64..2c6f5bd6f 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -323,16 +323,16 @@ def word_tokenize(
 
 
 def sent_tokenize(
-    text: str,
+    text: Union[str, List[str]],
     engine: str = DEFAULT_SENT_TOKENIZE_ENGINE,
     keep_whitespace: bool = True,
 ) -> List[str]:
     """
     Sentence tokenizer.
 
-    Tokenizes running text into "sentences"
+    Tokenizes running text into "sentences". Supports both string and list of strings.
 
-    :param str text: the text to be tokenized
+    :param text: the text (string) or list of words (list of strings) to be tokenized
     :param str engine: choose among *'crfcut'*, *'whitespace'*, \
     *'whitespace+newline'*
     :return: list of split sentences
@@ -394,9 +394,15 @@ def sent_tokenize(
         'และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค']
     """
 
-    if not text or not isinstance(text, str):
+    if not text or not isinstance(text, (str, list)):
         return []
 
+    if isinstance(text, list):
+        try:
+            text = " ".join(text)
+        except TypeError:
+            return []
+
     segments = []
 
     if engine == "crfcut":

From 817b87ab858ecae15d648236e77c810ac88848e3 Mon Sep 17 00:00:00 2001
From: ayaan-qadri <develop.ayaan@gmail.com>
Date: Sat, 12 Oct 2024 21:27:18 +0530
Subject: [PATCH 02/11] Added list grouping

---
 pythainlp/tokenize/core.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 2c6f5bd6f..ef54168d9 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -322,6 +322,22 @@ def word_tokenize(
     return segments
 
 
+def groupedText(list, keep_whitespace=True):
+    output = []
+    current_word = []
+    for word in list:
+        if (word.strip()):
+            current_word.append(word)
+        else:
+            if (current_word):
+                output.append([current_word])
+                current_word = []
+            if (keep_whitespace):
+                output.append([word])
+    if current_word:
+        output.append(current_word)
+    return output
+
 def sent_tokenize(
     text: Union[str, List[str]],
     engine: str = DEFAULT_SENT_TOKENIZE_ENGINE,
@@ -399,8 +415,8 @@ def sent_tokenize(
 
     if isinstance(text, list):
         try:
-            text = " ".join(text)
-        except TypeError:
+            text = groupedText(text, keep_whitespace)
+        except AttributeError:
             return []
 
     segments = []

From 5bbf4108033b1d2f67aba6d0de82cb79a16923da Mon Sep 17 00:00:00 2001
From: ayaan-qadri <develop.ayaan@gmail.com>
Date: Sun, 13 Oct 2024 15:59:39 +0530
Subject: [PATCH 03/11] Implemented indices_words & map_indices_to_words in
 sent_tokenize

---
 pythainlp/tokenize/core.py | 85 +++++++++++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index ef54168d9..620dbac20 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -6,6 +6,7 @@
 """
 import re
 from typing import Iterable, List, Union
+import copy
 
 from pythainlp.tokenize import (
     DEFAULT_SENT_TOKENIZE_ENGINE,
@@ -198,7 +199,7 @@ def word_tokenize(
 
         word_tokenize(text, engine="newmm", keep_whitespace=False)
         # output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
-        
+
     Join broken formatted numeric (e.g. time, decimals, IP addresses)::
 
         text = "เงิน1,234บาท19:32น 127.0.0.1"
@@ -322,21 +323,44 @@ def word_tokenize(
     return segments
 
 
-def groupedText(list, keep_whitespace=True):
-    output = []
-    current_word = []
-    for word in list:
-        if (word.strip()):
-            current_word.append(word)
+def indices_words(words):
+    indices = []
+    start_index = 0
+
+    for word in words:
+        if len(word) > 1:
+            _temp = len(word)-1
         else:
-            if (current_word):
-                output.append([current_word])
-                current_word = []
-            if (keep_whitespace):
-                output.append([word])
-    if current_word:
-        output.append(current_word)
-    return output
+            _temp = 1
+        indices.append((start_index, start_index + _temp))
+        start_index += len(word)
+
+    return indices
+
+
+def map_indices_to_words(index_list, sentences):
+    result = []
+    c = copy.copy(index_list)
+    n_sum = 0
+
+    for sentence in sentences:
+        words = sentence
+        sentence_result = []
+        n = 0
+
+        for start, end in c:
+            if start > n_sum+len(words)-1:
+                break
+            else:
+                word = sentence[start-n_sum:end+1-n_sum]
+                sentence_result.append(word)
+                n += 1
+
+        result.append(sentence_result)
+        n_sum += len(words)
+        for _ in range(n):
+            del c[0]
+
 
 def sent_tokenize(
     text: Union[str, List[str]],
@@ -413,33 +437,40 @@ def sent_tokenize(
     if not text or not isinstance(text, (str, list)):
         return []
 
-    if isinstance(text, list):
+    is_list_input = isinstance(text, list)
+
+    if is_list_input:
+
         try:
-            text = groupedText(text, keep_whitespace)
-        except AttributeError:
+            original_text = "".join(text)
+        except ValueError:
             return []
 
+        word_indices = indices_words(text)
+    else:
+        original_text = text
+
     segments = []
 
     if engine == "crfcut":
         from pythainlp.tokenize.crfcut import segment
 
-        segments = segment(text)
+        segments = segment(original_text)
     elif engine == "whitespace":
-        segments = re.split(r" +", text, flags=re.U)
+        segments = re.split(r" +", original_text, flags=re.U)
     elif engine == "whitespace+newline":
-        segments = text.split()
+        segments = original_text.split()
     elif engine == "tltk":
         from pythainlp.tokenize.tltk import sent_tokenize as segment
 
-        segments = segment(text)
+        segments = segment(original_text)
     elif engine == "thaisum":
         from pythainlp.tokenize.thaisumcut import (
             ThaiSentenceSegmentor as segmentor,
         )
 
         segment = segmentor()
-        segments = segment.split_into_sentences(text)
+        segments = segment.split_into_sentences(original_text)
     elif engine.startswith("wtp"):
         if "-" not in engine:
             _size = "mini"
@@ -447,7 +478,7 @@ def sent_tokenize(
             _size = engine.split("-")[-1]
         from pythainlp.tokenize.wtsplit import tokenize as segment
 
-        segments = segment(text, size=_size, tokenize="sentence")
+        segments = segment(original_text, size=_size, tokenize="sentence")
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
@@ -457,7 +488,11 @@ def sent_tokenize(
     if not keep_whitespace:
         segments = strip_whitespace(segments)
 
-    return segments
+    if is_list_input:
+        result = map_indices_to_words(word_indices, segments)
+        return result
+    else:
+        return segments
 
 
 def paragraph_tokenize(

From f1b9c117b531da7726172ecc6d53d4cf516407f7 Mon Sep 17 00:00:00 2001
From: ayaan-qadri <develop.ayaan@gmail.com>
Date: Sun, 13 Oct 2024 16:15:30 +0530
Subject: [PATCH 04/11] Added return to map_indices_to_words

---
 pythainlp/tokenize/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 620dbac20..0d5b08234 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -360,7 +360,7 @@ def map_indices_to_words(index_list, sentences):
         n_sum += len(words)
         for _ in range(n):
             del c[0]
-
+    return result
 
 def sent_tokenize(
     text: Union[str, List[str]],

From 77756c43f9b81b01848efe4ff2f3fc2d6e8e92f3 Mon Sep 17 00:00:00 2001
From: ayaan-qadri <develop.ayaan@gmail.com>
Date: Sun, 13 Oct 2024 23:08:30 +0530
Subject: [PATCH 05/11] Reolved Bug: whitespace+newline was tokenize by
 whitespace

---
 pythainlp/tokenize/core.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 0d5b08234..422ead4a4 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -446,7 +446,6 @@ def sent_tokenize(
         except ValueError:
             return []
 
-        word_indices = indices_words(text)
     else:
         original_text = text
 
@@ -458,8 +457,15 @@ def sent_tokenize(
         segments = segment(original_text)
     elif engine == "whitespace":
         segments = re.split(r" +", original_text, flags=re.U)
+        if is_list_input:
+            non_whitespace_text = [word for word in text if word.strip()]
+            word_indices = indices_words(non_whitespace_text)
     elif engine == "whitespace+newline":
         segments = original_text.split()
+        if is_list_input:
+            non_whitespace_newline_text = [
+                word for word in text if word.strip() and word != '\n']
+            word_indices = indices_words(non_whitespace_newline_text)
     elif engine == "tltk":
         from pythainlp.tokenize.tltk import sent_tokenize as segment
 
@@ -489,6 +495,8 @@ def sent_tokenize(
         segments = strip_whitespace(segments)
 
     if is_list_input:
+        if engine not in ["whitespace", "whitespace+newline"]:
+            word_indices = indices_words(text)
         result = map_indices_to_words(word_indices, segments)
         return result
     else:

From 011fb6685bed2b82d4f2370c6ff077605170cd68 Mon Sep 17 00:00:00 2001
From: ayaan-qadri <develop.ayaan@gmail.com>
Date: Sun, 27 Oct 2024 16:44:43 +0530
Subject: [PATCH 06/11] fixed bug 1 for crfcut engine

---
 pythainlp/tokenize/core.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 422ead4a4..9186e89a5 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -326,13 +326,9 @@ def word_tokenize(
 def indices_words(words):
     indices = []
     start_index = 0
-
     for word in words:
-        if len(word) > 1:
-            _temp = len(word)-1
-        else:
-            _temp = 1
-        indices.append((start_index, start_index + _temp))
+        end_index = start_index + len(word) - 1
+        indices.append((start_index, end_index))
         start_index += len(word)
 
     return indices
@@ -342,12 +338,10 @@ def map_indices_to_words(index_list, sentences):
     result = []
     c = copy.copy(index_list)
     n_sum = 0
-
     for sentence in sentences:
         words = sentence
         sentence_result = []
         n = 0
-
         for start, end in c:
             if start > n_sum+len(words)-1:
                 break
@@ -455,6 +449,11 @@ def sent_tokenize(
         from pythainlp.tokenize.crfcut import segment
 
         segments = segment(original_text)
+
+        if is_list_input:
+            word_indices = indices_words(text)
+            result = map_indices_to_words(word_indices, [original_text])
+            return result
     elif engine == "whitespace":
         segments = re.split(r" +", original_text, flags=re.U)
         if is_list_input:
@@ -468,13 +467,11 @@ def sent_tokenize(
             word_indices = indices_words(non_whitespace_newline_text)
     elif engine == "tltk":
         from pythainlp.tokenize.tltk import sent_tokenize as segment
-
         segments = segment(original_text)
     elif engine == "thaisum":
         from pythainlp.tokenize.thaisumcut import (
             ThaiSentenceSegmentor as segmentor,
         )
-
         segment = segmentor()
         segments = segment.split_into_sentences(original_text)
     elif engine.startswith("wtp"):
@@ -483,7 +480,6 @@ def sent_tokenize(
         else:
             _size = engine.split("-")[-1]
         from pythainlp.tokenize.wtsplit import tokenize as segment
-
         segments = segment(original_text, size=_size, tokenize="sentence")
     else:
         raise ValueError(
@@ -494,13 +490,12 @@ def sent_tokenize(
     if not keep_whitespace:
         segments = strip_whitespace(segments)
 
-    if is_list_input:
-        if engine not in ["whitespace", "whitespace+newline"]:
-            word_indices = indices_words(text)
+    if is_list_input and engine not in ["crfcut", "whitespace"]:
+        word_indices = indices_words(text)
         result = map_indices_to_words(word_indices, segments)
         return result
     else:
-        return segments
+        return [segments]
 
 
 def paragraph_tokenize(

From 0e9148d95cc64c4dafc2541567b3871da8ef86d6 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@yahoo.com>
Date: Mon, 28 Oct 2024 22:02:55 +0700
Subject: [PATCH 07/11] Fixed list of string support in whitespace engine

---
 pythainlp/tokenize/core.py | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 9186e89a5..8e92917d5 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -457,14 +457,35 @@ def sent_tokenize(
     elif engine == "whitespace":
         segments = re.split(r" +", original_text, flags=re.U)
         if is_list_input:
-            non_whitespace_text = [word for word in text if word.strip()]
-            word_indices = indices_words(non_whitespace_text)
+            result = []
+            _temp = []
+            for i,w in enumerate(text):
+                if re.findall(r"\s",w) != [] and re.findall(r"\w",w) == []:
+                    if _temp == []:
+                        continue
+                    result.append(_temp)
+                    _temp = []
+                else:
+                    _temp.append(w)
+                if i+1 == len(text):
+                    result.append(_temp)
+            return result
     elif engine == "whitespace+newline":
         segments = original_text.split()
         if is_list_input:
-            non_whitespace_newline_text = [
-                word for word in text if word.strip() and word != '\n']
-            word_indices = indices_words(non_whitespace_newline_text)
+            result = []
+            _temp = []
+            for i,w in enumerate(text):
+                if (re.findall(r"\s",w) != [] or re.findall(r"\n",w) != []) and re.findall(r"\w",w) == []:
+                    if _temp==[]:
+                        continue
+                    result.append(_temp)
+                    _temp=[]
+                else:
+                    _temp.append(w)
+                if i+1==len(text):
+                    result.append(_temp)
+            return result
     elif engine == "tltk":
         from pythainlp.tokenize.tltk import sent_tokenize as segment
         segments = segment(original_text)
@@ -490,7 +511,7 @@ def sent_tokenize(
     if not keep_whitespace:
         segments = strip_whitespace(segments)
 
-    if is_list_input and engine not in ["crfcut", "whitespace"]:
+    if is_list_input and engine not in ["crfcut"]:
         word_indices = indices_words(text)
         result = map_indices_to_words(word_indices, segments)
         return result

From 66c0647a4e76cdafa4324120b4ba9447476e1ba7 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@yahoo.com>
Date: Mon, 28 Oct 2024 22:08:52 +0700
Subject: [PATCH 08/11] Fixed pep8

---
 pythainlp/tokenize/core.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 8e92917d5..bb5ca08b0 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -343,10 +343,10 @@ def map_indices_to_words(index_list, sentences):
         sentence_result = []
         n = 0
         for start, end in c:
-            if start > n_sum+len(words)-1:
+            if start > n_sum + len(words) - 1:
                 break
             else:
-                word = sentence[start-n_sum:end+1-n_sum]
+                word = sentence[start - n_sum:end + 1 - n_sum]
                 sentence_result.append(word)
                 n += 1
 
@@ -459,8 +459,8 @@ def sent_tokenize(
         if is_list_input:
             result = []
             _temp = []
-            for i,w in enumerate(text):
-                if re.findall(r"\s",w) != [] and re.findall(r"\w",w) == []:
+            for i, w in enumerate(text):
+                if re.findall(r"\s", w) != [] and re.findall(r"\w", w) == []:
                     if _temp == []:
                         continue
                     result.append(_temp)
@@ -476,14 +476,17 @@ def sent_tokenize(
             result = []
             _temp = []
             for i,w in enumerate(text):
-                if (re.findall(r"\s",w) != [] or re.findall(r"\n",w) != []) and re.findall(r"\w",w) == []:
-                    if _temp==[]:
+                if ((
+                    re.findall(r"\s",w) != [] or
+                    re.findall(r"\n",w) != []) and
+                    re.findall(r"\w",w) == []):
+                    if _temp == []:
                         continue
                     result.append(_temp)
-                    _temp=[]
+                    _temp = []
                 else:
                     _temp.append(w)
-                if i+1==len(text):
+                if i+1 == len(text):
                     result.append(_temp)
             return result
     elif engine == "tltk":

From 54842ca34abb37614fc24ef483597d7bc7a7fd71 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@yahoo.com>
Date: Mon, 28 Oct 2024 22:13:20 +0700
Subject: [PATCH 09/11] Fixed pep8

---
 pythainlp/tokenize/core.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index bb5ca08b0..4b53ec340 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -467,7 +467,7 @@ def sent_tokenize(
                     _temp = []
                 else:
                     _temp.append(w)
-                if i+1 == len(text):
+                if i + 1 == len(text):
                     result.append(_temp)
             return result
     elif engine == "whitespace+newline":
@@ -475,18 +475,19 @@ def sent_tokenize(
         if is_list_input:
             result = []
             _temp = []
-            for i,w in enumerate(text):
+            for i, w in enumerate(text):
                 if ((
-                    re.findall(r"\s",w) != [] or
-                    re.findall(r"\n",w) != []) and
-                    re.findall(r"\w",w) == []):
+                    re.findall(r"\s", w) != []
+                    or re.findall(r"\n", w) != [])
+                    and re.findall(r"\w", w) == []
+                ):
                     if _temp == []:
                         continue
                     result.append(_temp)
                     _temp = []
                 else:
                     _temp.append(w)
-                if i+1 == len(text):
+                if i + 1 == len(text):
                     result.append(_temp)
             return result
     elif engine == "tltk":

From 76f33107c4870a7a3262fb552a828ae1d006154e Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@yahoo.com>
Date: Mon, 28 Oct 2024 22:17:05 +0700
Subject: [PATCH 10/11] Fixed pep8

---
 pythainlp/tokenize/core.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 4b53ec340..931980532 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -476,9 +476,9 @@ def sent_tokenize(
             result = []
             _temp = []
             for i, w in enumerate(text):
-                if ((
-                    re.findall(r"\s", w) != []
-                    or re.findall(r"\n", w) != [])
+                if (
+                    (re.findall(r"\s", w) != [] or
+                        re.findall(r"\n", w) != [])
                     and re.findall(r"\w", w) == []
                 ):
                     if _temp == []:

From 1a2b457063a604175c9ece8146f1c425920e6d4e Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@yahoo.com>
Date: Mon, 28 Oct 2024 22:29:31 +0700
Subject: [PATCH 11/11] Add list of words in sent_tokenize testset

---
 pythainlp/tokenize/core.py |  6 +++---
 tests/test_tokenize.py     | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 931980532..57669337d 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -460,7 +460,7 @@ def sent_tokenize(
             result = []
             _temp = []
             for i, w in enumerate(text):
-                if re.findall(r"\s", w) != [] and re.findall(r"\w", w) == []:
+                if re.findall(r" ", w) != [] and re.findall(r"\w", w) == []:
                     if _temp == []:
                         continue
                     result.append(_temp)
@@ -478,8 +478,8 @@ def sent_tokenize(
             for i, w in enumerate(text):
                 if (
                     (re.findall(r"\s", w) != [] or
-                        re.findall(r"\n", w) != [])
-                    and re.findall(r"\w", w) == []
+                        re.findall(r"\n", w) != []) and
+                        re.findall(r"\w", w) == []
                 ):
                     if _temp == []:
                         continue
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index ad5a1f5e9..0d6026168 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -333,6 +333,23 @@ def test_sent_tokenize(self):
         #         engine="wtp-large",
         #     ),
         # )
+        sent_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]
+        self.assertEqual(
+            sent_tokenize(sent_4, engine="crfcut"),
+            [["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
+        )
+        self.assertEqual(
+            sent_tokenize(sent_4, engine="whitespace"),
+            [["ผม", "กิน", "ข้าว"], ["\n", "เธอ", "เล่น", "เกม"]],
+        )
+        self.assertEqual(
+            sent_tokenize(sent_4, engine="whitespace+newline"),
+            [["ผม", "กิน", "ข้าว"], ["เธอ", "เล่น", "เกม"]],
+        )
+        self.assertEqual(
+            sent_tokenize(sent_4, engine="thaisum"),
+            [["ผม", "กิน", "ข้าว", " ", "เธอ", "เล่น", "เกม"]],
+        )
         self.assertFalse(
             " "
             in sent_tokenize(