From 2a9507094b29f8dca4e363b6a9e52a634b691f08 Mon Sep 17 00:00:00 2001 From: penw0lf Date: Tue, 8 Oct 2024 13:21:29 +0530 Subject: [PATCH 01/11] Added list of string support to sent_tokenize/solved #906 --- pythainlp/tokenize/core.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 993bcff64..2c6f5bd6f 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -323,16 +323,16 @@ def word_tokenize( def sent_tokenize( - text: str, + text: Union[str, List[str]], engine: str = DEFAULT_SENT_TOKENIZE_ENGINE, keep_whitespace: bool = True, ) -> List[str]: """ Sentence tokenizer. - Tokenizes running text into "sentences" + Tokenizes running text into "sentences". Supports both string and list of strings. - :param str text: the text to be tokenized + :param text: the text (string) or list of words (list of strings) to be tokenized :param str engine: choose among *'crfcut'*, *'whitespace'*, \ *'whitespace+newline'* :return: list of split sentences @@ -394,9 +394,15 @@ def sent_tokenize( 'และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค'] """ - if not text or not isinstance(text, str): + if not text or not isinstance(text, (str, list)): return [] + if isinstance(text, list): + try: + text = " ".join(text) + except TypeError: + return [] + segments = [] if engine == "crfcut": From 817b87ab858ecae15d648236e77c810ac88848e3 Mon Sep 17 00:00:00 2001 From: ayaan-qadri Date: Sat, 12 Oct 2024 21:27:18 +0530 Subject: [PATCH 02/11] Added list grouping --- pythainlp/tokenize/core.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 2c6f5bd6f..ef54168d9 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -322,6 +322,22 @@ def word_tokenize( return segments +def groupedText(list, keep_whitespace=True): + output = [] + current_word = [] + for word in list: + if (word.strip()): + current_word.append(word) + else: + if (current_word): + output.append([current_word]) + current_word = [] + if (keep_whitespace): + output.append([word]) + if current_word: + output.append(current_word) + return output + def sent_tokenize( text: Union[str, List[str]], engine: str = DEFAULT_SENT_TOKENIZE_ENGINE, @@ -399,8 +415,8 @@ def sent_tokenize( if isinstance(text, list): try: - text = " ".join(text) - except TypeError: + text = groupedText(text, keep_whitespace) + except AttributeError: return [] segments = [] From 5bbf4108033b1d2f67aba6d0de82cb79a16923da Mon Sep 17 00:00:00 2001 From: ayaan-qadri Date: Sun, 13 Oct 2024 15:59:39 +0530 Subject: [PATCH 03/11] Implemented indices_words & map_indices_to_words in sent_tokenize --- pythainlp/tokenize/core.py | 85 +++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index ef54168d9..620dbac20 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -6,6 +6,7 @@ """ import re from typing import Iterable, List, Union +import copy from pythainlp.tokenize import ( DEFAULT_SENT_TOKENIZE_ENGINE, @@ -198,7 +199,7 @@ def word_tokenize( word_tokenize(text, engine="newmm", keep_whitespace=False) # output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว'] - + Join broken formatted numeric (e.g. time, decimals, IP addresses):: text = "เงิน1,234บาท19:32น 127.0.0.1" @@ -322,21 +323,44 @@ def word_tokenize( return segments -def groupedText(list, keep_whitespace=True): - output = [] - current_word = [] - for word in list: - if (word.strip()): - current_word.append(word) +def indices_words(words): + indices = [] + start_index = 0 + + for word in words: + if len(word) > 1: + _temp = len(word)-1 else: - if (current_word): - output.append([current_word]) - current_word = [] - if (keep_whitespace): - output.append([word]) - if current_word: - output.append(current_word) - return output + _temp = 1 + indices.append((start_index, start_index + _temp)) + start_index += len(word) + + return indices + + +def map_indices_to_words(index_list, sentences): + result = [] + c = copy.copy(index_list) + n_sum = 0 + + for sentence in sentences: + words = sentence + sentence_result = [] + n = 0 + + for start, end in c: + if start > n_sum+len(words)-1: + break + else: + word = sentence[start-n_sum:end+1-n_sum] + sentence_result.append(word) + n += 1 + + result.append(sentence_result) + n_sum += len(words) + for _ in range(n): + del c[0] + def sent_tokenize( text: Union[str, List[str]], @@ -413,33 +437,40 @@ def sent_tokenize( if not text or not isinstance(text, (str, list)): return [] - if isinstance(text, list): + is_list_input = isinstance(text, list) + + if is_list_input: + try: - text = groupedText(text, keep_whitespace) - except AttributeError: + original_text = "".join(text) + except ValueError: return [] + word_indices = indices_words(text) + else: + original_text = text + segments = [] if engine == "crfcut": from pythainlp.tokenize.crfcut import segment - segments = segment(text) + segments = segment(original_text) elif engine == "whitespace": - segments = re.split(r" +", text, flags=re.U) + segments = re.split(r" +", original_text, flags=re.U) elif engine == "whitespace+newline": - segments = text.split() + segments = original_text.split() elif engine == "tltk": from pythainlp.tokenize.tltk import sent_tokenize as segment - segments = segment(text) + segments = segment(original_text) elif engine == "thaisum": from pythainlp.tokenize.thaisumcut import ( ThaiSentenceSegmentor as segmentor, ) segment = segmentor() - segments = segment.split_into_sentences(text) + segments = segment.split_into_sentences(original_text) elif engine.startswith("wtp"): if "-" not in engine: _size = "mini" @@ -447,7 +478,7 @@ def sent_tokenize( _size = engine.split("-")[-1] from pythainlp.tokenize.wtsplit import tokenize as segment - segments = segment(text, size=_size, tokenize="sentence") + segments = segment(original_text, size=_size, tokenize="sentence") else: raise ValueError( f"""Tokenizer \"{engine}\" not found. @@ -457,7 +488,11 @@ def sent_tokenize( if not keep_whitespace: segments = strip_whitespace(segments) - return segments + if is_list_input: + result = map_indices_to_words(word_indices, segments) + return result + else: + return segments def paragraph_tokenize( From f1b9c117b531da7726172ecc6d53d4cf516407f7 Mon Sep 17 00:00:00 2001 From: ayaan-qadri Date: Sun, 13 Oct 2024 16:15:30 +0530 Subject: [PATCH 04/11] Added return to map_indices_to_words --- pythainlp/tokenize/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 620dbac20..0d5b08234 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -360,7 +360,7 @@ def map_indices_to_words(index_list, sentences): n_sum += len(words) for _ in range(n): del c[0] - + return result def sent_tokenize( text: Union[str, List[str]], From 77756c43f9b81b01848efe4ff2f3fc2d6e8e92f3 Mon Sep 17 00:00:00 2001 From: ayaan-qadri Date: Sun, 13 Oct 2024 23:08:30 +0530 Subject: [PATCH 05/11] Reolved Bug: whitespace+newline was tokenize by whitespace --- pythainlp/tokenize/core.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 0d5b08234..422ead4a4 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -446,7 +446,6 @@ def sent_tokenize( except ValueError: return [] - word_indices = indices_words(text) else: original_text = text @@ -458,8 +457,15 @@ def sent_tokenize( segments = segment(original_text) elif engine == "whitespace": segments = re.split(r" +", original_text, flags=re.U) + if is_list_input: + non_whitespace_text = [word for word in text if word.strip()] + word_indices = indices_words(non_whitespace_text) elif engine == "whitespace+newline": segments = original_text.split() + if is_list_input: + non_whitespace_newline_text = [ + word for word in text if word.strip() and word != '\n'] + word_indices = indices_words(non_whitespace_newline_text) elif engine == "tltk": from pythainlp.tokenize.tltk import sent_tokenize as segment @@ -489,6 +495,8 @@ def sent_tokenize( segments = strip_whitespace(segments) if is_list_input: + if engine not in ["whitespace", "whitespace+newline"]: + word_indices = indices_words(text) result = map_indices_to_words(word_indices, segments) return result else: From 011fb6685bed2b82d4f2370c6ff077605170cd68 Mon Sep 17 00:00:00 2001 From: ayaan-qadri Date: Sun, 27 Oct 2024 16:44:43 +0530 Subject: [PATCH 06/11] fixed bug 1 for crfcut engine --- pythainlp/tokenize/core.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 422ead4a4..9186e89a5 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -326,13 +326,9 @@ def word_tokenize( def indices_words(words): indices = [] start_index = 0 - for word in words: - if len(word) > 1: - _temp = len(word)-1 - else: - _temp = 1 - indices.append((start_index, start_index + _temp)) + end_index = start_index + len(word) - 1 + indices.append((start_index, end_index)) start_index += len(word) return indices @@ -342,12 +338,10 @@ def map_indices_to_words(index_list, sentences): result = [] c = copy.copy(index_list) n_sum = 0 - for sentence in sentences: words = sentence sentence_result = [] n = 0 - for start, end in c: if start > n_sum+len(words)-1: break @@ -455,6 +449,11 @@ def sent_tokenize( from pythainlp.tokenize.crfcut import segment segments = segment(original_text) + + if is_list_input: + word_indices = indices_words(text) + result = map_indices_to_words(word_indices, [original_text]) + return result elif engine == "whitespace": segments = re.split(r" +", original_text, flags=re.U) if is_list_input: @@ -468,13 +467,11 @@ def sent_tokenize( word_indices = indices_words(non_whitespace_newline_text) elif engine == "tltk": from pythainlp.tokenize.tltk import sent_tokenize as segment - segments = segment(original_text) elif engine == "thaisum": from pythainlp.tokenize.thaisumcut import ( ThaiSentenceSegmentor as segmentor, ) - segment = segmentor() segments = segment.split_into_sentences(original_text) elif engine.startswith("wtp"): @@ -483,7 +480,6 @@ def sent_tokenize( else: _size = engine.split("-")[-1] from pythainlp.tokenize.wtsplit import tokenize as segment - segments = segment(original_text, size=_size, tokenize="sentence") else: raise ValueError( @@ -494,13 +490,12 @@ def sent_tokenize( if not keep_whitespace: segments = strip_whitespace(segments) - if is_list_input: - if engine not in ["whitespace", "whitespace+newline"]: - word_indices = indices_words(text) + if is_list_input and engine not in ["crfcut", "whitespace"]: + word_indices = indices_words(text) result = map_indices_to_words(word_indices, segments) return result else: - return segments + return [segments] def paragraph_tokenize( From 0e9148d95cc64c4dafc2541567b3871da8ef86d6 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 28 Oct 2024 22:02:55 +0700 Subject: [PATCH 07/11] Fixed list of string support in whitespace engine --- pythainlp/tokenize/core.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 9186e89a5..8e92917d5 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -457,14 +457,35 @@ def sent_tokenize( elif engine == "whitespace": segments = re.split(r" +", original_text, flags=re.U) if is_list_input: - non_whitespace_text = [word for word in text if word.strip()] - word_indices = indices_words(non_whitespace_text) + result = [] + _temp = [] + for i,w in enumerate(text): + if re.findall(r"\s",w) != [] and re.findall(r"\w",w) == []: + if _temp == []: + continue + result.append(_temp) + _temp = [] + else: + _temp.append(w) + if i+1 == len(text): + result.append(_temp) + return result elif engine == "whitespace+newline": segments = original_text.split() if is_list_input: - non_whitespace_newline_text = [ - word for word in text if word.strip() and word != '\n'] - word_indices = indices_words(non_whitespace_newline_text) + result = [] + _temp = [] + for i,w in enumerate(text): + if (re.findall(r"\s",w) != [] or re.findall(r"\n",w) != []) and re.findall(r"\w",w) == []: + if _temp==[]: + continue + result.append(_temp) + _temp=[] + else: + _temp.append(w) + if i+1==len(text): + result.append(_temp) + return result elif engine == "tltk": from pythainlp.tokenize.tltk import sent_tokenize as segment segments = segment(original_text) @@ -490,7 +511,7 @@ def sent_tokenize( if not keep_whitespace: segments = strip_whitespace(segments) - if is_list_input and engine not in ["crfcut", "whitespace"]: + if is_list_input and engine not in ["crfcut"]: word_indices = indices_words(text) result = map_indices_to_words(word_indices, segments) return result From 66c0647a4e76cdafa4324120b4ba9447476e1ba7 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 28 Oct 2024 22:08:52 +0700 Subject: [PATCH 08/11] Fixed pep8 --- pythainlp/tokenize/core.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 8e92917d5..bb5ca08b0 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -343,10 +343,10 @@ def map_indices_to_words(index_list, sentences): sentence_result = [] n = 0 for start, end in c: - if start > n_sum+len(words)-1: + if start > n_sum + len(words) - 1: break else: - word = sentence[start-n_sum:end+1-n_sum] + word = sentence[start - n_sum:end + 1 - n_sum] sentence_result.append(word) n += 1 @@ -459,8 +459,8 @@ def sent_tokenize( if is_list_input: result = [] _temp = [] - for i,w in enumerate(text): - if re.findall(r"\s",w) != [] and re.findall(r"\w",w) == []: + for i, w in enumerate(text): + if re.findall(r"\s", w) != [] and re.findall(r"\w", w) == []: if _temp == []: continue result.append(_temp) @@ -476,14 +476,17 @@ def sent_tokenize( result = [] _temp = [] for i,w in enumerate(text): - if (re.findall(r"\s",w) != [] or re.findall(r"\n",w) != []) and re.findall(r"\w",w) == []: - if _temp==[]: + if (( + re.findall(r"\s",w) != [] or + re.findall(r"\n",w) != []) and + re.findall(r"\w",w) == []): + if _temp == []: continue result.append(_temp) - _temp=[] + _temp = [] else: _temp.append(w) - if i+1==len(text): + if i+1 == len(text): result.append(_temp) return result elif engine == "tltk": From 54842ca34abb37614fc24ef483597d7bc7a7fd71 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 28 Oct 2024 22:13:20 +0700 Subject: [PATCH 09/11] Fixed pep8 --- pythainlp/tokenize/core.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index bb5ca08b0..4b53ec340 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -467,7 +467,7 @@ def sent_tokenize( _temp = [] else: _temp.append(w) - if i+1 == len(text): + if i + 1 == len(text): result.append(_temp) return result elif engine == "whitespace+newline": @@ -475,18 +475,19 @@ def sent_tokenize( if is_list_input: result = [] _temp = [] - for i,w in enumerate(text): + for i, w in enumerate(text): if (( - re.findall(r"\s",w) != [] or - re.findall(r"\n",w) != []) and - re.findall(r"\w",w) == []): + re.findall(r"\s", w) != [] + or re.findall(r"\n", w) != []) + and re.findall(r"\w", w) == [] + ): if _temp == []: continue result.append(_temp) _temp = [] else: _temp.append(w) - if i+1 == len(text): + if i + 1 == len(text): result.append(_temp) return result elif engine == "tltk": From 76f33107c4870a7a3262fb552a828ae1d006154e Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 28 Oct 2024 22:17:05 +0700 Subject: [PATCH 10/11] Fixed pep8 --- pythainlp/tokenize/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 4b53ec340..931980532 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -476,9 +476,9 @@ def sent_tokenize( result = [] _temp = [] for i, w in enumerate(text): - if (( - re.findall(r"\s", w) != [] - or re.findall(r"\n", w) != []) + if ( + (re.findall(r"\s", w) != [] or + re.findall(r"\n", w) != []) and re.findall(r"\w", w) == [] ): if _temp == []: From 1a2b457063a604175c9ece8146f1c425920e6d4e Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 28 Oct 2024 22:29:31 +0700 Subject: [PATCH 11/11] Add list of words in sent_tokenize testset --- pythainlp/tokenize/core.py | 6 +++--- tests/test_tokenize.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 931980532..57669337d 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -460,7 +460,7 @@ def sent_tokenize( result = [] _temp = [] for i, w in enumerate(text): - if re.findall(r"\s", w) != [] and re.findall(r"\w", w) == []: + if re.findall(r" ", w) != [] and re.findall(r"\w", w) == []: if _temp == []: continue result.append(_temp) @@ -478,8 +478,8 @@ def sent_tokenize( for i, w in enumerate(text): if ( (re.findall(r"\s", w) != [] or - re.findall(r"\n", w) != []) - and re.findall(r"\w", w) == [] + re.findall(r"\n", w) != []) and + re.findall(r"\w", w) == [] ): if _temp == []: continue diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index ad5a1f5e9..0d6026168 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -333,6 +333,23 @@ def test_sent_tokenize(self): # engine="wtp-large", # ), # ) + sent_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"] + self.assertEqual( + sent_tokenize(sent_4, engine="crfcut"), + [["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]], + ) + self.assertEqual( + sent_tokenize(sent_4, engine="whitespace"), + [["ผม", "กิน", "ข้าว"], ["\n", "เธอ", "เล่น", "เกม"]], + ) + self.assertEqual( + sent_tokenize(sent_4, engine="whitespace+newline"), + [["ผม", "กิน", "ข้าว"], ["เธอ", "เล่น", "เกม"]], + ) + self.assertEqual( + sent_tokenize(sent_4, engine="thaisum"), + [["ผม", "กิน", "ข้าว", " ", "เธอ", "เล่น", "เกม"]], + ) self.assertFalse( " " in sent_tokenize(