Merge pull request #916 from PyThaiNLP/add-tud

wannaphong · web-flow · commit c08d6eb81457 · 2024-06-11T02:12:29.000+07:00
Add TUD postag
diff --git a/pythainlp/corpus/corpus_license.md b/pythainlp/corpus/corpus_license.md
@@ -46,11 +46,13 @@ https://creativecommons.org/licenses/by/4.0/
 
 | Filename                  | Description                                                                                           |
 | ------------------------- | ----------------------------------------------------------------------------------------------------- |
-| pos_orchid_perceptron.pkl | Part-of-speech tagging model, trained from ORCHID data, using perceptron                              |
+| pos_orchid_perceptron.json | Part-of-speech tagging model, trained from ORCHID data, using perceptron                              |
 | pos_orchid_unigram.json   | Part-of-speech tagging model, trained from ORCHID data, using unigram                                 |
-| pos_ud_perceptron.pkl     | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
-| pos_ud_unigram.json       | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram    |
+| pos_ud_perceptron-v0.2.json     | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
+| pos_ud_unigram-v0.2.json       | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram    |
 | sentenceseg_crfcut.model  | Sentence segmentation model, trained from TED subtitles, using CRF                                    |
+| pos_tud_perceptron.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using perceptron                              |
+| pos_tud_unigram.json   | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using unigram                                 |
 
 
 ## Thai Dictionary for ICU BreakIterator
diff --git a/pythainlp/corpus/pos_tud_perceptron.json b/pythainlp/corpus/pos_tud_perceptron.json
diff --git a/pythainlp/corpus/pos_tud_unigram.json b/pythainlp/corpus/pos_tud_unigram.json
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -18,9 +18,13 @@
 
 _BLACKBOARD_NAME = "blackboard_pt_tagger"
 
+_TUD_FILENAME = "pos_tud_perceptron.json"
+_TUD_PATH = os.path.join(corpus_path(), _TUD_FILENAME)
+
 _ORCHID_TAGGER = None
 _PUD_TAGGER = None
 _BLACKBOARD_TAGGER = None
+_TUD_TAGGER = None
 
 
 def _orchid_tagger():
@@ -44,6 +48,13 @@ def _blackboard_tagger():
     return _LST20_TAGGER
 
 
+def _tud_tagger():
+    global _TUD_TAGGER
+    if not _TUD_TAGGER:
+        _TUD_TAGGER = PerceptronTagger(path=_TUD_PATH)
+    return _TUD_TAGGER
+
+
 def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
     """
     :param list words: a list of tokenized words
@@ -67,6 +78,9 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
         words = blackboard.pre_process(words)
         word_tags = _blackboard_tagger().tag(words)
         word_tags = blackboard.post_process(word_tags, to_ud)
+    elif corpus in ("tud"):
+        tagger = _tud_tagger()
+        word_tags = tagger.tag(words)
     else:  # by default, use "pud" for corpus
         tagger = _pud_tagger()
         word_tags = tagger.tag(words)
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -29,6 +29,8 @@ def pos_tag(
             <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
             treebanks, natively use Universal POS tags
         * *tnc* - Thai National Corpus (support tltk engine only)
+        * *tud* - `Thai Universal Dependency Treebank (TUD)\
+            <https://github.com/nlp-chula/TUD>`_ \
     :return: a list of tuples (word, POS tag)
     :rtype: list[tuple[str, str]]
 
@@ -96,6 +98,7 @@ def pos_tag(
         "orchid",
         "orchid_ud",
         "pud",
+        "tud",
     ]
 
     if engine == "perceptron" and corpus in _support_corpus:
diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py
@@ -19,9 +19,13 @@
 
 _BLACKBOARD_NAME = "blackboard_unigram_tagger"
 
+_TUD_FILENAME = "pos_tud_unigram.json"
+_TUD_PATH = os.path.join(corpus_path(), _TUD_FILENAME)
+
 _ORCHID_TAGGER = None
 _PUD_TAGGER = None
 _BLACKBOARD_TAGGER = None
+_TUD_TAGGER = None
 
 
 def _orchid_tagger():
@@ -49,6 +53,14 @@ def _blackboard_tagger():
     return _BLACKBOARD_TAGGER
 
 
+def _tud_tagger():
+    global _TUD_TAGGER
+    if not _TUD_TAGGER:
+        with open(_TUD_PATH, encoding="utf-8-sig") as fh:
+            _TUD_TAGGER = json.load(fh)
+    return _TUD_TAGGER
+
+
 def _find_tag(
     words: List[str], dictdata: dict, default_tag: str = ""
 ) -> List[Tuple[str, str]]:
@@ -82,6 +94,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
         words = blackboard.pre_process(words)
         word_tags = _find_tag(words, _blackboard_tagger())
         word_tags = blackboard.post_process(word_tags, to_ud)
+    elif corpus in ("tud"):
+        word_tags = _find_tag(words, _tud_tagger())
     else:  # by default, use "pud" for corpus
         word_tags = _find_tag(words, _pud_tagger())
 
diff --git a/tests/test_tag.py b/tests/test_tag.py
@@ -51,6 +51,8 @@ def test_pos_tag(self):
         self.assertEqual(unigram.tag([], corpus="orchid"), [])
         self.assertEqual(unigram.tag(None, corpus="blackboard"), [])
         self.assertEqual(unigram.tag([], corpus="blackboard"), [])
+        self.assertEqual(unigram.tag(None, corpus="tud"), [])
+        self.assertEqual(unigram.tag([], corpus="tud"), [])
         self.assertIsNotNone(
             pos_tag(tokens, engine="unigram", corpus="orchid")
         )
@@ -68,6 +70,8 @@ def test_pos_tag(self):
         self.assertIsNotNone(
             pos_tag([""], engine="unigram", corpus="blackboard_ud")
         )
+        self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tud"))
+        self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tud"))
         self.assertEqual(
             pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
             [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
@@ -88,6 +92,8 @@ def test_pos_tag(self):
         self.assertEqual(perceptron.tag([], corpus="pud"), [])
         self.assertEqual(perceptron.tag(None, corpus="blackboard"), [])
         self.assertEqual(perceptron.tag([], corpus="blackboard"), [])
+        self.assertEqual(perceptron.tag(None, corpus="tud"), [])
+        self.assertEqual(perceptron.tag([], corpus="tud"), [])
         self.assertIsNotNone(
             pos_tag(tokens, engine="perceptron", corpus="orchid")
         )
@@ -103,6 +109,9 @@ def test_pos_tag(self):
         self.assertIsNotNone(
             pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
         )
+        self.assertIsNotNone(
+            pos_tag(tokens, engine="perceptron", corpus="tud")
+        )
         self.assertIsNotNone(pos_tag(tokens, engine="tltk"))
 
         self.assertEqual(pos_tag_sents(None), [])