Skip to content

Commit c08d6eb

Browse files
authored
Merge pull request #916 from PyThaiNLP/add-tud
Add TUD postag
2 parents 5971300 + ed3e61e commit c08d6eb

File tree

7 files changed

+47
-3
lines changed

7 files changed

+47
-3
lines changed

pythainlp/corpus/corpus_license.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,13 @@ https://creativecommons.org/licenses/by/4.0/
4646

4747
| Filename | Description |
4848
| ------------------------- | ----------------------------------------------------------------------------------------------------- |
49-
| pos_orchid_perceptron.pkl | Part-of-speech tagging model, trained from ORCHID data, using perceptron |
49+
| pos_orchid_perceptron.json | Part-of-speech tagging model, trained from ORCHID data, using perceptron |
5050
| pos_orchid_unigram.json | Part-of-speech tagging model, trained from ORCHID data, using unigram |
51-
| pos_ud_perceptron.pkl | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
52-
| pos_ud_unigram.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
51+
| pos_ud_perceptron-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
52+
| pos_ud_unigram-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
5353
| sentenceseg_crfcut.model | Sentence segmentation model, trained from TED subtitles, using CRF |
54+
| pos_tud_perceptron.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using perceptron |
55+
| pos_tud_unigram.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using unigram |
5456

5557

5658
## Thai Dictionary for ICU BreakIterator

pythainlp/corpus/pos_tud_perceptron.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

pythainlp/corpus/pos_tud_unigram.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

pythainlp/tag/perceptron.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,13 @@
1818

1919
_BLACKBOARD_NAME = "blackboard_pt_tagger"
2020

21+
_TUD_FILENAME = "pos_tud_perceptron.json"
22+
_TUD_PATH = os.path.join(corpus_path(), _TUD_FILENAME)
23+
2124
_ORCHID_TAGGER = None
2225
_PUD_TAGGER = None
2326
_BLACKBOARD_TAGGER = None
27+
_TUD_TAGGER = None
2428

2529

2630
def _orchid_tagger():
@@ -44,6 +48,13 @@ def _blackboard_tagger():
4448
return _LST20_TAGGER
4549

4650

51+
def _tud_tagger():
52+
global _TUD_TAGGER
53+
if not _TUD_TAGGER:
54+
_TUD_TAGGER = PerceptronTagger(path=_TUD_PATH)
55+
return _TUD_TAGGER
56+
57+
4758
def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
4859
"""
4960
:param list words: a list of tokenized words
@@ -67,6 +78,9 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
6778
words = blackboard.pre_process(words)
6879
word_tags = _blackboard_tagger().tag(words)
6980
word_tags = blackboard.post_process(word_tags, to_ud)
81+
elif corpus in ("tud"):
82+
tagger = _tud_tagger()
83+
word_tags = tagger.tag(words)
7084
else: # by default, use "pud" for corpus
7185
tagger = _pud_tagger()
7286
word_tags = tagger.tag(words)

pythainlp/tag/pos_tag.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ def pos_tag(
2929
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
3030
treebanks, natively use Universal POS tags
3131
* *tnc* - Thai National Corpus (support tltk engine only)
32+
* *tud* - `Thai Universal Dependency Treebank (TUD)\
33+
<https://github.com/nlp-chula/TUD>`_ \
3234
:return: a list of tuples (word, POS tag)
3335
:rtype: list[tuple[str, str]]
3436
@@ -96,6 +98,7 @@ def pos_tag(
9698
"orchid",
9799
"orchid_ud",
98100
"pud",
101+
"tud",
99102
]
100103

101104
if engine == "perceptron" and corpus in _support_corpus:

pythainlp/tag/unigram.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,13 @@
1919

2020
_BLACKBOARD_NAME = "blackboard_unigram_tagger"
2121

22+
_TUD_FILENAME = "pos_tud_unigram.json"
23+
_TUD_PATH = os.path.join(corpus_path(), _TUD_FILENAME)
24+
2225
_ORCHID_TAGGER = None
2326
_PUD_TAGGER = None
2427
_BLACKBOARD_TAGGER = None
28+
_TUD_TAGGER = None
2529

2630

2731
def _orchid_tagger():
@@ -49,6 +53,14 @@ def _blackboard_tagger():
4953
return _BLACKBOARD_TAGGER
5054

5155

56+
def _tud_tagger():
57+
global _TUD_TAGGER
58+
if not _TUD_TAGGER:
59+
with open(_TUD_PATH, encoding="utf-8-sig") as fh:
60+
_TUD_TAGGER = json.load(fh)
61+
return _TUD_TAGGER
62+
63+
5264
def _find_tag(
5365
words: List[str], dictdata: dict, default_tag: str = ""
5466
) -> List[Tuple[str, str]]:
@@ -82,6 +94,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
8294
words = blackboard.pre_process(words)
8395
word_tags = _find_tag(words, _blackboard_tagger())
8496
word_tags = blackboard.post_process(word_tags, to_ud)
97+
elif corpus in ("tud"):
98+
word_tags = _find_tag(words, _tud_tagger())
8599
else: # by default, use "pud" for corpus
86100
word_tags = _find_tag(words, _pud_tagger())
87101

tests/test_tag.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ def test_pos_tag(self):
5151
self.assertEqual(unigram.tag([], corpus="orchid"), [])
5252
self.assertEqual(unigram.tag(None, corpus="blackboard"), [])
5353
self.assertEqual(unigram.tag([], corpus="blackboard"), [])
54+
self.assertEqual(unigram.tag(None, corpus="tud"), [])
55+
self.assertEqual(unigram.tag([], corpus="tud"), [])
5456
self.assertIsNotNone(
5557
pos_tag(tokens, engine="unigram", corpus="orchid")
5658
)
@@ -68,6 +70,8 @@ def test_pos_tag(self):
6870
self.assertIsNotNone(
6971
pos_tag([""], engine="unigram", corpus="blackboard_ud")
7072
)
73+
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tud"))
74+
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tud"))
7175
self.assertEqual(
7276
pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
7377
[("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
@@ -88,6 +92,8 @@ def test_pos_tag(self):
8892
self.assertEqual(perceptron.tag([], corpus="pud"), [])
8993
self.assertEqual(perceptron.tag(None, corpus="blackboard"), [])
9094
self.assertEqual(perceptron.tag([], corpus="blackboard"), [])
95+
self.assertEqual(perceptron.tag(None, corpus="tud"), [])
96+
self.assertEqual(perceptron.tag([], corpus="tud"), [])
9197
self.assertIsNotNone(
9298
pos_tag(tokens, engine="perceptron", corpus="orchid")
9399
)
@@ -103,6 +109,9 @@ def test_pos_tag(self):
103109
self.assertIsNotNone(
104110
pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
105111
)
112+
self.assertIsNotNone(
113+
pos_tag(tokens, engine="perceptron", corpus="tud")
114+
)
106115
self.assertIsNotNone(pos_tag(tokens, engine="tltk"))
107116

108117
self.assertEqual(pos_tag_sents(None), [])

0 commit comments

Comments
 (0)