Skip to content

Commit adc6967

Browse files
authored
Merge pull request #910 from PyThaiNLP/add-th_tdtb
Add postag of Thai Discourse Treebank
2 parents c08d6eb + 753cd6c commit adc6967

File tree

8 files changed

+49
-0
lines changed

8 files changed

+49
-0
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,3 +157,4 @@ Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contr
157157
- **[Thai Character Cluster]** -- T. Teeramunkong, V. Sornlertlamvanich, T. Tanhermhong and W. Chinnan, “Character cluster based Thai information retrieval,” in IRAL '00 Proceedings of the fifth international workshop on on Information retrieval with Asian languages, 2000.
158158
- **[Enhanced Thai Character Cluster]** -- Jeeragone Inrut, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. “Thai word segmentation using combination of forward and backward longest matching techniques.” In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001.
159159
- เพ็ญศิริ ลี้ตระกูล. การเลือกประโยคสำคัญในการสรุปความภาษาไทย โดยใช้แบบจำลองแบบลำดับชั้น (Selection of Important Sentences in Thai Text Summarization Using a Hierarchical Model). Retrieved from http://digi.library.tu.ac.th/thesis/st/0192/
160+
- **[Thai Discourse Treebank]** -- Ponrawee Prasertsom, Apiwat Jaroonpol, Attapol T. Rutherford; The Thai Discourse Treebank: Annotating and Classifying Thai Discourse Connectives. Transactions of the Association for Computational Linguistics 2024; 12 613–629. doi: https://doi.org/10.1162/tacl_a_00650

pythainlp/corpus/corpus_license.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ https://creativecommons.org/licenses/by/4.0/
5151
| pos_ud_perceptron-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
5252
| pos_ud_unigram-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
5353
| sentenceseg_crfcut.model | Sentence segmentation model, trained from TED subtitles, using CRF |
54+
| tdtb-pt_tagger.json | Part-of-speech tagging model, trained from The Thai Discourse Treebank, using perceptron |
55+
| tdtb-unigram_tagger.json | Part-of-speech tagging model, trained from The Thai Discourse Treebank, using unigram |
5456
| pos_tud_perceptron.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using perceptron |
5557
| pos_tud_unigram.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using unigram |
5658

pythainlp/corpus/tdtb-pt_tagger.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

pythainlp/corpus/tdtb-unigram_tagger.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

pythainlp/tag/perceptron.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
_PUD_FILENAME = "pos_ud_perceptron-v0.2.json"
1717
_PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)
1818

19+
_TDTB_FILENAME = "tdtb-pt_tagger.json"
20+
_TDTB_PATH = os.path.join(corpus_path(), _TDTB_FILENAME)
21+
1922
_BLACKBOARD_NAME = "blackboard_pt_tagger"
2023

2124
_TUD_FILENAME = "pos_tud_perceptron.json"
@@ -24,6 +27,7 @@
2427
_ORCHID_TAGGER = None
2528
_PUD_TAGGER = None
2629
_BLACKBOARD_TAGGER = None
30+
_TDTB_TAGGER = None
2731
_TUD_TAGGER = None
2832

2933

@@ -48,6 +52,13 @@ def _blackboard_tagger():
4852
return _LST20_TAGGER
4953

5054

55+
def _tdtb():
56+
global _TDTB_TAGGER
57+
if not _TDTB_TAGGER:
58+
_TDTB_TAGGER = PerceptronTagger(path=_TDTB_PATH)
59+
return _TDTB_TAGGER
60+
61+
5162
def _tud_tagger():
5263
global _TUD_TAGGER
5364
if not _TUD_TAGGER:
@@ -78,6 +89,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
7889
words = blackboard.pre_process(words)
7990
word_tags = _blackboard_tagger().tag(words)
8091
word_tags = blackboard.post_process(word_tags, to_ud)
92+
elif corpus in ("tdtb"):
93+
word_tags = _tdtb().tag(words)
8194
elif corpus in ("tud"):
8295
tagger = _tud_tagger()
8396
word_tags = tagger.tag(words)

pythainlp/tag/pos_tag.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@ def pos_tag(
2828
* *pud* - `Parallel Universal Dependencies (PUD)\
2929
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
3030
treebanks, natively use Universal POS tags
31+
* *tdtb* - `Thai Discourse Treebank \
32+
<https://github.com/nlp-chula/thai-discourse-treebank/tree/main>`_ \
33+
, natively use Universal POS tags
3134
* *tnc* - Thai National Corpus (support tltk engine only)
35+
* *tdtb* - `Thai Discourse Treebank <https://github.com/nlp-chula/thai-discourse-treebank>`_
3236
* *tud* - `Thai Universal Dependency Treebank (TUD)\
3337
<https://github.com/nlp-chula/TUD>`_ \
3438
:return: a list of tuples (word, POS tag)
@@ -98,6 +102,7 @@ def pos_tag(
98102
"orchid",
99103
"orchid_ud",
100104
"pud",
105+
"tdtb",
101106
"tud",
102107
]
103108

pythainlp/tag/unigram.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
_PUD_FILENAME = "pos_ud_unigram-v0.2.json"
1818
_PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)
1919

20+
_TDTB_FILENAME = "tdtb-unigram_tagger.json"
21+
_TDTB_PATH = os.path.join(corpus_path(), _TDTB_FILENAME)
22+
2023
_BLACKBOARD_NAME = "blackboard_unigram_tagger"
2124

2225
_TUD_FILENAME = "pos_tud_unigram.json"
@@ -25,6 +28,7 @@
2528
_ORCHID_TAGGER = None
2629
_PUD_TAGGER = None
2730
_BLACKBOARD_TAGGER = None
31+
_TDTB_TAGGER = None
2832
_TUD_TAGGER = None
2933

3034

@@ -53,6 +57,14 @@ def _blackboard_tagger():
5357
return _BLACKBOARD_TAGGER
5458

5559

60+
def _thai_tdtb():
61+
global _TDTB_TAGGER
62+
if not _TDTB_TAGGER:
63+
with open(_TDTB_PATH, encoding="utf-8-sig") as fh:
64+
_TDTB_TAGGER = json.load(fh)
65+
return _TDTB_TAGGER
66+
67+
5668
def _tud_tagger():
5769
global _TUD_TAGGER
5870
if not _TUD_TAGGER:
@@ -94,6 +106,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
94106
words = blackboard.pre_process(words)
95107
word_tags = _find_tag(words, _blackboard_tagger())
96108
word_tags = blackboard.post_process(word_tags, to_ud)
109+
elif corpus in ("tdtb"):
110+
word_tags = _find_tag(words, _thai_tdtb())
97111
elif corpus in ("tud"):
98112
word_tags = _find_tag(words, _tud_tagger())
99113
else: # by default, use "pud" for corpus

tests/test_tag.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ def test_pos_tag(self):
7070
self.assertIsNotNone(
7171
pos_tag([""], engine="unigram", corpus="blackboard_ud")
7272
)
73+
self.assertIsNotNone(
74+
pos_tag(tokens, engine="unigram", corpus="tdtb")
75+
)
76+
self.assertIsNotNone(
77+
pos_tag([""], engine="unigram", corpus="tdtb")
78+
)
7379
self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="tud"))
7480
self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="tud"))
7581
self.assertEqual(
@@ -109,6 +115,12 @@ def test_pos_tag(self):
109115
self.assertIsNotNone(
110116
pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
111117
)
118+
self.assertIsNotNone(
119+
pos_tag(tokens, engine="perceptron", corpus="tdtb")
120+
)
121+
self.assertIsNotNone(
122+
pos_tag(tokens, engine="perceptron", corpus="tdtb")
123+
)
112124
self.assertIsNotNone(
113125
pos_tag(tokens, engine="perceptron", corpus="tud")
114126
)

0 commit comments

Comments
 (0)