Skip to content

Commit 753cd6c

Browse files
authored
Merge branch 'dev' into add-th_tdtb
2 parents 9819bf0 + c08d6eb commit 753cd6c

17 files changed

+70
-17
lines changed

CITATION.cff

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cff-version: 1.1.0
1+
cff-version: "1.2.0"
22
message: "If you use this software, please cite it as below."
33
authors:
44
- family-names: "Phatthiyaphaibun"
@@ -20,5 +20,6 @@ authors:
2020
given-names: "Pattarawat"
2121
orcid: "https://orcid.org/0000-0000-0000-0000"
2222
title: "PyThaiNLP: Thai Natural Language Processing in Python"
23-
version: v5.0.2
24-
date-released: 2024-04-03
23+
version: v5.0.4
24+
license: Apache-2.0
25+
date-released: 2024-06-02

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
2323
2424
| Version | Description | Status |
2525
|:------:|:--:|:------:|
26-
| [5.0.2](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
26+
| [5.0.4](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
2727
| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.1 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/900) |
2828

2929
## Getting Started

README_TH.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ PyThaiNLP เป็นไลบารีภาษาไพทอนสำหร
2020
2121
| รุ่น | คำอธิบาย | สถานะ |
2222
|:------:|:--:|:------:|
23-
| [5.0.2](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
23+
| [5.0.4](https://github.com/PyThaiNLP/pythainlp/releases) | Stable | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/788) |
2424
| [`dev`](https://github.com/PyThaiNLP/pythainlp/tree/dev) | Release Candidate for 5.1 | [Change Log](https://github.com/PyThaiNLP/pythainlp/issues/900) |
2525

2626
ติดตามพวกเราบน [PyThaiNLP Facebook page](https://www.facebook.com/pythainlp/) เพื่อรับข่าวสารเพิ่มเติม

pythainlp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
33
# SPDX-License-Identifier: Apache-2.0
4-
__version__ = "5.0.2"
4+
__version__ = "5.0.4"
55

66
thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" # 44 chars
77

pythainlp/corpus/corpus_license.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,15 @@ https://creativecommons.org/licenses/by/4.0/
4646

4747
| Filename | Description |
4848
| ------------------------- | ----------------------------------------------------------------------------------------------------- |
49-
| pos_orchid_perceptron.pkl | Part-of-speech tagging model, trained from ORCHID data, using perceptron |
49+
| pos_orchid_perceptron.json | Part-of-speech tagging model, trained from ORCHID data, using perceptron |
5050
| pos_orchid_unigram.json | Part-of-speech tagging model, trained from ORCHID data, using unigram |
51-
| pos_ud_perceptron.pkl | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
52-
| pos_ud_unigram.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
51+
| pos_ud_perceptron-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
52+
| pos_ud_unigram-v0.2.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
5353
| sentenceseg_crfcut.model | Sentence segmentation model, trained from TED subtitles, using CRF |
5454
| tdtb-pt_tagger.json | Part-of-speech tagging model, trained from The Thai Discourse Treebank, using perceptron |
5555
| tdtb-unigram_tagger.json | Part-of-speech tagging model, trained from The Thai Discourse Treebank, using unigram |
56+
| pos_tud_perceptron.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using perceptron |
57+
| pos_tud_unigram.json | Part-of-speech tagging model, trained from Thai Universal Dependency Treebank data, using unigram |
5658

5759

5860
## Thai Dictionary for ICU BreakIterator

pythainlp/corpus/pos_tud_perceptron.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

pythainlp/corpus/pos_tud_unigram.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

pythainlp/tag/perceptron.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,14 @@
2121

2222
_BLACKBOARD_NAME = "blackboard_pt_tagger"
2323

24+
_TUD_FILENAME = "pos_tud_perceptron.json"
25+
_TUD_PATH = os.path.join(corpus_path(), _TUD_FILENAME)
26+
2427
_ORCHID_TAGGER = None
2528
_PUD_TAGGER = None
2629
_BLACKBOARD_TAGGER = None
2730
_TDTB_TAGGER = None
31+
_TUD_TAGGER = None
2832

2933

3034
def _orchid_tagger():
@@ -55,6 +59,13 @@ def _tdtb():
5559
return _TDTB_TAGGER
5660

5761

62+
def _tud_tagger():
63+
global _TUD_TAGGER
64+
if not _TUD_TAGGER:
65+
_TUD_TAGGER = PerceptronTagger(path=_TUD_PATH)
66+
return _TUD_TAGGER
67+
68+
5869
def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
5970
"""
6071
:param list words: a list of tokenized words
@@ -80,6 +91,9 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
8091
word_tags = blackboard.post_process(word_tags, to_ud)
8192
elif corpus in ("tdtb"):
8293
word_tags = _tdtb().tag(words)
94+
elif corpus in ("tud"):
95+
tagger = _tud_tagger()
96+
word_tags = tagger.tag(words)
8397
else: # by default, use "pud" for corpus
8498
tagger = _pud_tagger()
8599
word_tags = tagger.tag(words)

pythainlp/tag/pos_tag.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ def pos_tag(
3232
<https://github.com/nlp-chula/thai-discourse-treebank/tree/main>`_ \
3333
, natively use Universal POS tags
3434
* *tnc* - Thai National Corpus (support tltk engine only)
35+
* *tdtb* - `Thai Discourse Treebank <https://github.com/nlp-chula/thai-discourse-treebank>`_
36+
* *tud* - `Thai Universal Dependency Treebank (TUD)\
37+
<https://github.com/nlp-chula/TUD>`_ \
3538
:return: a list of tuples (word, POS tag)
3639
:rtype: list[tuple[str, str]]
3740
@@ -100,6 +103,7 @@ def pos_tag(
100103
"orchid_ud",
101104
"pud",
102105
"tdtb",
106+
"tud",
103107
]
104108

105109
if engine == "perceptron" and corpus in _support_corpus:

pythainlp/tag/thainer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ class ThaiNameTagger:
8383
:Example:
8484
::
8585
86-
from pythainlp.tag.named_entity import ThaiNameTagger
86+
from pythainlp.tag.thainer import ThaiNameTagger
8787
8888
thainer14 = ThaiNameTagger(version="1.4")
8989
thainer14.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")
@@ -133,7 +133,7 @@ def get_ner(
133133
134134
:Example:
135135
136-
>>> from pythainlp.tag.named_entity import ThaiNameTagger
136+
>>> from pythainlp.tag.thainer import ThaiNameTagger
137137
>>>
138138
>>> ner = ThaiNameTagger()
139139
>>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.")

0 commit comments

Comments
 (0)