Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ sentencepiece==0.1.91
ssg==0.0.8
torch==1.8.1
fastai==1.0.61
transformers==4.8.2
transformers==4.22.1
phunspell==0.1.6
spylls==0.1.5
symspellpy==6.7.6
Expand All @@ -31,3 +31,6 @@ thai-nner==0.3
spacy==2.3.*
wunsen==0.0.3
khanaa==0.0.6
spacy_thai==0.7.1
esupar==1.3.8
ufal.chu-liu-edmonds==1.0.2
10 changes: 10 additions & 0 deletions docs/api/parse.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
.. currentmodule:: pythainlp.parse

pythainlp.parse
===============
The :class:`pythainlp.parse` is dependency parsing for Thai.

Modules
-------

.. autofunction:: dependency_parsing
6 changes: 5 additions & 1 deletion docs/notes/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ where ``extras`` can be
- ``tltk`` (to support tltk)
- ``textaugment`` (to support text augmentation)
- ``oskut`` (to support OSKUT)
- ``nlpo3`` (to support nlpo3 enging)
- ``nlpo3`` (to support nlpo3 engine)
- ``spacy_thai`` (to support spacy_thai engine)
- ``esupar`` (to support esupar engine)
- ``transformers_ud`` (to support transformers_ud engine)
- ``dependency_parsing`` (to support dependency parsing with all engine)
- ``full`` (install everything)

For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.
Expand Down
8 changes: 8 additions & 0 deletions pythainlp/parse/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
"""
PyThaiNLP Parse
"""
__all__ = [
"dependency_parsing"
]
from pythainlp.parse.core import dependency_parsing
92 changes: 92 additions & 0 deletions pythainlp/parse/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# -*- coding: utf-8 -*-
_tagger = None
_tagger_name = ""

def dependency_parsing(text: str, model: str=None, engine: str="esupar")->str:
"""
Dependency Parsing

:param str text: text to do dependency parsing
:param str model: model for using with engine \
(for esupar and transformers_ud)
:param str engine: the name dependency parser
:return: str (conllu)

**Options for engine**
* *esupar* (default) - Tokenizer POS-tagger and Dependency-parser \
with BERT/RoBERTa/DeBERTa model. `GitHub \
<https://github.com/KoichiYasuoka/esupar>`_
* *spacy_thai* - Tokenizer, POS-tagger, and dependency-parser \
for Thai language, working on Universal Dependencies. \
`GitHub <https://github.com/KoichiYasuoka/spacy-thai>`_
* *transformers_ud* - TransformersUD \
`GitHub <https://github.com/KoichiYasuoka/>`_

**Options for model (esupar engine)**
* *th* (default) - KoichiYasuoka/roberta-base-thai-spm-upos model \
`Huggingface \
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-upos>`_
* *KoichiYasuoka/deberta-base-thai-upos* - DeBERTa(V2) model \
pre-trained on Thai Wikipedia texts for POS-tagging and \
dependency-parsing `Huggingface \
<https://huggingface.co/KoichiYasuoka/deberta-base-thai-upos>`_
* *KoichiYasuoka/roberta-base-thai-syllable-upos* - RoBERTa model \
pre-trained on Thai Wikipedia texts for POS-tagging and \
dependency-parsing. (syllable level) `Huggingface \
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-syllable-upos>`_
* *KoichiYasuoka/roberta-base-thai-char-upos* - RoBERTa model \
pre-trained on Thai Wikipedia texts for POS-tagging \
and dependency-parsing. (char level) `Huggingface \
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-char-upos>`_

If you want to train model for esupar, you can read \
`Huggingface <https://github.com/KoichiYasuoka/esupar>`_

**Options for model (transformers_ud engine)**
* *KoichiYasuoka/deberta-base-thai-ud-head* (default) - \
DeBERTa(V2) model pretrained on Thai Wikipedia texts \
for dependency-parsing (head-detection on Universal \
Dependencies) as question-answering, derived from \
deberta-base-thai. \
trained by th_blackboard.conll. `Huggingface \
<https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head>`_
* *KoichiYasuoka/roberta-base-thai-spm-ud-head* - \
roberta model pretrained on Thai Wikipedia texts \
for dependency-parsing. `Huggingface \
<https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm-ud-head>`_

:Example:
::

from pythainlp.parse import dependency_parsing

print(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
# output:
# 1 ผม _ PRON _ _ 3 nsubj _ SpaceAfter=No
# 2 เป็น _ VERB _ _ 3 cop _ SpaceAfter=No
# 3 คน _ NOUN _ _ 0 root _ SpaceAfter=No
# 4 ดี _ VERB _ _ 3 acl _ SpaceAfter=No

print(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))
# output:
# 1 ผม PRON PPRS _ 2 nsubj _ SpaceAfter=No
# 2 เป็น VERB VSTA _ 0 ROOT _ SpaceAfter=No
# 3 คนดี NOUN NCMN _ 2 obj _ SpaceAfter=No
"""
global _tagger, _tagger_name
if _tagger_name != engine:
if engine == "esupar":
from pythainlp.parse.esupar_engine import Parse
_tagger = Parse(model=model)
elif engine == "transformers_ud":
from pythainlp.parse.transformers_ud import Parse
_tagger = Parse(model=model)
elif engine == "spacy_thai":
from pythainlp.parse.spacy_thai_engine import Parse
_tagger = Parse()
else:
raise NotImplementedError(
"The engine doesn't support."
)
_tagger_name = engine
return _tagger(text)
17 changes: 17 additions & 0 deletions pythainlp/parse/esupar_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
"""
esupar: Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models for Japanese and other languages

GitHub: https://github.com/KoichiYasuoka/esupar
"""
import esupar


class Parse:
def __init__(self, model: str="th") -> None:
if model == None:
model = "th"
self.nlp=esupar.load(model)

def __call__(self, text):
return self.nlp(text)
19 changes: 19 additions & 0 deletions pythainlp/parse/spacy_thai_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
"""
spacy_thai: Tokenizer, POS-tagger, and dependency-parser for Thai language, working on Universal Dependencies.

GitHub: https://github.com/KoichiYasuoka/spacy-thai
"""
import spacy_thai


class Parse:
def __init__(self, model: str="th") -> None:
self.nlp=spacy_thai.load()

def __call__(self, text:str)->str:
doc = self.nlp(text)
_text = []
for t in doc:
_text.append("\t".join([str(t.i+1),t.orth_,t.lemma_,t.pos_,t.tag_,"_",str(0 if t.head==t else t.head.i+1),t.dep_,"_","_" if t.whitespace_ else "SpaceAfter=No"]))
return '\n'.join(_text)
81 changes: 81 additions & 0 deletions pythainlp/parse/transformers_ud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
"""
TransformersUD

Author: Prof. Koichi Yasuoka

This tagger is provided under the terms of the apache-2.0 License.

The source: https://huggingface.co/KoichiYasuoka/deberta-base-thai-ud-head

GitHub: https://github.com/KoichiYasuoka
"""
import os
import numpy
import torch
import ufal.chu_liu_edmonds
from transformers import (
AutoTokenizer,
AutoModelForQuestionAnswering,
AutoModelForTokenClassification,
AutoConfig,
TokenClassificationPipeline
)
from transformers.utils import cached_file


class Parse:
def __init__(self, model: str="KoichiYasuoka/deberta-base-thai-ud-head") -> None:
if model == None:
model = "KoichiYasuoka/deberta-base-thai-ud-head"
self.tokenizer=AutoTokenizer.from_pretrained(model)
self.model=AutoModelForQuestionAnswering.from_pretrained(model)
x=AutoModelForTokenClassification.from_pretrained
if os.path.isdir(model):
d,t=x(os.path.join(model,"deprel")),x(os.path.join(model,"tagger"))
else:
c=AutoConfig.from_pretrained(cached_file(model,"deprel/config.json"))
d=x(cached_file(model,"deprel/pytorch_model.bin"),config=c)
s=AutoConfig.from_pretrained(cached_file(model,"tagger/config.json"))
t=x(cached_file(model,"tagger/pytorch_model.bin"),config=s)
self.deprel=TokenClassificationPipeline(
model=d,
tokenizer=self.tokenizer,
aggregation_strategy="simple"
)
self.tagger=TokenClassificationPipeline(
model=t,
tokenizer=self.tokenizer
)

def __call__(self, text: str)->str:
w=[(t["start"],t["end"],t["entity_group"]) for t in self.deprel(text)]
z,n={t["start"]:t["entity"].split("|") for t in self.tagger(text)},len(w)
r,m=[text[s:e] for s,e,p in w],numpy.full((n+1,n+1),numpy.nan)
v,c=self.tokenizer(r,add_special_tokens=False)["input_ids"],[]
for i,t in enumerate(v):
q=[self.tokenizer.cls_token_id]+t+[self.tokenizer.sep_token_id]
c.append([q]+v[0:i]+[[self.tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]])
b=[[len(sum(x[0:j+1],[])) for j in range(len(x))] for x in c]
with torch.no_grad():
d=self.model(
input_ids=torch.tensor([sum(x,[]) for x in c]),
token_type_ids=torch.tensor([[0]*x[0]+[1]*(x[-1]-x[0]) for x in b])
)
s,e=d.start_logits.tolist(),d.end_logits.tolist()
for i in range(n):
for j in range(n):
m[i+1,0 if i==j else j+1]=s[i][b[i][j]]+e[i][b[i][j+1]-1]
h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
if [0 for i in h if i==0]!=[0]:
i=([p for s,e,p in w]+["root"]).index("root")
j=i+1 if i<n else numpy.nanargmax(m[:,0])
m[0:j,0]=m[j+1:,0]=numpy.nan
h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
u=""
for i,(s,e,p) in enumerate(w,1):
p="root" if h[i]==0 else "dep" if p=="root" else p
u+="\t".join(
[str(i),r[i-1],"_",z[s][0][2:],"_","|".join(z[s][1:]),str(h[i]),p,"_","_" if i<n and e<w[i][0] else "SpaceAfter=No"]
)+"\n"
return u+"\n"
23 changes: 21 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,22 @@
"onnxruntime>=1.10.0"
],
"thai_nner": ["thai_nner"],
"esupar": [
"esupar>=1.3.8",
"numpy",
"transformers>=4.22.1",
],
"spacy_thai": ["spacy_thai>=0.7.1"],
"transformers_ud": [
"ufal.chu-liu-edmonds>=1.0.2",
"transformers>=4.22.1",
],
"dependency_parsing": [
"esupar>=1.3.8",
"spacy_thai>=0.7.1",
"ufal.chu-liu-edmonds>=1.0.2",
"transformers>=4.22.1",
],
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
Expand All @@ -98,7 +114,7 @@
"torch>=1.0.0",
"fastai<2.0",
"bpemb>=0.3.2",
"transformers>=4.6.0",
"transformers>=4.22.1",
"sefr_cut>=1.1",
"phunspell>=0.1.6",
"spylls>=0.1.5",
Expand All @@ -108,7 +124,10 @@
"nlpo3>=1.2.2",
"onnxruntime>=1.10.0",
"thai_nner",
"wunsen>=0.0.3"
"wunsen>=0.0.3",
"spacy_thai>=0.7.1",
"esupar>=1.3.8",
"ufal.chu-liu-edmonds>=1.0.2",
],
}

Expand Down
11 changes: 11 additions & 0 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

import unittest
from pythainlp.parse import dependency_parsing


class TestParsePackage(unittest.TestCase):
def test_dependency_parsing(self):
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="esupar"))
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="transformers_ud"))
self.assertIsNotNone(dependency_parsing("ผมเป็นคนดี", engine="spacy_thai"))