Skip to content

Commit 3600236

Browse files
authored
Merge pull request #802 from PyThaiNLP/add-coref
Add pythainlp.coref
2 parents 33c5b5a + f58d51a commit 3600236

File tree

12 files changed

+175
-6
lines changed

12 files changed

+175
-6
lines changed

.github/workflows/macos-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ jobs:
7373
pip install pytest coverage coveralls
7474
conda install -c conda-forge icu
7575
conda install -c conda-forge pyicu
76-
if [ -f docker_requirements.txt ]; then pip install -r docker_requirements.txt; fi
76+
if [ -f docker_requirements.txt ]; then SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt; fi
7777
pip install deepcut tltk
7878
pip install .[full]
7979
python -m nltk.downloader omw-1.4

.github/workflows/pypi-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
run: |
2323
python -m pip install --upgrade pip
2424
pip install deepcut tltk
25-
pip install -r https://raw.githubusercontent.com/PyThaiNLP/pythainlp/dev/docker_requirements.txt
25+
SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r https://raw.githubusercontent.com/PyThaiNLP/pythainlp/dev/docker_requirements.txt
2626
pip install pythainlp[full]
2727
python -m nltk.downloader omw-1.4
2828
- name: Test

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
run: |
3131
python -m pip install --upgrade pip
3232
pip install pytest coverage coveralls
33-
if [ -f docker_requirements.txt ]; then pip install -r docker_requirements.txt; fi
33+
if [ -f docker_requirements.txt ]; then SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt; fi
3434
pip install deepcut tltk
3535
pip install .[full]
3636
python -m nltk.downloader omw-1.4

docker_requirements.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ epitran==1.9
1010
sacremoses==0.0.41
1111
sentencepiece==0.1.91
1212
ssg==0.0.8
13-
torch==1.8.1
13+
torch==1.13.1
1414
fastai==1.0.61
1515
transformers==4.22.1
1616
phunspell==0.1.6
@@ -24,13 +24,14 @@ deepcut==0.7.0.0
2424
h5py==3.1.0
2525
tensorflow==2.9.3
2626
pandas==1.4.*
27-
tltk==1.3.8
27+
tltk==1.6.8
2828
OSKut==1.3
2929
nlpo3==1.2.6
3030
thai-nner==0.3
31-
spacy==2.3.*
31+
spacy==3.5.*
3232
wunsen==0.0.3
3333
khanaa==0.0.6
3434
spacy_thai==0.7.1
3535
esupar==1.3.8
3636
ufal.chu-liu-edmonds==1.0.2
37+
fastcoref==2.1.6

docs/api/coref.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
.. currentmodule:: pythainlp.coref
2+
3+
pythainlp.coref
4+
===============
5+
The :class:`pythainlp.coref` is Coreference Resolution for Thai.
6+
7+
Modules
8+
-------
9+
10+
.. autofunction:: coreference_resolution

docs/notes/installation.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ where ``extras`` can be
3535
- ``esupar`` (to support esupar engine)
3636
- ``transformers_ud`` (to support transformers_ud engine)
3737
- ``dependency_parsing`` (to support dependency parsing with all engine)
38+
- ``coreference_resolution`` (to support coreference esolution with all engine)
3839
- ``full`` (install everything)
3940

4041
For dependency details, look at `extras` variable in `setup.py <https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py>`_.

pythainlp/coref/__init__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (C) 2016-2023 PyThaiNLP Project
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
PyThaiNLP Coreference Resolution
17+
"""
18+
__all__ = ["coreference_resolution"]
19+
from pythainlp.coref.core import coreference_resolution

pythainlp/coref/_fastcoref.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (C) 2016-2023 PyThaiNLP Project
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
from typing import List
16+
import spacy
17+
18+
19+
class FastCoref:
20+
def __init__(self, model_name, nlp=spacy.blank("th"), device:str="cpu", type:str="FCoref") -> None:
21+
if type == "FCoref":
22+
from fastcoref import FCoref as _model
23+
else:
24+
from fastcoref import LingMessCoref as _model
25+
self.model_name = model_name
26+
self.nlp = nlp
27+
self.model = _model(self.model_name,device=device,nlp=self.nlp)
28+
29+
def _to_json(self, _predict):
30+
return {
31+
"text":_predict.text,
32+
"clusters_string":_predict.get_clusters(as_strings=True),
33+
"clusters":_predict.get_clusters(as_strings=False)
34+
}
35+
36+
37+
def predict(self, texts:List[str])->dict:
38+
return [self._to_json(i) for i in self.model.predict(texts=texts)]

pythainlp/coref/core.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (C) 2016-2023 PyThaiNLP Project
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
from typing import List
16+
model = None
17+
18+
19+
def coreference_resolution(texts:List[str], model_name:str="han-coref-v1.0", device:str="cpu"):
20+
"""
21+
Coreference Resolution
22+
23+
:param List[str] texts: list texts to do coreference resolution
24+
:param str model_name: coreference resolution model
25+
:param str device: device for running coreference resolution model (cpu, cuda, and other)
26+
:return: List txets of coreference resolution
27+
:rtype: List[dict]
28+
29+
:Options for model_name:
30+
* *han-coref-v1.0* - (default) Han-Corf: Thai oreference resolution by PyThaiNLP v1.0
31+
32+
:Example:
33+
::
34+
35+
from pythainlp.coref import coreference_resolution
36+
37+
print(
38+
coreference_resolution(
39+
["Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก"]
40+
)
41+
)
42+
# output:
43+
# [
44+
# {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก',
45+
# 'clusters_string': [['Bill Gates', 'ผม']],
46+
# 'clusters': [[(0, 10), (50, 52)]]}
47+
# ]
48+
"""
49+
global model
50+
if isinstance(texts, str):
51+
texts = [texts]
52+
if model == None and model_name=="han-coref-v1.0":
53+
from pythainlp.coref.han_coref import HanCoref
54+
model = HanCoref(device=device)
55+
return model.predict(texts)

pythainlp/coref/han_coref.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (C) 2016-2023 PyThaiNLP Project
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
from pythainlp.coref._fastcoref import FastCoref
16+
import spacy
17+
18+
19+
class HanCoref(FastCoref):
20+
def __init__(self,device:str="cpu",nlp=spacy.blank("th")) -> None:
21+
super(self.__class__, self).__init__(
22+
model_name="pythainlp/han-coref-v1.0",
23+
device=device,
24+
nlp=nlp
25+
)

0 commit comments

Comments
 (0)