|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +from typing import List, Tuple, Union |
| 3 | +from tltk import nlp |
| 4 | +from pythainlp.tokenize import word_tokenize |
| 5 | + |
| 6 | +nlp.pos_load() |
| 7 | +nlp.ner_load() |
| 8 | + |
| 9 | + |
| 10 | +def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]: |
| 11 | + if corpus != "tnc": |
| 12 | + raise ValueError("tltk not support {0} corpus.".format(0)) |
| 13 | + return nlp.pos_tag_wordlist(words) |
| 14 | + |
| 15 | + |
| 16 | +def _post_process(text: str) -> str: |
| 17 | + return text.replace("<s/>", " ") |
| 18 | + |
| 19 | + |
| 20 | +def get_ner( |
| 21 | + text: str, |
| 22 | + pos: bool = True, |
| 23 | + tag: bool = False |
| 24 | +) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: |
| 25 | + """ |
| 26 | + Named-entity recognizer from **TLTK** |
| 27 | +
|
| 28 | + This function tags named-entitiy from text in IOB format. |
| 29 | +
|
| 30 | + :param str text: text in Thai to be tagged |
| 31 | + :param bool pos: To include POS tags in the results (`True`) or |
| 32 | + exclude (`False`). The defualt value is `True` |
| 33 | + :param bool tag: output like html tag. |
| 34 | + :return: a list of tuple associated with tokenized word, NER tag, |
| 35 | + POS tag (if the parameter `pos` is specified as `True`), |
| 36 | + and output like html tag (if the parameter `tag` is |
| 37 | + specified as `True`). |
| 38 | + Otherwise, return a list of tuple associated with tokenized |
| 39 | + word and NER tag |
| 40 | + :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str |
| 41 | +
|
| 42 | + :Example: |
| 43 | +
|
| 44 | + >>> from pythainlp.tag.tltk import get_ner |
| 45 | + >>> get_ner("เขาเรียนที่โรงเรียนนางรอง") |
| 46 | + [('เขา', 'PRON', 'O'), |
| 47 | + ('เรียน', 'VERB', 'O'), |
| 48 | + ('ที่', 'SCONJ', 'O'), |
| 49 | + ('โรงเรียน', 'NOUN', 'B-L'), |
| 50 | + ('นางรอง', 'VERB', 'I-L')] |
| 51 | + >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", pos=False) |
| 52 | + [('เขา', 'O'), |
| 53 | + ('เรียน', 'O'), |
| 54 | + ('ที่', 'O'), |
| 55 | + ('โรงเรียน', 'B-L'), |
| 56 | + ('นางรอง', 'I-L')] |
| 57 | + >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", tag=True) |
| 58 | + 'เขาเรียนที่<L>โรงเรียนนางรอง</L>' |
| 59 | + """ |
| 60 | + if not text: |
| 61 | + return [] |
| 62 | + list_word = [] |
| 63 | + for i in word_tokenize(text, engine="tltk"): |
| 64 | + if i == " ": |
| 65 | + i = "<s/>" |
| 66 | + list_word.append(i) |
| 67 | + _pos = nlp.pos_tag_wordlist(list_word) |
| 68 | + sent_ner = [ |
| 69 | + (_post_process(word), pos, ner) for word, pos, ner in nlp.ner(_pos) |
| 70 | + ] |
| 71 | + if tag: |
| 72 | + temp = "" |
| 73 | + sent = "" |
| 74 | + for idx, (word, pos, ner) in enumerate(sent_ner): |
| 75 | + if ner.startswith("B-") and temp != "": |
| 76 | + sent += "</" + temp + ">" |
| 77 | + temp = ner[2:] |
| 78 | + sent += "<" + temp + ">" |
| 79 | + elif ner.startswith("B-"): |
| 80 | + temp = ner[2:] |
| 81 | + sent += "<" + temp + ">" |
| 82 | + elif ner == "O" and temp != "": |
| 83 | + sent += "</" + temp + ">" |
| 84 | + temp = "" |
| 85 | + sent += word |
| 86 | + |
| 87 | + if idx == len(sent_ner) - 1 and temp != "": |
| 88 | + sent += "</" + temp + ">" |
| 89 | + |
| 90 | + return sent |
| 91 | + if pos is False: |
| 92 | + return [(word, ner) for word, pos, ner in sent_ner] |
| 93 | + return sent_ner |
0 commit comments