Skip to content
2 changes: 1 addition & 1 deletion pythainlp/augment/word2vec/bpemb_wv.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
from pythainlp.augment.word2vec.core import Word2VecAug
from bpemb import BPEmb
from typing import List, Tuple


Expand All @@ -12,6 +11,7 @@ class BPEmbAug:
`github.com/bheinzerling/bpemb <https://github.com/bheinzerling/bpemb>`_
"""
def __init__(self, lang: str = "th", vs: int = 100000, dim: int = 300):
from bpemb import BPEmb
self.bpemb_temp = BPEmb(lang=lang, dim=dim, vs=vs)
self.model = self.bpemb_temp.emb
self.load_w2v()
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/augment/word2vec/core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
from typing import List, Tuple
import gensim.models.keyedvectors as word2vec
import itertools


Expand All @@ -13,6 +12,7 @@ def __init__(
:param object tokenize: tokenize function
:param str type: moodel type (file, binary)
"""
import gensim.models.keyedvectors as word2vec
self.tokenizer = tokenize
if type == "file":
self.model = word2vec.KeyedVectors.load_word2vec_format(model)
Expand Down
19 changes: 9 additions & 10 deletions pythainlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,29 @@
"""
Corpus related functions.
"""

import hashlib
import os
from typing import Union
from urllib.request import urlopen
import json

import requests
from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path
from pythainlp.tools import get_full_data_path
from requests.exceptions import HTTPError
import tarfile
import zipfile
import shutil
from pythainlp import __version__


_CHECK_MODE = os.getenv("PYTHAINLP_READ_MODE")


def get_corpus_db(url: str) -> requests.Response:
def get_corpus_db(url: str):
"""
Get corpus catalog from server.

:param str url: URL corpus catalog
"""
import requests
corpus_db = None
try:
corpus_db = requests.get(url)
except HTTPError as http_err:
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
except Exception as err:
print(f"Non-HTTP error occurred: {err}")
Expand Down Expand Up @@ -231,6 +224,8 @@ def _download(url: str, dst: str) -> int:
"""
_CHUNK_SIZE = 64 * 1024 # 64 KiB

import requests
from urllib.request import urlopen
file_size = int(urlopen(url).info().get("Content-Length", -1))
r = requests.get(url, stream=True)
with open(get_full_data_path(dst), "wb") as f:
Expand Down Expand Up @@ -262,6 +257,7 @@ def _check_hash(dst: str, md5: str) -> None:
@param: md5 place to hash the file (MD5)
"""
if md5 and md5 != "-":
import hashlib
with open(get_full_data_path(dst), "rb") as f:
content = f.read()
file_md5 = hashlib.md5(content).hexdigest()
Expand Down Expand Up @@ -423,13 +419,15 @@ def download(
foldername = None

if corpus_versions["is_tar_gz"] == "True":
import tarfile
is_folder = True
foldername = name+"_"+str(version)
if not os.path.exists(get_full_data_path(foldername)):
os.mkdir(get_full_data_path(foldername))
with tarfile.open(get_full_data_path(file_name)) as tar:
tar.extractall(path=get_full_data_path(foldername))
elif corpus_versions["is_zip"] == "True":
import zipfile
is_folder = True
foldername = name+"_"+str(version)
if not os.path.exists(get_full_data_path(foldername)):
Expand Down Expand Up @@ -520,6 +518,7 @@ def remove(name: str) -> bool:
if data:
path = get_corpus_path(name)
if data[0].get("is_folder"):
import shutil
os.remove(get_full_data_path(data[0].get("filename")))
shutil.rmtree(path, ignore_errors=True)
else:
Expand Down
5 changes: 1 addition & 4 deletions pythainlp/tag/_tag_perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,7 @@

This tagger is provided under the terms of the MIT License.
"""

from __future__ import absolute_import

import json
import random
from collections import defaultdict
from typing import Dict, Iterable, List, Tuple, Union

Expand Down Expand Up @@ -160,6 +156,7 @@ def train(
location.
:param nr_iter: Number of training iterations.
"""
import random
self._make_tagdict(sentences)
self.model.classes = self.classes
for _ in range(nr_iter):
Expand Down
2 changes: 1 addition & 1 deletion pythainlp/tag/thainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from typing import Dict, List, Tuple, Union

from pycrfsuite import Tagger as CRFTagger
from pythainlp.corpus import get_corpus_path, thai_stopwords
from pythainlp.tag import pos_tag
from pythainlp.tokenize import word_tokenize
Expand Down Expand Up @@ -98,6 +97,7 @@ def __init__(self, version: str = "1.5") -> None:
It's support Thai NER 1.4 & 1.5.
The defualt value is `1.5`
"""
from pycrfsuite import Tagger as CRFTagger
self.crf = CRFTagger()

if version == "1.4":
Expand Down
9 changes: 5 additions & 4 deletions pythainlp/tag/wangchanberta_onnx.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
# -*- coding: utf-8 -*-
from typing import List
import json
import sentencepiece as spm

import numpy as np
from onnxruntime import (
InferenceSession, SessionOptions, GraphOptimizationLevel
)
from pythainlp.corpus import get_path_folder_corpus


class WngchanBerta_ONNX:
def __init__(self, model_name: str, model_version: str, file_onnx: str, providers: List[str] = ['CPUExecutionProvider']) -> None:
import sentencepiece as spm
from onnxruntime import (
InferenceSession, SessionOptions, GraphOptimizationLevel
)
self.model_name = model_name
self.model_version = model_version
self.options = SessionOptions()
Expand Down
3 changes: 1 addition & 2 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"""
import re
from typing import Iterable, List, Union
import warnings

from pythainlp.tokenize import (
DEFAULT_SENT_TOKENIZE_ENGINE,
Expand All @@ -14,7 +13,6 @@
DEFAULT_WORD_DICT_TRIE,
DEFAULT_WORD_TOKENIZE_ENGINE,
)
from pythainlp import thai_characters
from pythainlp.util.trie import Trie, dict_trie


Expand Down Expand Up @@ -63,6 +61,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
_list_all = []
if isinstance(segments[0], str):
segments = [segments]
from pythainlp import thai_characters
for i, s in enumerate(segments):
_list_sents = []
_add_index = []
Expand Down
1 change: 0 additions & 1 deletion pythainlp/util/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"""
import re
from typing import List, Union
import warnings

from pythainlp import thai_above_vowels as above_v
from pythainlp import thai_below_vowels as below_v
Expand Down
1 change: 0 additions & 1 deletion pythainlp/util/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

Convert time string or time object to Thai words.
"""
import warnings
from datetime import datetime, time
from typing import Union

Expand Down
1 change: 0 additions & 1 deletion pythainlp/wangchanberta/postag.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import warnings
from transformers import (
CamembertTokenizer,
AutoTokenizer,
pipeline,
)

Expand Down