diff --git a/docs/api/cls.rst b/docs/api/cls.rst new file mode 100644 index 000000000..3a3f13426 --- /dev/null +++ b/docs/api/cls.rst @@ -0,0 +1,7 @@ +.. currentmodule:: pythainlp.cls + +pythainlp.cls +============= + +.. autoclass:: GzipModel + :members: diff --git a/notebooks/test_gzip_cls.ipynb b/notebooks/test_gzip_cls.ipynb new file mode 100644 index 000000000..86666b355 --- /dev/null +++ b/notebooks/test_gzip_cls.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1480d345", + "metadata": {}, + "outputs": [], + "source": [ + "import pythainlp.cls.param_free" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "78fca9fa", + "metadata": {}, + "outputs": [], + "source": [ + "training_data = [\n", + " (\"รายละเอียดตามนี้เลยค่าา ^^\", \"Neutral\"),\n", + " (\"กลัวพวกมึงหาย อดกินบาบิก้อน\", \"Neutral\"),\n", + " (\"บริการแย่มากก เป็นหมอได้ไง😤\", \"Negative\"),\n", + " (\"ขับรถแย่มาก\", \"Negative\"),\n", + " (\"ดีนะครับ\", \"Positive\"),\n", + " (\"ลองแล้วรสนี้อร่อย... ชอบๆ\", \"Positive\"),\n", + " (\"ฉันรู้สึกโกรธ เวลามือถือแบตหมด\", \"Negative\"),\n", + " (\"เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ\", \"Positive\"),\n", + " (\"นี่เป็นบทความหนึ่ง\", \"Neutral\")\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "94c1495f", + "metadata": {}, + "outputs": [], + "source": [ + "model = pythainlp.cls.param_free.GzipModel(training_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f4d047db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Positive'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(\"ฉันดีใจ\", k=1)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pythainlp/cls/__init__.py b/pythainlp/cls/__init__.py new file mode 100644 index 000000000..f48462c7f --- /dev/null +++ b/pythainlp/cls/__init__.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +pythainlp.cls +""" + +__all__ = ["GzipModel"] + +from pythainlp.cls.param_free import GzipModel diff --git a/pythainlp/cls/param_free.py b/pythainlp/cls/param_free.py new file mode 100644 index 000000000..c81e89e1c --- /dev/null +++ b/pythainlp/cls/param_free.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gzip +import numpy as np +from typing import Dict, List, Tuple, Union + + +class GzipModel: + """ + This class is a reimplemenatation of “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors (Jiang et al., Findings 2023) + + :param list training_data: list [(text_sample,label)] + """ + + def __init__(self, training_data: List[Tuple[str, str]]): + self.training_data = np.array(training_data) + self.Cx2_list = self.train() + + def train(self): + Cx2_list = list() + for i in range(len(self.training_data)): + Cx2_list.append( + len(gzip.compress(self.training_data[i][0].encode("utf-8"))) + ) + return Cx2_list + + def predict(self, x1: str, k: int = 1): + Cx1 = len(gzip.compress(x1.encode("utf-8"))) + disance_from_x1 = [] + for i in range(len(self.Cx2_list)): + x2 = self.training_data[i][0] + Cx2 = self.Cx2_list[i] + x1x2 = "".join([x1, x2]) + Cx1x2 = len(gzip.compress(x1x2.encode("utf-8"))) + # normalized compression distance + ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2) + disance_from_x1.append(ncd) + sorted_idx = np.argsort(np.array(disance_from_x1)) + top_k_class = self.training_data[sorted_idx[:k], 1] + _, counts = np.unique(top_k_class, return_counts=True) + predict_class = top_k_class[counts.argmax()] + return predict_class diff --git a/tests/test_cls.py b/tests/test_cls.py new file mode 100644 index 000000000..984b5f4a4 --- /dev/null +++ b/tests/test_cls.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +import unittest +from pythainlp.cls import GzipModel + + +class TestClsPackage(unittest.TestCase): + def test_GzipModel(self): + training_data = [ + ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"), + ("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"), + ("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"), + ("ขับรถแย่มาก", "Negative"), + ("ดีนะครับ", "Positive"), + ("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"), + ("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"), + ("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"), + ("นี่เป็นบทความหนึ่ง", "Neutral") + ] + model = GzipModel(training_data) + self.assertEqual(model.predict("ฉันดีใจ", k=1), "Positive")