From 47f3f67118e1263f1208b2487f264e6bfd55f5c2 Mon Sep 17 00:00:00 2001 From: c4n Date: Mon, 17 Jul 2023 14:42:12 +0700 Subject: [PATCH 1/8] Add parameter-free classification model --- notebooks/test_gzip_cls.ipynb | 94 +++++++++++++++++++++++++++++++++++ pythainlp/cls/__init__.py | 21 ++++++++ pythainlp/cls/param_free.py | 51 +++++++++++++++++++ 3 files changed, 166 insertions(+) create mode 100644 notebooks/test_gzip_cls.ipynb create mode 100644 pythainlp/cls/__init__.py create mode 100644 pythainlp/cls/param_free.py diff --git a/notebooks/test_gzip_cls.ipynb b/notebooks/test_gzip_cls.ipynb new file mode 100644 index 000000000..82641b656 --- /dev/null +++ b/notebooks/test_gzip_cls.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1480d345", + "metadata": {}, + "outputs": [], + "source": [ + "import pythainlp.cls.param_free" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "78fca9fa", + "metadata": {}, + "outputs": [], + "source": [ + "training_data = [\n", + " (\"รายละเอียดตามนี้เลยค่าา ^^\", \"Neutral\"),\n", + " (\"กลัวพวกมึงหาย อดกินบาบิก้อน\", \"Neutral\"),\n", + " (\"บริการแย่มากก เป็นหมอได้ไง😤\", \"Negative\"),\n", + " (\"ขับรถแย่มาก\", \"Negative\"),\n", + " (\"ดีนะครับ\", \"Positive\"),\n", + " (\"ลองแล้วรสนี้อร่อย... ชอบๆ\", \"Positive\"),\n", + " (\"ฉันรู้สึกโกรธ เวลามือถือแบตหมด\", \"Negative\"),\n", + " (\"เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ\", \"Positive\"),\n", + " (\"นี่เป็นบทความหนึ่ง\", \"Neutral\")\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "94c1495f", + "metadata": {}, + "outputs": [], + "source": [ + "model = pythainlp.cls.param_free.GzipModel(training_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f4d047db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Positive'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(\"ควย\", k=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6529c3e9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pythainlp/cls/__init__.py b/pythainlp/cls/__init__.py new file mode 100644 index 000000000..4666b39b0 --- /dev/null +++ b/pythainlp/cls/__init__.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +pythainlp.chat +""" + +__all__ = ["GzipModel"] + +from pythainlp.cls.param_free import GzipModel diff --git a/pythainlp/cls/param_free.py b/pythainlp/cls/param_free.py new file mode 100644 index 000000000..f28a20a1d --- /dev/null +++ b/pythainlp/cls/param_free.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gzip +import numpy as np +from typing import Dict, List, Tuple, Union + +class GzipModel(): + """ + This class is a reimplemenatation of “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors (Jiang et al., Findings 2023) + + :param list training_data: list [(text_sample,label)] + """ + def __init__(self, training_data: List[Tuple[str, str]]): + self.training_data = np.array(training_data) + self.Cx2_list = self.train() + + def train(self): + Cx2_list = list() + for i in range(len(self.training_data)): + Cx2_list.append(len(gzip.compress(self.training_data[i][0].encode('utf-8')))) + return Cx2_list + + def predict (self, x1: str, k: int = 1): + Cx1 = len(gzip.compress(x1.encode('utf-8'))) + disance_from_x1 = [] + for i in range(len(self.Cx2_list)): + x2 = self.training_data[i][0] + Cx2 = self.Cx2_list[i] + x1x2 = "".join([x1,x2]) + Cx1x2 = len(gzip.compress(x1x2.encode('utf-8'))) + #normalized compression distance + ncd = (Cx1x2 - min(Cx1,Cx2)) / max(Cx1,Cx2) + disance_from_x1.append(ncd) + sorted_idx = np.argsort(np.array(disance_from_x1)) + top_k_class = self.training_data[sorted_idx[:k], 1] + _, counts = np.unique(top_k_class, return_counts=True) + predict_class = top_k_class[counts.argmax()] + return predict_class \ No newline at end of file From e074bece2861f244f61fbb775b68b94b9b6b3a21 Mon Sep 17 00:00:00 2001 From: c4n Date: Mon, 17 Jul 2023 14:45:19 +0700 Subject: [PATCH 2/8] refactor using black --- pythainlp/cls/param_free.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/pythainlp/cls/param_free.py b/pythainlp/cls/param_free.py index f28a20a1d..c81e89e1c 100644 --- a/pythainlp/cls/param_free.py +++ b/pythainlp/cls/param_free.py @@ -17,35 +17,39 @@ import numpy as np from typing import Dict, List, Tuple, Union -class GzipModel(): + +class GzipModel: """ This class is a reimplemenatation of “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors (Jiang et al., Findings 2023) - + :param list training_data: list [(text_sample,label)] """ + def __init__(self, training_data: List[Tuple[str, str]]): self.training_data = np.array(training_data) self.Cx2_list = self.train() def train(self): - Cx2_list = list() + Cx2_list = list() for i in range(len(self.training_data)): - Cx2_list.append(len(gzip.compress(self.training_data[i][0].encode('utf-8')))) + Cx2_list.append( + len(gzip.compress(self.training_data[i][0].encode("utf-8"))) + ) return Cx2_list - - def predict (self, x1: str, k: int = 1): - Cx1 = len(gzip.compress(x1.encode('utf-8'))) + + def predict(self, x1: str, k: int = 1): + Cx1 = len(gzip.compress(x1.encode("utf-8"))) disance_from_x1 = [] for i in range(len(self.Cx2_list)): x2 = self.training_data[i][0] Cx2 = self.Cx2_list[i] - x1x2 = "".join([x1,x2]) - Cx1x2 = len(gzip.compress(x1x2.encode('utf-8'))) - #normalized compression distance - ncd = (Cx1x2 - min(Cx1,Cx2)) / max(Cx1,Cx2) + x1x2 = "".join([x1, x2]) + Cx1x2 = len(gzip.compress(x1x2.encode("utf-8"))) + # normalized compression distance + ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2) disance_from_x1.append(ncd) sorted_idx = np.argsort(np.array(disance_from_x1)) top_k_class = self.training_data[sorted_idx[:k], 1] _, counts = np.unique(top_k_class, return_counts=True) predict_class = top_k_class[counts.argmax()] - return predict_class \ No newline at end of file + return predict_class From fb24e556a458c47b5bf8f75ac9853dc4ef15dcc6 Mon Sep 17 00:00:00 2001 From: c4n Date: Mon, 17 Jul 2023 16:37:01 +0700 Subject: [PATCH 3/8] Fix typo --- pythainlp/cls/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/cls/__init__.py b/pythainlp/cls/__init__.py index 4666b39b0..f48462c7f 100644 --- a/pythainlp/cls/__init__.py +++ b/pythainlp/cls/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -pythainlp.chat +pythainlp.cls """ __all__ = ["GzipModel"] From e0bd22d2bc88ceccb01ba22130574330cd3c5c02 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 17 Jul 2023 16:53:30 +0700 Subject: [PATCH 4/8] Create test_cls.py --- tests/test_cls.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 tests/test_cls.py diff --git a/tests/test_cls.py b/tests/test_cls.py new file mode 100644 index 000000000..0cb8b0b0c --- /dev/null +++ b/tests/test_cls.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +import unittest +from pythainlp.cls import GzipModel + + +class TestClsPackage(unittest.TestCase): + def test_GzipModel(self): + training_data = [ + ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"), + ("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"), + ("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"), + ("ขับรถแย่มาก", "Negative"), + ("ดีนะครับ", "Positive"), + ("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"), + ("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"), + ("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"), + ("นี่เป็นบทความหนึ่ง", "Neutral") + ] + model = GzipModel(training_data) + self.assertIsNotNone(model.predict("รู้สึกดีจัง", k=3)) From 057b9dfb1aa316ad2d5959580a2e10b18ebbf9af Mon Sep 17 00:00:00 2001 From: c4n Date: Mon, 17 Jul 2023 19:52:57 +0700 Subject: [PATCH 5/8] change example --- notebooks/test_gzip_cls.ipynb | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/notebooks/test_gzip_cls.ipynb b/notebooks/test_gzip_cls.ipynb index 82641b656..86666b355 100644 --- a/notebooks/test_gzip_cls.ipynb +++ b/notebooks/test_gzip_cls.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "f4d047db", "metadata": {}, "outputs": [ @@ -52,22 +52,14 @@ "'Positive'" ] }, - "execution_count": 8, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "model.predict(\"ควย\", k=3)" + "model.predict(\"ฉันดีใจ\", k=1)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6529c3e9", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From f9e5e235a7c68f717f2b9064ae650d983cd61555 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 17 Jul 2023 21:19:01 +0700 Subject: [PATCH 6/8] Update test_cls.py --- tests/test_cls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cls.py b/tests/test_cls.py index 0cb8b0b0c..70f56b676 100644 --- a/tests/test_cls.py +++ b/tests/test_cls.py @@ -17,4 +17,4 @@ def test_GzipModel(self): ("นี่เป็นบทความหนึ่ง", "Neutral") ] model = GzipModel(training_data) - self.assertIsNotNone(model.predict("รู้สึกดีจัง", k=3)) + self.assertEqual(model.predict("รู้สึกดีจัง", k=1), "Positive") From 4a7076b036efa1f135e111b329f82b3936f09294 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 17 Jul 2023 21:23:13 +0700 Subject: [PATCH 7/8] Create cls.rst --- docs/api/cls.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/api/cls.rst diff --git a/docs/api/cls.rst b/docs/api/cls.rst new file mode 100644 index 000000000..3a3f13426 --- /dev/null +++ b/docs/api/cls.rst @@ -0,0 +1,7 @@ +.. currentmodule:: pythainlp.cls + +pythainlp.cls +============= + +.. autoclass:: GzipModel + :members: From acf4533d9229ae31665f406f545adb30066798ad Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Mon, 17 Jul 2023 21:55:00 +0700 Subject: [PATCH 8/8] Update test_cls.py --- tests/test_cls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cls.py b/tests/test_cls.py index 70f56b676..984b5f4a4 100644 --- a/tests/test_cls.py +++ b/tests/test_cls.py @@ -17,4 +17,4 @@ def test_GzipModel(self): ("นี่เป็นบทความหนึ่ง", "Neutral") ] model = GzipModel(training_data) - self.assertEqual(model.predict("รู้สึกดีจัง", k=1), "Positive") + self.assertEqual(model.predict("ฉันดีใจ", k=1), "Positive")