PyThaiNLP · wannaphong · Jul 18, 2023 · Jul 17, 2023 · Jul 17, 2023 · Jul 17, 2023
diff --git a/docs/api/cls.rst b/docs/api/cls.rst
@@ -0,0 +1,7 @@
+.. currentmodule:: pythainlp.cls
+
+pythainlp.cls
+=============
+
+.. autoclass:: GzipModel
+   :members:
diff --git a/notebooks/test_gzip_cls.ipynb b/notebooks/test_gzip_cls.ipynb
@@ -0,0 +1,86 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1480d345",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pythainlp.cls.param_free"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "78fca9fa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_data =  [\n",
+    "        (\"รายละเอียดตามนี้เลยค่าา ^^\", \"Neutral\"),\n",
+    "        (\"กลัวพวกมึงหาย อดกินบาบิก้อน\", \"Neutral\"),\n",
+    "        (\"บริการแย่มากก เป็นหมอได้ไง😤\", \"Negative\"),\n",
+    "        (\"ขับรถแย่มาก\", \"Negative\"),\n",
+    "        (\"ดีนะครับ\", \"Positive\"),\n",
+    "        (\"ลองแล้วรสนี้อร่อย... ชอบๆ\", \"Positive\"),\n",
+    "        (\"ฉันรู้สึกโกรธ เวลามือถือแบตหมด\", \"Negative\"),\n",
+    "        (\"เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ\", \"Positive\"),\n",
+    "        (\"นี่เป็นบทความหนึ่ง\", \"Neutral\")\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "94c1495f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = pythainlp.cls.param_free.GzipModel(training_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f4d047db",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Positive'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.predict(\"ฉันดีใจ\", k=1)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pythainlp/cls/__init__.py b/pythainlp/cls/__init__.py
@@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+pythainlp.cls
+"""
+
+__all__ = ["GzipModel"]
+
+from pythainlp.cls.param_free import GzipModel
diff --git a/pythainlp/cls/param_free.py b/pythainlp/cls/param_free.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gzip
+import numpy as np
+from typing import Dict, List, Tuple, Union
+
+
+class GzipModel:
+    """
+    This class is a reimplemenatation of “Low-Resource” Text Classification: A Parameter-Free Classification Method with Compressors (Jiang et al., Findings 2023)
+
+    :param list training_data: list [(text_sample,label)]
+    """
+
+    def __init__(self, training_data: List[Tuple[str, str]]):
+        self.training_data = np.array(training_data)
+        self.Cx2_list = self.train()
+
+    def train(self):
+        Cx2_list = list()
+        for i in range(len(self.training_data)):
+            Cx2_list.append(
+                len(gzip.compress(self.training_data[i][0].encode("utf-8")))
+            )
+        return Cx2_list
+
+    def predict(self, x1: str, k: int = 1):
+        Cx1 = len(gzip.compress(x1.encode("utf-8")))
+        disance_from_x1 = []
+        for i in range(len(self.Cx2_list)):
+            x2 = self.training_data[i][0]
+            Cx2 = self.Cx2_list[i]
+            x1x2 = "".join([x1, x2])
+            Cx1x2 = len(gzip.compress(x1x2.encode("utf-8")))
+            # normalized compression distance
+            ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)
+            disance_from_x1.append(ncd)
+        sorted_idx = np.argsort(np.array(disance_from_x1))
+        top_k_class = self.training_data[sorted_idx[:k], 1]
+        _, counts = np.unique(top_k_class, return_counts=True)
+        predict_class = top_k_class[counts.argmax()]
+        return predict_class
diff --git a/tests/test_cls.py b/tests/test_cls.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+import unittest
+from pythainlp.cls import GzipModel
+
+
+class TestClsPackage(unittest.TestCase):
+    def test_GzipModel(self):
+        training_data =  [
+          ("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"),
+          ("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"),
+          ("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"),
+          ("ขับรถแย่มาก", "Negative"),
+          ("ดีนะครับ", "Positive"),
+          ("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"),
+          ("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"),
+          ("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"),
+          ("นี่เป็นบทความหนึ่ง", "Neutral")
+        ]
+        model = GzipModel(training_data)
+        self.assertEqual(model.predict("ฉันดีใจ", k=1), "Positive")