Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/api/summarize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,14 @@ Modules
-------

.. autofunction:: summarize
.. autofunction:: extract_keywords

Keyword Extraction Engines
--------------------------

KeyBERT
+++++++

.. automodule:: pythainlp.summarize.keybert
.. autoclass:: pythainlp.summarize.keybert.KeyBERT
:members:
7 changes: 3 additions & 4 deletions pythainlp/summarize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
Text summarization
"""

__all__ = [
"summarize",
]
__all__ = ["summarize", "extract_keywords"]

DEFAULT_SUMMARIZE_ENGINE = "frequency"
CPE_KMUTT_THAI_SENTENCE_SUM = "mt5-cpe-kmutt-thai-sentence-sum"
DEFAULT_KEYWORD_EXTRACTION_ENGINE = "keybert"

from pythainlp.summarize.core import summarize
from pythainlp.summarize.core import summarize, extract_keywords
141 changes: 138 additions & 3 deletions pythainlp/summarize/core.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-
"""
Text summarization
Text summarization and Keyword extraction
"""


from typing import List
from typing import List, Iterable, Optional, Tuple

from pythainlp.summarize import (
DEFAULT_SUMMARIZE_ENGINE,
CPE_KMUTT_THAI_SENTENCE_SUM,
DEFAULT_KEYWORD_EXTRACTION_ENGINE,
)
from pythainlp.summarize.freq import FrequencySummarizer
from pythainlp.tokenize import sent_tokenize
Expand Down Expand Up @@ -112,3 +112,138 @@ def summarize(
sents = sent_tokenize(text, engine="whitespace+newline")[:n]

return sents


def extract_keywords(
text: str,
keyphrase_ngram_range: Tuple[int, int] = (1, 2),
max_keywords: int = 5,
min_df: int = 1,
engine: str = DEFAULT_KEYWORD_EXTRACTION_ENGINE,
tokenizer: str = "newmm",
stop_words: Optional[Iterable[str]] = None,
) -> List[str]:
"""
This function returns most-relevant keywords (and/or keyphrases) from the input document.
Each algorithm may produce completely different keywords from each other,
so please be careful when choosing the algorithm.

*Note*: Calling :func: `extract_keywords()` is expensive. For repetitive use of KeyBERT (the default engine),
creating KeyBERT object is highly recommended.

:param str text: text to be summarized
:param Tuple[int, int] keyphrase_ngram_range: Number of token units to be defined as keyword.
The token unit varies w.r.t. `tokenizer_engine`.
For instance, (1, 1) means each token (unigram) can be a keyword (e.g. "เสา", "ไฟฟ้า"),
(1, 2) means one and two consecutive tokens (unigram and bigram) can be keywords
(e.g. "เสา", "ไฟฟ้า", "เสาไฟฟ้า") (default: (1, 2))
:param int max_keywords: Number of maximum keywords to be returned. (default: 5)
:param int min_df: Minimum frequency required to be a keyword. (default: 1)
:param str engine: Name of algorithm to use for keyword extraction. (default: 'keybert')
:param str tokenizer: Name of tokenizer engine to use.
Refer to options in :func: `pythainlp.tokenize.word_tokenizer() (default: 'newmm')
:param Optional[Iterable[str]] stop_words: A list of stop words (a.k.a words to be ignored).
If not specified, :func:`pythainlp.corpus.thai_stopwords` is used. (default: None)

:return: list of keywords

**Options for engine**
* *keybert* (default) - KeyBERT keyword extraction algorithm
* *frequency* - frequency of words

:Example:
::

from pythainlp.summarize import extract_keywords

text = '''
อาหาร หมายถึง ของแข็งหรือของเหลว
ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว
จะทำให้เกิดพลังงานและความร้อนแก่ร่างกาย
ทำให้ร่างกายเจริญเติบโต
ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย
ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ
อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย
'''

keywords = extract_keywords(text)

# output: ['อวัยวะต่างๆ',
# 'ซ่อมแซมส่วน',
# 'เจริญเติบโต',
# 'ควบคุมการเปลี่ยนแปลง',
# 'มีพิษ']

keywords = extract_keywords(text, max_keywords=10)

# output: ['อวัยวะต่างๆ',
# 'ซ่อมแซมส่วน',
# 'เจริญเติบโต',
# 'ควบคุมการเปลี่ยนแปลง',
# 'มีพิษ',
# 'ทำให้ร่างกาย',
# 'ร่างกายเจริญเติบโต',
# 'จะทำให้เกิด',
# 'มีพิษและ',
# 'เกิดโทษ']

"""

def rank_by_frequency(
text: str,
max_keywords: int = 5,
min_df: int = 5,
tokenizer: str = "newmm",
stop_words: Optional[Iterable[str]] = None,
):
from pythainlp.util.keywords import rank
from pythainlp.tokenize import word_tokenize

tokens = word_tokenize(text, engine=tokenizer, keep_whitespace=False)

use_custom_stop_words = stop_words is not None

if use_custom_stop_words:
tokens = [token for token in tokens if token not in stop_words]

word_rank = rank(tokens, exclude_stopwords=not use_custom_stop_words)

keywords = [
kw
for kw, cnt in word_rank.most_common(max_keywords)
if cnt >= min_df
]

return keywords

engines = ["keybert", "frequency"]

if engine == "keybert":
from .keybert import KeyBERT

keywords = KeyBERT().extract_keywords(
text,
keyphrase_ngram_range=keyphrase_ngram_range,
max_keywords=max_keywords,
min_df=min_df,
tokenizer=tokenizer,
return_similarity=False,
stop_words=stop_words,
)
elif engine == "frequency":
return rank_by_frequency(
text,
max_keywords=max_keywords,
min_df=min_df,
tokenizer=tokenizer,
stop_words=stop_words,
)

else:
# currently not supported
raise ValueError(
f"Keyword extractor {repr(engine)} is currently not supported. "
f"Use one of {engines}."
)

return keywords
Loading