diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 73b98a88a..587c15eb9 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -439,7 +439,7 @@ def sent_tokenize( return segments -def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]: +def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold:float=0.5) -> List[List[str]]: """ Paragraph tokenizer. @@ -485,7 +485,8 @@ def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]: else: _size = engine.split("-")[-1] from pythainlp.tokenize.wtsplit import tokenize as segment - segments = segment(text,size=_size,tokenize="paragraph") + segments = segment(text,size=_size,tokenize="paragraph",paragraph_threshold=paragraph_threshold) + else: raise ValueError( f"""Tokenizer \"{engine}\" not found. diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py index 20c8a8eb1..6364aeffa 100644 --- a/pythainlp/tokenize/wtsplit.py +++ b/pythainlp/tokenize/wtsplit.py @@ -28,7 +28,8 @@ def _tokenize( text:str, lang_code:str="th", model:str="wtp-bert-mini", - tokenize:str="sentence" + tokenize:str="sentence", + paragraph_threshold:float=0.5, )-> List[str]: global _MODEL_NAME,_MODEL if _MODEL_NAME != model: @@ -40,11 +41,12 @@ def _tokenize( return _MODEL.split( text, lang_code=lang_code, - do_paragraph_segmentation=True + do_paragraph_segmentation=True, + paragraph_threshold=paragraph_threshold ) -def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]: +def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_threshold:float=0.5)-> List[str]: _model_load="" if size=="tiny": _model_load="wtp-bert-tiny" @@ -54,4 +56,4 @@ def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]: _model_load="wtp-canine-s-12l" else: # mini _model_load="wtp-bert-mini" - return _tokenize(text, model=_model_load,tokenize=tokenize) + return _tokenize(text, model=_model_load,tokenize=tokenize,paragraph_threshold=paragraph_threshold)