diff --git a/pythainlp/tokenize/_utils.py b/pythainlp/tokenize/_utils.py
new file mode 100644
index 000000000..63d54bf8c
--- /dev/null
+++ b/pythainlp/tokenize/_utils.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+"""
+Utility functions for tokenize module.
+"""
+
+import re
+from typing import List, Callable
+
+_DIGITS_WITH_SEPARATOR = re.compile(r"(\d+[\.\,:])+\d+")
+
+
+def apply_postprocessors(
+    segments: List[str], postprocessors: Callable[[List[str]], List[str]]
+) -> List[str]:
+    """
+    A list of callables to apply on a raw segmentation result.
+    """
+    for func in postprocessors:
+        segments = func(segments)
+
+    return segments
+
+
+def rejoin_formatted_num(segments: List[str]) -> List[str]:
+    """
+    Rejoin well-known formatted numeric that are over-tokenized.
+    The formatted numeric are numbers separated by ":", ",", or ".",
+    such as time, decimal number, comma-added number, and IP address.
+
+    :param List[str] segments: result from word tokenizer
+    :return: a list of fixed tokens
+    :rtype: List[str]
+
+    :Example:
+        tokens = ['ขณะ', 'นี้', 'เวลา', ' ', '12', ':', '00น', ' ', 'อัตรา',
+                'แลกเปลี่ยน', ' ', '1', ',', '234', '.', '5', ' ', 'baht/zeny']
+        rejoin_formatted_num(tokens)
+        # output:
+        # ['ขณะ', 'นี้', 'เวลา', ' ', '12:00น', ' ', 'อัตรา', 'แลกเปลี่ยน', ' ', '1,234.5', ' ', 'baht/zeny']
+
+        tokens = ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127', '.', '0', '.', '0', '.', '1', ' ', 'ครับ']
+        rejoin_formatted_num(tokens)
+        # output:
+        # ['IP', ' ', 'address', ' ', 'ของ', 'คุณ', 'คือ', ' ', '127.0.0.1', ' ', 'ครับ']
+    """
+    original = "".join(segments)
+    matching_results = _DIGITS_WITH_SEPARATOR.finditer(original)
+    tokens_joined = []
+    pos = 0
+    segment_idx = 0
+
+    match = next(matching_results, None)
+    while segment_idx < len(segments) and match:
+        is_span_beginning = pos >= match.start()
+        token = segments[segment_idx]
+        if is_span_beginning:
+            connected_token = ""
+            while pos < match.end() and segment_idx < len(segments):
+                connected_token += segments[segment_idx]
+                pos += len(segments[segment_idx])
+                segment_idx += 1
+
+            tokens_joined.append(connected_token)
+            match = next(matching_results, None)
+        else:
+            tokens_joined.append(token)
+            segment_idx += 1
+            pos += len(token)
+    tokens_joined += segments[segment_idx:]
+    return tokens_joined
+
+
+def strip_whitespace(segments: List[str]) -> List[str]:
+    """
+    Strip whitespace(s) off each token and remove whitespace tokens.
+    :param List[str] segments: result from word tokenizer
+    :return: a list of tokens
+    :rtype: List[str]
+
+    :Example:
+        tokens = [" ", "วันนี้ ", "เวลา ", "19.00น"]
+        strip_whitespace(tokens)
+        # ["วันนี้", "เวลา", "19.00น"]
+
+    """
+    segments = [token.strip(" ") for token in segments if token.strip(" ")]
+    return segments
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 6b918be12..2846e7a90 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -13,6 +13,11 @@
     DEFAULT_WORD_DICT_TRIE,
     DEFAULT_WORD_TOKENIZE_ENGINE,
 )
+from pythainlp.tokenize._utils import (
+    apply_postprocessors,
+    rejoin_formatted_num,
+    strip_whitespace,
+)
 from pythainlp.util.trie import Trie, dict_trie
 
 
@@ -47,7 +52,9 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
     return segment(doc)
 
 
-def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "str") -> Union[str, List[str]]:
+def word_detokenize(
+    segments: Union[List[List[str]], List[str]], output: str = "str"
+) -> Union[str, List[str]]:
     """
     Word detokenizer.
 
@@ -62,6 +69,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
     if isinstance(segments[0], str):
         segments = [segments]
     from pythainlp import thai_characters
+
     for i, s in enumerate(segments):
         _list_sents = []
         _add_index = []
@@ -70,7 +78,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
         for j, w in enumerate(s):
             if j > 0:
                 # previous word
-                p_w = s[j-1]
+                p_w = s[j - 1]
                 # if w is number or other language and not be space
                 if (
                     w[0] not in thai_characters
@@ -88,9 +96,9 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
                     if not p_w.isspace():
                         _list_sents.append(" ")
                     _mark_index.append(j)
-                elif w.isspace() and j-1 not in _space_index:
+                elif w.isspace() and j - 1 not in _space_index:
                     _space_index.append(j)
-                elif j-1 in _mark_index:
+                elif j - 1 in _mark_index:
                     _list_sents.append(" ")
             _list_sents.append(w)
         _list_all.append(_list_sents)
@@ -103,7 +111,7 @@ def word_detokenize(segments: Union[List[List[str]], List[str]], output: str = "
             for j in i:
                 _temp += j
             _text.append(_temp)
-        return ' '.join(_text)
+        return " ".join(_text)
 
 
 def word_tokenize(
@@ -111,6 +119,7 @@ def word_tokenize(
     custom_dict: Trie = None,
     engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
     keep_whitespace: bool = True,
+    join_broken_num: bool = True,
 ) -> List[str]:
     """
     Word tokenizer.
@@ -123,37 +132,47 @@ def word_tokenize(
     :param bool keep_whitespace: True to keep whitespaces, a common mark
                                  for end of phrase in Thai.
                                  Otherwise, whitespaces are omitted.
+    :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
+                                 Otherwise, formatted numeric could be wrongly separated.
+
     :return: list of words
     :rtype: List[str]
     **Options for engine**
-        * *newmm* (default) - dictionary-based, Maximum Matching +
-          Thai Character Cluster
-        * *newmm-safe* - newmm, with a mechanism to help avoid long
-          processing time for text with continuous ambiguous breaking points
-        * *mm* or *multi_cut* - dictionary-based, Maximum Matching.
-        * *nlpo3* - Python binding for nlpO3. It is newmm engine in Rust.
-        * *longest* - dictionary-based, Longest Matching
-        * *icu* - wrapper for ICU (International Components for Unicode,
-          using PyICU), dictionary-based
         * *attacut* - wrapper for
           `AttaCut <https://github.com/PyThaiNLP/attacut>`_.,
           learning-based approach
         * *deepcut* - wrapper for
           `DeepCut <https://github.com/rkcosmos/deepcut>`_,
           learning-based approach
-        * *nercut* - Dictionary-based maximal matching word segmentation,
+        * *icu* - wrapper for a word tokenizer in
+          `PyICU <https://gitlab.pyicu.org/main/pyicu>`_.,
+          from ICU (International Components for Unicode),
+          dictionary-based          
+        * *longest* - dictionary-based, longest matching
+        * *mm* - "multi-cut", dictionary-based, maximum matching
+        * *nercut* - dictionary-based, maximal matching,
           constrained with Thai Character Cluster (TCC) boundaries,
-          and combining tokens that are parts of the same named-entity.
+          combining tokens that are parts of the same named-entity
+        * *newmm* (default) - "new multi-cut",
+          dictionary-based, maximum matching,
+          constrained with Thai Character Cluster (TCC) boundaries
+        * *newmm-safe* - newmm, with a mechanism to avoid long
+          processing time for text with continuous ambiguous breaking points
+        * *nlpo3* - wrapper for a word tokenizer in
+          `nlpO3 <https://github.com/PyThaiNLP/nlpo3>`_.,
+          newmm adaptation in Rust (2.5x faster)
+        * *oskut* - wrapper for
+          `OSKut <https://github.com/mrpeerat/OSKut>`_.,
+          Out-of-domain StacKed cut for Word Segmentation
         * *sefr_cut* - wrapper for
           `SEFR CUT <https://github.com/mrpeerat/SEFR_CUT>`_.,
+          Stacked Ensemble Filter and Refine for Word Segmentation
         * *tltk* - wrapper for
           `TLTK <https://pypi.org/project/tltk/>`_.,
-        * *oskut* - wrapper for
-          `OSKut <https://github.com/mrpeerat/OSKut>`_.,
-
+           maximum collocation approach
     :Note:
-        - The parameter **custom_dict** can be provided as an argument \
-          only for *newmm*, *longest*, and *deepcut* engine.
+        - The **custom_dict** parameter only works for \
+          *deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
     :Example:
 
     Tokenize text with different tokenizer::
@@ -178,6 +197,19 @@ def word_tokenize(
 
         word_tokenize(text, engine="newmm", keep_whitespace=False)
         # output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
+        
+    Join broken formatted numeric (e.g. time, decimals, IP address)::
+
+        text = "เงิน1,234บาท19:32น 127.0.0.1"
+
+        word_tokenize(text, engine="attacut", join_broken_num=False)
+        # output:
+        # ['เงิน', '1', ',', '234', 'บาท', '19', ':', '32น', ' ',
+        #  '127', '.', '0', '.', '0', '.', '1']
+
+        word_tokenize(text, engine="attacut", join_broken_num=True)
+        # output:
+        # ['เงิน', '1,234', 'บาท', '19:32น', ' ', '127.0.0.1']
 
     Tokenize with default and custom dictionary::
 
@@ -199,8 +231,8 @@ def word_tokenize(
 
         word_tokenize(text, engine="newmm", custom_dict=trie))
         # output:
-        # ['ชินโซ', ' ', 'อาเบะ',
-        #   ' ', 'เกิด', ' ', '21', ' ', 'กันยายน']
+        # ['ชินโซ', ' ', 'อาเบะ', ' ',
+        #  'เกิด', ' ', '21', ' ', 'กันยายน']
     """
     if not text or not isinstance(text, str):
         return []
@@ -257,6 +289,7 @@ def word_tokenize(
         segments = segment(text)
     elif engine == "nlpo3":
         from pythainlp.tokenize.nlpo3 import segment
+
         if isinstance(custom_dict, str):
             segments = segment(text, custom_dict=custom_dict)
         elif not isinstance(custom_dict, str) and custom_dict is not None:
@@ -274,8 +307,14 @@ def word_tokenize(
             It might be a typo; if not, please consult our document."""
         )
 
+    postprocessors = []
+    if join_broken_num:
+        postprocessors.append(rejoin_formatted_num)
+
     if not keep_whitespace:
-        segments = [token.strip(" ") for token in segments if token.strip(" ")]
+        postprocessors.append(strip_whitespace)
+
+    segments = apply_postprocessors(segments, postprocessors)
 
     return segments
 
@@ -297,12 +336,12 @@ def sent_tokenize(
     :rtype: list[str]
     **Options for engine**
         * *crfcut* - (default) split by CRF trained on TED dataset
+        * *thaisum* - The implementation of sentence segmentator from \
+            Nakhun Chumpolsathien, 2020
+        * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
         * *whitespace+newline* - split by whitespaces and newline.
         * *whitespace* - split by whitespaces. Specifiaclly, with \
                          :class:`regex` pattern  ``r" +"``
-        * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
-        * *thaisum* - The implementation of sentence segmentator from \
-            Nakhun Chumpolsathien, 2020
     :Example:
 
     Split the text based on *whitespace*::
@@ -364,7 +403,10 @@ def sent_tokenize(
 
         segments = segment(text)
     elif engine == "thaisum":
-        from pythainlp.tokenize.thaisumcut import ThaiSentenceSegmentor as segmentor
+        from pythainlp.tokenize.thaisumcut import (
+            ThaiSentenceSegmentor as segmentor,
+        )
+
         segment = segmentor()
         segments = segment.split_into_sentences(text)
     else:
@@ -374,7 +416,7 @@ def sent_tokenize(
         )
 
     if not keep_whitespace:
-        segments = [token.strip(" ") for token in segments if token.strip(" ")]
+        segments = strip_whitespace(segments)
 
     return segments
 
@@ -405,13 +447,12 @@ def subword_tokenize(
     :return: list of subwords
     :rtype: list[str]
     **Options for engine**
-        * *tcc* (default) -  Thai Character Cluster (Theeramunkong et al. 2000)
-        * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
-        * *wangchanberta* - SentencePiece from wangchanberta model.
         * *dict* - newmm word tokenizer with a syllable dictionary
+        * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
         * *ssg* - CRF syllable segmenter for Thai
+        * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
         * *tltk* - syllable tokenizer from tltk
-
+        * *wangchanberta* - SentencePiece from wangchanberta model
     :Example:
 
     Tokenize text into subword based on *tcc*::
@@ -485,7 +526,7 @@ def subword_tokenize(
         segments = segment(text)
 
     if not keep_whitespace:
-        segments = [token.strip(" ") for token in segments if token.strip(" ")]
+        segments = strip_whitespace(segments)
 
     return segments
 
@@ -562,6 +603,7 @@ def __init__(
         custom_dict: Union[Trie, Iterable[str], str] = None,
         engine: str = "newmm",
         keep_whitespace: bool = True,
+        join_broken_num: bool = True,
     ):
         """
         Initialize tokenizer object.
@@ -584,9 +626,11 @@ def __init__(
             raise NotImplementedError(
                 """
                 The Tokenizer class is not support %s for custom tokenizer
-                """ % self.__engine
+                """
+                % self.__engine
             )
         self.__keep_whitespace = keep_whitespace
+        self.__join_broken_num = join_broken_num
 
     def word_tokenize(self, text: str) -> List[str]:
         """
@@ -601,6 +645,7 @@ def word_tokenize(self, text: str) -> List[str]:
             custom_dict=self.__trie_dict,
             engine=self.__engine,
             keep_whitespace=self.__keep_whitespace,
+            join_broken_num=self.__join_broken_num,
         )
 
     def set_tokenize_engine(self, engine: str) -> None:
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index c1cf41340..d0999d771 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -242,48 +242,78 @@ def test_sent_tokenize(self):
         ]
 
         self.assertEqual(
-            sent_tokenize(sent_1, engine="crfcut"), sent_1_toks,
+            sent_tokenize(sent_1, engine="crfcut"),
+            sent_1_toks,
         )
         self.assertEqual(
-            sent_tokenize(sent_2, engine="crfcut"), sent_2_toks,
+            sent_tokenize(sent_2, engine="crfcut"),
+            sent_2_toks,
         )
         self.assertEqual(
-            sent_tokenize(sent_3, engine="crfcut"), sent_3_toks,
+            sent_tokenize(sent_3, engine="crfcut"),
+            sent_3_toks,
         )
         self.assertEqual(
-            sent_tokenize(sent_1), sent_1_toks,
+            sent_tokenize(sent_1),
+            sent_1_toks,
         )
         self.assertEqual(
-            sent_tokenize(sent_2), sent_2_toks,
+            sent_tokenize(sent_2),
+            sent_2_toks,
         )
         self.assertEqual(
-            sent_tokenize(sent_3), sent_3_toks,
+            sent_tokenize(sent_3),
+            sent_3_toks,
         )
         self.assertIsNotNone(
-            sent_tokenize(sent_1, keep_whitespace=False, engine="whitespace",),
+            sent_tokenize(
+                sent_1,
+                keep_whitespace=False,
+                engine="whitespace",
+            ),
         )
         self.assertIsNotNone(
-            sent_tokenize(sent_1, engine="tltk",),
+            sent_tokenize(
+                sent_1,
+                engine="tltk",
+            ),
         )
         self.assertIsNotNone(
-            sent_tokenize(sent_2, engine="tltk",),
+            sent_tokenize(
+                sent_2,
+                engine="tltk",
+            ),
         )
         self.assertIsNotNone(
-            sent_tokenize(sent_3, engine="tltk",),
+            sent_tokenize(
+                sent_3,
+                engine="tltk",
+            ),
         )
         self.assertIsNotNone(
-            sent_tokenize(sent_1, engine="thaisum",),
+            sent_tokenize(
+                sent_1,
+                engine="thaisum",
+            ),
         )
         self.assertIsNotNone(
-            sent_tokenize(sent_2, engine="thaisum",),
+            sent_tokenize(
+                sent_2,
+                engine="thaisum",
+            ),
         )
         self.assertIsNotNone(
-            sent_tokenize(sent_3, engine="thaisum",),
+            sent_tokenize(
+                sent_3,
+                engine="thaisum",
+            ),
         )
         self.assertFalse(
             " "
             in sent_tokenize(
-                sent_1, engine="whitespace", keep_whitespace=False,
+                sent_1,
+                engine="whitespace",
+                keep_whitespace=False,
             )
         )
         with self.assertRaises(ValueError):
@@ -322,9 +352,12 @@ def test_subword_tokenize(self):
             " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
         )
         self.assertEqual(
-            subword_tokenize("สวัสดีชาวโลก", engine="dict"), ["สวัส", "ดี", "ชาว", "โลก"]
+            subword_tokenize("สวัสดีชาวโลก", engine="dict"),
+            ["สวัส", "ดี", "ชาว", "โลก"],
+        )
+        self.assertFalse(
+            "า" in subword_tokenize("สวัสดีชาวโลก", engine="dict")
         )
-        self.assertFalse("า" in subword_tokenize("สวัสดีชาวโลก", engine="dict"))
         self.assertEqual(subword_tokenize(None, engine="ssg"), [])
         self.assertEqual(
             subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
@@ -346,9 +379,7 @@ def test_subword_tokenize(self):
         self.assertFalse(
             "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tltk")
         )
-        self.assertIsInstance(
-            subword_tokenize("โควิด19", engine="tltk"), list
-        )
+        self.assertIsInstance(subword_tokenize("โควิด19", engine="tltk"), list)
         with self.assertRaises(ValueError):
             subword_tokenize("นกแก้ว", engine="XX")  # engine does not exist
 
@@ -436,20 +467,18 @@ def test_tltk(self):
         self.assertEqual(tltk.segment(None), [])
         self.assertEqual(tltk.segment(""), [])
         self.assertEqual(
-            tltk.syllable_tokenize(
-                "ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"
-            ),
+            tltk.syllable_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย"),
             [
-                'ฉัน',
-                'รัก',
-                'ภา',
-                'ษา',
-                'ไทย',
-                'เพราะ',
-                'ฉัน',
-                'เป็น',
-                'คน',
-                'ไทย'
+                "ฉัน",
+                "รัก",
+                "ภา",
+                "ษา",
+                "ไทย",
+                "เพราะ",
+                "ฉัน",
+                "เป็น",
+                "คน",
+                "ไทย",
             ],
         )
         self.assertEqual(tltk.syllable_tokenize(None), [])
@@ -471,7 +500,8 @@ def test_longest(self):
             ["ปวด", "เฉียบพลัน"],
         )
         self.assertEqual(
-            longest_tokenizer.word_tokenize("เฉียบพลัน"), ["เฉียบพลัน"],
+            longest_tokenizer.word_tokenize("เฉียบพลัน"),
+            ["เฉียบพลัน"],
         )
 
     def test_mm(self):
@@ -486,15 +516,15 @@ def test_mm(self):
         )
         self.assertEqual(
             word_tokenize("19...", engine="mm"),
-            ['19', '...'],
+            ["19", "..."],
         )
         self.assertEqual(
             word_tokenize("19.", engine="mm"),
-            ['19', '.'],
+            ["19", "."],
         )
         self.assertEqual(
             word_tokenize("19.84", engine="mm"),
-            ['19.84'],
+            ["19.84"],
         )
         self.assertEqual(
             word_tokenize("127.0.0.1", engine="mm"),
@@ -502,7 +532,7 @@ def test_mm(self):
         )
         self.assertEqual(
             word_tokenize("USD1,984.42", engine="mm"),
-            ['USD', '1,984.42'],
+            ["USD", "1,984.42"],
         )
 
         self.assertIsNotNone(multi_cut.mmcut("ทดสอบ"))
@@ -521,15 +551,15 @@ def test_newmm(self):
         )
         self.assertEqual(
             word_tokenize("19...", engine="newmm"),
-            ['19', '...'],
+            ["19", "..."],
         )
         self.assertEqual(
             word_tokenize("19.", engine="newmm"),
-            ['19', '.'],
+            ["19", "."],
         )
         self.assertEqual(
             word_tokenize("19.84", engine="newmm"),
-            ['19.84'],
+            ["19.84"],
         )
         self.assertEqual(
             word_tokenize("127.0.0.1", engine="newmm"),
@@ -537,7 +567,7 @@ def test_newmm(self):
         )
         self.assertEqual(
             word_tokenize("USD1,984.42", engine="newmm"),
-            ['USD', '1,984.42'],
+            ["USD", "1,984.42"],
         )
         self.assertEqual(
             word_tokenize(
@@ -561,7 +591,11 @@ def test_newmm(self):
             ["จุ๋ม", "ง่วง"],
         )
         self.assertFalse(
-            " " in word_tokenize("จุ๋มง่วง", keep_whitespace=False,)
+            " "
+            in word_tokenize(
+                "จุ๋มง่วง",
+                keep_whitespace=False,
+            )
         )
 
     def test_newmm_longtext(self):
@@ -596,13 +630,12 @@ def test_nercut(self):
         self.assertEqual(nercut.segment(None), [])
         self.assertEqual(nercut.segment(""), [])
         self.assertIsNotNone(nercut.segment("ทดสอบ"))
-        self.assertEqual(nercut.segment("ทันแน่ๆ"), ['ทัน', 'แน่ๆ'])
-        self.assertEqual(nercut.segment("%1ครั้ง"), ['%', '1', 'ครั้ง'])
-        self.assertEqual(nercut.segment("ทุ๊กกโคนน"), ['ทุ๊กกโคนน'])
-        self.assertEqual(nercut.segment("อือหือ"), ['อือหือ'])
+        self.assertEqual(nercut.segment("ทันแน่ๆ"), ["ทัน", "แน่ๆ"])
+        self.assertEqual(nercut.segment("%1ครั้ง"), ["%", "1", "ครั้ง"])
+        self.assertEqual(nercut.segment("ทุ๊กกโคนน"), ["ทุ๊กกโคนน"])
+        self.assertEqual(nercut.segment("อือหือ"), ["อือหือ"])
         self.assertEqual(
-            nercut.segment("อย่าลืมอัพการ์ดนะจ๊ะ"),
-            ['อย่าลืมอัพการ์ดนะจ๊ะ']
+            nercut.segment("อย่าลืมอัพการ์ดนะจ๊ะ"), ["อย่าลืมอัพการ์ดนะจ๊ะ"]
         )
         self.assertIsNotNone(word_tokenize("ทดสอบ", engine="nercut"))
 
@@ -644,29 +677,74 @@ def test_oskut(self):
 
     def test_word_detokenize(self):
         self.assertEqual(
-            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]),
-            "ผมเลี้ยง 5 ตัว"
+            word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), "ผมเลี้ยง 5 ตัว"
         )
-        self.assertEqual(word_detokenize(
-            ["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
-            [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]]
+        self.assertEqual(
+            word_detokenize(["ผม", "เลี้ยง", " ", "5", "ตัว"], "list"),
+            [["ผม", "เลี้ยง", " ", "5", " ", "ตัว"]],
         )
         self.assertEqual(
             word_detokenize(
                 ["ผม", "เลี้ยง", "5", "10", "ตัว", "ๆ", "คน", "ดี"]
             ),
-            "ผมเลี้ยง 5 10 ตัว ๆ คนดี"
+            "ผมเลี้ยง 5 10 ตัว ๆ คนดี",
         )
         self.assertEqual(
             word_detokenize(
                 ["ผม", "เลี้ยง", "5", "ตัว", " ", "ๆ", "คน", "ดี"]
             ),
-            "ผมเลี้ยง 5 ตัว ๆ คนดี"
+            "ผมเลี้ยง 5 ตัว ๆ คนดี",
         )
         self.assertTrue(
             isinstance(word_detokenize(["ผม", "เลี้ยง", "5", "ตัว"]), str)
         )
         self.assertEqual(
             word_detokenize(["ม่ายย", " ", "ผม", "เลี้ยง", "5", "ตัว"]),
-            "ม่ายย ผมเลี้ยง 5 ตัว"
+            "ม่ายย ผมเลี้ยง 5 ตัว",
+        )
+
+    def test_numeric_data_format(self):
+        engines = ["attacut", "deepcut", "newmm", "sefr_cut"]
+
+        for engine in engines:
+            self.assertIn(
+                "127.0.0.1",
+                word_tokenize("ไอพีของคุณคือ 127.0.0.1 ครับ", engine=engine),
+            )
+
+            tokens = word_tokenize(
+                "เวลา 12:12pm มีโปรโมชั่น 11.11", engine=engine
+            )
+            self.assertTrue(
+                any([value in tokens for value in ["12:12pm", "12:12"]]),
+                msg=f"{engine}: {tokens}",
+            )
+            self.assertIn("11.11", tokens)
+
+            self.assertIn(
+                "1,234,567.89",
+                word_tokenize("รางวัลมูลค่า 1,234,567.89 บาท", engine=engine),
+            )
+
+            tokens = word_tokenize("อัตราส่วน 2.5:1 คือ 5:2", engine=engine)
+            self.assertIn("2.5:1", tokens)
+            self.assertIn("5:2", tokens)
+
+        # try turning off `join_broken_num`
+        engine = "attacut"
+        self.assertNotIn(
+            "127.0.0.1",
+            word_tokenize(
+                "ไอพีของคุณคือ 127.0.0.1 ครับ",
+                engine=engine,
+                join_broken_num=False,
+            ),
+        )
+        self.assertNotIn(
+            "1,234,567.89",
+            word_tokenize(
+                "รางวัลมูลค่า 1,234,567.89 บาท",
+                engine=engine,
+                join_broken_num=False,
+            ),
         )