FIX html escape regex (#394)

Wauplin · web-flow · commit c8152d4d1cb4 · 2023-08-29T15:34:48.000+02:00
* FIX html escape regex

* doc
diff --git a/src/doc_builder/convert_md_to_mdx.py b/src/doc_builder/convert_md_to_mdx.py
@@ -22,6 +22,11 @@
 
 
 _re_doctest_flags = re.compile(r"^(>>>.*\S)(\s+)# doctest:\s+\+[A-Z_]+\s*$", flags=re.MULTILINE)
+_re_lt_html = re.compile(r"<(((!(DOCTYPE|--))|((\/\s*)?[a-z]+))[^>]*?)>", re.IGNORECASE)
+_re_lcub_svelte = re.compile(
+    r"<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)(((?!<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)).)*)>|&amp;lcub;(#if|:else}|/if})",
+    re.DOTALL,
+)
 
 
 def convert_md_to_mdx(md_text, page_info):
@@ -68,18 +73,13 @@ def convert_special_chars(text):
     """
     Convert { and < that have special meanings in MDX.
     """
-    _re_lcub_svelte = re.compile(
-        r"<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)(((?!<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)).)*)>|&amp;lcub;(#if|:else}|/if})",
-        re.DOTALL,
-    )
     text = text.replace("{", "&amp;lcub;")
     # We don't want to escape `{` that are part of svelte syntax
     text = _re_lcub_svelte.sub(lambda match: match[0].replace("&amp;lcub;", "{"), text)
     # We don't want to replace those by the HTML code, so we temporarily set them at LTHTML
     # source is a special tag, it can be standalone (html tag) or closing (doc tag)
 
     # Temporarily replace all valid HTML tags with LTHTML
-    _re_lt_html = re.compile(r"<(((!(DOCTYPE|--))|((\/\s*)?\w+))[^>]*?)>", re.DOTALL)
     text = re.sub(_re_lt_html, r"LTHTML\1>", text)
     # Encode remaining < symbols
     text = text.replace("<", "&amp;lt;")
diff --git a/tests/test_convert_md_to_mdx.py b/tests/test_convert_md_to_mdx.py
@@ -122,6 +122,16 @@ def test_convert_special_chars(self):
         comment = "<!-- comment -->"
         self.assertEqual(convert_special_chars(comment), comment)
 
+        comment = "<!-- multi line\ncomment -->"
+        self.assertEqual(convert_special_chars(comment), comment)
+
+        # Regression test for https://github.com/huggingface/doc-builder/pull/394
+        # '<' must not be considered an HTML tag before a number
+        self.assertEqual(
+            convert_special_chars("something <5MB something else -> here"),
+            "something &amp;lt;5MB something else -> here",
+        )
+
     def test_convert_img_links(self):
         page_info = {"package_name": "transformers", "version": "v4.10.0", "language": "fr"}