Skip to content

Commit c8152d4

Browse files
authored
FIX html escape regex (#394)
* FIX html escape regex * doc
1 parent efc982f commit c8152d4

File tree

2 files changed

+15
-5
lines changed

2 files changed

+15
-5
lines changed

src/doc_builder/convert_md_to_mdx.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222

2323

2424
_re_doctest_flags = re.compile(r"^(>>>.*\S)(\s+)# doctest:\s+\+[A-Z_]+\s*$", flags=re.MULTILINE)
25+
_re_lt_html = re.compile(r"<(((!(DOCTYPE|--))|((\/\s*)?[a-z]+))[^>]*?)>", re.IGNORECASE)
26+
_re_lcub_svelte = re.compile(
27+
r"<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)(((?!<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)).)*)>|&amp;lcub;(#if|:else}|/if})",
28+
re.DOTALL,
29+
)
2530

2631

2732
def convert_md_to_mdx(md_text, page_info):
@@ -68,18 +73,13 @@ def convert_special_chars(text):
6873
"""
6974
Convert { and < that have special meanings in MDX.
7075
"""
71-
_re_lcub_svelte = re.compile(
72-
r"<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)(((?!<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)).)*)>|&amp;lcub;(#if|:else}|/if})",
73-
re.DOTALL,
74-
)
7576
text = text.replace("{", "&amp;lcub;")
7677
# We don't want to escape `{` that are part of svelte syntax
7778
text = _re_lcub_svelte.sub(lambda match: match[0].replace("&amp;lcub;", "{"), text)
7879
# We don't want to replace those by the HTML code, so we temporarily set them at LTHTML
7980
# source is a special tag, it can be standalone (html tag) or closing (doc tag)
8081

8182
# Temporarily replace all valid HTML tags with LTHTML
82-
_re_lt_html = re.compile(r"<(((!(DOCTYPE|--))|((\/\s*)?\w+))[^>]*?)>", re.DOTALL)
8383
text = re.sub(_re_lt_html, r"LTHTML\1>", text)
8484
# Encode remaining < symbols
8585
text = text.replace("<", "&amp;lt;")

tests/test_convert_md_to_mdx.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,16 @@ def test_convert_special_chars(self):
122122
comment = "<!-- comment -->"
123123
self.assertEqual(convert_special_chars(comment), comment)
124124

125+
comment = "<!-- multi line\ncomment -->"
126+
self.assertEqual(convert_special_chars(comment), comment)
127+
128+
# Regression test for https://github.com/huggingface/doc-builder/pull/394
129+
# '<' must not be considered an HTML tag before a number
130+
self.assertEqual(
131+
convert_special_chars("something <5MB something else -> here"),
132+
"something &amp;lt;5MB something else -> here",
133+
)
134+
125135
def test_convert_img_links(self):
126136
page_info = {"package_name": "transformers", "version": "v4.10.0", "language": "fr"}
127137

0 commit comments

Comments
 (0)