huggingface · mishig25 · Sep 24, 2023 · Sep 23, 2023 · Sep 23, 2023 · Sep 23, 2023
diff --git a/kit/package-lock.json b/kit/package-lock.json
diff --git a/kit/package.json b/kit/package.json
@@ -27,6 +27,7 @@
 		"eslint-config-prettier": "^8.3.0",
 		"eslint-plugin-svelte3": "^3.2.1",
 		"highlight.js": "^11.4.0",
+		"html-tags": "^3.3.1",
 		"htmlparser2": "^7.2.0",
 		"katex": "^0.15.2",
 		"mdsvex": "^0.10.5",
@@ -38,7 +39,8 @@
 		"svelte-preprocess": "^4.10.1",
 		"tailwindcss": "^3.0.22",
 		"tslib": "^2.3.1",
-		"typescript": "~4.5.4"
+		"typescript": "~4.5.4",
+		"unist-util-visit": "^5.0.0"
 	},
 	"type": "module",
 	"dependencies": {

diff --git a/kit/preprocess.js b/kit/preprocess.js
@@ -3,6 +3,10 @@ import htmlparser2 from "htmlparser2";
 import hljs from "highlight.js";
 import { mdsvex } from "mdsvex";
 import katex from "katex";
+import { visit } from 'unist-util-visit';
+import htmlTags from 'html-tags';
+import { readdir } from 'fs/promises';
+import path from 'path';
 
 // Preprocessor that converts markdown into Docstring
 // svelte component using mdsvexPreprocess
@@ -391,7 +395,7 @@ function unescapeUnderscores(content) {
  * @param {Record<any, any>} markedKatex
  */
 function markKatex(content, markedKatex) {
-	const REGEX_LATEX_DISPLAY = /\n\$\$([\s\S]+?)\$\$/g;
+	const REGEX_LATEX_DISPLAY = /\$\$([\s\S]+?)\$\$/g;
 	const REGEX_LATEX_INLINE = /\\\\\(([\s\S]+?)\\\\\)/g;
 	let counter = 0;
 	return content
@@ -411,19 +415,72 @@ function markKatex(content, markedKatex) {
 
 function renderKatex(code, markedKatex) {
 	return code.replace(/KATEXPARSE[0-9]+MARKER/g, (marker) => {
-		const { tex, displayMode } = markedKatex[marker];
-		const html = katex.renderToString(renderSvelteChars(tex), {
+		let { tex, displayMode } = markedKatex[marker];
+		tex = tex.replaceAll('&#123;', "{");
+		tex = tex.replaceAll('&#60;', "<");
+		let html = katex.renderToString(renderSvelteChars(tex), {
 			displayMode,
 			throwOnError: false
 		});
+		html = html.replace("katex-html", "katex-html hidden");
 		if (html.includes(`katex-error`)) {
 			throw new Error(`[KaTeX] Error while parsing markdown\n ${html}`);
 		}
 		return `{@html ${JSON.stringify(html)}}`;
 	});
 }
 
+async function findSvelteComponentNames(startDir) {
+    let svelteFiles = [];
+
+    async function searchDir(directory) {
+        const files = await readdir(directory, { withFileTypes: true });
+
+        for (const file of files) {
+            const filePath = path.join(directory, file.name);
+            if (file.isDirectory()) {
+                await searchDir(filePath);
+            } else if (path.extname(file.name) === '.svelte') {
+                svelteFiles.push(path.basename(file.name, '.svelte')); // strip the directory and .svelte extension
+            }
+        }
+    }
+
+    await searchDir(startDir);
+    return svelteFiles;
+}
+
+const dirPath = './src/lib';
+const svelteTags = await findSvelteComponentNames(dirPath);
+const validTags = [...htmlTags, ...svelteTags];
+
+function escapeSvelteSpecialChars() {
+	return transform;
+
+	function transform(tree) {
+		visit(tree, 'text', onText);
+		visit(tree, 'html', onHtml);
+	}
+
+	function onText(node) {
+		node.value = node.value.replaceAll("{", '&#123;');
+		node.value = node.value.replaceAll("<", '&#60;');
+	}
+
+	function onHtml(node) {
+		const RE_TAG_NAME = /<\/?(\w+)/;
+		const match = node.value.match(RE_TAG_NAME);
+		if(match){
+			const tagName = match[1];
+			if(!validTags.includes(tagName)){
+				node.value = node.value.replaceAll("<", '&#60;');
+			}
+		}
+	}
+}
+
 const _mdsvexPreprocess = mdsvex({
+	remarkPlugins: [escapeSvelteSpecialChars],
 	extensions: ["mdx"],
 	highlight: {
 		highlighter: function (code, lang) {

diff --git a/src/doc_builder/convert_md_to_mdx.py b/src/doc_builder/convert_md_to_mdx.py
@@ -22,25 +22,6 @@
 
 
 _re_doctest_flags = re.compile(r"^(>>>.*\S)(\s+)# doctest:\s+\+[A-Z_]+\s*$", flags=re.MULTILINE)
-_re_lt_html = re.compile(
-    r"""# This regex is meant to detect any HTML tag or comment, but not standalone '<' characters.
-    <(                   # HTML tag with...
-    (
-        !DOCTYPE         # ... !DOCTYPE
-    |
-        ((\/\s*)?[a-z]+) # ... or any regular tag (i.e. starts with [a-z]
-    )
-    [^><]*?              # ... followed by anything until next closing ">"
-    )>
-    |
-    <(!--[^>]*?--)>      # Or an HTML comment
-    """,
-    re.IGNORECASE | re.VERBOSE,
-)
-_re_lcub_svelte = re.compile(
-    r"<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)(((?!<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)).)*)>|&amp;lcub;(#if|:else}|/if})",
-    re.DOTALL,
-)
 
 
 def convert_md_to_mdx(md_text, page_info):
@@ -83,28 +64,6 @@ def convert_md_to_mdx(md_text, page_info):
     )
 
 
-def convert_special_chars(text):
-    """
-    Convert { and < that have special meanings in MDX.
-    """
-    text = text.replace("{", "&amp;lcub;")
-    # We don't want to escape `{` that are part of svelte syntax
-    text = _re_lcub_svelte.sub(lambda match: match[0].replace("&amp;lcub;", "{"), text)
-    # We don't want to replace those by the HTML code, so we temporarily set them at LTHTML
-    # source is a special tag, it can be standalone (html tag) or closing (doc tag)
-
-    # Temporarily replace all valid HTML tags with LTHTML
-    # Replace with '\1\5' => 2 possible groups to catch the tag but in practice only one is not empty.
-    text = re.sub(_re_lt_html, r"LTHTML\1\5>", text)
-
-    # Encode remaining < symbols
-    text = text.replace("<", "&amp;lt;")
-    # Put back the HTML tags
-    text = text.replace("LTHTML", "<")
-
-    return text
-
-
 def convert_img_links(text, page_info):
     """
     Convert image links to correct URL paths.
@@ -199,13 +158,11 @@ def process_md(text, page_info):
     Processes markdown by:
         1. Converting include
         2. Converting literalinclude
-        3. Converting special characters
-        4. Clean doctest syntax
-        5. Converting image links
+        3. Clean doctest syntax
+        4. Converting image links
     """
     text = convert_include(text, page_info)
     text = convert_literalinclude(text, page_info)
-    text = convert_special_chars(text)
     text = clean_doctest_syntax(text)
     text = convert_img_links(text, page_info)
     return text
diff --git a/tests/test_convert_md_to_mdx.py b/tests/test_convert_md_to_mdx.py
@@ -22,7 +22,6 @@
     convert_include,
     convert_literalinclude,
     convert_md_to_mdx,
-    convert_special_chars,
     process_md,
 )
 
@@ -65,80 +64,6 @@ def test_convert_md_to_mdx(self):
 Lorem ipsum dolor sit amet, consectetur adipiscing elit"""
         self.assertEqual(convert_md_to_mdx(md_text, page_info), expected_conversion)
 
-    def test_convert_special_chars(self):
-        self.assertEqual(convert_special_chars("{ lala }"), "&amp;lcub; lala }")
-        self.assertEqual(convert_special_chars("< blo"), "&amp;lt; blo")
-        self.assertEqual(convert_special_chars("<source></source>"), "<source></source>")
-        self.assertEqual(convert_special_chars("<br>"), "<br>")
-        self.assertEqual(convert_special_chars("<hr>"), "<hr>")
-        self.assertEqual(convert_special_chars("<source>"), "<source>")
-        self.assertEqual(convert_special_chars("<Youtube id='my_vid' />"), "<Youtube id='my_vid' />")
-        self.assertEqual(convert_special_chars("</br>"), "</br>")
-        self.assertEqual(convert_special_chars("<br />"), "<br />")
-        self.assertEqual(convert_special_chars("<p>5 <= 10</p>"), "<p>5 &amp;lt;= 10</p>")
-        self.assertEqual(
-            convert_special_chars("<p align='center'>5 <= 10</p>"), "<p align='center'>5 &amp;lt;= 10</p>"
-        )
-        self.assertEqual(convert_special_chars("<p>5 <= 10"), "<p>5 &amp;lt;= 10")  # no closing tag
-        self.assertEqual(convert_special_chars("5 <= 10</p>"), "5 &amp;lt;= 10</p>")  # no opening tag
-        self.assertEqual(convert_special_chars("<a>test</b>"), "<a>test</b>")  # mismatched tags
-        self.assertEqual(convert_special_chars("<p>5 < 10</p>"), "<p>5 &amp;lt; 10</p>")
-        self.assertEqual(convert_special_chars("<p>5 > 10</p>"), "<p>5 > 10</p>")
-        self.assertEqual(convert_special_chars("<!--...-->"), "<!--...-->")  # comment
-        self.assertEqual(convert_special_chars("<!-- < -->"), "<!-- &amp;lt; -->")  # comment
-        self.assertEqual(convert_special_chars("<!DOCTYPE html> 1 < 2"), "<!DOCTYPE html> 1 &amp;lt; 2")
-
-        longer_test = """<script lang="ts">
-import Tip from "$lib/Tip.svelte";
-import Youtube from "$lib/Youtube.svelte";
-import Docstring from "$lib/Docstring.svelte";
-import CodeBlock from "$lib/CodeBlock.svelte";
-export let fw: "pt" | "tf"
-</script>"""
-        self.assertEqual(convert_special_chars(longer_test), longer_test)
-
-        nested_test = """<blockquote>
-   sometext
-   <blockquote>
-        sometext
-   </blockquote>
-</blockquote>"""
-        self.assertEqual(convert_special_chars(nested_test), nested_test)
-
-        html_code = '<a href="Some URl">some_text</a>'
-        self.assertEqual(convert_special_chars(html_code), html_code)
-
-        inner_less = """<blockquote>
-   sometext 4 &amp;lt; 5
-</blockquote>"""
-        self.assertEqual(convert_special_chars(inner_less), inner_less)
-
-        img_code = '<img src="someSrc">'
-        self.assertEqual(convert_special_chars(img_code), img_code)
-
-        video_code = '<video src="someSrc">'
-        self.assertEqual(convert_special_chars(video_code), video_code)
-
-        comment = "<!-- comment -->"
-        self.assertEqual(convert_special_chars(comment), comment)
-
-        comment = "<!-- multi line\ncomment -->"
-        self.assertEqual(convert_special_chars(comment), comment)
-
-        # Regression test for https://github.com/huggingface/doc-builder/pull/394
-        # '<' must not be considered an HTML tag before a number
-        self.assertEqual(
-            convert_special_chars("something <5MB something else -> here"),
-            "something &amp;lt;5MB something else -> here",
-        )
-
-        # Regression test for https://github.com/huggingface/doc-builder/pull/398
-        # '10K<n<100K' must be caught correctly and not grouped with the next HTML tag.
-        self.assertEqual(
-            convert_special_chars("""10K<n<100K\n<Tip>\nThis is a tip.\n</Tip>"""),
-            "10K&amp;lt;n&amp;lt;100K\n<Tip>\nThis is a tip.\n</Tip>",
-        )
-
     def test_convert_img_links(self):
         page_info = {"package_name": "transformers", "version": "v4.10.0", "language": "fr"}
 
@@ -157,8 +82,8 @@ def test_process_md(self):
 {}
 <>"""
         expected_conversion = """[img](/docs/transformers/v4.10.0/fr/imgs/img.gif)
-&amp;lcub;}
-&amp;lt;>"""
+{}
+<>"""
         self.assertEqual(process_md(text, page_info), expected_conversion)
 
     def test_convert_include(self):