Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 75 additions & 1 deletion kit/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion kit/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"eslint-config-prettier": "^8.3.0",
"eslint-plugin-svelte3": "^3.2.1",
"highlight.js": "^11.4.0",
"html-tags": "^3.3.1",
"htmlparser2": "^7.2.0",
"katex": "^0.15.2",
"mdsvex": "^0.10.5",
Expand All @@ -38,7 +39,8 @@
"svelte-preprocess": "^4.10.1",
"tailwindcss": "^3.0.22",
"tslib": "^2.3.1",
"typescript": "~4.5.4"
"typescript": "~4.5.4",
"unist-util-visit": "^5.0.0"
},
"type": "module",
"dependencies": {
Expand Down
63 changes: 60 additions & 3 deletions kit/preprocess.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ import htmlparser2 from "htmlparser2";
import hljs from "highlight.js";
import { mdsvex } from "mdsvex";
import katex from "katex";
import { visit } from 'unist-util-visit';
import htmlTags from 'html-tags';
import { readdir } from 'fs/promises';
import path from 'path';

// Preprocessor that converts markdown into Docstring
// svelte component using mdsvexPreprocess
Expand Down Expand Up @@ -391,7 +395,7 @@ function unescapeUnderscores(content) {
* @param {Record<any, any>} markedKatex
*/
function markKatex(content, markedKatex) {
const REGEX_LATEX_DISPLAY = /\n\$\$([\s\S]+?)\$\$/g;
const REGEX_LATEX_DISPLAY = /\$\$([\s\S]+?)\$\$/g;
const REGEX_LATEX_INLINE = /\\\\\(([\s\S]+?)\\\\\)/g;
let counter = 0;
return content
Expand All @@ -411,19 +415,72 @@ function markKatex(content, markedKatex) {

function renderKatex(code, markedKatex) {
return code.replace(/KATEXPARSE[0-9]+MARKER/g, (marker) => {
const { tex, displayMode } = markedKatex[marker];
const html = katex.renderToString(renderSvelteChars(tex), {
let { tex, displayMode } = markedKatex[marker];
tex = tex.replaceAll('&#123;', "{");
tex = tex.replaceAll('&#60;', "<");
let html = katex.renderToString(renderSvelteChars(tex), {
displayMode,
throwOnError: false
});
html = html.replace("katex-html", "katex-html hidden");
if (html.includes(`katex-error`)) {
throw new Error(`[KaTeX] Error while parsing markdown\n ${html}`);
}
return `{@html ${JSON.stringify(html)}}`;
});
}

async function findSvelteComponentNames(startDir) {
let svelteFiles = [];

async function searchDir(directory) {
const files = await readdir(directory, { withFileTypes: true });

for (const file of files) {
const filePath = path.join(directory, file.name);
if (file.isDirectory()) {
await searchDir(filePath);
} else if (path.extname(file.name) === '.svelte') {
svelteFiles.push(path.basename(file.name, '.svelte')); // strip the directory and .svelte extension
}
}
}

await searchDir(startDir);
return svelteFiles;
}

const dirPath = './src/lib';
const svelteTags = await findSvelteComponentNames(dirPath);
const validTags = [...htmlTags, ...svelteTags];

function escapeSvelteSpecialChars() {
return transform;

function transform(tree) {
visit(tree, 'text', onText);
visit(tree, 'html', onHtml);
}

function onText(node) {
node.value = node.value.replaceAll("{", '&#123;');
node.value = node.value.replaceAll("<", '&#60;');
}

function onHtml(node) {
const RE_TAG_NAME = /<\/?(\w+)/;
const match = node.value.match(RE_TAG_NAME);
if(match){
const tagName = match[1];
if(!validTags.includes(tagName)){
node.value = node.value.replaceAll("<", '&#60;');
}
}
}
}

const _mdsvexPreprocess = mdsvex({
remarkPlugins: [escapeSvelteSpecialChars],
extensions: ["mdx"],
highlight: {
highlighter: function (code, lang) {
Expand Down
47 changes: 2 additions & 45 deletions src/doc_builder/convert_md_to_mdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,6 @@


_re_doctest_flags = re.compile(r"^(>>>.*\S)(\s+)# doctest:\s+\+[A-Z_]+\s*$", flags=re.MULTILINE)
_re_lt_html = re.compile(
r"""# This regex is meant to detect any HTML tag or comment, but not standalone '<' characters.
<( # HTML tag with...
(
!DOCTYPE # ... !DOCTYPE
|
((\/\s*)?[a-z]+) # ... or any regular tag (i.e. starts with [a-z]
)
[^><]*? # ... followed by anything until next closing ">"
)>
|
<(!--[^>]*?--)> # Or an HTML comment
""",
re.IGNORECASE | re.VERBOSE,
)
_re_lcub_svelte = re.compile(
r"<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)(((?!<(Question|Tip|Added|Changed|Deprecated|DocNotebookDropdown|CourseFloatingBanner|FrameworkSwitch|audio|PipelineIcon|PipelineTag)).)*)>|&amp;lcub;(#if|:else}|/if})",
re.DOTALL,
)


def convert_md_to_mdx(md_text, page_info):
Expand Down Expand Up @@ -83,28 +64,6 @@ def convert_md_to_mdx(md_text, page_info):
)


def convert_special_chars(text):
"""
Convert { and < that have special meanings in MDX.
"""
text = text.replace("{", "&amp;lcub;")
# We don't want to escape `{` that are part of svelte syntax
text = _re_lcub_svelte.sub(lambda match: match[0].replace("&amp;lcub;", "{"), text)
# We don't want to replace those by the HTML code, so we temporarily set them at LTHTML
# source is a special tag, it can be standalone (html tag) or closing (doc tag)

# Temporarily replace all valid HTML tags with LTHTML
# Replace with '\1\5' => 2 possible groups to catch the tag but in practice only one is not empty.
text = re.sub(_re_lt_html, r"LTHTML\1\5>", text)

# Encode remaining < symbols
text = text.replace("<", "&amp;lt;")
# Put back the HTML tags
text = text.replace("LTHTML", "<")

return text


def convert_img_links(text, page_info):
"""
Convert image links to correct URL paths.
Expand Down Expand Up @@ -199,13 +158,11 @@ def process_md(text, page_info):
Processes markdown by:
1. Converting include
2. Converting literalinclude
3. Converting special characters
4. Clean doctest syntax
5. Converting image links
3. Clean doctest syntax
4. Converting image links
"""
text = convert_include(text, page_info)
text = convert_literalinclude(text, page_info)
text = convert_special_chars(text)
text = clean_doctest_syntax(text)
text = convert_img_links(text, page_info)
return text
79 changes: 2 additions & 77 deletions tests/test_convert_md_to_mdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
convert_include,
convert_literalinclude,
convert_md_to_mdx,
convert_special_chars,
process_md,
)

Expand Down Expand Up @@ -65,80 +64,6 @@ def test_convert_md_to_mdx(self):
Lorem ipsum dolor sit amet, consectetur adipiscing elit"""
self.assertEqual(convert_md_to_mdx(md_text, page_info), expected_conversion)

def test_convert_special_chars(self):
self.assertEqual(convert_special_chars("{ lala }"), "&amp;lcub; lala }")
self.assertEqual(convert_special_chars("< blo"), "&amp;lt; blo")
self.assertEqual(convert_special_chars("<source></source>"), "<source></source>")
self.assertEqual(convert_special_chars("<br>"), "<br>")
self.assertEqual(convert_special_chars("<hr>"), "<hr>")
self.assertEqual(convert_special_chars("<source>"), "<source>")
self.assertEqual(convert_special_chars("<Youtube id='my_vid' />"), "<Youtube id='my_vid' />")
self.assertEqual(convert_special_chars("</br>"), "</br>")
self.assertEqual(convert_special_chars("<br />"), "<br />")
self.assertEqual(convert_special_chars("<p>5 <= 10</p>"), "<p>5 &amp;lt;= 10</p>")
self.assertEqual(
convert_special_chars("<p align='center'>5 <= 10</p>"), "<p align='center'>5 &amp;lt;= 10</p>"
)
self.assertEqual(convert_special_chars("<p>5 <= 10"), "<p>5 &amp;lt;= 10") # no closing tag
self.assertEqual(convert_special_chars("5 <= 10</p>"), "5 &amp;lt;= 10</p>") # no opening tag
self.assertEqual(convert_special_chars("<a>test</b>"), "<a>test</b>") # mismatched tags
self.assertEqual(convert_special_chars("<p>5 < 10</p>"), "<p>5 &amp;lt; 10</p>")
self.assertEqual(convert_special_chars("<p>5 > 10</p>"), "<p>5 > 10</p>")
self.assertEqual(convert_special_chars("<!--...-->"), "<!--...-->") # comment
self.assertEqual(convert_special_chars("<!-- < -->"), "<!-- &amp;lt; -->") # comment
self.assertEqual(convert_special_chars("<!DOCTYPE html> 1 < 2"), "<!DOCTYPE html> 1 &amp;lt; 2")

longer_test = """<script lang="ts">
import Tip from "$lib/Tip.svelte";
import Youtube from "$lib/Youtube.svelte";
import Docstring from "$lib/Docstring.svelte";
import CodeBlock from "$lib/CodeBlock.svelte";
export let fw: "pt" | "tf"
</script>"""
self.assertEqual(convert_special_chars(longer_test), longer_test)

nested_test = """<blockquote>
sometext
<blockquote>
sometext
</blockquote>
</blockquote>"""
self.assertEqual(convert_special_chars(nested_test), nested_test)

html_code = '<a href="Some URl">some_text</a>'
self.assertEqual(convert_special_chars(html_code), html_code)

inner_less = """<blockquote>
sometext 4 &amp;lt; 5
</blockquote>"""
self.assertEqual(convert_special_chars(inner_less), inner_less)

img_code = '<img src="someSrc">'
self.assertEqual(convert_special_chars(img_code), img_code)

video_code = '<video src="someSrc">'
self.assertEqual(convert_special_chars(video_code), video_code)

comment = "<!-- comment -->"
self.assertEqual(convert_special_chars(comment), comment)

comment = "<!-- multi line\ncomment -->"
self.assertEqual(convert_special_chars(comment), comment)

# Regression test for https://github.com/huggingface/doc-builder/pull/394
# '<' must not be considered an HTML tag before a number
self.assertEqual(
convert_special_chars("something <5MB something else -> here"),
"something &amp;lt;5MB something else -> here",
)

# Regression test for https://github.com/huggingface/doc-builder/pull/398
# '10K<n<100K' must be caught correctly and not grouped with the next HTML tag.
self.assertEqual(
convert_special_chars("""10K<n<100K\n<Tip>\nThis is a tip.\n</Tip>"""),
"10K&amp;lt;n&amp;lt;100K\n<Tip>\nThis is a tip.\n</Tip>",
)

def test_convert_img_links(self):
page_info = {"package_name": "transformers", "version": "v4.10.0", "language": "fr"}

Expand All @@ -157,8 +82,8 @@ def test_process_md(self):
{}
<>"""
expected_conversion = """[img](/docs/transformers/v4.10.0/fr/imgs/img.gif)
&amp;lcub;}
&amp;lt;>"""
{}
<>"""
self.assertEqual(process_md(text, page_info), expected_conversion)

def test_convert_include(self):
Expand Down