Skip to content

Commit b4ab50d

Browse files
danilojslDevinTDHa
andauthored
[SPARKNLP-1213] Introducing MarkdownReader (#14618)
* [SPARKNLP-1213] Introducing MarkdownReader * [SPARKNLP-1213] Adding python wrapper for Markdown reader * [SPARKNLP-1213] Adding demo notebook for Markdown reader and adds Partition support for md files * [SPARKNLP-1213] Addressing copilot suggestions * [SPARKNLP-1213] Corrects typo in partition demo notebook [skip test] * [SPARKNLP-1213] Adding direct input text, URL and table options to MarkdownReader * move README.md to reader folder --------- Co-authored-by: Devin Ha <[email protected]>
1 parent e6262b1 commit b4ab50d

File tree

15 files changed

+865
-134
lines changed

15 files changed

+865
-134
lines changed

build.sbt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ lazy val utilDependencies = Seq(
7272
exclude ("org.apache.logging.log4j", "log4j-api"),
7373
scratchpad
7474
exclude ("org.apache.logging.log4j", "log4j-api"),
75-
pdfBox)
75+
pdfBox,
76+
flexmark)
7677

7778
lazy val typedDependencyParserDependencies = Seq(junit)
7879

examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb

Lines changed: 225 additions & 131 deletions
Large diffs are not rendered by default.
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"id": "tzcU5p2gdak9"
7+
},
8+
"source": [
9+
"# Introducing Markdown reader in SparkNLP\n",
10+
"This notebook showcases the newly added `sparknlp.read().md()` method in Spark NLP that parses Markdown content from both local files and real-time URLs into a Spark DataFrame.\n",
11+
"\n",
12+
"**Key Features:**\n",
13+
"- Ability to parse Markdown from local directories and URLs.\n",
14+
"- Versatile support for varied data ingestion scenarios."
15+
]
16+
},
17+
{
18+
"cell_type": "markdown",
19+
"metadata": {
20+
"id": "RFOFhaEedalB"
21+
},
22+
"source": [
23+
"## Setup and Initialization\n",
24+
"Let's keep in mind a few things before we start 😊\n",
25+
"\n",
26+
"Support for reading markdown files was introduced in Spark NLP 6.0.5. Please make sure you have upgraded to the latest Spark NLP release."
27+
]
28+
},
29+
{
30+
"cell_type": "markdown",
31+
"metadata": {
32+
"id": "Y3hWfT5q-npM"
33+
},
34+
"source": [
35+
"- Let's install and setup Spark NLP in Google Colab\n",
36+
"- This part is pretty easy via our simple script"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"id": "u3ORYVyb-pRI"
44+
},
45+
"outputs": [],
46+
"source": [
47+
"! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {
53+
"id": "oIbFQyEo-tat"
54+
},
55+
"source": [
56+
"For local files example we will download a markdown file from Spark NLP Github repo:"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": null,
62+
"metadata": {
63+
"colab": {
64+
"base_uri": "https://localhost:8080/"
65+
},
66+
"id": "ya8qZe00dalC",
67+
"outputId": "0311f02e-10cf-4037-8ea3-7cbd5d8820f7"
68+
},
69+
"outputs": [
70+
{
71+
"name": "stdout",
72+
"output_type": "stream",
73+
"text": [
74+
"mkdir: cannot create directory ‘md-files’: File exists\n",
75+
"--2025-07-02 14:27:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1213-Adding-MarkdownReader/src/test/resources/reader/md/simple.md\n",
76+
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...\n",
77+
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
78+
"HTTP request sent, awaiting response... 200 OK\n",
79+
"Length: 181 [text/plain]\n",
80+
"Saving to: ‘md-files/simple.md’\n",
81+
"\n",
82+
"simple.md 100%[===================>] 181 --.-KB/s in 0s \n",
83+
"\n",
84+
"2025-07-02 14:27:11 (2.39 MB/s) - ‘md-files/simple.md’ saved [181/181]\n",
85+
"\n"
86+
]
87+
}
88+
],
89+
"source": [
90+
"!mkdir md-files\n",
91+
"!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/md/simple.md -P md-files"
92+
]
93+
},
94+
{
95+
"cell_type": "markdown",
96+
"metadata": {
97+
"id": "EoFI66NAdalE"
98+
},
99+
"source": [
100+
"## Parsing Markdown from Local Files\n",
101+
"Use the `md()` method to parse Markdown content from local directories."
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": null,
107+
"metadata": {
108+
"colab": {
109+
"base_uri": "https://localhost:8080/"
110+
},
111+
"id": "bAkMjJ1vdalE",
112+
"outputId": "b674b476-8afc-4966-9b21-508e1441e6a8"
113+
},
114+
"outputs": [
115+
{
116+
"name": "stdout",
117+
"output_type": "stream",
118+
"text": [
119+
"Warning::Spark Session already created, some configs may not take.\n",
120+
"+--------------------+--------------------+\n",
121+
"| path| md|\n",
122+
"+--------------------+--------------------+\n",
123+
"|file:/content/md-...|[{Title, Introduc...|\n",
124+
"+--------------------+--------------------+\n",
125+
"\n"
126+
]
127+
}
128+
],
129+
"source": [
130+
"import sparknlp\n",
131+
"md_df = sparknlp.read().md(\"./md-files\")\n",
132+
"\n",
133+
"md_df.show()"
134+
]
135+
},
136+
{
137+
"cell_type": "code",
138+
"execution_count": null,
139+
"metadata": {
140+
"colab": {
141+
"base_uri": "https://localhost:8080/"
142+
},
143+
"id": "oBj0cHPXSD1m",
144+
"outputId": "128cb731-6b90-4856-9903-3af8245c6af7"
145+
},
146+
"outputs": [
147+
{
148+
"name": "stdout",
149+
"output_type": "stream",
150+
"text": [
151+
"root\n",
152+
" |-- path: string (nullable = true)\n",
153+
" |-- md: array (nullable = true)\n",
154+
" | |-- element: struct (containsNull = true)\n",
155+
" | | |-- elementType: string (nullable = true)\n",
156+
" | | |-- content: string (nullable = true)\n",
157+
" | | |-- metadata: map (nullable = true)\n",
158+
" | | | |-- key: string\n",
159+
" | | | |-- value: string (valueContainsNull = true)\n",
160+
"\n"
161+
]
162+
}
163+
],
164+
"source": [
165+
"md_df.printSchema()"
166+
]
167+
}
168+
],
169+
"metadata": {
170+
"colab": {
171+
"provenance": []
172+
},
173+
"kernelspec": {
174+
"display_name": "Python 3 (ipykernel)",
175+
"language": "python",
176+
"name": "python3"
177+
},
178+
"language_info": {
179+
"codemirror_mode": {
180+
"name": "ipython",
181+
"version": 3
182+
},
183+
"file_extension": ".py",
184+
"mimetype": "text/x-python",
185+
"name": "python",
186+
"nbconvert_exporter": "python",
187+
"pygments_lexer": "ipython3",
188+
"version": "3.10.12"
189+
}
190+
},
191+
"nbformat": 4,
192+
"nbformat_minor": 1
193+
}

examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"## Setup and Initialization\n",
2424
"Let's keep in mind a few things before we start 😊\n",
2525
"\n",
26-
"Support for reading xml files was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release."
26+
"Support for reading xml files was introduced in Spark NLP 6.0.3. Please make sure you have upgraded to the latest Spark NLP release."
2727
]
2828
},
2929
{

project/Dependencies.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,5 +149,8 @@ object Dependencies {
149149

150150
val pdfBoxVersion = "2.0.28"
151151
val pdfBox = "org.apache.pdfbox" % "pdfbox" % pdfBoxVersion
152+
153+
val flexmarkVersion = "0.61.34"
154+
val flexmark = "com.vladsch.flexmark" % "flexmark-all" % flexmarkVersion
152155
/** ------- Dependencies end ------- */
153156
}

python/sparknlp/reader/sparknlp_reader.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,4 +367,50 @@ def xml(self, docPath):
367367
if not isinstance(docPath, str):
368368
raise TypeError("docPath must be a string")
369369
jdf = self._java_obj.xml(docPath)
370+
return self.getDataFrame(self.spark, jdf)
371+
372+
373+
def md(self, filePath):
374+
"""Reads Markdown files and returns a Spark DataFrame.
375+
376+
Parameters
377+
----------
378+
filePath : str
379+
Path to a Markdown file or a directory containing Markdown files.
380+
381+
Returns
382+
-------
383+
pyspark.sql.DataFrame
384+
A DataFrame containing parsed Markdown content.
385+
386+
Examples
387+
--------
388+
>>> from sparknlp.reader import SparkNLPReader
389+
>>> md_df = SparkNLPReader(spark).md("home/user/markdown-directory")
390+
391+
You can use SparkNLP for one line of code
392+
393+
>>> import sparknlp
394+
>>> md_df = sparknlp.read().md("home/user/markdown-directory")
395+
>>> md_df.show(truncate=False)
396+
+-----------------------------------------------------------+
397+
|md |
398+
+-----------------------------------------------------------+
399+
|[{Title, Sample Markdown Document, {elementId -> ..., tag -> title}}]|
400+
+-----------------------------------------------------------+
401+
402+
>>> md_df.printSchema()
403+
root
404+
|-- path: string (nullable = true)
405+
|-- md: array (nullable = true)
406+
| |-- element: struct (containsNull = true)
407+
| | |-- elementType: string (nullable = true)
408+
| | |-- content: string (nullable = true)
409+
| | |-- metadata: map (nullable = true)
410+
| | | |-- key: string
411+
| | | |-- value: string (valueContainsNull = true)
412+
"""
413+
if not isinstance(filePath, str):
414+
raise TypeError("filePath must be a string")
415+
jdf = self._java_obj.md(filePath)
370416
return self.getDataFrame(self.spark, jdf)

python/test/sparknlp_test.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,4 +139,17 @@ def runTest(self):
139139
xml_df = sparknlp.read().xml(self.xml_files)
140140
xml_df.show()
141141

142-
self.assertTrue(xml_df.select("xml").count() > 0)
142+
self.assertTrue(xml_df.select("xml").count() > 0)
143+
144+
@pytest.mark.fast
145+
class SparkNLPTestMdFilesSpec(unittest.TestCase):
146+
147+
def setUp(self):
148+
self.data = SparkContextForTest.data
149+
self.md_file = f"file:///{os.getcwd()}/../src/test/resources/reader/md/simple.md"
150+
151+
def runTest(self):
152+
md_df = sparknlp.read().md(self.md_file)
153+
md_df.show()
154+
155+
self.assertTrue(md_df.select("md").count() > 0)

src/main/scala/com/johnsnowlabs/partition/Partition.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
189189
sparkNLPReader.ppt
190190
case "application/pdf" => sparkNLPReader.pdf
191191
case "application/xml" => sparkNLPReader.xml
192+
case "text/markdown" => sparkNLPReader.md
192193
case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
193194
}
194195
}
@@ -201,6 +202,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
201202
case "text/html" => sparkNLPReader.htmlToHTMLElement
202203
case "url" => sparkNLPReader.urlToHTMLElement
203204
case "application/xml" => sparkNLPReader.xmlToHTMLElement
205+
case "text/markdown" => sparkNLPReader.mdToHTMLElement
204206
case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
205207
}
206208
}
@@ -237,6 +239,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
237239
case "ppt" | "pptx" => sparkNLPReader.ppt
238240
case "pdf" => sparkNLPReader.pdf
239241
case "xml" => sparkNLPReader.xml
242+
case "md" => sparkNLPReader.md
240243
case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")
241244
}
242245
}

src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,29 @@ class HTMLReader(
268268
content = tableText,
269269
metadata = pageMetadata)
270270
}
271+
case "li" =>
272+
val itemText = element.text().trim
273+
if (itemText.nonEmpty && !visitedNode) {
274+
trackingNodes(element).visited = true
275+
elements += HTMLElement(
276+
ElementType.LIST_ITEM,
277+
content = itemText,
278+
metadata = pageMetadata)
279+
}
280+
case "pre" =>
281+
// A <pre> tag typically contains a <code> child
282+
val codeElem = element.getElementsByTag("code").first()
283+
val codeText =
284+
if (codeElem != null) codeElem.text().trim
285+
else element.text().trim
286+
if (codeText.nonEmpty && !visitedNode) {
287+
trackingNodes(element).visited = true
288+
elements += HTMLElement(
289+
ElementType.UNCATEGORIZED_TEXT, // or ElementType.CODE if you have it
290+
content = codeText,
291+
metadata = pageMetadata
292+
)
293+
}
271294
case "p" =>
272295
if (!visitedNode) {
273296
classifyParagraphElement(element) match {

0 commit comments

Comments
 (0)