JohnSnowLabs
diff --git a/‎build.sbt‎
Lines changed: 2 additions & 1 deletion b/‎build.sbt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb‎
Lines changed: 225 additions & 131 deletions b/‎examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb‎
Lines changed: 225 additions & 131 deletions
diff --git a/‎examples/python/reader/SparkNLP_Markdown_Reader_Demo.ipynb‎
Lines changed: 193 additions & 0 deletions b/‎examples/python/reader/SparkNLP_Markdown_Reader_Demo.ipynb‎
Lines changed: 193 additions & 0 deletions
diff --git a/‎examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎project/Dependencies.scala‎
Lines changed: 3 additions & 0 deletions b/‎project/Dependencies.scala‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/sparknlp/reader/sparknlp_reader.py‎
Lines changed: 46 additions & 0 deletions b/‎python/sparknlp/reader/sparknlp_reader.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎python/test/sparknlp_test.py‎
Lines changed: 14 additions & 1 deletion b/‎python/test/sparknlp_test.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎src/main/scala/com/johnsnowlabs/partition/Partition.scala‎
Lines changed: 3 additions & 0 deletions b/‎src/main/scala/com/johnsnowlabs/partition/Partition.scala‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala‎
Lines changed: 23 additions & 0 deletions b/‎src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala‎
Lines changed: 23 additions & 0 deletions
@@ -72,7 +72,8 @@ lazy val utilDependencies = Seq(
     exclude ("org.apache.logging.log4j", "log4j-api"),
   scratchpad
     exclude ("org.apache.logging.log4j", "log4j-api"),
-  pdfBox)
+  pdfBox,
+  flexmark)
 
 lazy val typedDependencyParserDependencies = Seq(junit)
 
 
@@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tzcU5p2gdak9"
+   },
+   "source": [
+    "# Introducing Markdown reader in SparkNLP\n",
+    "This notebook showcases the newly added  `sparknlp.read().md()` method in Spark NLP that parses Markdown content from both local files and real-time URLs into a Spark DataFrame.\n",
+    "\n",
+    "**Key Features:**\n",
+    "- Ability to parse Markdown from local directories and URLs.\n",
+    "- Versatile support for varied data ingestion scenarios."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RFOFhaEedalB"
+   },
+   "source": [
+    "## Setup and Initialization\n",
+    "Let's keep in mind a few things before we start 😊\n",
+    "\n",
+    "Support for reading markdown files was introduced in Spark NLP 6.0.5. Please make sure you have upgraded to the latest Spark NLP release."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Y3hWfT5q-npM"
+   },
+   "source": [
+    "- Let's install and setup Spark NLP in Google Colab\n",
+    "- This part is pretty easy via our simple script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "u3ORYVyb-pRI"
+   },
+   "outputs": [],
+   "source": [
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "oIbFQyEo-tat"
+   },
+   "source": [
+    "For local files example we will download a markdown file from Spark NLP Github repo:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ya8qZe00dalC",
+    "outputId": "0311f02e-10cf-4037-8ea3-7cbd5d8820f7"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mkdir: cannot create directory ‘md-files’: File exists\n",
+      "--2025-07-02 14:27:11--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1213-Adding-MarkdownReader/src/test/resources/reader/md/simple.md\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 181 [text/plain]\n",
+      "Saving to: ‘md-files/simple.md’\n",
+      "\n",
+      "simple.md           100%[===================>]     181  --.-KB/s    in 0s      \n",
+      "\n",
+      "2025-07-02 14:27:11 (2.39 MB/s) - ‘md-files/simple.md’ saved [181/181]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir md-files\n",
+    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/md/simple.md -P md-files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EoFI66NAdalE"
+   },
+   "source": [
+    "## Parsing Markdown from Local Files\n",
+    "Use the `md()` method to parse Markdown content from local directories."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "bAkMjJ1vdalE",
+    "outputId": "b674b476-8afc-4966-9b21-508e1441e6a8"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "+--------------------+--------------------+\n",
+      "|                path|                  md|\n",
+      "+--------------------+--------------------+\n",
+      "|file:/content/md-...|[{Title, Introduc...|\n",
+      "+--------------------+--------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sparknlp\n",
+    "md_df = sparknlp.read().md(\"./md-files\")\n",
+    "\n",
+    "md_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "oBj0cHPXSD1m",
+    "outputId": "128cb731-6b90-4856-9903-3af8245c6af7"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- path: string (nullable = true)\n",
+      " |-- md: array (nullable = true)\n",
+      " |    |-- element: struct (containsNull = true)\n",
+      " |    |    |-- elementType: string (nullable = true)\n",
+      " |    |    |-- content: string (nullable = true)\n",
+      " |    |    |-- metadata: map (nullable = true)\n",
+      " |    |    |    |-- key: string\n",
+      " |    |    |    |-- value: string (valueContainsNull = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "md_df.printSchema()"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
@@ -23,7 +23,7 @@
     "## Setup and Initialization\n",
     "Let's keep in mind a few things before we start 😊\n",
     "\n",
-    "Support for reading xml files was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release."
+    "Support for reading xml files was introduced in Spark NLP 6.0.3. Please make sure you have upgraded to the latest Spark NLP release."
    ]
   },
   {
 
@@ -149,5 +149,8 @@ object Dependencies {
 
   val pdfBoxVersion = "2.0.28"
   val pdfBox = "org.apache.pdfbox" % "pdfbox" % pdfBoxVersion
+
+  val flexmarkVersion = "0.61.34"
+  val flexmark = "com.vladsch.flexmark" % "flexmark-all" % flexmarkVersion
   /** ------- Dependencies end  ------- */
 }
@@ -367,4 +367,50 @@ def xml(self, docPath):
         if not isinstance(docPath, str):
             raise TypeError("docPath must be a string")
         jdf = self._java_obj.xml(docPath)
+        return self.getDataFrame(self.spark, jdf)
+
+
+    def md(self, filePath):
+        """Reads Markdown files and returns a Spark DataFrame.
+
+        Parameters
+        ----------
+        filePath : str
+            Path to a Markdown file or a directory containing Markdown files.
+
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            A DataFrame containing parsed Markdown content.
+
+        Examples
+        --------
+        >>> from sparknlp.reader import SparkNLPReader
+        >>> md_df = SparkNLPReader(spark).md("home/user/markdown-directory")
+
+        You can use SparkNLP for one line of code
+
+        >>> import sparknlp
+        >>> md_df = sparknlp.read().md("home/user/markdown-directory")
+        >>> md_df.show(truncate=False)
+        +-----------------------------------------------------------+
+        |md                                                         |
+        +-----------------------------------------------------------+
+        |[{Title, Sample Markdown Document, {elementId -> ..., tag -> title}}]|
+        +-----------------------------------------------------------+
+
+        >>> md_df.printSchema()
+        root
+         |-- path: string (nullable = true)
+         |-- md: array (nullable = true)
+         |    |-- element: struct (containsNull = true)
+         |    |    |-- elementType: string (nullable = true)
+         |    |    |-- content: string (nullable = true)
+         |    |    |-- metadata: map (nullable = true)
+         |    |    |    |-- key: string
+         |    |    |    |-- value: string (valueContainsNull = true)
+        """
+        if not isinstance(filePath, str):
+            raise TypeError("filePath must be a string")
+        jdf = self._java_obj.md(filePath)
         return self.getDataFrame(self.spark, jdf)
@@ -139,4 +139,17 @@ def runTest(self):
         xml_df = sparknlp.read().xml(self.xml_files)
         xml_df.show()
 
-        self.assertTrue(xml_df.select("xml").count() > 0)
+        self.assertTrue(xml_df.select("xml").count() > 0)
+
+@pytest.mark.fast
+class SparkNLPTestMdFilesSpec(unittest.TestCase):
+
+    def setUp(self):
+        self.data = SparkContextForTest.data
+        self.md_file = f"file:///{os.getcwd()}/../src/test/resources/reader/md/simple.md"
+
+    def runTest(self):
+        md_df = sparknlp.read().md(self.md_file)
+        md_df.show()
+
+        self.assertTrue(md_df.select("md").count() > 0)
@@ -189,6 +189,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
         sparkNLPReader.ppt
       case "application/pdf" => sparkNLPReader.pdf
       case "application/xml" => sparkNLPReader.xml
+      case "text/markdown" => sparkNLPReader.md
       case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
     }
   }
@@ -201,6 +202,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "text/html" => sparkNLPReader.htmlToHTMLElement
       case "url" => sparkNLPReader.urlToHTMLElement
       case "application/xml" => sparkNLPReader.xmlToHTMLElement
+      case "text/markdown" => sparkNLPReader.mdToHTMLElement
       case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
     }
   }
@@ -237,6 +239,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "ppt" | "pptx" => sparkNLPReader.ppt
       case "pdf" => sparkNLPReader.pdf
       case "xml" => sparkNLPReader.xml
+      case "md" => sparkNLPReader.md
       case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")
     }
   }
 
@@ -268,6 +268,29 @@ class HTMLReader(
                   content = tableText,
                   metadata = pageMetadata)
               }
+            case "li" =>
+              val itemText = element.text().trim
+              if (itemText.nonEmpty && !visitedNode) {
+                trackingNodes(element).visited = true
+                elements += HTMLElement(
+                  ElementType.LIST_ITEM,
+                  content = itemText,
+                  metadata = pageMetadata)
+              }
+            case "pre" =>
+              // A <pre> tag typically contains a <code> child
+              val codeElem = element.getElementsByTag("code").first()
+              val codeText =
+                if (codeElem != null) codeElem.text().trim
+                else element.text().trim
+              if (codeText.nonEmpty && !visitedNode) {
+                trackingNodes(element).visited = true
+                elements += HTMLElement(
+                  ElementType.UNCATEGORIZED_TEXT, // or ElementType.CODE if you have it
+                  content = codeText,
+                  metadata = pageMetadata
+                )
+              }
             case "p" =>
               if (!visitedNode) {
                 classifyParagraphElement(element) match {
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`"## Setup and Initialization\n",`
`24`	`24`	`"Let's keep in mind a few things before we start 😊\n",`
`25`	`25`	`"\n",`
`26`		`- "Support for reading xml files was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release."`
	`26`	`+ "Support for reading xml files was introduced in Spark NLP 6.0.3. Please make sure you have upgraded to the latest Spark NLP release."`
`27`	`27`	`]`
`28`	`28`	`},`
`29`	`29`	`{`
Original file line number	Diff line number	Diff line change
`@@ -149,5 +149,8 @@ object Dependencies {`
`149`	`149`
`150`	`150`	`val pdfBoxVersion = "2.0.28"`
`151`	`151`	`val pdfBox = "org.apache.pdfbox" % "pdfbox" % pdfBoxVersion`
	`152`	`+`
	`153`	`+ val flexmarkVersion = "0.61.34"`
	`154`	`+ val flexmark = "com.vladsch.flexmark" % "flexmark-all" % flexmarkVersion`
`152`	`155`	`/** ------- Dependencies end ------- */`
`153`	`156`	`}`
Original file line number	Diff line number	Diff line change
`@@ -189,6 +189,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())`
`189`	`189`	`sparkNLPReader.ppt`
`190`	`190`	`case "application/pdf" => sparkNLPReader.pdf`
`191`	`191`	`case "application/xml" => sparkNLPReader.xml`
	`192`	`+ case "text/markdown" => sparkNLPReader.md`
`192`	`193`	`case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")`
`193`	`194`	`}`
`194`	`195`	`}`
`@@ -201,6 +202,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())`
`201`	`202`	`case "text/html" => sparkNLPReader.htmlToHTMLElement`
`202`	`203`	`case "url" => sparkNLPReader.urlToHTMLElement`
`203`	`204`	`case "application/xml" => sparkNLPReader.xmlToHTMLElement`
	`205`	`+ case "text/markdown" => sparkNLPReader.mdToHTMLElement`
`204`	`206`	`case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")`
`205`	`207`	`}`
`206`	`208`	`}`
`@@ -237,6 +239,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())`
`237`	`239`	`case "ppt" \| "pptx" => sparkNLPReader.ppt`
`238`	`240`	`case "pdf" => sparkNLPReader.pdf`
`239`	`241`	`case "xml" => sparkNLPReader.xml`
	`242`	`+ case "md" => sparkNLPReader.md`
`240`	`243`	`case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")`
`241`	`244`	`}`
`242`	`245`	`}`