JohnSnowLabs · DevinTDHa · Aug 5, 2025 · Jul 21, 2025 · Jul 26, 2025 · Jul 28, 2025
diff --git a/examples/python/data-preprocessing/SparkNLP_Reader2Doc_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Reader2Doc_Demo.ipynb
diff --git a/examples/python/data-preprocessing/SparkNLP_Reader2Table_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Reader2Table_Demo.ipynb
diff --git a/python/sparknlp/reader/reader2doc.py b/python/sparknlp/reader/reader2doc.py
@@ -25,7 +25,7 @@ class Reader2Doc(
     HasExcelReaderProperties,
     HasHTMLReaderProperties,
     HasPowerPointProperties,
-    HasTextReaderProperties,
+    HasTextReaderProperties
 ):
     """
     The Reader2Doc annotator allows you to use reading files more smoothly within existing
@@ -36,7 +36,7 @@ class Reader2Doc(
     output as a structured Spark DataFrame.
 
     Supported formats include:
-    
+
     - Plain text
     - HTML
     - Word (.doc/.docx)
@@ -77,42 +77,49 @@ class Reader2Doc(
         Params._dummy(),
         "contentPath",
         "contentPath path to files to read",
-        typeConverter=TypeConverters.toString,
+        typeConverter=TypeConverters.toString
     )
 
     outputCol = Param(
         Params._dummy(),
         "outputCol",
         "output column name",
-        typeConverter=TypeConverters.toString,
+        typeConverter=TypeConverters.toString
     )
 
     contentType = Param(
         Params._dummy(),
         "contentType",
         "Set the content type to load following MIME specification",
-        typeConverter=TypeConverters.toString,
+        typeConverter=TypeConverters.toString
     )
 
     explodeDocs = Param(
         Params._dummy(),
         "explodeDocs",
         "whether to explode the documents into separate rows",
-        typeConverter=TypeConverters.toBoolean,
+        typeConverter=TypeConverters.toBoolean
     )
 
     flattenOutput = Param(
         Params._dummy(),
         "flattenOutput",
         "If true, output is flattened to plain text with minimal metadata",
-        typeConverter=TypeConverters.toBoolean,
+        typeConverter=TypeConverters.toBoolean
     )
 
     titleThreshold = Param(
         Params._dummy(),
         "titleThreshold",
         "Minimum font size threshold for title detection in PDF docs",
-        typeConverter=TypeConverters.toFloat,
+        typeConverter=TypeConverters.toFloat
+    )
+
+    outputFormat = Param(
+        Params._dummy(),
+        "outputFormat",
+        "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
+        typeConverter=TypeConverters.toString
     )
 
     @keyword_only
@@ -126,7 +133,6 @@ def __init__(self):
             titleThreshold=18
         )
     @keyword_only
-
     def setParams(self):
         kwargs = self._input_kwargs
         return self._set(**kwargs)
@@ -192,3 +198,13 @@ def setTitleThreshold(self, value):
             Minimum font size threshold for title detection in PDF docs
         """
         return self._set(titleThreshold=value)
+
+    def setOutputFormat(self, value):
+        """Sets the output format for the table content.
+
+        Parameters
+        ----------
+        value : str
+            Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
+        """
+        return self._set(outputFormat=value)
diff --git a/python/sparknlp/reader/reader2table.py b/python/sparknlp/reader/reader2table.py
@@ -0,0 +1,163 @@
+#  Copyright 2017-2025 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from pyspark import keyword_only
+from pyspark.ml.param import TypeConverters, Params, Param
+
+from sparknlp.common import AnnotatorType
+from sparknlp.internal import AnnotatorTransformer
+from sparknlp.partition.partition_properties import *
+
+class Reader2Table(
+    AnnotatorTransformer,
+    HasEmailReaderProperties,
+    HasExcelReaderProperties,
+    HasHTMLReaderProperties,
+    HasPowerPointProperties,
+    HasTextReaderProperties
+):
+    name = 'Reader2Table'
+
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+
+    contentPath = Param(
+        Params._dummy(),
+        "contentPath",
+        "contentPath path to files to read",
+        typeConverter=TypeConverters.toString
+    )
+
+    outputCol = Param(
+        Params._dummy(),
+        "outputCol",
+        "output column name",
+        typeConverter=TypeConverters.toString
+    )
+
+    contentType = Param(
+        Params._dummy(),
+        "contentType",
+        "Set the content type to load following MIME specification",
+        typeConverter=TypeConverters.toString
+    )
+
+    explodeDocs = Param(
+        Params._dummy(),
+        "explodeDocs",
+        "whether to explode the documents into separate rows",
+        typeConverter=TypeConverters.toBoolean
+    )
+
+    flattenOutput = Param(
+        Params._dummy(),
+        "flattenOutput",
+        "If true, output is flattened to plain text with minimal metadata",
+        typeConverter=TypeConverters.toBoolean
+    )
+
+    titleThreshold = Param(
+        Params._dummy(),
+        "titleThreshold",
+        "Minimum font size threshold for title detection in PDF docs",
+        typeConverter=TypeConverters.toFloat
+    )
+
+    outputFormat = Param(
+        Params._dummy(),
+        "outputFormat",
+        "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
+        typeConverter=TypeConverters.toString
+    )
+
+    @keyword_only
+    def __init__(self):
+        super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
+        self._setDefault(outputCol="document")
+
+    @keyword_only
+    def setParams(self):
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+    def setContentPath(self, value):
+        """Sets content path.
+
+        Parameters
+        ----------
+        value : str
+            contentPath path to files to read
+        """
+        return self._set(contentPath=value)
+
+    def setContentType(self, value):
+        """
+        Set the content type to load following MIME specification
+
+        Parameters
+        ----------
+        value : str
+            content type to load following MIME specification
+        """
+        return self._set(contentType=value)
+
+    def setExplodeDocs(self, value):
+        """Sets whether to explode the documents into separate rows.
+
+
+        Parameters
+        ----------
+        value : boolean
+        Whether to explode the documents into separate rows
+        """
+        return self._set(explodeDocs=value)
+
+    def setOutputCol(self, value):
+        """Sets output column name.
+
+        Parameters
+        ----------
+        value : str
+            Name of the Output Column
+        """
+        return self._set(outputCol=value)
+
+    def setFlattenOutput(self, value):
+        """Sets whether to flatten the output to plain text with minimal metadata.
+
+        Parameters
+        ----------
+        value : bool
+            If true, output is flattened to plain text with minimal metadata
+        """
+        return self._set(flattenOutput=value)
+
+    def setTitleThreshold(self, value):
+        """Sets the minimum font size threshold for title detection in PDF documents.
+
+        Parameters
+        ----------
+        value : float
+            Minimum font size threshold for title detection in PDF docs
+        """
+        return self._set(titleThreshold=value)
+
+    def setOutputFormat(self, value):
+        """Sets the output format for the table content.
+
+        Parameters
+        ----------
+        value : str
+            Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
+        """
+        return self._set(outputFormat=value)
diff --git a/python/test/reader/reader2table_test.py b/python/test/reader/reader2table_test.py
@@ -0,0 +1,63 @@
+
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import pytest
+import os
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from sparknlp.reader.reader2table import Reader2Table
+from test.util import SparkContextForTest
+from pyspark.ml import Pipeline
+
+@pytest.mark.fast
+class Reader2TableTest(unittest.TestCase):
+    def setUp(self):
+        spark = SparkContextForTest.spark
+        self.empty_df = spark.createDataFrame([], "string").toDF("text")
+
+    def runTest(self):
+        reader2table = Reader2Table() \
+            .setContentType("text/html") \
+            .setContentPath(f"file:///{os.getcwd()}/../src/test/resources/reader/html/example-mix-tags.html") \
+            .setOutputCol("document")
+
+        pipeline = Pipeline(stages=[reader2table])
+        model = pipeline.fit(self.empty_df)
+
+        result_df = model.transform(self.empty_df)
+        result_df.show(truncate=False)
+
+        self.assertTrue(result_df.select("document").count() > 0)
+
+@pytest.mark.fast
+class Reader2TableMixedFilesTest(unittest.TestCase):
+    def setUp(self):
+        spark = SparkContextForTest.spark
+        self.empty_df = spark.createDataFrame([], "string").toDF("text")
+
+    def runTest(self):
+        reader2table = Reader2Table() \
+            .setContentPath(f"{os.getcwd()}/../src/test/resources/reader") \
+            .setOutputCol("document")
+
+        pipeline = Pipeline(stages=[reader2table])
+        model = pipeline.fit(self.empty_df)
+
+        result_df = model.transform(self.empty_df)
+
+        self.assertTrue(result_df.select("document").count() > 1)
diff --git a/src/main/scala/com/johnsnowlabs/partition/HasHTMLReaderProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasHTMLReaderProperties.scala
@@ -44,6 +44,13 @@ trait HasHTMLReaderProperties extends ParamsAndFeaturesWritable {
 
   def setIncludeTitleTag(value: Boolean): this.type = set(includeTitleTag, value)
 
+  val outputFormat = new Param[String](
+    this,
+    "outputFormat",
+    "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.")
+
+  def setOutputFormat(value: String): this.type = set(outputFormat, value)
+
   setDefault(timeout -> 0, includeTitleTag -> false, headers -> Map.empty[String, String])
 
 }
diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala
@@ -190,6 +190,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "application/pdf" => sparkNLPReader.pdf
       case "application/xml" => sparkNLPReader.xml
       case "text/markdown" => sparkNLPReader.md
+      case "text/csv" => sparkNLPReader.csv
       case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
     }
   }
@@ -241,6 +242,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "pdf" => sparkNLPReader.pdf
       case "xml" => sparkNLPReader.xml
       case "md" => sparkNLPReader.md
+      case "csv" => sparkNLPReader.csv
       case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")
     }
   }

diff --git a/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala b/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala
@@ -41,7 +41,9 @@ object PartitionHelper {
 
   def isStringContent(contentType: String): Boolean = {
     contentType match {
-      case "text/plain" | "text/html" | "text/markdown" | "application/xml" | "url" => true
+      case "text/plain" | "text/html" | "text/markdown" | "application/xml" | "text/csv" |
+          "url" =>
+        true
       case _ => false
     }
   }