Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
900 changes: 497 additions & 403 deletions examples/python/data-preprocessing/SparkNLP_Reader2Doc_Demo.ipynb

Large diffs are not rendered by default.

1,102 changes: 1,102 additions & 0 deletions examples/python/data-preprocessing/SparkNLP_Reader2Table_Demo.ipynb

Large diffs are not rendered by default.

34 changes: 25 additions & 9 deletions python/sparknlp/reader/reader2doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class Reader2Doc(
HasExcelReaderProperties,
HasHTMLReaderProperties,
HasPowerPointProperties,
HasTextReaderProperties,
HasTextReaderProperties
):
"""
The Reader2Doc annotator allows you to use reading files more smoothly within existing
Expand All @@ -36,7 +36,7 @@ class Reader2Doc(
output as a structured Spark DataFrame.

Supported formats include:

- Plain text
- HTML
- Word (.doc/.docx)
Expand Down Expand Up @@ -77,42 +77,49 @@ class Reader2Doc(
Params._dummy(),
"contentPath",
"contentPath path to files to read",
typeConverter=TypeConverters.toString,
typeConverter=TypeConverters.toString
)

outputCol = Param(
Params._dummy(),
"outputCol",
"output column name",
typeConverter=TypeConverters.toString,
typeConverter=TypeConverters.toString
)

contentType = Param(
Params._dummy(),
"contentType",
"Set the content type to load following MIME specification",
typeConverter=TypeConverters.toString,
typeConverter=TypeConverters.toString
)

explodeDocs = Param(
Params._dummy(),
"explodeDocs",
"whether to explode the documents into separate rows",
typeConverter=TypeConverters.toBoolean,
typeConverter=TypeConverters.toBoolean
)

flattenOutput = Param(
Params._dummy(),
"flattenOutput",
"If true, output is flattened to plain text with minimal metadata",
typeConverter=TypeConverters.toBoolean,
typeConverter=TypeConverters.toBoolean
)

titleThreshold = Param(
Params._dummy(),
"titleThreshold",
"Minimum font size threshold for title detection in PDF docs",
typeConverter=TypeConverters.toFloat,
typeConverter=TypeConverters.toFloat
)

outputFormat = Param(
Params._dummy(),
"outputFormat",
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
typeConverter=TypeConverters.toString
)

@keyword_only
Expand All @@ -126,7 +133,6 @@ def __init__(self):
titleThreshold=18
)
@keyword_only

def setParams(self):
kwargs = self._input_kwargs
return self._set(**kwargs)
Expand Down Expand Up @@ -192,3 +198,13 @@ def setTitleThreshold(self, value):
Minimum font size threshold for title detection in PDF docs
"""
return self._set(titleThreshold=value)

def setOutputFormat(self, value):
"""Sets the output format for the table content.

Parameters
----------
value : str
Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
"""
return self._set(outputFormat=value)
163 changes: 163 additions & 0 deletions python/sparknlp/reader/reader2table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# Copyright 2017-2025 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param

from sparknlp.common import AnnotatorType
from sparknlp.internal import AnnotatorTransformer
from sparknlp.partition.partition_properties import *

class Reader2Table(
AnnotatorTransformer,
HasEmailReaderProperties,
HasExcelReaderProperties,
HasHTMLReaderProperties,
HasPowerPointProperties,
HasTextReaderProperties
):
name = 'Reader2Table'

outputAnnotatorType = AnnotatorType.DOCUMENT

contentPath = Param(
Params._dummy(),
"contentPath",
"contentPath path to files to read",
typeConverter=TypeConverters.toString
)

outputCol = Param(
Params._dummy(),
"outputCol",
"output column name",
typeConverter=TypeConverters.toString
)

contentType = Param(
Params._dummy(),
"contentType",
"Set the content type to load following MIME specification",
typeConverter=TypeConverters.toString
)

explodeDocs = Param(
Params._dummy(),
"explodeDocs",
"whether to explode the documents into separate rows",
typeConverter=TypeConverters.toBoolean
)

flattenOutput = Param(
Params._dummy(),
"flattenOutput",
"If true, output is flattened to plain text with minimal metadata",
typeConverter=TypeConverters.toBoolean
)

titleThreshold = Param(
Params._dummy(),
"titleThreshold",
"Minimum font size threshold for title detection in PDF docs",
typeConverter=TypeConverters.toFloat
)

outputFormat = Param(
Params._dummy(),
"outputFormat",
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
typeConverter=TypeConverters.toString
)

@keyword_only
def __init__(self):
super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
self._setDefault(outputCol="document")

@keyword_only
def setParams(self):
kwargs = self._input_kwargs
return self._set(**kwargs)

def setContentPath(self, value):
"""Sets content path.

Parameters
----------
value : str
contentPath path to files to read
"""
return self._set(contentPath=value)

def setContentType(self, value):
"""
Set the content type to load following MIME specification

Parameters
----------
value : str
content type to load following MIME specification
"""
return self._set(contentType=value)

def setExplodeDocs(self, value):
"""Sets whether to explode the documents into separate rows.


Parameters
----------
value : boolean
Whether to explode the documents into separate rows
"""
return self._set(explodeDocs=value)

def setOutputCol(self, value):
"""Sets output column name.

Parameters
----------
value : str
Name of the Output Column
"""
return self._set(outputCol=value)

def setFlattenOutput(self, value):
"""Sets whether to flatten the output to plain text with minimal metadata.

Parameters
----------
value : bool
If true, output is flattened to plain text with minimal metadata
"""
return self._set(flattenOutput=value)

def setTitleThreshold(self, value):
"""Sets the minimum font size threshold for title detection in PDF documents.

Parameters
----------
value : float
Minimum font size threshold for title detection in PDF docs
"""
return self._set(titleThreshold=value)

def setOutputFormat(self, value):
"""Sets the output format for the table content.

Parameters
----------
value : str
Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.
"""
return self._set(outputFormat=value)
63 changes: 63 additions & 0 deletions python/test/reader/reader2table_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@

# Copyright 2017-2024 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import pytest
import os

from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.reader.reader2table import Reader2Table
from test.util import SparkContextForTest
from pyspark.ml import Pipeline

@pytest.mark.fast
class Reader2TableTest(unittest.TestCase):
def setUp(self):
spark = SparkContextForTest.spark
self.empty_df = spark.createDataFrame([], "string").toDF("text")

def runTest(self):
reader2table = Reader2Table() \
.setContentType("text/html") \
.setContentPath(f"file:///{os.getcwd()}/../src/test/resources/reader/html/example-mix-tags.html") \
.setOutputCol("document")

pipeline = Pipeline(stages=[reader2table])
model = pipeline.fit(self.empty_df)

result_df = model.transform(self.empty_df)
result_df.show(truncate=False)

self.assertTrue(result_df.select("document").count() > 0)

@pytest.mark.fast
class Reader2TableMixedFilesTest(unittest.TestCase):
def setUp(self):
spark = SparkContextForTest.spark
self.empty_df = spark.createDataFrame([], "string").toDF("text")

def runTest(self):
reader2table = Reader2Table() \
.setContentPath(f"{os.getcwd()}/../src/test/resources/reader") \
.setOutputCol("document")

pipeline = Pipeline(stages=[reader2table])
model = pipeline.fit(self.empty_df)

result_df = model.transform(self.empty_df)

self.assertTrue(result_df.select("document").count() > 1)
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ trait HasHTMLReaderProperties extends ParamsAndFeaturesWritable {

def setIncludeTitleTag(value: Boolean): this.type = set(includeTitleTag, value)

val outputFormat = new Param[String](
this,
"outputFormat",
"Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.")

def setOutputFormat(value: String): this.type = set(outputFormat, value)

setDefault(timeout -> 0, includeTitleTag -> false, headers -> Map.empty[String, String])

}
2 changes: 2 additions & 0 deletions src/main/scala/com/johnsnowlabs/partition/Partition.scala
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
case "application/pdf" => sparkNLPReader.pdf
case "application/xml" => sparkNLPReader.xml
case "text/markdown" => sparkNLPReader.md
case "text/csv" => sparkNLPReader.csv
case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
}
}
Expand Down Expand Up @@ -241,6 +242,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
case "pdf" => sparkNLPReader.pdf
case "xml" => sparkNLPReader.xml
case "md" => sparkNLPReader.md
case "csv" => sparkNLPReader.csv
case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ object PartitionHelper {

def isStringContent(contentType: String): Boolean = {
contentType match {
case "text/plain" | "text/html" | "text/markdown" | "application/xml" | "url" => true
case "text/plain" | "text/html" | "text/markdown" | "application/xml" | "text/csv" |
"url" =>
true
case _ => false
}
}
Expand Down
Loading