1818from sparknlp .internal import AnnotatorTransformer
1919from sparknlp .partition .partition_properties import *
2020
21+
2122class Reader2Doc (
2223 AnnotatorTransformer ,
2324 HasEmailReaderProperties ,
@@ -26,16 +27,16 @@ class Reader2Doc(
2627 HasPowerPointProperties ,
2728 HasTextReaderProperties ,
2829):
29-
3030 """
31- The Reader2Doc annotator allows you to use reading files more smoothly within existing
32- Spark NLP workflows, enabling seamless reuse of your pipelines.
31+ The Reader2Doc annotator allows you to use reading files more smoothly within existing
32+ Spark NLP workflows, enabling seamless reuse of your pipelines.
3333
34- Reader2Doc can be used for extracting structured content from various document types
35- using Spark NLP readers. It supports reading from many file types and returns parsed
36- output as a structured Spark DataFrame.
34+ Reader2Doc can be used for extracting structured content from various document types
35+ using Spark NLP readers. It supports reading from many file types and returns parsed
36+ output as a structured Spark DataFrame.
3737
38- Supported formats include:
38+ Supported formats include:
39+
3940 - Plain text
4041 - HTML
4142 - Word (.doc/.docx)
@@ -44,79 +45,74 @@ class Reader2Doc(
4445 - Email files (.eml, .msg)
4546 - PDFs
4647
47- Example:
48- from johnsnowlabs.reader import Reader2Doc
49- from johnsnowlabs.nlp.base import DocumentAssembler
50- from pyspark.ml import Pipeline
51-
52- # Initialize Reader2Doc for PDF files
53- reader2doc = Reader2Doc() \
54- .setContentType("application/pdf") \
55- .setContentPath(f"{pdf_directory}/")
56-
57- # Build the pipeline with the Reader2Doc stage
58- pipeline = Pipeline(stages=[reader2doc])
59-
60- # Fit the pipeline to an empty DataFrame
61- pipeline_model = pipeline.fit(empty_data_set)
62- result_df = pipeline_model.transform(empty_data_set)
63-
64- # Show the resulting DataFrame
65- result_df.show()
66-
67- # Output Example:
68- # +------------------------------------------------------------------------------------------------------------------------------------+
69- # |document |
70- # +------------------------------------------------------------------------------------------------------------------------------------+
71- # |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
72- # |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
73- # |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
74- # +------------------------------------------------------------------------------------------------------------------------------------+
48+ Examples
49+ --------
50+ >>> from johnsnowlabs.reader import Reader2Doc
51+ >>> from johnsnowlabs.nlp.base import DocumentAssembler
52+ >>> from pyspark.ml import Pipeline
53+ >>> # Initialize Reader2Doc for PDF files
54+ >>> reader2doc = Reader2Doc() \\
55+ ... .setContentType("application/pdf") \\
56+ ... .setContentPath(f"{pdf_directory}/")
57+ >>> # Build the pipeline with the Reader2Doc stage
58+ >>> pipeline = Pipeline(stages=[reader2doc])
59+ >>> # Fit the pipeline to an empty DataFrame
60+ >>> pipeline_model = pipeline.fit(empty_data_set)
61+ >>> result_df = pipeline_model.transform(empty_data_set)
62+ >>> # Show the resulting DataFrame
63+ >>> result_df.show()
64+ +------------------------------------------------------------------------------------------------------------------------------------+
65+ |document |
66+ +------------------------------------------------------------------------------------------------------------------------------------+
67+ |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] |
68+ |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
69+ |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]|
70+ +------------------------------------------------------------------------------------------------------------------------------------+
7571"""
7672
77- name = ' Reader2Doc'
73+ name = " Reader2Doc"
7874 outputAnnotatorType = AnnotatorType .DOCUMENT
7975
8076 contentPath = Param (
8177 Params ._dummy (),
8278 "contentPath" ,
8379 "contentPath path to files to read" ,
84- typeConverter = TypeConverters .toString
80+ typeConverter = TypeConverters .toString ,
8581 )
8682
8783 outputCol = Param (
8884 Params ._dummy (),
8985 "outputCol" ,
9086 "output column name" ,
91- typeConverter = TypeConverters .toString
87+ typeConverter = TypeConverters .toString ,
9288 )
9389
9490 contentType = Param (
9591 Params ._dummy (),
9692 "contentType" ,
9793 "Set the content type to load following MIME specification" ,
98- typeConverter = TypeConverters .toString
94+ typeConverter = TypeConverters .toString ,
9995 )
10096
10197 explodeDocs = Param (
10298 Params ._dummy (),
10399 "explodeDocs" ,
104100 "whether to explode the documents into separate rows" ,
105- typeConverter = TypeConverters .toBoolean
101+ typeConverter = TypeConverters .toBoolean ,
106102 )
107103
108104 flattenOutput = Param (
109105 Params ._dummy (),
110106 "flattenOutput" ,
111107 "If true, output is flattened to plain text with minimal metadata" ,
112- typeConverter = TypeConverters .toBoolean
108+ typeConverter = TypeConverters .toBoolean ,
113109 )
114110
115111 titleThreshold = Param (
116112 Params ._dummy (),
117113 "titleThreshold" ,
118114 "Minimum font size threshold for title detection in PDF docs" ,
119- typeConverter = TypeConverters .toFloat
115+ typeConverter = TypeConverters .toFloat ,
120116 )
121117
122118 @keyword_only
@@ -189,4 +185,4 @@ def setTitleThreshold(self, value):
189185 value : float
190186 Minimum font size threshold for title detection in PDF docs
191187 """
192- return self ._set (titleThreshold = value )
188+ return self ._set (titleThreshold = value )
0 commit comments