@@ -42,7 +42,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
4242 val pipelineModel = pipeline.fit(emptyDataSet)
4343 val resultDf = pipelineModel.transform(emptyDataSet)
4444
45- assert(resultDf.count() > 1 )
45+ assert(resultDf.count() == 1 )
4646 }
4747
4848 it should " output clean flatten text without any structured metadata" taggedAs FastTest in {
@@ -52,7 +52,6 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
5252 .setContentPath(s " $htmlFilesDirectory/example-div.html " )
5353 .setOutputCol(" document" )
5454 .setFlattenOutput(true )
55- .setExplodeDocs(false )
5655
5756 val pipeline = new Pipeline ().setStages(Array (reader2Doc))
5857
@@ -91,20 +90,20 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
9190 }
9291 }
9392
94- it should " convert Reader output to Document format with one row per document " taggedAs FastTest in {
93+ it should " convert Reader output to Document format with exploded documents " taggedAs FastTest in {
9594
9695 val reader2Doc = new Reader2Doc ()
9796 .setContentType(" text/html" )
9897 .setContentPath(s " $htmlFilesDirectory/example-div.html " )
9998 .setOutputCol(" document" )
100- .setExplodeDocs(false )
99+ .setExplodeDocs(true )
101100
102101 val pipeline = new Pipeline ().setStages(Array (reader2Doc))
103102
104103 val pipelineModel = pipeline.fit(emptyDataSet)
105104 val resultDf = pipelineModel.transform(emptyDataSet)
106105
107- assert(resultDf.count() == 1 )
106+ assert(resultDf.count() > 1 )
108107 }
109108
110109 it should " work with Tokenizer" taggedAs FastTest in {
@@ -116,9 +115,8 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
116115
117116 val pipelineModel = pipeline.fit(emptyDataSet)
118117 val resultDf = pipelineModel.transform(emptyDataSet)
119- resultDf.select(" document" ).show(truncate = false )
120118
121- assert(resultDf.count() > 1 )
119+ assert(resultDf.count() == 1 )
122120 }
123121
124122 it should " work for Text documents" taggedAs FastTest in {
@@ -132,7 +130,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
132130 val pipelineModel = pipeline.fit(emptyDataSet)
133131 val resultDf = pipelineModel.transform(emptyDataSet)
134132
135- assert(resultDf.count() > 1 )
133+ assert(resultDf.count() == 1 )
136134 }
137135
138136 it should " work for Word documents" taggedAs FastTest in {
@@ -146,7 +144,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
146144 val pipelineModel = pipeline.fit(emptyDataSet)
147145 val resultDf = pipelineModel.transform(emptyDataSet)
148146
149- assert(resultDf.count() > 1 )
147+ assert(resultDf.count() == 1 )
150148 }
151149
152150 it should " work with PDF documents" taggedAs FastTest in {
@@ -160,7 +158,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
160158 val pipelineModel = pipeline.fit(emptyDataSet)
161159 val resultDf = pipelineModel.transform(emptyDataSet)
162160
163- assert(resultDf.count() > 1 )
161+ assert(resultDf.count() == 1 )
164162 }
165163
166164 it should " work with Markdown" taggedAs FastTest in {
@@ -174,7 +172,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
174172 val pipelineModel = pipeline.fit(emptyDataSet)
175173 val resultDf = pipelineModel.transform(emptyDataSet)
176174
177- assert(resultDf.count() > 1 )
175+ assert(resultDf.count() == 1 )
178176 }
179177
180178 it should " work with XML" taggedAs FastTest in {
@@ -188,7 +186,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
188186 val pipelineModel = pipeline.fit(emptyDataSet)
189187 val resultDf = pipelineModel.transform(emptyDataSet)
190188
191- assert(resultDf.count() > 1 )
189+ assert(resultDf.count() == 1 )
192190 }
193191
194192 it should " throw if contentPath is not set" taggedAs FastTest in {
0 commit comments