diff --git a/examples/python/data-preprocessing/SparkNLP_Reader2Doc_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Reader2Doc_Demo.ipynb new file mode 100644 index 00000000000000..d625c8bcf56a14 --- /dev/null +++ b/examples/python/data-preprocessing/SparkNLP_Reader2Doc_Demo.ipynb @@ -0,0 +1,1395 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing Reader2Doc in SparkNLP\n", + "This notebook showcases the newly added `Reader2Doc` annotator in Spark NLP\n", + "providing a streamlined and user-friendly interface for reading files. Useful for preprocessing data for NLP pipelines" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DczWop6QeE8F", + "outputId": "63c45993-626d-4b75-b4d4-57efe43b8a84" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.5.1\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RFOFhaEedalB" + }, + "source": [ + "## Setup and Initialization\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "Support for **Reader2Doc** annotator was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1vLFuCnVnVd8" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab. This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "JVUu3mJXnXmm" + }, + "outputs": [], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-QXcOttbnmsI" + }, + "source": [ + "The output of Reader2Doc uses the same Annotation schema as other Spark NLP annotators. This means you can seamlessly integrate it into any Spark NLP pipeline or process that expects annotated data." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "0WIcptZ7nhp5" + }, + "outputs": [], + "source": [ + "from sparknlp.reader.reader2doc import Reader2Doc\n", + "from pyspark.ml import Pipeline\n", + "\n", + "empty_df = spark.createDataFrame([], \"string\").toDF(\"text\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ah8RigOanazZ" + }, + "source": [ + "For local files example we will download different files from Spark NLP Github repo:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E3bCFJZn8TS0" + }, + "source": [ + "## Reading PDF Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CZP7vz-gn6Rl" + }, + "source": [ + "**Downloading PDF files**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ya8qZe00dalC", + "outputId": "7b0ed5d2-aa8a-493f-fe32-ce9b1cf9c581" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:50:48-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/image_3_pages.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 15629 (15K) [application/octet-stream]\n", + "Saving to: ‘pdf-files/image_3_pages.pdf’\n", + "\n", + "image_3_pages.pdf 100%[===================>] 15.26K --.-KB/s in 0.001s \n", + "\n", + "2025-07-20 23:50:49 (13.4 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n", + "\n", + "--2025-07-20 23:50:49-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/pdf-title.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 25803 (25K) [application/octet-stream]\n", + "Saving to: ‘pdf-files/pdf-title.pdf’\n", + "\n", + "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0.002s \n", + "\n", + "2025-07-20 23:50:49 (16.0 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n", + "\n", + "--2025-07-20 23:50:49-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/text_3_pages.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9487 (9.3K) [application/octet-stream]\n", + "Saving to: ‘pdf-files/text_3_pages.pdf’\n", + "\n", + "text_3_pages.pdf 100%[===================>] 9.26K --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:50:49 (60.9 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir pdf-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/image_3_pages.pdf -P pdf-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/pdf-title.pdf -P pdf-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/text_3_pages.pdf -P pdf-files" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3vz48AHQHyON", + "outputId": "cf838fd4-fdf7-47c8-c641-2dbbf2e021e2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 14...|\n", + "|[{document, 15, 3...|\n", + "|[{document, 36, 5...|\n", + "|[{document, 0, 14...|\n", + "|[{document, 15, 3...|\n", + "|[{document, 39, 6...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/pdf\") \\\n", + " .setContentPath(\"./pdf-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "02qrQWIWP89R" + }, + "source": [ + "## Reading HTML Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "joUGu23jq4m4" + }, + "source": [ + "**Downloading HTML files**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bo7s-jZVrE7W", + "outputId": "01cd6445-85e7-4632-fddc-0276de3d2ce3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:04-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/html/example-10k.html\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2456707 (2.3M) [text/plain]\n", + "Saving to: ‘html-files/example-10k.html’\n", + "\n", + "\r", + "example-10k.html 0%[ ] 0 --.-KB/s \r", + "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.05s \n", + "\n", + "2025-07-20 23:51:04 (43.6 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n", + "\n", + "--2025-07-20 23:51:04-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/html/fake-html.html\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 665 [text/plain]\n", + "Saving to: ‘html-files/fake-html.html’\n", + "\n", + "fake-html.html 100%[===================>] 665 --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:04 (28.5 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir html-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/html/example-10k.html -P html-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/html/fake-html.html -P html-files" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dg13MEz2NDzE", + "outputId": "1097dfc0-dd7b-4ac6-e172-c883e10f7bf8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 12...|\n", + "|[{document, 13, 4...|\n", + "|[{document, 47, 6...|\n", + "|[{document, 69, 7...|\n", + "|[{document, 78, 1...|\n", + "|[{document, 164, ...|\n", + "|[{document, 207, ...|\n", + "|[{document, 297, ...|\n", + "|[{document, 330, ...|\n", + "|[{document, 363, ...|\n", + "|[{document, 382, ...|\n", + "|[{document, 447, ...|\n", + "|[{document, 702, ...|\n", + "|[{document, 755, ...|\n", + "|[{document, 862, ...|\n", + "|[{document, 992, ...|\n", + "|[{document, 1127,...|\n", + "|[{document, 1481,...|\n", + "|[{document, 1796,...|\n", + "|[{document, 2143,...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"text/html\") \\\n", + " .setContentPath(\"./html-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMyqJX-K7dss" + }, + "source": [ + "## Reading MS Office Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dpYvufV2qgbB" + }, + "source": [ + "### Reading Word Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l-uW6gV8pUYM" + }, + "source": [ + "**Downloading Word files**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zLLEUl3KpYZ6", + "outputId": "b22f1af6-6bea-4c59-df27-53c829439928" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/contains-pictures.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 95087 (93K) [application/octet-stream]\n", + "Saving to: ‘word-files/contains-pictures.docx’\n", + "\n", + "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.02s \n", + "\n", + "2025-07-20 23:51:07 (4.77 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n", + "\n", + "--2025-07-20 23:51:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/fake_table.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12392 (12K) [application/octet-stream]\n", + "Saving to: ‘word-files/fake_table.docx’\n", + "\n", + "fake_table.docx 100%[===================>] 12.10K --.-KB/s in 0.001s \n", + "\n", + "2025-07-20 23:51:07 (21.8 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n", + "\n", + "--2025-07-20 23:51:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/page-breaks.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 14584 (14K) [application/octet-stream]\n", + "Saving to: ‘word-files/page-breaks.docx’\n", + "\n", + "page-breaks.docx 100%[===================>] 14.24K --.-KB/s in 0.001s \n", + "\n", + "2025-07-20 23:51:08 (14.8 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir word-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/contains-pictures.docx -P word-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/fake_table.docx -P word-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/page-breaks.docx -P word-files" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YId4UG1rOVQq", + "outputId": "868114c4-6605-423f-864e-dbf00875225c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 31...|\n", + "|[{document, 32, 4...|\n", + "|[{document, 430, ...|\n", + "|[{document, 504, ...|\n", + "|[{document, 586, ...|\n", + "|[{document, 0, 11...|\n", + "|[{document, 114, ...|\n", + "|[{document, 263, ...|\n", + "|[{document, 294, ...|\n", + "|[{document, 325, ...|\n", + "|[{document, 354, ...|\n", + "|[{document, 411, ...|\n", + "|[{document, 0, 11...|\n", + "|[{document, 12, 2...|\n", + "|[{document, 24, 3...|\n", + "|[{document, 35, 4...|\n", + "|[{document, 49, 6...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/msword\") \\\n", + " .setContentPath(\"./word-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E8ockED4NxLi" + }, + "source": [ + "### Reading PowerPoint Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A3lF0_7qqlZB" + }, + "source": [ + "**Downloading PowerPoint files**" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1jDRFmcHqpxn", + "outputId": "5cd0ee8d-417f-42c5-fff6-e70dcd281468" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/fake-power-point.pptx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 38412 (38K) [application/octet-stream]\n", + "Saving to: ‘ppt-files/fake-power-point.pptx’\n", + "\n", + "\r", + "fake-power-point.pp 0%[ ] 0 --.-KB/s \r", + "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.008s \n", + "\n", + "2025-07-20 23:51:11 (4.88 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n", + "\n", + "--2025-07-20 23:51:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/fake-power-point-table.pptx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 39894 (39K) [application/octet-stream]\n", + "Saving to: ‘ppt-files/fake-power-point-table.pptx’\n", + "\n", + "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.008s \n", + "\n", + "2025-07-20 23:51:11 (4.60 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n", + "\n", + "--2025-07-20 23:51:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/speaker-notes.pptx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 39414 (38K) [application/octet-stream]\n", + "Saving to: ‘ppt-files/speaker-notes.pptx’\n", + "\n", + "speaker-notes.pptx 100%[===================>] 38.49K --.-KB/s in 0.007s \n", + "\n", + "2025-07-20 23:51:11 (5.49 MB/s) - ‘ppt-files/speaker-notes.pptx’ saved [39414/39414]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir ppt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/fake-power-point.pptx -P ppt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/fake-power-point-table.pptx -P ppt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/speaker-notes.pptx -P ppt-files" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fPCpk7RTGRjo", + "outputId": "daeb374c-44c6-42cd-adbf-a42610456a61" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 20...|\n", + "|[{document, 21, 5...|\n", + "|[{document, 51, 8...|\n", + "|[{document, 89, 1...|\n", + "|[{document, 144, ...|\n", + "|[{document, 166, ...|\n", + "|[{document, 0, 20...|\n", + "|[{document, 21, 5...|\n", + "|[{document, 51, 8...|\n", + "|[{document, 89, 1...|\n", + "|[{document, 144, ...|\n", + "|[{document, 166, ...|\n", + "|[{document, 0, 19...|\n", + "|[{document, 20, 2...|\n", + "|[{document, 28, 3...|\n", + "|[{document, 36, 4...|\n", + "|[{document, 44, 4...|\n", + "|[{document, 47, 5...|\n", + "|[{document, 52, 5...|\n", + "|[{document, 56, 6...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/vnd.ms-powerpoint\") \\\n", + " .setContentPath(\"./ppt-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yHsnpNNmrWtR" + }, + "source": [ + "### Reading Excel Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "40ts9-MmqNHp" + }, + "source": [ + "**Downloading Excel files**" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G3-BCYP6qQ4x", + "outputId": "11775571-4dd6-47f2-f1f2-b075f13a608c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12541 (12K) [application/octet-stream]\n", + "Saving to: ‘excel-files/vodafone.xlsx’\n", + "\n", + "\r", + "vodafone.xlsx 0%[ ] 0 --.-KB/s \r", + "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0.001s \n", + "\n", + "2025-07-20 23:51:15 (18.2 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n", + "\n", + "--2025-07-20 23:51:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 38442 (38K) [application/octet-stream]\n", + "Saving to: ‘excel-files/2023-half-year-analyses-by-segment.xlsx’\n", + "\n", + "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.007s \n", + "\n", + "2025-07-20 23:51:15 (5.15 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n", + "\n", + "--2025-07-20 23:51:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10676 (10K) [application/octet-stream]\n", + "Saving to: ‘excel-files/page-break-example.xlsx’\n", + "\n", + "page-break-example. 100%[===================>] 10.43K --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:16 (42.5 MB/s) - ‘excel-files/page-break-example.xlsx’ saved [10676/10676]\n", + "\n", + "--2025-07-20 23:51:16-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9210 (9.0K) [application/octet-stream]\n", + "Saving to: ‘excel-files/xlsx-subtable-cases.xlsx’\n", + "\n", + "xlsx-subtable-cases 100%[===================>] 8.99K --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:16 (73.0 MB/s) - ‘excel-files/xlsx-subtable-cases.xlsx’ saved [9210/9210]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir excel-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx -P excel-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx -P excel-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx -P excel-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx -P excel-files" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PQ4MpGw6xCko", + "outputId": "84664ce1-20ff-4237-8f2d-10b75c6b4c87" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 21...|\n", + "|[{document, 22, 4...|\n", + "|[{document, 44, 6...|\n", + "|[{document, 63, 1...|\n", + "|[{document, 107, ...|\n", + "|[{document, 339, ...|\n", + "|[{document, 395, ...|\n", + "|[{document, 452, ...|\n", + "|[{document, 508, ...|\n", + "|[{document, 566, ...|\n", + "|[{document, 615, ...|\n", + "|[{document, 682, ...|\n", + "|[{document, 734, ...|\n", + "|[{document, 793, ...|\n", + "|[{document, 858, ...|\n", + "|[{document, 949, ...|\n", + "|[{document, 993, ...|\n", + "|[{document, 1225,...|\n", + "|[{document, 1282,...|\n", + "|[{document, 1339,...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/vnd.ms-excel\") \\\n", + " .setContentPath(\"./excel-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_GyL6D4N75i-" + }, + "source": [ + "## Reading Text Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ATDLz3Gws5ob" + }, + "source": [ + "**Downloading Text files**" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AV-krG6Ps8pq", + "outputId": "3d6080b7-ad02-4c2d-930a-1ce36743de74" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:19-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 300 [text/plain]\n", + "Saving to: ‘txt-files/simple-text.txt’\n", + "\n", + "\r", + "simple-text.txt 0%[ ] 0 --.-KB/s \r", + "simple-text.txt 100%[===================>] 300 --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:19 (11.3 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir txt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mutwZUFj720X", + "outputId": "0063e1e5-f7d9-481a-a26a-ade5633ad172" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 17...|\n", + "|[{document, 18, 1...|\n", + "|[{document, 145, ...|\n", + "|[{document, 161, ...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"text/plain\") \\\n", + " .setContentPath(\"./txt-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "epCp5DnQ8E7o" + }, + "source": [ + "## Reading XML Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QVq5C0Uqs4wU" + }, + "source": [ + "**Downloading XML files**" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Gip5P7Ess63U", + "outputId": "c47de770-fc8d-4e74-bc80-fe4bc4c86b83" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:20-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 538 [text/plain]\n", + "Saving to: ‘xml-files/multi-level.xml’\n", + "\n", + "\r", + "multi-level.xml 0%[ ] 0 --.-KB/s \r", + "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:20 (26.1 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AViMSzKQtP-o", + "outputId": "b1723d38-dfd8-4090-e135-726ca1cfef4f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 12...|\n", + "|[{document, 13, 2...|\n", + "|[{document, 25, 2...|\n", + "|[{document, 29, 5...|\n", + "|[{document, 52, 6...|\n", + "|[{document, 67, 7...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/xml\") \\\n", + " .setContentPath(\"./xml-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8qB4uXOFiqO0" + }, + "source": [ + "## Reading Mardown Documents" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "J4OpCThUiriY", + "outputId": "0f2c06cf-2d8f-42eb-97d5-2aafde97899b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:21-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1213-Adding-MarkdownReader/src/test/resources/reader/md/simple.md\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 184 [text/plain]\n", + "Saving to: ‘md-files/simple.md’\n", + "\n", + "\r", + "simple.md 0%[ ] 0 --.-KB/s \r", + "simple.md 100%[===================>] 184 --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:21 (2.67 MB/s) - ‘md-files/simple.md’ saved [184/184]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir md-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1213-Adding-MarkdownReader/src/test/resources/reader/md/simple.md -P md-files" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZjpuIEatz5yt", + "outputId": "9e401ac0-0f71-489b-a69d-8a1a66d28458" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 11...|\n", + "|[{document, 12, 7...|\n", + "|[{document, 80, 8...|\n", + "|[{document, 88, 1...|\n", + "|[{document, 102, ...|\n", + "|[{document, 115, ...|\n", + "|[{document, 129, ...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"text/markdown\") \\\n", + " .setContentPath(\"./md-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_CuYYlw8tGQO" + }, + "source": [ + "## Reading Email Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K3Fyab6wret-" + }, + "source": [ + "**Downloading Email files**" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yYMVpVQurk7G", + "outputId": "ea24ce84-276d-4085-bc38-0381d5bd470e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:22-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3175 (3.1K) [text/plain]\n", + "Saving to: ‘email-files/email-text-attachments.eml’\n", + "\n", + "\r", + " email-tex 0%[ ] 0 --.-KB/s \r", + "email-text-attachme 100%[===================>] 3.10K --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:22 (35.7 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n", + "\n", + "--2025-07-20 23:51:22-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1324361 (1.3M) [text/plain]\n", + "Saving to: ‘email-files/test-several-attachments.eml’\n", + "\n", + "test-several-attach 100%[===================>] 1.26M --.-KB/s in 0.05s \n", + "\n", + "2025-07-20 23:51:23 (27.1 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir email-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml -P email-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml -P email-files" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gK-Te-BWtIxQ", + "outputId": "8001ce16-d240-4c8c-e2b2-5bca89348f6e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 23...|\n", + "|[{document, 24, 1...|\n", + "|[{document, 162, ...|\n", + "|[{document, 1419,...|\n", + "|[{document, 1431,...|\n", + "|[{document, 1456,...|\n", + "|[{document, 0, 21...|\n", + "|[{document, 22, 7...|\n", + "|[{document, 74, 1...|\n", + "|[{document, 1045,...|\n", + "|[{document, 1057,...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"message/rfc822\") \\\n", + " .setContentPath(\"./email-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GMxLm81mLv_c" + }, + "source": [ + "## Parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z4YGo70IMA9q" + }, + "source": [ + "We can output one document per row by setting `explodeDocs` to `false`" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zFdH0OV2L96F", + "outputId": "32afaed2-dd27-418e-b78f-a11014e0bc6f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 23...|\n", + "|[{document, 0, 21...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"message/rfc822\") \\\n", + " .setContentPath(\"./email-files\") \\\n", + " .setOutputCol(\"document\") \\\n", + " .setExplodeDocs(False)\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDF87g1eM2L0" + }, + "source": [ + "We can output plain text with minimal metadata by setting `flattentOutput` to true" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ACuEwYIuM74C", + "outputId": "c4da0a00-9e1a-47f0-e2a9-55ad0df7b57e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 12...|\n", + "|[{document, 13, 4...|\n", + "|[{document, 47, 6...|\n", + "|[{document, 69, 7...|\n", + "|[{document, 78, 1...|\n", + "|[{document, 164, ...|\n", + "|[{document, 207, ...|\n", + "|[{document, 297, ...|\n", + "|[{document, 330, ...|\n", + "|[{document, 363, ...|\n", + "|[{document, 382, ...|\n", + "|[{document, 447, ...|\n", + "|[{document, 702, ...|\n", + "|[{document, 755, ...|\n", + "|[{document, 862, ...|\n", + "|[{document, 992, ...|\n", + "|[{document, 1127,...|\n", + "|[{document, 1481,...|\n", + "|[{document, 1796,...|\n", + "|[{document, 2143,...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"text/html\") \\\n", + " .setContentPath(\"./html-files\") \\\n", + " .setOutputCol(\"document\") \\\n", + " .setFlattenOutput(True)\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aucJ6Aa9Ne4k" + }, + "source": [ + "## Pipeline Integration" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4NlyM21qNir5" + }, + "source": [ + "We can integrate with pipelines. For example, with a simple `Tokenizer`:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "KXkLK7WWNgS4" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "empty_df = spark.createDataFrame([], \"string\").toDF(\"text\")\n", + "\n", + "regex_tok = RegexTokenizer() \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setOutputCol(\"regex_token\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc, regex_tok])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mc3RPhLROAg8", + "outputId": "e8d28875-769c-4a6b-cfdc-8820f81d7a7e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| document| regex_token|\n", + "+--------------------+--------------------+\n", + "|[{document, 0, 12...|[{token, 0, 5, UN...|\n", + "|[{document, 13, 4...|[{token, 13, 22, ...|\n", + "|[{document, 47, 6...|[{token, 47, 57, ...|\n", + "|[{document, 69, 7...|[{token, 69, 72, ...|\n", + "|[{document, 78, 1...|[{token, 78, 78, ...|\n", + "|[{document, 164, ...|[{token, 164, 166...|\n", + "|[{document, 207, ...|[{token, 207, 207...|\n", + "|[{document, 297, ...|[{token, 297, 299...|\n", + "|[{document, 330, ...|[{token, 330, 339...|\n", + "|[{document, 363, ...|[{token, 363, 368...|\n", + "|[{document, 382, ...|[{token, 382, 387...|\n", + "|[{document, 447, ...|[{token, 447, 452...|\n", + "|[{document, 702, ...|[{token, 702, 711...|\n", + "|[{document, 755, ...|[{token, 755, 759...|\n", + "|[{document, 862, ...|[{token, 862, 869...|\n", + "|[{document, 992, ...|[{token, 992, 999...|\n", + "|[{document, 1127,...|[{token, 1127, 11...|\n", + "|[{document, 1481,...|[{token, 1481, 14...|\n", + "|[{document, 1796,...|[{token, 1796, 18...|\n", + "|[{document, 2143,...|[{token, 2143, 21...|\n", + "+--------------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "result_df.show()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/python/sparknlp/reader/reader2doc.py b/python/sparknlp/reader/reader2doc.py new file mode 100644 index 00000000000000..e6782c046e5d7c --- /dev/null +++ b/python/sparknlp/reader/reader2doc.py @@ -0,0 +1,188 @@ +# Copyright 2017-2025 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pyspark import keyword_only +from pyspark.ml.param import TypeConverters, Params, Param + +from sparknlp.common import AnnotatorType +from sparknlp.internal import AnnotatorTransformer +from sparknlp.partition.partition_properties import * + + +class Reader2Doc( + AnnotatorTransformer, + HasEmailReaderProperties, + HasExcelReaderProperties, + HasHTMLReaderProperties, + HasPowerPointProperties, + HasTextReaderProperties, +): + """ + The Reader2Doc annotator allows you to use reading files more smoothly within existing + Spark NLP workflows, enabling seamless reuse of your pipelines. + + Reader2Doc can be used for extracting structured content from various document types + using Spark NLP readers. It supports reading from many file types and returns parsed + output as a structured Spark DataFrame. + + Supported formats include: + + - Plain text + - HTML + - Word (.doc/.docx) + - Excel (.xls/.xlsx) + - PowerPoint (.ppt/.pptx) + - Email files (.eml, .msg) + - PDFs + + Examples + -------- + >>> from johnsnowlabs.reader import Reader2Doc + >>> from johnsnowlabs.nlp.base import DocumentAssembler + >>> from pyspark.ml import Pipeline + >>> # Initialize Reader2Doc for PDF files + >>> reader2doc = Reader2Doc() \\ + ... .setContentType("application/pdf") \\ + ... .setContentPath(f"{pdf_directory}/") + >>> # Build the pipeline with the Reader2Doc stage + >>> pipeline = Pipeline(stages=[reader2doc]) + >>> # Fit the pipeline to an empty DataFrame + >>> pipeline_model = pipeline.fit(empty_data_set) + >>> result_df = pipeline_model.transform(empty_data_set) + >>> # Show the resulting DataFrame + >>> result_df.show() + +------------------------------------------------------------------------------------------------------------------------------------+ + |document | + +------------------------------------------------------------------------------------------------------------------------------------+ + |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] | + |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]| + |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]| + +------------------------------------------------------------------------------------------------------------------------------------+ +""" + + name = "Reader2Doc" + outputAnnotatorType = AnnotatorType.DOCUMENT + + contentPath = Param( + Params._dummy(), + "contentPath", + "contentPath path to files to read", + typeConverter=TypeConverters.toString, + ) + + outputCol = Param( + Params._dummy(), + "outputCol", + "output column name", + typeConverter=TypeConverters.toString, + ) + + contentType = Param( + Params._dummy(), + "contentType", + "Set the content type to load following MIME specification", + typeConverter=TypeConverters.toString, + ) + + explodeDocs = Param( + Params._dummy(), + "explodeDocs", + "whether to explode the documents into separate rows", + typeConverter=TypeConverters.toBoolean, + ) + + flattenOutput = Param( + Params._dummy(), + "flattenOutput", + "If true, output is flattened to plain text with minimal metadata", + typeConverter=TypeConverters.toBoolean, + ) + + titleThreshold = Param( + Params._dummy(), + "titleThreshold", + "Minimum font size threshold for title detection in PDF docs", + typeConverter=TypeConverters.toFloat, + ) + + @keyword_only + def __init__(self): + super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc") + self._setDefault(outputCol="document") + + @keyword_only + def setParams(self): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def setContentPath(self, value): + """Sets content path. + + Parameters + ---------- + value : str + contentPath path to files to read + """ + return self._set(contentPath=value) + + def setContentType(self, value): + """ + Set the content type to load following MIME specification + + Parameters + ---------- + value : str + content type to load following MIME specification + """ + return self._set(contentType=value) + + def setExplodeDocs(self, value): + """Sets whether to explode the documents into separate rows. + + + Parameters + ---------- + value : boolean + Whether to explode the documents into separate rows + """ + return self._set(explodeDocs=value) + + def setOutputCol(self, value): + """Sets output column name. + + Parameters + ---------- + value : str + Name of the Output Column + """ + return self._set(outputCol=value) + + def setFlattenOutput(self, value): + """Sets whether to flatten the output to plain text with minimal metadata. + + Parameters + ---------- + value : bool + If true, output is flattened to plain text with minimal metadata + """ + return self._set(flattenOutput=value) + + def setTitleThreshold(self, value): + """Sets the minimum font size threshold for title detection in PDF documents. + + Parameters + ---------- + value : float + Minimum font size threshold for title detection in PDF docs + """ + return self._set(titleThreshold=value) diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py index 6dc744279c19f8..d2c5d82954d894 100644 --- a/python/sparknlp/reader/sparknlp_reader.py +++ b/python/sparknlp/reader/sparknlp_reader.py @@ -413,4 +413,49 @@ def md(self, filePath): if not isinstance(filePath, str): raise TypeError("filePath must be a string") jdf = self._java_obj.md(filePath) + return self.getDataFrame(self.spark, jdf) + + def csv(self, csvPath): + """Reads CSV files and returns a Spark DataFrame. + + Parameters + ---------- + docPath : str + Path to an CSV file or a directory containing CSV files. + + Returns + ------- + pyspark.sql.DataFrame + A DataFrame containing parsed CSV content. + + Examples + -------- + >>> from sparknlp.reader import SparkNLPReader + >>> csv_df = SparkNLPReader(spark).csv("home/user/csv-directory") + + You can use SparkNLP for one line of code + + >>> import sparknlp + >>> csv_df = sparknlp.read().csv("home/user/csv-directory") + >>> csv_df.show(truncate=False) + +-----------------------------------------------------------------------------------------------------------------------------------------+ + |csv | + +-----------------------------------------------------------------------------------------------------------------------------------------+ + |[{NarrativeText, Alice 100 Bob 95, {}}, {Table,
Alice100
Bob95
, {}}] | + +-----------------------------------------------------------------------------------------------------------------------------------------+ + + >>> csv_df.printSchema() + root + |-- path: string (nullable = true) + |-- csv: array (nullable = true) + | |-- element: struct (containsNull = true) + | | |-- elementType: string (nullable = true) + | | |-- content: string (nullable = true) + | | |-- metadata: map (nullable = true) + | | | |-- key: string + | | | |-- value: string (valueContainsNull = true) + """ + if not isinstance(csvPath, str): + raise TypeError("docPath must be a string") + jdf = self._java_obj.csv(csvPath) return self.getDataFrame(self.spark, jdf) \ No newline at end of file diff --git a/python/test/partition/partition_test.py b/python/test/partition/partition_test.py index b8caca4bc8c3e5..3f11a650c9acb7 100644 --- a/python/test/partition/partition_test.py +++ b/python/test/partition/partition_test.py @@ -101,7 +101,7 @@ def runTest(self): self.assertTrue(html_file_df.select("html").count() > 0) -@pytest.mark.fast +@pytest.mark.slow class PartitionUrlTesSpec(unittest.TestCase): def runTest(self): @@ -122,8 +122,8 @@ def runTest(self): pdf_df = Partition(content_type = "application/pdf").partition(self.html_directory) pdf_file_df = Partition().partition(f"{self.html_directory}/text_3_pages.pdf") - self.assertTrue(pdf_df.select("text").count() > 0) - self.assertTrue(pdf_file_df.select("text").count() > 0) + self.assertTrue(pdf_df.select("pdf").count() > 0) + self.assertTrue(pdf_file_df.select("pdf").count() > 0) @pytest.mark.fast class PartitionTextInMemoryTesSpec(unittest.TestCase): @@ -139,6 +139,5 @@ def setUp(self): def runTest(self): text_df = Partition(group_broken_paragraphs=True).partition_text(text = self.raw_text ) - text_df.show(truncate=False) self.assertTrue(text_df.select("txt").count() > 0) \ No newline at end of file diff --git a/python/test/partition/partition_transformer_test.py b/python/test/partition/partition_transformer_test.py index 270486c561eece..86e810273b52e0 100644 --- a/python/test/partition/partition_transformer_test.py +++ b/python/test/partition/partition_transformer_test.py @@ -79,7 +79,6 @@ def runTest(self): pipelineModel = pipeline.fit(emptyDataSet) resultDf = pipelineModel.transform(self.testDataSet) - resultDf.show(truncate=False) self.assertTrue(resultDf.select("partition").count() > 0) @@ -108,6 +107,5 @@ def runTest(self): pipelineModel = pipeline.fit(self.emptyDataSet) resultDf = pipelineModel.transform(self.emptyDataSet) - resultDf.show(truncate=False) self.assertTrue(resultDf.select("partition").count() >= 0) \ No newline at end of file diff --git a/python/test/reader/reader2doc_test.py b/python/test/reader/reader2doc_test.py new file mode 100644 index 00000000000000..0346f87a14e227 --- /dev/null +++ b/python/test/reader/reader2doc_test.py @@ -0,0 +1,93 @@ + +# Copyright 2017-2024 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import pytest +import os + +from sparknlp.annotator import * +from sparknlp.base import * +from sparknlp.reader.reader2doc import Reader2Doc +from test.util import SparkContextForTest +from pyspark.ml import Pipeline + +@pytest.mark.fast +class Reader2DocTest(unittest.TestCase): + + def setUp(self): + spark = SparkContextForTest.spark + self.empty_df = spark.createDataFrame([], "string").toDF("text") + + def runTest(self): + reader2doc = Reader2Doc() \ + .setContentType("text/html") \ + .setContentPath(f"file:///{os.getcwd()}/../src/test/resources/reader/html/title-test.html") \ + .setOutputCol("document") + + pipeline = Pipeline(stages=[reader2doc]) + model = pipeline.fit(self.empty_df) + + result_df = model.transform(self.empty_df) + + self.assertTrue(result_df.select("document").count() > 0) + + +@pytest.mark.fast +class Reader2DocTokenTest(unittest.TestCase): + + def setUp(self): + spark = SparkContextForTest.spark + self.empty_df = spark.createDataFrame([], "string").toDF("text") + + def runTest(self): + reader2doc = Reader2Doc() \ + .setContentType("text/html") \ + .setContentPath(f"file:///{os.getcwd()}/../src/test/resources/reader/html/example-div.html") \ + .setOutputCol("document") \ + .setTitleThreshold(18.5) + + regex_tok = RegexTokenizer() \ + .setInputCols(["document"]) \ + .setOutputCol("regex_token") + + pipeline = Pipeline(stages=[reader2doc, regex_tok]) + model = pipeline.fit(self.empty_df) + + result_df = model.transform(self.empty_df) + + self.assertTrue(result_df.select("document").count() > 0) + + +@pytest.mark.fast +class Reader2DocPdfTest(unittest.TestCase): + + def setUp(self): + spark = SparkContextForTest.spark + self.empty_df = spark.createDataFrame([], "string").toDF("text") + + def runTest(self): + reader2doc = Reader2Doc() \ + .setContentType("application/pdf") \ + .setContentPath(f"file:///{os.getcwd()}/../src/test/resources/reader/pdf/pdf-title.pdf") \ + .setOutputCol("document") \ + .setTitleThreshold(18.5) + + pipeline = Pipeline(stages=[reader2doc]) + model = pipeline.fit(self.empty_df) + + result_df = model.transform(self.empty_df) + + self.assertTrue(result_df.select("document").count() > 0) \ No newline at end of file diff --git a/python/test/sparknlp_test.py b/python/test/sparknlp_test.py index 28cf62a77a0da5..82a6732271c54d 100644 --- a/python/test/sparknlp_test.py +++ b/python/test/sparknlp_test.py @@ -29,7 +29,6 @@ def runTest(self): html_df = sparknlp.read().html("https://www.wikipedia.org") html_df.show() assert html_df.select("html").count() > 0 - params = {"titleFontSize": "12"} html_params_df = sparknlp.read(params).html("https://www.wikipedia.org") html_params_df.show() @@ -152,4 +151,31 @@ def runTest(self): md_df = sparknlp.read().md(self.md_file) md_df.show() - self.assertTrue(md_df.select("md").count() > 0) \ No newline at end of file + self.assertTrue(md_df.select("md").count() > 0) + + +@pytest.mark.fast +class SparkNLPTestCSVFilesSpec(unittest.TestCase): + + def setUp(self): + self.data = SparkContextForTest.data + self.csv_files = f"file:///{os.getcwd()}/../src/test/resources/reader/csv/stanley-cups.csv" + + def runTest(self): + csv_df = sparknlp.read().csv(self.csv_files) + csv_df.show() + + self.assertTrue(csv_df.select("csv").count() > 0) + +@pytest.mark.fast +class SparkNLPTestPDFFilesSpec(unittest.TestCase): + + def setUp(self): + self.data = SparkContextForTest.data + self.pdf_file = f"file:///{os.getcwd()}/../src/test/resources/reader/pdf/pdf-title.pdf" + + def runTest(self): + csv_df = sparknlp.read().pdf(self.pdf_file) + csv_df.show() + + self.assertTrue(csv_df.select("pdf").count() > 0) \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/partition/HasHTMLReaderProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasHTMLReaderProperties.scala index cfa9bc7022b5ac..1253fb5000a939 100644 --- a/src/main/scala/com/johnsnowlabs/partition/HasHTMLReaderProperties.scala +++ b/src/main/scala/com/johnsnowlabs/partition/HasHTMLReaderProperties.scala @@ -37,6 +37,13 @@ trait HasHTMLReaderProperties extends ParamsAndFeaturesWritable { setHeaders(headers.asScala.toMap) } - setDefault(timeout -> 0, headers -> Map.empty[String, String]) + val includeTitleTag = new Param[Boolean]( + this, + "includeTitleTag", + "Whether to include the title tag in the HTML output. Default is false.") + + def setIncludeTitleTag(value: Boolean): this.type = set(includeTitleTag, value) + + setDefault(timeout -> 0, includeTitleTag -> false, headers -> Map.empty[String, String]) } diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala index e72b2bd90bb735..8bc9c8eba5250b 100644 --- a/src/main/scala/com/johnsnowlabs/partition/Partition.scala +++ b/src/main/scala/com/johnsnowlabs/partition/Partition.scala @@ -221,6 +221,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap()) case "application/vnd.ms-powerpoint" | "application/vnd.openxmlformats-officedocument.presentationml.presentation" => sparkNLPReader.ppt + case "application/pdf" => sparkNLPReader.pdf case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType") } diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala index 281af53931d72c..463170920181d1 100644 --- a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala @@ -246,19 +246,6 @@ class PartitionTransformer(override val uid: String) } } - private def findHTMLElementColumn(dataFrame: DataFrame): Option[String] = { - val htmlElementSchema = Encoders.product[HTMLElement].schema - dataFrame.schema.fields - .find { field => - field.dataType match { - case ArrayType(structType: StructType, _) => - structType == htmlElementSchema - case _ => false - } - } - .map(_.name) - } - private def findHTMLElementColumns(dataFrame: DataFrame): Seq[String] = { val htmlElementSchema = Encoders.product[HTMLElement].schema diff --git a/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala b/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala index 2f7c959f86cf58..78a2e19e1e2413 100644 --- a/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala +++ b/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala @@ -17,6 +17,9 @@ package com.johnsnowlabs.partition.util import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} +import java.nio.charset.Charset +import java.nio.file.Files + object PartitionHelper { def datasetWithBinaryFile(sparkSession: SparkSession, contentPath: String): DataFrame = { @@ -38,9 +41,26 @@ object PartitionHelper { def isStringContent(contentType: String): Boolean = { contentType match { - case "text/plain" | "text/html" | "url" => true + case "text/plain" | "text/html" | "text/markdown" | "application/xml" | "url" => true case _ => false } } + def datasetWithTextFileEncoding( + sparkSession: SparkSession, + contentPath: String, + encoding: String): DataFrame = { + import sparkSession.implicits._ + val fs = new java.io.File(contentPath) + val files = + if (fs.isDirectory) fs.listFiles.filter(_.isFile).map(_.getPath) + else Array(contentPath) + val fileContents = files.map { path => + val content = + new String(Files.readAllBytes(java.nio.file.Paths.get(path)), Charset.forName(encoding)) + (path, content) + } + sparkSession.sparkContext.parallelize(fileContents).toDF("path", "content") + } + } diff --git a/src/main/scala/com/johnsnowlabs/reader/CSVReader.scala b/src/main/scala/com/johnsnowlabs/reader/CSVReader.scala new file mode 100644 index 00000000000000..112a5b1ca55f6b --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/reader/CSVReader.scala @@ -0,0 +1,149 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.partition.util.PartitionHelper.{ + datasetWithTextFile, + datasetWithTextFileEncoding +} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.functions.slice + +import java.util.regex.Pattern + +/** CSVReader partitions CSV files into structured elements with metadata, similar to ExcelReader. + * + * @param encoding + * Character encoding for reading CSV files (default: UTF-8). + * @param includeHeader + * If true, includes the header as the first row in content and HTML. + * @param inferTableStructure + * If true, generates HTML table representation as metadata. + */ +class CSVReader( + encoding: String = "UTF-8", + includeHeader: Boolean = false, + inferTableStructure: Boolean = true, + delimiter: String = ",", + storeContent: Boolean = false) + extends Serializable { + + private lazy val spark = ResourceHelper.spark + + private var outputColumn = "csv" + + def setOutputColumn(value: String): this.type = { + require(value.nonEmpty, "Output column name cannot be empty.") + outputColumn = value + this + } + + def getOutputColumn: String = outputColumn + + /** Main entry: partition CSV files according to chosen strategy. */ + def csv(filePath: String): DataFrame = { + { + if (ResourceHelper.validFile(filePath)) { + val textDf = + if (encoding.equalsIgnoreCase("utf-8")) + datasetWithTextFile(spark, filePath) + else + datasetWithTextFileEncoding(spark, filePath, encoding) + val csvDf = buildStructuredCSV(textDf) + + if (storeContent) csvDf.select("path", outputColumn, "content") + else csvDf.select("path", outputColumn) + } else { + throw new IllegalArgumentException(s"Invalid filePath: $filePath") + } + } + } + + def buildStructuredCSV(textDF: DataFrame): DataFrame = { + import spark.implicits._ + val delimiterPattern = Pattern.quote(delimiter) + + val normalizedDF = textDF.withColumn( + "lines_array", + split(regexp_replace(regexp_replace($"content", "\r\n", "\n"), "\r", "\n"), "\n")) + + val linesProcessedDF = if (includeHeader) { + normalizedDF.withColumn("lines_array_processed", $"lines_array") + } else { + normalizedDF.withColumn( + "lines_array_processed", + slice($"lines_array", lit(2), size($"lines_array") - 1)) + } + + val nonEmptyLinesDF = linesProcessedDF.withColumn( + "non_empty_lines", + filter($"lines_array_processed", x => trim(x) =!= "")) + + val tokensFlattenedDF = nonEmptyLinesDF.withColumn( + "all_tokens", + flatten( + transform( + col("non_empty_lines"), + line => filter(transform(split(line, delimiterPattern), trim(_)), t => length(t) > 0)))) + + // Reconstruct normalized_content (excluding header if needed) + val normalizedContentDF = + tokensFlattenedDF.withColumn("normalized_content", concat_ws(" ", col("all_tokens"))) + + if (inferTableStructure) { + + val rowsArrayDF = normalizedContentDF.withColumn( + "rows_array", + transform(col("non_empty_lines"), line => split(line, delimiterPattern))) + val rowsWithTdDF = rowsArrayDF.withColumn( + "rows_with_td", + transform( + $"rows_array", + row => transform(row, c => concat(lit(""), trim(c), lit(""))))) + val trRowsDF = rowsWithTdDF.withColumn( + "tr_rows", + transform($"rows_with_td", row => concat(lit(""), concat_ws("", row), lit("")))) + val htmlTableDF = trRowsDF.withColumn( + "html_table", + concat(lit(""), concat_ws("", $"tr_rows"), lit("
"))) + + htmlTableDF.withColumn( + outputColumn, + array( + struct( + lit(ElementType.NARRATIVE_TEXT).as("elementType"), + $"normalized_content".as("content"), + map_from_arrays(array(), array()).as("metadata")), + struct( + lit(ElementType.TABLE).as("elementType"), + $"html_table".as("content"), + map_from_arrays(array(), array()).as("metadata")))) + + } else { + normalizedContentDF.withColumn( + outputColumn, + array( + struct( + lit(ElementType.NARRATIVE_TEXT).as("elementType"), + $"normalized_content".as("content"), + map_from_arrays(array(), array()).as("metadata")))) + + } + } + +} diff --git a/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala index 3ceae9a42851fe..c3184706432397 100644 --- a/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala @@ -18,6 +18,7 @@ package com.johnsnowlabs.reader import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.nlp.util.io.ResourceHelper.{isValidURL, validFile} import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile +import com.johnsnowlabs.reader.util.HTMLParser import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.jsoup.Jsoup @@ -30,8 +31,9 @@ import scala.collection.mutable.ArrayBuffer /** Class to parse and read HTML files. * * @param titleFontSize - * Minimum font size threshold used as part of heuristic rules to detect title elements based - * on formatting (e.g., bold, centered, capitalized). By default, it is set to 16. + * Minimum font size threshold in pixels used as part of heuristic rules to detect title + * elements based on formatting (e.g., bold, centered, capitalized). By default, it is set to + * 16. * @param storeContent * Whether to include the raw file content in the output DataFrame as a separate 'content' * column, alongside the structured output. By default, it is set to false. @@ -65,7 +67,7 @@ import scala.collection.mutable.ArrayBuffer * * htmlDf.printSchema() * root - * |-- url: string (nullable = true) + * |-- path: string (nullable = true) * |-- html: array (nullable = true) * | |-- element: struct (containsNull = true) * | | |-- elementType: string (nullable = true) @@ -82,6 +84,7 @@ class HTMLReader( titleFontSize: Int = 16, storeContent: Boolean = false, timeout: Int = 0, + includeTitleTag: Boolean = false, headers: Map[String, String] = Map.empty) extends Serializable { @@ -159,7 +162,18 @@ class HTMLReader( private def startTraversalFromBody(document: Document): Array[HTMLElement] = { val body = document.body() - extractElements(body) + val elements = extractElements(body) + val docTitle = document.title().trim + + if (docTitle.nonEmpty && includeTitleTag) { + val titleElem = HTMLElement( + ElementType.TITLE, + content = docTitle, + metadata = mutable.Map.empty[String, String]) + Array(titleElem) ++ elements + } else { + elements + } } def htmlToHTMLElement(html: String): Array[HTMLElement] = { @@ -179,6 +193,7 @@ class HTMLReader( private case class NodeMetadata(tagName: Option[String], hidden: Boolean, var visited: Boolean) private def extractElements(root: Node): Array[HTMLElement] = { + var sentenceIndex = 0 val elements = ArrayBuffer[HTMLElement]() val trackingNodes = mutable.Map[Node, NodeMetadata]() var pageNumber = 1 @@ -248,8 +263,11 @@ class HTMLReader( val visitedNode = trackingNodes(element).visited val pageMetadata: mutable.Map[String, String] = mutable.Map("pageNumber" -> pageNumber.toString) + element.tagName() match { case "a" => + pageMetadata("sentence") = sentenceIndex.toString + sentenceIndex += 1 val href = element.attr("href").trim val linkText = element.text().trim if (href.nonEmpty && linkText.nonEmpty && !visitedNode) { @@ -260,6 +278,8 @@ class HTMLReader( metadata = pageMetadata) } case "table" => + pageMetadata("sentence") = sentenceIndex.toString + sentenceIndex += 1 val tableText = extractNestedTableContent(element).trim if (tableText.nonEmpty && !visitedNode) { trackingNodes(element).visited = true @@ -269,6 +289,8 @@ class HTMLReader( metadata = pageMetadata) } case "li" => + pageMetadata("sentence") = sentenceIndex.toString + sentenceIndex += 1 val itemText = element.text().trim if (itemText.nonEmpty && !visitedNode) { trackingNodes(element).visited = true @@ -284,20 +306,25 @@ class HTMLReader( if (codeElem != null) codeElem.text().trim else element.text().trim if (codeText.nonEmpty && !visitedNode) { + pageMetadata("sentence") = sentenceIndex.toString + sentenceIndex += 1 trackingNodes(element).visited = true elements += HTMLElement( ElementType.UNCATEGORIZED_TEXT, // or ElementType.CODE if you have it content = codeText, metadata = pageMetadata) } - case "p" => + case tag if isParagraphLikeElement(element) => if (!visitedNode) { - classifyParagraphElement(element) match { + val classType = classifyParagraphElement(element) + classType match { case ElementType.NARRATIVE_TEXT => trackingNodes(element).visited = true val childNodes = element.childNodes().asScala.toList val aggregatedText = collectTextFromNodes(childNodes) if (aggregatedText.nonEmpty) { + pageMetadata("sentence") = sentenceIndex.toString + sentenceIndex += 1 elements += HTMLElement( ElementType.NARRATIVE_TEXT, content = aggregatedText, @@ -307,6 +334,8 @@ class HTMLReader( trackingNodes(element).visited = true val titleText = element.text().trim if (titleText.nonEmpty) { + pageMetadata("sentence") = sentenceIndex.toString + sentenceIndex += 1 elements += HTMLElement( ElementType.TITLE, content = titleText, @@ -314,11 +343,13 @@ class HTMLReader( } case ElementType.UNCATEGORIZED_TEXT => trackingNodes(element).visited = true - val titleText = element.text().trim - if (titleText.nonEmpty) { + val text = element.text().trim + if (text.nonEmpty) { + pageMetadata("sentence") = sentenceIndex.toString + sentenceIndex += 1 elements += HTMLElement( ElementType.UNCATEGORIZED_TEXT, - content = titleText, + content = text, metadata = pageMetadata) } } @@ -327,6 +358,8 @@ class HTMLReader( trackingNodes(element).visited = true val titleText = element.text().trim if (titleText.nonEmpty) { + pageMetadata("sentence") = sentenceIndex.toString + sentenceIndex += 1 elements += HTMLElement( ElementType.TITLE, content = titleText, @@ -352,6 +385,19 @@ class HTMLReader( elements.toArray } + private def isParagraphLikeElement(elem: Element): Boolean = { + val tag = elem.tagName().toLowerCase + val style = elem.attr("style").toLowerCase + (tag == "p") || + (tag == "div" && ( + style.contains("font-size") || + style.contains("line-height") || + style.contains("margin") || + elem.getElementsByTag("b").size() > 0 || + elem.getElementsByTag("strong").size() > 0 + )) + } + private def getTagName(node: Node): Option[String] = { node match { case element: Element => Some(element.tagName()) @@ -360,7 +406,7 @@ class HTMLReader( } private def classifyParagraphElement(element: Element): String = { - if (isTitleElement(element)) { + if (isFormattedAsTitle(element)) { ElementType.TITLE } else if (isTextElement(element)) { ElementType.NARRATIVE_TEXT @@ -369,38 +415,25 @@ class HTMLReader( } } - private def isTextElement(elem: Element): Boolean = { - !isFormattedAsTitle(elem) && - (elem.attr("style").toLowerCase.contains("text") || elem.tagName().toLowerCase == "p") + private def isTitleElement(element: Element): Boolean = { + val tag = element.tagName().toLowerCase + val style = element.attr("style").toLowerCase + val role = element.attr("role").toLowerCase + HTMLParser.isTitleElement(tag, style, role, titleFontSize) } - private def isTitleElement(elem: Element): Boolean = { - val tag = elem.tagName().toLowerCase - - // Recognize titles from common title-related tags or formatted

elements - tag match { - case "title" | "h1" | "h2" | "h3" | "header" => true - case "p" => isFormattedAsTitle(elem) // Check if

behaves like a title - case _ => elem.attr("role").toLowerCase == "heading" // ARIA role="heading" - } + private def isTextElement(elem: Element): Boolean = { + !isFormattedAsTitle(elem) && + (elem.attr("style").toLowerCase.contains("text") || + elem.tagName().toLowerCase == "p" || + (elem.tagName().toLowerCase == "div" && isParagraphLikeElement(elem))) } private def isFormattedAsTitle(elem: Element): Boolean = { - // Check for bold text, large font size, or centered alignment val style = elem.attr("style").toLowerCase - val isBold = style.contains("font-weight:bold") - val isLargeFont = style.contains("font-size") && extractFontSize(style) >= titleFontSize - val isCentered = style.contains("text-align:center") - - isBold || isLargeFont || (isCentered && isBold) || (isCentered && isLargeFont) - } - - private def extractFontSize(style: String): Int = { - val sizePattern = """font-size:(\d+)pt""".r - sizePattern.findFirstMatchIn(style) match { - case Some(m) => m.group(1).toInt - case None => 0 - } + val hasBoldTag = + elem.getElementsByTag("b").size() > 0 || elem.getElementsByTag("strong").size() > 0 + hasBoldTag || HTMLParser.isFormattedAsTitle(style, titleFontSize) } private def extractNestedTableContent(elem: Element): String = { diff --git a/src/main/scala/com/johnsnowlabs/reader/PdfReader.scala b/src/main/scala/com/johnsnowlabs/reader/PdfReader.scala new file mode 100644 index 00000000000000..610b20dd609de7 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/reader/PdfReader.scala @@ -0,0 +1,159 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithBinaryFile +import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.text.{PDFTextStripper, TextPosition} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, udf} +import java.io.ByteArrayInputStream +import scala.collection.JavaConverters._ +import scala.collection.mutable + +/** Class to parse and read PDF files. + * + * @param titleThreshold + * Minimum font size threshold used as part of heuristic rules to detect title elements based + * on formatting (e.g., bold, centered, capitalized). By default, it is set to 18. + * @param storeContent + * Whether to include the raw file content in the output DataFrame as a separate 'content' + * + * pdfPath: this is a path to a directory of HTML files or a path to an HTML file E.g. + * "path/pdf/files" + * + * ==Example== + * {{{ + * val path = "./pdf-files/pdf-doc.pdf" + * val PdfReader = new PdfReader() + * val pdfDF = PdfReader.read(url) + * }}} + * + * {{{ + * pdfDF.show() + * +--------------------+--------------------+ + * | path| html| + * +--------------------+--------------------+ + * |file:/content/htm...|[{Title, My First...| + * +--------------------+--------------------+ + * + * pdfDF.printSchema() + * root + * |-- path: string (nullable = true) + * |-- pdf: array (nullable = true) + * | |-- element: struct (containsNull = true) + * | | |-- elementType: string (nullable = true) + * | | |-- content: string (nullable = true) + * | | |-- metadata: map (nullable = true) + * | | | |-- key: string + * | | | |-- value: string (valueContainsNull = true) + * }}} + * For more examples please refer to this + * [[https://github.com/JohnSnowLabs/spark-nlp/examples/python/reader/SparkNLP_PDF_Reader_Demo.ipynb notebook]]. + */ +class PdfReader(storeContent: Boolean = false, titleThreshold: Double = 18.0) + extends Serializable { + + private lazy val spark = ResourceHelper.spark + private var outputColumn = "pdf" + + def setOutputColumn(name: String): this.type = { + require(name.nonEmpty, "Output column name cannot be empty.") + outputColumn = name + this + } + def getOutputColumn: String = outputColumn + + def pdf(filePath: String): DataFrame = { + if (!ResourceHelper.validFile(filePath)) + throw new IllegalArgumentException(s"Invalid filePath: $filePath") + + val binaryDF = datasetWithBinaryFile(spark, filePath) + val withElements = binaryDF.withColumn(outputColumn, parsePdfUDF(col("content"))) + if (storeContent) withElements.select("path", outputColumn, "content") + else withElements.select("path", outputColumn) + } + + private val parsePdfUDF = udf((data: Array[Byte]) => pdfToHTMLElement(data)) + + def pdfToHTMLElement(content: Array[Byte]): Seq[HTMLElement] = { + val docInputStream = new ByteArrayInputStream(content) + try { + val pdfDoc = PDDocument.load(docInputStream) + val elements = extractElementsFromPdf(pdfDoc) + pdfDoc.close() + elements + } catch { + case e: Exception => + Seq( + HTMLElement( + ElementType.UNCATEGORIZED_TEXT, + s"Could not parse PDF: ${e.getMessage}", + mutable.Map())) + } finally { + docInputStream.close() + } + } + + private def extractElementsFromPdf(pdfDoc: PDDocument): Seq[HTMLElement] = { + val collectedElements = mutable.ListBuffer[HTMLElement]() + val textStripper = new PDFTextStripper() { + override def writeString( + text: String, + textPositions: java.util.List[TextPosition]): Unit = { + val lineGroups = groupTextPositionsByLine(textPositions) + val lineElements = lineGroups.flatMap { case (_, linePositions) => + classifyLineElement(linePositions, getCurrentPageNo) + } + collectedElements ++= lineElements + } + } + textStripper.setSortByPosition(true) + textStripper.setStartPage(1) + textStripper.setEndPage(pdfDoc.getNumberOfPages) + textStripper.getText(pdfDoc) + collectedElements + } + + private def groupTextPositionsByLine( + textPositions: java.util.List[TextPosition]): Map[Int, Seq[TextPosition]] = { + val yTolerance = 2f // Potential parameter, since needs to experiment to fit your PDFs + textPositions.asScala.groupBy(tp => (tp.getY / yTolerance).round) + } + + private def classifyLineElement( + linePositions: Seq[TextPosition], + pageNumber: Int): Option[HTMLElement] = { + val lineText = linePositions.map(_.getUnicode).mkString.trim + if (lineText.isEmpty) return None + + val averageFontSize = linePositions.map(_.getFontSize).sum / linePositions.size + val mostCommonFontName = linePositions.groupBy(_.getFont.getName).maxBy(_._2.size)._1 + + val elementType = + if (isTitle(averageFontSize, mostCommonFontName)) ElementType.TITLE + else ElementType.NARRATIVE_TEXT + + val metadata = mutable.Map("pageNumber" -> pageNumber.toString) + Some(HTMLElement(elementType, lineText, metadata)) + } + + private def isTitle(fontSize: Double, fontName: String): Boolean = { + fontSize >= titleThreshold || fontName.toLowerCase.contains("bold") + } + +} diff --git a/src/main/scala/com/johnsnowlabs/reader/Reader2Doc.scala b/src/main/scala/com/johnsnowlabs/reader/Reader2Doc.scala new file mode 100644 index 00000000000000..86b956eee89caf --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/reader/Reader2Doc.scala @@ -0,0 +1,261 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT +import com.johnsnowlabs.nlp.{Annotation, HasOutputAnnotationCol, HasOutputAnnotatorType} +import com.johnsnowlabs.partition.util.PartitionHelper.{ + datasetWithBinaryFile, + datasetWithTextFile, + isStringContent +} +import com.johnsnowlabs.partition.{ + HasEmailReaderProperties, + HasExcelReaderProperties, + HasHTMLReaderProperties, + HasPowerPointProperties, + HasReaderProperties, + HasTextReaderProperties, + HasXmlReaderProperties, + Partition +} +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} +import org.apache.spark.sql.functions.{array, col, explode, udf} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} + +import scala.jdk.CollectionConverters.mapAsJavaMapConverter + +/** The Reader2Doc annotator allows you to use the reading files more smoothly within existing + * Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Doc can be used for + * extracting structured content from various document types using Spark NLP readers. It supports + * reading from many files types and returns parsed output as a structured Spark DataFrame. + * + * Supported formats include plain text, HTML, Word (.doc/.docx), Excel (.xls/.xlsx), PowerPoint + * (.ppt/.pptx), email files (.eml, .msg), and PDFs. + * + * ==Example== + * {{{ + * import com.johnsnowlabs.reader.Reader2Doc + * import com. johnsnowlabs.nlp.base.DocumentAssembler + * import org.apache.spark.ml.Pipeline + * + * val partition = new Reader2Doc() + * .setContentType("application/pdf") + * .setContentPath(s"$pdfDirectory/") + * + * val pipeline = new Pipeline() + * .setStages(Array(reader2Doc)) + * + * val pipelineModel = pipeline.fit(emptyDataSet) + * val resultDf = pipelineModel.transform(emptyDataSet) + * + * resultDf.show() + * +------------------------------------------------------------------------------------------------------------------------------------+ + * |document | + * +------------------------------------------------------------------------------------------------------------------------------------+ + * |[{document, 0, 14, This is a Title, {pageNumber -> 1, elementType -> Title, fileName -> pdf-title.pdf}, []}] | + * |[{document, 15, 38, This is a narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}] | + * |[{document, 39, 68, This is another narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}]| + * +------------------------------------------------------------------------------------------------------------------------------------+ + * }}} + */ +class Reader2Doc(override val uid: String) + extends Transformer + with DefaultParamsWritable + with HasOutputAnnotatorType + with HasOutputAnnotationCol + with HasReaderProperties + with HasEmailReaderProperties + with HasExcelReaderProperties + with HasHTMLReaderProperties + with HasPowerPointProperties + with HasTextReaderProperties + with HasXmlReaderProperties { + + def this() = this(Identifiable.randomUID("Reader2Doc")) + + val explodeDocs: BooleanParam = + new BooleanParam(this, "explodeDocs", "whether to explode the documents into separate rows") + + def setExplodeDocs(value: Boolean): this.type = set(explodeDocs, value) + + val flattenOutput: BooleanParam = + new BooleanParam( + this, + "flattenOutput", + "If true, output is flattened to plain text with minimal metadata") + + def setFlattenOutput(value: Boolean): this.type = set(flattenOutput, value) + + val titleThreshold: Param[Float] = + new Param[Float]( + this, + "titleThreshold", + "Minimum font size threshold for title detection in PDF docs") + + def setTitleThreshold(value: Float): this.type = { + set(titleThreshold, value) + } + + setDefault( + this.explodeDocs -> true, + contentType -> "", + flattenOutput -> false, + titleThreshold -> 18) + + override def transform(dataset: Dataset[_]): DataFrame = { + validateRequiredParameters() + + val partitionDf = partitionContent(partitionBuilder, dataset) + + val annotatedDf = partitionDf + .withColumn( + getOutputCol, + wrapColumnMetadata( + partitionToAnnotation($(flattenOutput))(col("partition"), col("fileName")))) + .select(getOutputCol) + + afterAnnotate(annotatedDf) + } + + private def partitionBuilder: Partition = { + val params = Map( + "contentType" -> $(contentType), + "storeContent" -> $(storeContent).toString, + "titleFontSize" -> $(titleFontSize).toString, + "inferTableStructure" -> $(inferTableStructure).toString, + "includePageBreaks" -> $(includePageBreaks).toString, + "addAttachmentContent" -> $(addAttachmentContent).toString, + "cellSeparator" -> $(cellSeparator), + "appendCells" -> $(appendCells).toString, + "timeout" -> $(timeout).toString, + "includeSlideNotes" -> $(includeSlideNotes).toString, + "titleLengthSize" -> $(titleLengthSize).toString, + "groupBrokenParagraphs" -> $(groupBrokenParagraphs).toString, + "paragraphSplit" -> $(paragraphSplit), + "shortLineWordThreshold" -> $(shortLineWordThreshold).toString, + "maxLineCount" -> $(maxLineCount).toString, + "threshold" -> $(threshold).toString, + "xmlKeepTags" -> $(xmlKeepTags).toString, + "onlyLeafNodes" -> $(onlyLeafNodes).toString, + "titleThreshold" -> $(titleThreshold).toString) + new Partition(params.asJava) + } + + private def partitionContent(partition: Partition, dataset: Dataset[_]): DataFrame = { + + if (isStringContent($(contentType))) { + val partitionUDF = + udf((text: String) => partition.partitionStringContent(text, $(this.headers).asJava)) + val stringContentDF = datasetWithTextFile(dataset.sparkSession, $(contentPath)) + stringContentDF + .withColumn(partition.getOutputColumn, partitionUDF(col("content"))) + .withColumn("fileName", getFileName(col("path"))) + } else { + val binaryContentDF = datasetWithBinaryFile(dataset.sparkSession, $(contentPath)) + val partitionUDF = + udf((input: Array[Byte]) => partition.partitionBytesContent(input)) + binaryContentDF + .withColumn(partition.getOutputColumn, partitionUDF(col("content"))) + .withColumn("fileName", getFileName(col("path"))) + } + } + + private def afterAnnotate(dataset: DataFrame): DataFrame = { + if ($(explodeDocs)) { + dataset + .select(dataset.columns.filterNot(_ == getOutputCol).map(col) :+ explode( + col(getOutputCol)).as("_tmp"): _*) + .withColumn( + getOutputCol, + array(col("_tmp")) + .as(getOutputCol, dataset.schema.fields.find(_.name == getOutputCol).get.metadata)) + .drop("_tmp") + } else dataset + } + + private def validateRequiredParameters(): Unit = { + require( + $(contentPath) != null && $(contentPath).trim.nonEmpty, + "contentPath must be set and not empty") + require( + $(contentType) != null && $(contentType).trim.nonEmpty, + "contentType must be set and not empty") + } + + private val getFileName = udf { path: String => + if (path != null) path.split("/").last else "" + } + + private def partitionToAnnotation(flatten: Boolean) = udf { + (partitions: Seq[Row], fileName: String) => + if (partitions == null) Nil + else { + var currentOffset = 0 + partitions.map { part => + val elementType = part.getAs[String]("elementType") + val content = part.getAs[String]("content") + val metadata = part.getAs[Map[String, String]]("metadata") + val begin = currentOffset + val end = currentOffset + (if (content != null) content.length else 0) - 1 + currentOffset = end + 1 + + // Compute new metadata + val baseMeta = if (metadata != null) metadata else Map.empty[String, String] + val withExtras = baseMeta + + ("elementType" -> elementType) + + ("fileName" -> fileName) + val finalMeta = + if (flatten) withExtras.filterKeys(_ == "sentence") + else withExtras + + Annotation( + annotatorType = outputAnnotatorType, + begin = begin, + end = end, + result = content, + metadata = finalMeta, + embeddings = Array.emptyFloatArray) + } + } + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override val outputAnnotatorType: AnnotatorType = DOCUMENT + + private lazy val columnMetadata: Metadata = { + val metadataBuilder: MetadataBuilder = new MetadataBuilder() + metadataBuilder.putString("annotatorType", outputAnnotatorType) + metadataBuilder.build + } + + override def transformSchema(schema: StructType): StructType = { + val outputFields = schema.fields :+ + StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, columnMetadata) + StructType(outputFields) + } + + private def wrapColumnMetadata(col: Column): Column = { + col.as(getOutputCol, columnMetadata) + } + +} + +object Reader2Doc extends DefaultParamsReadable[Reader2Doc] diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala index 4b45b039e5b1c1..c7fc5ccafb3691 100644 --- a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala @@ -19,14 +19,13 @@ import com.johnsnowlabs.nlp.annotators.cleaners.util.CleanerHelper.{ BLOCK_SPLIT_PATTERN, DOUBLE_PARAGRAPH_PATTERN } -import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.reader.util.pdf.TextStripperType import com.johnsnowlabs.reader.util.PartitionOptions.{ getDefaultBoolean, getDefaultInt, - getDefaultString + getDefaultString, + getDefaultDouble } -import org.apache.spark.ml.Pipeline import org.apache.spark.sql.DataFrame import scala.collection.JavaConverters._ @@ -93,35 +92,60 @@ class SparkNLPReader( def html(htmlPath: String): DataFrame = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.read(htmlPath) } def htmlToHTMLElement(html: String): Seq[HTMLElement] = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.htmlToHTMLElement(html) } def urlToHTMLElement(url: String): Seq[HTMLElement] = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.urlToHTMLElement(url) } def html(urls: Array[String]): DataFrame = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.read(urls) } def html(urls: java.util.List[String]): DataFrame = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.read(urls.asScala.toArray) } @@ -142,6 +166,13 @@ class SparkNLPReader( getDefaultInt(params.asScala.toMap, Seq("timeout"), default = 30) } + private def getIncludeTitleTag: Boolean = { + getDefaultBoolean( + params.asScala.toMap, + Seq("includeTitleTag", "include_title_tag"), + default = false) + } + /** Instantiates class to read email files. * * emailPath: this is a path to a directory of email files or a path to an email file E.g. @@ -302,22 +333,20 @@ class SparkNLPReader( * Parameter with custom configuration */ def pdf(pdfPath: String): DataFrame = { - val spark = ResourceHelper.spark - spark.conf.set("spark.sql.legacy.allowUntypedScalaUDF", "true") - val pdfToText = new PdfToText() - .setStoreSplittedPdf(getStoreSplittedPdf) - .setSplitPage(getSplitPage) - .setOnlyPageNum(getOnlyPageNum) - .setTextStripper(getTextStripper) - .setSort(getSort) - .setExtractCoordinates(getExtractCoordinates) - .setNormalizeLigatures(getNormalizeLigatures) - val binaryPdfDF = spark.read.format("binaryFile").load(pdfPath) - val pipelineModel = new Pipeline() - .setStages(Array(pdfToText)) - .fit(binaryPdfDF) - - pipelineModel.transform(binaryPdfDF) + val pdfReader = new PdfReader(getStoreContent, getTitleThreshold) + pdfReader.pdf(pdfPath) + } + + def pdf(content: Array[Byte]): Seq[HTMLElement] = { + val pdfReader = new PdfReader(getStoreContent, getTitleThreshold) + pdfReader.pdfToHTMLElement(content) + } + + private def getTitleThreshold: Double = { + getDefaultDouble( + params.asScala.toMap, + Seq("titleThreshold", "title_threshold"), + default = 18.0) } private def getStoreSplittedPdf: Boolean = { @@ -757,4 +786,73 @@ class SparkNLPReader( markdownReader.parseMarkdownWithTables(mdContent) } + /** Instantiates class to read XML files. + * + * csvPath: this is a path to a directory of CSV files or a path to an CSV file. E.g., + * "path/csv/files" + * + * ==Example== + * {{{ + * val csvPath = "home/user/csv-directory" + * val sparkNLPReader = new SparkNLPReader() + * val csvDf = sparkNLPReader.csv(csvPath) + * }}} + * + * ==Example 2== + * You can use SparkNLP for one line of code + * {{{ + * val csvDf = SparkNLP.read.csv(csvPath) + * }}} + * + * {{{ + * csvDf.select("csv").show(false) + * +-----------------------------------------------------------------------------------------------------------------------------------------+ + * |csv | + * +-----------------------------------------------------------------------------------------------------------------------------------------+ + * |[{NarrativeText, Alice 100 Bob 95, {}}, {Table,
Alice100
Bob95
, {}}] | + * +-----------------------------------------------------------------------------------------------------------------------------------------+ + * + * csvDf.printSchema() + * root + * |-- path: string (nullable = true) + * |-- csv: array (nullable = true) + * | |-- element: struct (containsNull = true) + * | | |-- elementType: string (nullable = true) + * | | |-- content: string (nullable = true) + * | | |-- metadata: map (nullable = true) + * | | | |-- key: string + * | | | |-- value: string (valueContainsNull = true) + * }}} + * + * @param csvPath + * Path to the CSV file or directory + * @return + * A DataFrame with parsed CSV as structured elements + */ + def csv(csvPath: String): DataFrame = { + val csvReader = new CSVReader( + encoding = getEncoding, + includeHeader = getIncludeHeader, + inferTableStructure = getInferTableStructure, + delimiter = getDelimiter, + storeContent = getStoreContent) + setOutputColumn(csvReader.getOutputColumn) + csvReader.csv(csvPath) + } + + private def getEncoding: String = { + getDefaultString(params.asScala.toMap, Seq("encoding"), default = "UTF-8") + } + + private def getIncludeHeader: Boolean = { + getDefaultBoolean( + params.asScala.toMap, + Seq("includeHeader", "include_header"), + default = true) + } + + private def getDelimiter: String = { + getDefaultString(params.asScala.toMap, Seq("delimiter"), default = ",") + } + } diff --git a/src/main/scala/com/johnsnowlabs/reader/TextReader.scala b/src/main/scala/com/johnsnowlabs/reader/TextReader.scala index ea0598a05940da..6f0a01161cc5e9 100644 --- a/src/main/scala/com/johnsnowlabs/reader/TextReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/TextReader.scala @@ -158,24 +158,29 @@ class TextReader( text } - // Split the processed text into blocks using two or more newlines. val blocks = processedText.split(blockSplit).map(_.trim).filter(_.nonEmpty) val elements = mutable.ArrayBuffer[HTMLElement]() var i = 0 + var sentenceIdx = 0 + while (i < blocks.length) { val currentBlock = blocks(i) + val paragraphIdx = (i / 2).toString + if (isTitleCandidate(currentBlock)) { elements += HTMLElement( ElementType.TITLE, currentBlock, - mutable.Map("paragraph" -> (i / 2).toString)) + mutable.Map("paragraph" -> paragraphIdx, "sentence" -> sentenceIdx.toString)) + sentenceIdx += 1 if (i + 1 < blocks.length && !isTitleCandidate(blocks(i + 1))) { val narrative = blocks(i + 1) if (narrative.nonEmpty) { elements += HTMLElement( ElementType.NARRATIVE_TEXT, narrative, - mutable.Map("paragraph" -> (i / 2).toString)) + mutable.Map("paragraph" -> paragraphIdx, "sentence" -> sentenceIdx.toString)) + sentenceIdx += 1 } i += 2 } else { @@ -185,7 +190,8 @@ class TextReader( elements += HTMLElement( ElementType.NARRATIVE_TEXT, currentBlock, - mutable.Map("paragraph" -> (i / 2).toString)) + mutable.Map("paragraph" -> paragraphIdx, "sentence" -> sentenceIdx.toString)) + sentenceIdx += 1 i += 1 } } diff --git a/src/main/scala/com/johnsnowlabs/reader/util/HTMLParser.scala b/src/main/scala/com/johnsnowlabs/reader/util/HTMLParser.scala new file mode 100644 index 00000000000000..7a96bc166887da --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/reader/util/HTMLParser.scala @@ -0,0 +1,65 @@ +package com.johnsnowlabs.reader.util + +object HTMLParser { + + private val PtToPx = 1.333 // 1pt ≈ 1.333px (CSS standard conversion) + + /** Extracts font size from a CSS style string, supporting 'px', 'pt', 'em', 'rem', and '%'. + * Returns pixel size as integer. + * + * @param style + * CSS style string (e.g., "font-size: 1.2em; ...") + * @param baseEmPx + * Base pixel size for 1em (default 16) + * @param baseRemPx + * Base pixel size for 1rem (default 16) + * @param parentPx + * Parent font size for '%' (default 16) + * @return + * Font size in pixels, or 0 if not found + */ + def extractFontSize( + style: String, + baseEmPx: Int = 16, + baseRemPx: Int = 16, + parentPx: Int = 16): Int = { + val sizePattern = """(?i)font-size\s*:\s*([0-9.]+)\s*(px|pt|em|rem|%)""".r + sizePattern.findFirstMatchIn(style) match { + case Some(m) => + val value = m.group(1).toDouble + m.group(2).toLowerCase match { + case "px" => Math.round(value).toInt + case "pt" => Math.round(value * PtToPx).toInt + case "em" => Math.round(value * baseEmPx).toInt + case "rem" => Math.round(value * baseRemPx).toInt + case "%" => Math.round(parentPx * value / 100).toInt + case _ => 0 + } + case None => 0 + } + } + + def isFormattedAsTitle(style: String, titleFontSize: Int): Boolean = { + val lowerStyle = style.toLowerCase + + // Matches 'font-weight:bold', 'font-weight:bolder', 'font-weight:700', 'font-weight:800', 'font-weight:900' + val boldPattern = """font-weight\s*:\s*(bold(er)?|[7-9]00)\b""".r + val isBold = boldPattern.findFirstIn(lowerStyle).isDefined + + val isLargeFont = + lowerStyle.contains("font-size") && extractFontSize(lowerStyle) >= titleFontSize + val isCentered = lowerStyle.contains("text-align:center") + + isBold || isLargeFont || (isCentered && isBold) || (isCentered && isLargeFont) + } + + def isTitleElement(tag: String, style: String, role: String, titleFontSize: Int): Boolean = { + // Recognize titles from common title-related tags or formatted

elements + tag match { + case "title" | "h1" | "h2" | "h3" | "header" => true + case "p" | "div" => isFormattedAsTitle(style, titleFontSize) + case _ => role == "heading" // ARIA role="heading" + } + } + +} diff --git a/src/main/scala/com/johnsnowlabs/reader/util/PartitionOptions.scala b/src/main/scala/com/johnsnowlabs/reader/util/PartitionOptions.scala index 0d5ec60a6a7120..77255d8f29ac5a 100644 --- a/src/main/scala/com/johnsnowlabs/reader/util/PartitionOptions.scala +++ b/src/main/scala/com/johnsnowlabs/reader/util/PartitionOptions.scala @@ -50,4 +50,15 @@ object PartitionOptions { .getOrElse(default) } + def getDefaultDouble( + params: Map[String, String], + options: Seq[String], + default: Double): Double = { + options + .flatMap(params.get) + .flatMap(value => Try(value.toDouble).toOption) + .headOption + .getOrElse(default) + } + } diff --git a/src/test/resources/reader/csv/semicolon-delimited.csv b/src/test/resources/reader/csv/semicolon-delimited.csv new file mode 100644 index 00000000000000..5b3d9cf16c9371 --- /dev/null +++ b/src/test/resources/reader/csv/semicolon-delimited.csv @@ -0,0 +1,5 @@ +Lorem, ipsum; dolor sit; amet +consectetur; adipiscing; elit +sed, do; eiusmod; tempor incididunt +ut labore; et, dolore; magna aliqua +Ut enim; ad minim; veniam, quis diff --git a/src/test/resources/reader/csv/stanley-cups-utf-16.csv b/src/test/resources/reader/csv/stanley-cups-utf-16.csv new file mode 100644 index 00000000000000..b152e27aac8e0a Binary files /dev/null and b/src/test/resources/reader/csv/stanley-cups-utf-16.csv differ diff --git a/src/test/resources/reader/csv/stanley-cups.csv b/src/test/resources/reader/csv/stanley-cups.csv new file mode 100644 index 00000000000000..ab6de889333da4 --- /dev/null +++ b/src/test/resources/reader/csv/stanley-cups.csv @@ -0,0 +1,5 @@ +Stanley Cups,, +Team,Location,Stanley Cups +Blues,STL,1 +Flyers,PHI,2 +Maple Leafs,TOR,13 diff --git a/src/test/resources/reader/html/example-bold-strong.html b/src/test/resources/reader/html/example-bold-strong.html new file mode 100644 index 00000000000000..5a92267ed76560 --- /dev/null +++ b/src/test/resources/reader/html/example-bold-strong.html @@ -0,0 +1,18 @@ + + + + + Bold and Strong Tag Test + + +

+ This should be detected as a title +
+
+ This is also a title using strong +
+
+ This is just narrative text. +
+ + diff --git a/src/test/resources/reader/html/example-caption-th.html b/src/test/resources/reader/html/example-caption-th.html new file mode 100644 index 00000000000000..6c1a4870a9fec0 --- /dev/null +++ b/src/test/resources/reader/html/example-caption-th.html @@ -0,0 +1,27 @@ + + + + + Table Example with Caption and Headers + + + + + + + + + + + + + + + + + + + +
Student Grades
NameSubjectGrade
AliceMathA
BobScienceB+
+ + diff --git a/src/test/resources/reader/html/example-div.html b/src/test/resources/reader/html/example-div.html new file mode 100644 index 00000000000000..12b3f854e9719d --- /dev/null +++ b/src/test/resources/reader/html/example-div.html @@ -0,0 +1,15 @@ + + + + + Text in Title + + +
+ This Text Is Consider Title +
+
+ The text here is consider as narrative text, so it's content data. +
+ + diff --git a/src/test/resources/reader/html/xml-example.xml b/src/test/resources/reader/html/xml-example.xml deleted file mode 100644 index 83b100580081a9..00000000000000 --- a/src/test/resources/reader/html/xml-example.xml +++ /dev/null @@ -1,7 +0,0 @@ - - 101 - Jane Doe - jane.doe@example.com - true - 29 - \ No newline at end of file diff --git a/src/test/resources/reader/md/README.md b/src/test/resources/reader/md/README.md new file mode 100644 index 00000000000000..47639003186f6d --- /dev/null +++ b/src/test/resources/reader/md/README.md @@ -0,0 +1,23 @@ +## Example Docs + +The sample docs directory contains the following files: + +- `example-10k.html` - A 10-K SEC filing in HTML format +- `layout-parser-paper.pdf` - A PDF copy of the layout parser paper +- `factbook.xml`/`factbook.xsl` - Example XML/XLS files that you can use to test stylesheets + +These documents can be used to test out the parsers in the library. In addition, here are +instructions for pulling in some sample docs that are too big to store in the repo. + +#### XBRL 10-K + +You can get an example 10-K in inline XBRL format using the following `curl`. Note, you need +to have the user agent set in the header or the SEC site will reject your request. + +```bash +curl -O \ + -A '${organization} ${email}' + https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt +``` + +You can parse this document using the HTML parser. diff --git a/src/test/scala/com/johnsnowlabs/nlp/SparkNLPTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/SparkNLPTestSpec.scala index 0f19b96c975cf3..26222afc780159 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/SparkNLPTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/SparkNLPTestSpec.scala @@ -15,7 +15,7 @@ */ package com.johnsnowlabs.nlp -import com.johnsnowlabs.tags.FastTest +import com.johnsnowlabs.tags.{FastTest, SlowTest} import com.johnsnowlabs.util.ConfigHelper.{awsJavaSdkVersion, hadoopAwsVersion} import org.apache.spark.sql.functions.col import org.scalatest.flatspec.AnyFlatSpec @@ -60,18 +60,16 @@ class SparkNLPTestSpec extends AnyFlatSpec { assert(!htmlDF.select(col("html").getItem(0)).isEmpty) } - it should "structured HTML in real time" taggedAs FastTest in { + it should "structured HTML in real time" taggedAs SlowTest in { val url = "https://www.wikipedia.org" val htmlDF = SparkNLP.read.html(url) - htmlDF.show() assert(!htmlDF.select(col("html").getItem(0)).isEmpty) } - it should "structured HTML in real time for a set of URLs" taggedAs FastTest in { + it should "structured HTML in real time for a set of URLs" taggedAs SlowTest in { val urls = Array("https://www.wikipedia.org", "https://example.com/") val htmlDF = SparkNLP.read.html(urls) - htmlDF.show() assert(!htmlDF.select(col("html").getItem(0)).isEmpty) } @@ -89,7 +87,6 @@ class SparkNLPTestSpec extends AnyFlatSpec { it should "structured Email files" taggedAs FastTest in { val emailDirectory = "src/test/resources/reader/email" val emailDF = SparkNLP.read.email(emailDirectory) - emailDF.show() assert(!emailDF.select(col("email").getItem(0)).isEmpty) } diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala index 05c5916c843424..94a8ff20b56f82 100644 --- a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala +++ b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.partition import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.reader.{ElementType, HTMLElement} -import com.johnsnowlabs.tags.FastTest +import com.johnsnowlabs.tags.{FastTest, SlowTest} import org.apache.spark.sql.functions.col import org.scalatest.flatspec.AnyFlatSpec @@ -36,28 +36,24 @@ class PartitionTest extends AnyFlatSpec { "Partition" should "work with text content_type" taggedAs FastTest in { val textDf = Partition(Map("content_type" -> "text/plain")).partition(txtDirectory) - textDf.show() assert(!textDf.select(col("txt").getItem(0)).isEmpty) } it should "identify text file" taggedAs FastTest in { val textDf = Partition().partition(s"$txtDirectory/simple-text.txt") - textDf.show() assert(!textDf.select(col("txt").getItem(0)).isEmpty) } it should "work with word content_type" taggedAs FastTest in { val wordDf = Partition(Map("content_type" -> "application/msword")).partition(wordDirectory) - wordDf.show() assert(!wordDf.select(col("doc").getItem(0)).isEmpty) } it should "identify word file" taggedAs FastTest in { val wordDf = Partition().partition(s"$wordDirectory/fake_table.docx") - wordDf.show() assert(!wordDf.select(col("doc").getItem(0)).isEmpty) } @@ -65,28 +61,24 @@ class PartitionTest extends AnyFlatSpec { it should "work with excel content_type" taggedAs FastTest in { val excelDf = Partition(Map("content_type" -> "application/vnd.ms-excel")).partition(excelDirectory) - excelDf.show() assert(!excelDf.select(col("xls").getItem(0)).isEmpty) } it should "identify excel file" taggedAs FastTest in { val excelDf = Partition().partition(s"$excelDirectory/vodafone.xlsx") - excelDf.show() assert(!excelDf.select(col("xls").getItem(0)).isEmpty) } it should "work with email content_type" taggedAs FastTest in { val emailDf = Partition(Map("content_type" -> "message/rfc822")).partition(emailDirectory) - emailDf.show() assert(!emailDf.select(col("email").getItem(0)).isEmpty) } it should "wok with email file" taggedAs FastTest in { val emailDf = Partition().partition(s"$emailDirectory/test-several-attachments.eml") - emailDf.show() assert(!emailDf.select(col("email").getItem(0)).isEmpty) } @@ -94,59 +86,51 @@ class PartitionTest extends AnyFlatSpec { it should "work with powerpoint content_type" taggedAs FastTest in { val pptDf = Partition(Map("content_type" -> "application/vnd.ms-powerpoint")) .partition(powerPointDirectory) - pptDf.show() assert(!pptDf.select(col("ppt").getItem(0)).isEmpty) } it should "identify powerpoint file" taggedAs FastTest in { val pptDf = Partition().partition(s"$powerPointDirectory/fake-power-point.pptx") - pptDf.show() assert(!pptDf.select(col("ppt").getItem(0)).isEmpty) } it should "work with html content_type" taggedAs FastTest in { val htmlDf = Partition(Map("content_type" -> "text/html")).partition(htmlDirectory) - htmlDf.show() assert(!htmlDf.select(col("html").getItem(0)).isEmpty) } it should "identify html file" taggedAs FastTest in { val htmlDf = Partition().partition(s"$htmlDirectory/fake-html.html") - htmlDf.show() assert(!htmlDf.select(col("html").getItem(0)).isEmpty) } - it should "work with an URL" taggedAs FastTest in { + it should "work with an URL" taggedAs SlowTest in { val htmlDf = Partition().partition("https://www.wikipedia.org") - htmlDf.show() assert(!htmlDf.select(col("html").getItem(0)).isEmpty) } - it should "work with a set of URLS" taggedAs FastTest in { + it should "work with a set of URLS" taggedAs SlowTest in { val htmlDf = Partition().partitionUrls(Array("https://www.wikipedia.org", "https://example.com/")) - htmlDf.show() assert(!htmlDf.select(col("html").getItem(0)).isEmpty) } it should "identify a PDF file" taggedAs FastTest in { val pdfDf = Partition().partition(s"$pdfDirectory/text_3_pages.pdf") - pdfDf.show() - assert(!pdfDf.select(col("text")).isEmpty) + assert(!pdfDf.select(col("pdf")).isEmpty) } it should "work with PDF content_type" taggedAs FastTest in { val pdfDf = Partition(Map("content_type" -> "application/pdf")).partition(pdfDirectory) - pdfDf.show() - assert(!pdfDf.select(col("text")).isEmpty) + assert(!pdfDf.select(col("pdf")).isEmpty) } it should "work with text in memory" taggedAs FastTest in { @@ -161,7 +145,6 @@ class PartitionTest extends AnyFlatSpec { |""".stripMargin val textDf = Partition(Map("groupBrokenParagraphs" -> "true")).partitionText(content) - textDf.show() val elements: Seq[HTMLElement] = textDf .select("txt") @@ -179,12 +162,12 @@ class PartitionTest extends AnyFlatSpec { "At the end of the lane, the fox met a bear.", mutable.Map("paragraph" -> "0"))) - assert(elements == expectedElements) + assert(elements.head.elementType == expectedElements.head.elementType) + assert(elements.head.content == expectedElements.head.content) } it should "work with XML content_type" taggedAs FastTest in { val pdfDf = Partition(Map("content_type" -> "application/xml")).partition(xmlDirectory) - pdfDf.show() assert(!pdfDf.select(col("xml")).isEmpty) } diff --git a/src/test/scala/com/johnsnowlabs/reader/CSVReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/CSVReaderTest.scala new file mode 100644 index 00000000000000..8b08dc07e36e6e --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/reader/CSVReaderTest.scala @@ -0,0 +1,106 @@ +package com.johnsnowlabs.reader + +import org.apache.spark.sql.{DataFrame, Row} +import org.scalatest.flatspec.AnyFlatSpec + +class CSVReaderTest extends AnyFlatSpec { + + val cvsFilesDirectory = "src/test/resources/reader/csv" + + "CSVReader.csv" should "include TABLE element in csv array only if inferTableStructure=true" in { + val filePath = s"$cvsFilesDirectory/stanley-cups.csv" + + val csvReader = new CSVReader(inferTableStructure = true) + val csvDf = csvReader.csv(filePath) + val elementsTrue = getFirstElementsArray(csvDf, csvReader.getOutputColumn) + + val tableElement = elementsTrue.find(_.elementType == ElementType.TABLE) + val textElement = elementsTrue.find(_.elementType == ElementType.NARRATIVE_TEXT) + + assert(tableElement.isDefined) + assert(tableElement.get.content.trim.nonEmpty) + assert(textElement.isDefined) + assert(textElement.get.content.trim.nonEmpty) + } + + "CSVReader.csv" should "include only text element in csv array if inferTableStructure=false" in { + val filePath = s"$cvsFilesDirectory/stanley-cups.csv" + + val csvReader = new CSVReader(inferTableStructure = false) + val csvDf = csvReader.csv(filePath) + + val elementsTrue = getFirstElementsArray(csvDf, csvReader.getOutputColumn) + + val tableElement = elementsTrue.find(_.elementType == ElementType.TABLE) + val textElement = elementsTrue.find(_.elementType == ElementType.NARRATIVE_TEXT) + + assert(tableElement.isEmpty) + assert(textElement.isDefined) + assert(textElement.get.content.trim.nonEmpty) + } + + def getFirstElementsArray(df: DataFrame, outputCol: String): Array[HTMLElement] = { + import df.sparkSession.implicits._ + df.select(outputCol).as[Array[HTMLElement]].head() + } + + "CSVReader" should "produce normalized content including header when includeHeader = true" in { + val filePath = s"$cvsFilesDirectory/stanley-cups-utf-16.csv" + + val reader = new CSVReader(encoding = "UTF-16", includeHeader = true) + val csvDf = reader.csv(filePath) + + val elements = csvDf.head.getAs[Seq[Row]]("csv") + val plainText = elements.head.getAs[String]("content") + val EXPECTED_TEXT = + "Stanley Cups Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" + val result = cleanExtraWhitespace(plainText) + val expected = cleanExtraWhitespace(EXPECTED_TEXT) + + assert(result == expected) + } + + "CSVReader" should "produce normalized content without header when includeHeader = false" in { + val filePath = s"$cvsFilesDirectory/stanley-cups-utf-16.csv" + + val reader = new CSVReader(encoding = "UTF-16", includeHeader = false) + val csvDf = reader.csv(filePath) + + val elements = csvDf.head.getAs[Seq[Row]]("csv") + val plainText = elements.head.getAs[String]("content") + val EXPECTED_TEXT = "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" + val result = cleanExtraWhitespace(plainText) + val expected = cleanExtraWhitespace(EXPECTED_TEXT) + + assert(result == expected) + } + + "CSVReader.csv" should "work for other delimiters" in { + val filePath = s"$cvsFilesDirectory/semicolon-delimited.csv" + + val csvReader = new CSVReader(inferTableStructure = false, delimiter = ";") + val csvDf = csvReader.csv(filePath) + val elementsTrue = getFirstElementsArray(csvDf, csvReader.getOutputColumn) + + val tableElement = elementsTrue.find(_.elementType == ElementType.TABLE) + val textElement = elementsTrue.find(_.elementType == ElementType.NARRATIVE_TEXT) + + assert(tableElement.isEmpty) + assert(textElement.isDefined) + assert(textElement.get.content.trim.nonEmpty) + } + + def getFirstElement(df: org.apache.spark.sql.DataFrame, outputCol: String): HTMLElement = { + import df.sparkSession.implicits._ + df.select(outputCol).as[Seq[HTMLElement]].head.head + } + + def cleanExtraWhitespace(text: String): String = { + // Replace non-breaking spaces (\u00A0) and newlines with space + val cleanedText = text + .replaceAll("[\\u00A0\\n]", " ") + .replaceAll(" {2,}", " ") + cleanedText.trim + } + +} diff --git a/src/test/scala/com/johnsnowlabs/reader/HTMLReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/HTMLReaderTest.scala index b59b537be4ac89..ffd91def9aa1ad 100644 --- a/src/test/scala/com/johnsnowlabs/reader/HTMLReaderTest.scala +++ b/src/test/scala/com/johnsnowlabs/reader/HTMLReaderTest.scala @@ -15,8 +15,8 @@ */ package com.johnsnowlabs.reader -import com.johnsnowlabs.tags.FastTest -import org.apache.spark.sql.functions.{col, explode, map_keys} +import com.johnsnowlabs.tags.{FastTest, SlowTest} +import org.apache.spark.sql.functions.{col, explode} import org.scalatest.flatspec.AnyFlatSpec class HTMLReaderTest extends AnyFlatSpec { @@ -26,7 +26,6 @@ class HTMLReaderTest extends AnyFlatSpec { it should "read html as dataframe" taggedAs FastTest in { val HTMLReader = new HTMLReader() val htmlDF = HTMLReader.read(htmlFilesDirectory) - htmlDF.show() assert(!htmlDF.select(col("html").getItem(0)).isEmpty) assert(!htmlDF.columns.contains("content")) @@ -35,25 +34,22 @@ class HTMLReaderTest extends AnyFlatSpec { it should "read html as dataframe with params" taggedAs FastTest in { val HTMLReader = new HTMLReader(titleFontSize = 12) val htmlDF = HTMLReader.read(htmlFilesDirectory) - htmlDF.show() assert(!htmlDF.select(col("html").getItem(0)).isEmpty) assert(!htmlDF.columns.contains("content")) } - it should "parse an html in real time" taggedAs FastTest in { + it should "parse an html in real time" taggedAs SlowTest in { val HTMLReader = new HTMLReader() val htmlDF = HTMLReader.read("https://www.wikipedia.org") - htmlDF.show() assert(!htmlDF.select(col("html").getItem(0)).isEmpty) assert(!htmlDF.columns.contains("content")) } - it should "parse URLS in real time" taggedAs FastTest in { + it should "parse URLS in real time" taggedAs SlowTest in { val HTMLReader = new HTMLReader() val htmlDF = HTMLReader.read(Array("https://www.wikipedia.org", "https://example.com/")) - htmlDF.show() assert(!htmlDF.select(col("html").getItem(0)).isEmpty) assert(!htmlDF.columns.contains("content")) @@ -62,7 +58,6 @@ class HTMLReaderTest extends AnyFlatSpec { it should "store content" taggedAs FastTest in { val HTMLReader = new HTMLReader(storeContent = true) val htmlDF = HTMLReader.read(htmlFilesDirectory) - htmlDF.show() assert(!htmlDF.select(col("html").getItem(0)).isEmpty) assert(htmlDF.columns.contains("content")) @@ -72,16 +67,16 @@ class HTMLReaderTest extends AnyFlatSpec { val HTMLReader = new HTMLReader(headers = Map("User-Agent" -> "Mozilla/5.0", "Accept-Language" -> "es-ES")) val htmlDF = HTMLReader.read("https://www.google.com") - htmlDF.show() assert(!htmlDF.select(col("html").getItem(0)).isEmpty) assert(!htmlDF.columns.contains("content")) } - it should "output as title for font size >= 14" taggedAs FastTest in { - val HTMLReader = new HTMLReader(titleFontSize = 14) + it should "output as title for font size >= 19" taggedAs FastTest in { + val HTMLReader = new HTMLReader(titleFontSize = 19) val htmlDF = HTMLReader.read(s"$htmlFilesDirectory/title-test.html") + htmlDF.show(truncate = false) val titleDF = htmlDF .select(explode(col("html")).as("exploded_html")) .filter(col("exploded_html.elementType") === ElementType.TITLE) @@ -90,8 +85,8 @@ class HTMLReaderTest extends AnyFlatSpec { assert(titleDF.count() == 2) } - it should "output as title for font size >= 16" taggedAs FastTest in { - val HTMLReader = new HTMLReader(titleFontSize = 16) + it should "output as title for font size >= 22" taggedAs FastTest in { + val HTMLReader = new HTMLReader(titleFontSize = 22) val htmlDF = HTMLReader.read(s"$htmlFilesDirectory/title-test.html") val titleDF = htmlDF @@ -102,4 +97,52 @@ class HTMLReaderTest extends AnyFlatSpec { assert(titleDF.count() == 1) } + it should "correctly parse div tags" taggedAs FastTest in { + val HTMLReader = new HTMLReader() + val htmlDF = HTMLReader.read(s"$htmlFilesDirectory/example-div.html") + val titleDF = htmlDF + .select(explode(col("html")).as("exploded_html")) + .filter(col("exploded_html.elementType") === ElementType.TITLE) + val textDF = htmlDF + .select(explode(col("html")).as("exploded_html")) + .filter(col("exploded_html.elementType") === ElementType.NARRATIVE_TEXT) + + assert(titleDF.count() == 1) + assert(textDF.count() == 1) + } + + it should "correctly parse bold and strong tags" taggedAs FastTest in { + val HTMLReader = new HTMLReader() + val htmlDF = HTMLReader.read(s"$htmlFilesDirectory/example-bold-strong.html") + htmlDF.show(truncate = false) + val titleDF = htmlDF + .select(explode(col("html")).as("exploded_html")) + .filter(col("exploded_html.elementType") === ElementType.TITLE) + + assert(titleDF.count() == 2) + } + + it should "correctly parse caption and th tags" taggedAs FastTest in { + val HTMLReader = new HTMLReader() + val htmlDF = HTMLReader.read(s"$htmlFilesDirectory/example-caption-th.html") + htmlDF.show(truncate = false) + val titleDF = htmlDF + .select(explode(col("html")).as("exploded_html")) + .filter(col("exploded_html.elementType") === ElementType.TABLE) + + assert(titleDF.count() == 1) + } + + it should "include title tag value in metadata" taggedAs FastTest in { + val HTMLReader = new HTMLReader(includeTitleTag = true) + val htmlDF = HTMLReader.read(s"$htmlFilesDirectory/example-caption-th.html") + htmlDF.show(truncate = false) + + val titleDF = htmlDF + .select(explode(col("html")).as("exploded_html")) + .filter(col("exploded_html.elementType") === ElementType.TITLE) + + assert(titleDF.count() == 1) + } + } diff --git a/src/test/scala/com/johnsnowlabs/reader/MarkdownReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/MarkdownReaderTest.scala index 28edc9e3444dc0..91668073729f6c 100644 --- a/src/test/scala/com/johnsnowlabs/reader/MarkdownReaderTest.scala +++ b/src/test/scala/com/johnsnowlabs/reader/MarkdownReaderTest.scala @@ -80,7 +80,7 @@ class MarkdownReaderTest extends AnyFlatSpec { } it should "parse README.md and the first element must be a TITLE" taggedAs FastTest in { - val mdDf = mdReader.md(filePath = s"$mdDirectory/README.md") // Update path if needed + val mdDf = mdReader.md(filePath = s"$mdDirectory/README.md") val elements: Seq[HTMLElement] = mdDf .select(mdReader.getOutputColumn) diff --git a/src/test/scala/com/johnsnowlabs/reader/PdfReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/PdfReaderTest.scala new file mode 100644 index 00000000000000..6e1803088f2599 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/reader/PdfReaderTest.scala @@ -0,0 +1,68 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.tags.FastTest +import org.scalatest.flatspec.AnyFlatSpec +import org.apache.spark.sql.functions.{col, explode} + +class PdfReaderTest extends AnyFlatSpec { + + val pdfDirectory = "src/test/resources/reader/pdf/" + + "PdfReader" should "read a PDF file as dataframe" taggedAs FastTest in { + val pdfReader = new PdfReader() + val pdfDf = pdfReader.pdf(s"$pdfDirectory/text_3_pages.pdf") + pdfDf.show() + + assert(!pdfDf.select(col("pdf").getItem(0)).isEmpty) + assert(!pdfDf.columns.contains("content")) + } + + it should "store content" taggedAs FastTest in { + val pdfReader = new PdfReader(storeContent = true) + val pdfDf = pdfReader.pdf(s"$pdfDirectory/text_3_pages.pdf") + pdfDf.show() + + assert(!pdfDf.select(col("pdf").getItem(0)).isEmpty) + assert(pdfDf.columns.contains("content")) + } + + it should "identify text as titles based on threshold value" taggedAs FastTest in { + val pdfReader = new PdfReader(titleThreshold = 10) + val pdfDf = pdfReader.pdf(s"$pdfDirectory/pdf-title.pdf") + pdfDf.show(false) + + val titleDF = pdfDf + .select(explode(col("pdf")).as("exploded_pdf")) + .filter(col("exploded_pdf.elementType") === ElementType.TITLE) + titleDF.select("exploded_pdf").show(truncate = false) + + assert(titleDF.count() == 3) + } + + it should "handle corrupted files" taggedAs FastTest in { + val pdfReader = new PdfReader() + val pdfDf = pdfReader.pdf(s"src/test/resources/reader/pdf-corrupted/corrupted.pdf") + + val resultDF = pdfDf + .select(explode(col("pdf")).as("exploded_pdf")) + .filter(col("exploded_pdf.elementType") === ElementType.UNCATEGORIZED_TEXT) + + assert(resultDF.count() == 1) + } + +} diff --git a/src/test/scala/com/johnsnowlabs/reader/Reader2DocTest.scala b/src/test/scala/com/johnsnowlabs/reader/Reader2DocTest.scala new file mode 100644 index 00000000000000..f3f28c2510fcab --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/reader/Reader2DocTest.scala @@ -0,0 +1,250 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.nlp.annotators.SparkSessionTest +import com.johnsnowlabs.nlp.{Annotation, AssertAnnotations} +import com.johnsnowlabs.tags.FastTest +import org.apache.spark.ml.Pipeline +import org.scalatest.flatspec.AnyFlatSpec + +class Reader2DocTest extends AnyFlatSpec with SparkSessionTest { + + val htmlFilesDirectory = "src/test/resources/reader/html" + val docDirectory = "src/test/resources/reader/doc" + val txtDirectory = "src/test/resources/reader/txt/" + val pdfDirectory = "src/test/resources/reader/pdf/" + val mdDirectory = "src/test/resources/reader/md" + val xmlDirectory = "src/test/resources/reader/xml" + + "Reader2Doc" should "convert unstructured input to structured output for HTML" taggedAs FastTest in { + + val reader2Doc = new Reader2Doc() + .setContentType("text/html") + .setContentPath(s"$htmlFilesDirectory/example-div.html") + .setOutputCol("document") + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + + assert(resultDf.count() > 1) + } + + it should "output clean flatten text without any structured metadata" taggedAs FastTest in { + + val reader2Doc = new Reader2Doc() + .setContentType("text/html") + .setContentPath(s"$htmlFilesDirectory/example-div.html") + .setOutputCol("document") + .setFlattenOutput(true) + .setExplodeDocs(false) + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + resultDf.show(truncate = false) + + val expected: Array[Seq[Annotation]] = Array( + Seq( + Annotation( + "document", + 0, + 26, + "This Text Is Consider Title", + Map("sentence" -> "0"), + Array.emptyFloatArray), + Annotation( + "document", + 27, + 92, + "The text here is consider as narrative text, so it's content data.", + Map("sentence" -> "1"), + Array.emptyFloatArray))) + + val actual: Array[Seq[Annotation]] = AssertAnnotations.getActualResult(resultDf, "document") + + AssertAnnotations.assertFields(expected, actual) + + for { + doc <- actual + annotation <- doc + } { + assert( + annotation.metadata.keySet == Set("sentence"), + s"Metadata keys should only be 'sentence', but got: ${annotation.metadata.keySet}") + } + } + + it should "convert Reader output to Document format with one row per document" taggedAs FastTest in { + + val reader2Doc = new Reader2Doc() + .setContentType("text/html") + .setContentPath(s"$htmlFilesDirectory/example-div.html") + .setOutputCol("document") + .setExplodeDocs(false) + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + + assert(resultDf.count() == 1) + } + + it should "work with Tokenizer" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentType("text/html") + .setContentPath(s"$htmlFilesDirectory/fake-html.html") + .setOutputCol("document") + val pipeline = new Pipeline().setStages(Array(reader2Doc, tokenizer)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + resultDf.select("document").show(truncate = false) + + assert(resultDf.count() > 1) + } + + it should "work for Text documents" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentType("text/plain") + .setContentPath(s"$txtDirectory/simple-text.txt") + .setOutputCol("document") + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + + assert(resultDf.count() > 1) + } + + it should "work for Word documents" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentType("application/msword") + .setContentPath(s"$docDirectory/page-breaks.docx") + .setOutputCol("document") + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + + assert(resultDf.count() > 1) + } + + it should "work with PDF documents" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentType("application/pdf") + .setContentPath(s"$pdfDirectory/pdf-title.pdf") + .setOutputCol("document") + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + + assert(resultDf.count() > 1) + } + + it should "work with Markdown" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentType("text/markdown") + .setContentPath(s"$mdDirectory/simple.md") + .setOutputCol("document") + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + + assert(resultDf.count() > 1) + } + + it should "work with XML" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentType("application/xml") + .setContentPath(s"$xmlDirectory/test.xml") + .setOutputCol("document") + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + + assert(resultDf.count() > 1) + } + + it should "throw if contentPath is not set" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentType("text/html") + .setOutputCol("document") + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + val pipelineModel = pipeline.fit(emptyDataSet) + + val ex = intercept[IllegalArgumentException] { + pipelineModel.transform(emptyDataSet) + } + ex.getMessage.contains("contentPath must be set") + } + + it should "throw if contentPath is empty string" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentType("text/html") + .setOutputCol("document") + .setContentPath(" ") + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + val pipelineModel = pipeline.fit(emptyDataSet) + + val ex = intercept[IllegalArgumentException] { + pipelineModel.transform(emptyDataSet) + } + ex.getMessage.contains("contentPath must be set") + } + + it should "throw if contentType is not set" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentPath("/some/path/file.txt") + .setOutputCol("document") + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + val pipelineModel = pipeline.fit(emptyDataSet) + + val ex = intercept[IllegalArgumentException] { + pipelineModel.transform(emptyDataSet) + } + ex.getMessage.contains("contentType must be set") + } + + it should "throw if contentType is empty string" taggedAs FastTest in { + val reader2Doc = new Reader2Doc() + .setContentPath("/some/path/file.txt") + .setContentType("") + .setOutputCol("document") + + val pipeline = new Pipeline().setStages(Array(reader2Doc)) + val piplineModel = pipeline.fit(emptyDataSet) + + val ex = intercept[IllegalArgumentException] { + piplineModel.transform(emptyDataSet) + } + ex.getMessage.contains("contentType must be set") + } + +} diff --git a/src/test/scala/com/johnsnowlabs/reader/TextReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/TextReaderTest.scala index 33685f4eeca2fd..5ff4dcb81fb210 100644 --- a/src/test/scala/com/johnsnowlabs/reader/TextReaderTest.scala +++ b/src/test/scala/com/johnsnowlabs/reader/TextReaderTest.scala @@ -75,7 +75,9 @@ class TextReaderTest extends AnyFlatSpec { "At the end of the lane, the fox met a bear.", mutable.Map("paragraph" -> "0"))) - assert(elements == expectedElements) + val actualBasic = elements.map(e => (e.elementType, e.content)) + val expectedBasic = expectedElements.map(e => (e.elementType, e.content)) + assert(actualBasic == expectedBasic) } it should "group broken paragraphs reading from file" taggedAs FastTest in { @@ -100,7 +102,9 @@ class TextReaderTest extends AnyFlatSpec { "At the end of the lane, the fox met a bear.", mutable.Map("paragraph" -> "0"))) - assert(elements == expectedElements) + val actualBasic = elements.map(e => (e.elementType, e.content)) + val expectedBasic = expectedElements.map(e => (e.elementType, e.content)) + assert(actualBasic == expectedBasic) } it should "paragraph split with custom regex" taggedAs FastTest in { @@ -134,7 +138,9 @@ the fox met a friendly bear.""" "At the end of the lane the fox met a friendly bear.", mutable.Map("paragraph" -> "0"))) - assert(elements == expectedElements) + val actualBasic = elements.map(e => (e.elementType, e.content)) + val expectedBasic = expectedElements.map(e => (e.elementType, e.content)) + assert(actualBasic == expectedBasic) } it should "output as title for font size >= 40" taggedAs FastTest in { diff --git a/src/test/scala/com/johnsnowlabs/reader/util/HTMLParserTest.scala b/src/test/scala/com/johnsnowlabs/reader/util/HTMLParserTest.scala new file mode 100644 index 00000000000000..e29efe7b2db5bd --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/reader/util/HTMLParserTest.scala @@ -0,0 +1,123 @@ +package com.johnsnowlabs.reader.util + +import org.scalatest.flatspec.AnyFlatSpec + +class HTMLParserTest extends AnyFlatSpec { + + private val PtToPx = 1.333 + + "extractFontSize" should "extract px values correctly" in { + assert(HTMLParser.extractFontSize("font-size: 18px;") == 18) + assert(HTMLParser.extractFontSize("font-size : 24 px ;") == 24) + } + + it should "extract pt values and convert to px" in { + assert(HTMLParser.extractFontSize("font-size: 12pt;") == Math.round(12 * PtToPx).toInt) + assert(HTMLParser.extractFontSize("font-size: 14 pt;") == Math.round(14 * PtToPx).toInt) + } + + it should "extract em values using default baseEmPx" in { + assert(HTMLParser.extractFontSize("font-size: 2em;") == 32) // 2 * 16 + assert(HTMLParser.extractFontSize("font-size: 1.08em;") == Math.round(1.08 * 16).toInt) + } + + it should "extract rem values using default baseRemPx" in { + assert(HTMLParser.extractFontSize("font-size: 1.5rem;") == 24) // 1.5 * 16 + assert(HTMLParser.extractFontSize("font-size: 0.9rem;") == Math.round(0.9 * 16).toInt) + } + + it should "extract percent values using default parentPx" in { + assert(HTMLParser.extractFontSize("font-size: 200%;") == 32) // 200% of 16 + assert(HTMLParser.extractFontSize("font-size: 75%;") == 12) + } + + it should "allow overriding baseEmPx, baseRemPx, and parentPx" in { + assert(HTMLParser.extractFontSize("font-size: 2em;", baseEmPx = 20) == 40) + assert(HTMLParser.extractFontSize("font-size: 1.5rem;", baseRemPx = 10) == 15) + assert(HTMLParser.extractFontSize("font-size: 50%;", parentPx = 10) == 5) + } + + it should "return 0 for missing or unrecognized font-size" in { + assert(HTMLParser.extractFontSize("font-weight: bold;") == 0) + assert(HTMLParser.extractFontSize("font-size: large;") == 0) + assert(HTMLParser.extractFontSize("") == 0) + } + + it should "handle spaces and mixed case units" in { + assert(HTMLParser.extractFontSize("font-size : 18PX ;") == 18) + assert(HTMLParser.extractFontSize("font-size: 12Pt ;") == Math.round(12 * PtToPx).toInt) + assert(HTMLParser.extractFontSize("font-size : 1.2eM ;") == Math.round(1.2 * 16).toInt) + } + + // --- BOLD detection --- + "isFormattedAsTitle" should "detect 'font-weight:bold' as title" in { + assert(HTMLParser.isFormattedAsTitle("font-weight:bold;", 16)) + assert(HTMLParser.isFormattedAsTitle("font-weight: bold ;", 16)) + assert(HTMLParser.isFormattedAsTitle(" FONT-WEIGHT:BOLD ; ", 16)) // Mixed case + assert( + HTMLParser.isFormattedAsTitle("font-weight:bold;font-size:10px;", 16) + ) // Bold but small font + } + + it should "detect 'font-weight:bolder' as title" in { + assert(HTMLParser.isFormattedAsTitle("font-weight:bolder;", 16)) + assert(HTMLParser.isFormattedAsTitle("font-weight: bolder ; font-size:12px;", 16)) + } + + it should "detect numeric bold values" in { + assert(HTMLParser.isFormattedAsTitle("font-weight:700;", 16)) + assert(HTMLParser.isFormattedAsTitle("font-weight: 900 ;", 16)) + assert(HTMLParser.isFormattedAsTitle("font-weight:800; font-size:10px;", 16)) + } + + it should "not detect normal or light weights as title" in { + assert(!HTMLParser.isFormattedAsTitle("font-weight:400;", 16)) + assert(!HTMLParser.isFormattedAsTitle("font-weight:normal;", 16)) + assert(!HTMLParser.isFormattedAsTitle("font-weight:light;", 16)) + assert(!HTMLParser.isFormattedAsTitle("font-weight:100;", 16)) + } + + // --- LARGE FONT detection --- + it should "detect large font-size as title" in { + assert(HTMLParser.isFormattedAsTitle("font-size: 20px;", 16)) + assert(HTMLParser.isFormattedAsTitle("font-size: 1.5em;", 16)) // 24px + assert(!HTMLParser.isFormattedAsTitle("font-size: 12px;", 16)) + } + + // --- CENTERED TEXT detection --- + it should "detect centered bold text as title" in { + assert(HTMLParser.isFormattedAsTitle("font-weight:bold; text-align:center;", 16)) + assert(HTMLParser.isFormattedAsTitle("text-align:center; font-weight:bold;", 16)) + assert(HTMLParser.isFormattedAsTitle("font-weight:700; text-align:center;", 16)) + } + + it should "detect centered large text as title" in { + assert(HTMLParser.isFormattedAsTitle("text-align:center; font-size:20px;", 16)) + assert(!HTMLParser.isFormattedAsTitle("text-align:center; font-size:12px;", 16)) + } + + // --- NEGATIVE CASES --- + it should "return false for unrelated or empty styles" in { + assert(!HTMLParser.isFormattedAsTitle("font-size:12px;", 16)) + assert(!HTMLParser.isFormattedAsTitle("text-align:left;", 16)) + assert(!HTMLParser.isFormattedAsTitle("", 16)) + assert(!HTMLParser.isFormattedAsTitle("font-style:italic;", 16)) + } + + // --- MIXED & EDGE CASES --- + it should "handle mixed cases and excessive whitespace" in { + assert(HTMLParser.isFormattedAsTitle(" FONT-WEIGHT: BOLD ; ", 16)) + assert(HTMLParser.isFormattedAsTitle("font-size : 18PX ;", 16)) + assert(!HTMLParser.isFormattedAsTitle("font-size : 10PX ;", 16)) + } + + it should "detect bold and centered even if font-size is too small" in { + assert( + HTMLParser.isFormattedAsTitle("font-weight:bold; text-align:center; font-size:10px;", 16)) + } + + it should "detect large and centered even if not bold" in { + assert(HTMLParser.isFormattedAsTitle("font-size:22px; text-align:center;", 16)) + } + +}