diff --git a/examples/python/data-preprocessing/SparkNLP_Reader2Doc_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Reader2Doc_Demo.ipynb new file mode 100644 index 00000000000000..d625c8bcf56a14 --- /dev/null +++ b/examples/python/data-preprocessing/SparkNLP_Reader2Doc_Demo.ipynb @@ -0,0 +1,1395 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing Reader2Doc in SparkNLP\n", + "This notebook showcases the newly added `Reader2Doc` annotator in Spark NLP\n", + "providing a streamlined and user-friendly interface for reading files. Useful for preprocessing data for NLP pipelines" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DczWop6QeE8F", + "outputId": "63c45993-626d-4b75-b4d4-57efe43b8a84" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.5.1\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RFOFhaEedalB" + }, + "source": [ + "## Setup and Initialization\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "Support for **Reader2Doc** annotator was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1vLFuCnVnVd8" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab. This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "JVUu3mJXnXmm" + }, + "outputs": [], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-QXcOttbnmsI" + }, + "source": [ + "The output of Reader2Doc uses the same Annotation schema as other Spark NLP annotators. This means you can seamlessly integrate it into any Spark NLP pipeline or process that expects annotated data." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "0WIcptZ7nhp5" + }, + "outputs": [], + "source": [ + "from sparknlp.reader.reader2doc import Reader2Doc\n", + "from pyspark.ml import Pipeline\n", + "\n", + "empty_df = spark.createDataFrame([], \"string\").toDF(\"text\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ah8RigOanazZ" + }, + "source": [ + "For local files example we will download different files from Spark NLP Github repo:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E3bCFJZn8TS0" + }, + "source": [ + "## Reading PDF Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CZP7vz-gn6Rl" + }, + "source": [ + "**Downloading PDF files**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ya8qZe00dalC", + "outputId": "7b0ed5d2-aa8a-493f-fe32-ce9b1cf9c581" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:50:48-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/image_3_pages.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 15629 (15K) [application/octet-stream]\n", + "Saving to: ‘pdf-files/image_3_pages.pdf’\n", + "\n", + "image_3_pages.pdf 100%[===================>] 15.26K --.-KB/s in 0.001s \n", + "\n", + "2025-07-20 23:50:49 (13.4 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n", + "\n", + "--2025-07-20 23:50:49-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/pdf-title.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 25803 (25K) [application/octet-stream]\n", + "Saving to: ‘pdf-files/pdf-title.pdf’\n", + "\n", + "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0.002s \n", + "\n", + "2025-07-20 23:50:49 (16.0 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n", + "\n", + "--2025-07-20 23:50:49-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/text_3_pages.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9487 (9.3K) [application/octet-stream]\n", + "Saving to: ‘pdf-files/text_3_pages.pdf’\n", + "\n", + "text_3_pages.pdf 100%[===================>] 9.26K --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:50:49 (60.9 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir pdf-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/image_3_pages.pdf -P pdf-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/pdf-title.pdf -P pdf-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/pdf/text_3_pages.pdf -P pdf-files" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3vz48AHQHyON", + "outputId": "cf838fd4-fdf7-47c8-c641-2dbbf2e021e2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 14...|\n", + "|[{document, 15, 3...|\n", + "|[{document, 36, 5...|\n", + "|[{document, 0, 14...|\n", + "|[{document, 15, 3...|\n", + "|[{document, 39, 6...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/pdf\") \\\n", + " .setContentPath(\"./pdf-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "02qrQWIWP89R" + }, + "source": [ + "## Reading HTML Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "joUGu23jq4m4" + }, + "source": [ + "**Downloading HTML files**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bo7s-jZVrE7W", + "outputId": "01cd6445-85e7-4632-fddc-0276de3d2ce3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:04-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/html/example-10k.html\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2456707 (2.3M) [text/plain]\n", + "Saving to: ‘html-files/example-10k.html’\n", + "\n", + "\r", + "example-10k.html 0%[ ] 0 --.-KB/s \r", + "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.05s \n", + "\n", + "2025-07-20 23:51:04 (43.6 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n", + "\n", + "--2025-07-20 23:51:04-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/html/fake-html.html\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 665 [text/plain]\n", + "Saving to: ‘html-files/fake-html.html’\n", + "\n", + "fake-html.html 100%[===================>] 665 --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:04 (28.5 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir html-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/html/example-10k.html -P html-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/html/fake-html.html -P html-files" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dg13MEz2NDzE", + "outputId": "1097dfc0-dd7b-4ac6-e172-c883e10f7bf8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 12...|\n", + "|[{document, 13, 4...|\n", + "|[{document, 47, 6...|\n", + "|[{document, 69, 7...|\n", + "|[{document, 78, 1...|\n", + "|[{document, 164, ...|\n", + "|[{document, 207, ...|\n", + "|[{document, 297, ...|\n", + "|[{document, 330, ...|\n", + "|[{document, 363, ...|\n", + "|[{document, 382, ...|\n", + "|[{document, 447, ...|\n", + "|[{document, 702, ...|\n", + "|[{document, 755, ...|\n", + "|[{document, 862, ...|\n", + "|[{document, 992, ...|\n", + "|[{document, 1127,...|\n", + "|[{document, 1481,...|\n", + "|[{document, 1796,...|\n", + "|[{document, 2143,...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"text/html\") \\\n", + " .setContentPath(\"./html-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMyqJX-K7dss" + }, + "source": [ + "## Reading MS Office Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dpYvufV2qgbB" + }, + "source": [ + "### Reading Word Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l-uW6gV8pUYM" + }, + "source": [ + "**Downloading Word files**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zLLEUl3KpYZ6", + "outputId": "b22f1af6-6bea-4c59-df27-53c829439928" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/contains-pictures.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 95087 (93K) [application/octet-stream]\n", + "Saving to: ‘word-files/contains-pictures.docx’\n", + "\n", + "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.02s \n", + "\n", + "2025-07-20 23:51:07 (4.77 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n", + "\n", + "--2025-07-20 23:51:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/fake_table.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12392 (12K) [application/octet-stream]\n", + "Saving to: ‘word-files/fake_table.docx’\n", + "\n", + "fake_table.docx 100%[===================>] 12.10K --.-KB/s in 0.001s \n", + "\n", + "2025-07-20 23:51:07 (21.8 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n", + "\n", + "--2025-07-20 23:51:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/page-breaks.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 14584 (14K) [application/octet-stream]\n", + "Saving to: ‘word-files/page-breaks.docx’\n", + "\n", + "page-breaks.docx 100%[===================>] 14.24K --.-KB/s in 0.001s \n", + "\n", + "2025-07-20 23:51:08 (14.8 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir word-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/contains-pictures.docx -P word-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/fake_table.docx -P word-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/doc/page-breaks.docx -P word-files" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YId4UG1rOVQq", + "outputId": "868114c4-6605-423f-864e-dbf00875225c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 31...|\n", + "|[{document, 32, 4...|\n", + "|[{document, 430, ...|\n", + "|[{document, 504, ...|\n", + "|[{document, 586, ...|\n", + "|[{document, 0, 11...|\n", + "|[{document, 114, ...|\n", + "|[{document, 263, ...|\n", + "|[{document, 294, ...|\n", + "|[{document, 325, ...|\n", + "|[{document, 354, ...|\n", + "|[{document, 411, ...|\n", + "|[{document, 0, 11...|\n", + "|[{document, 12, 2...|\n", + "|[{document, 24, 3...|\n", + "|[{document, 35, 4...|\n", + "|[{document, 49, 6...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/msword\") \\\n", + " .setContentPath(\"./word-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E8ockED4NxLi" + }, + "source": [ + "### Reading PowerPoint Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A3lF0_7qqlZB" + }, + "source": [ + "**Downloading PowerPoint files**" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1jDRFmcHqpxn", + "outputId": "5cd0ee8d-417f-42c5-fff6-e70dcd281468" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/fake-power-point.pptx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 38412 (38K) [application/octet-stream]\n", + "Saving to: ‘ppt-files/fake-power-point.pptx’\n", + "\n", + "\r", + "fake-power-point.pp 0%[ ] 0 --.-KB/s \r", + "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.008s \n", + "\n", + "2025-07-20 23:51:11 (4.88 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n", + "\n", + "--2025-07-20 23:51:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/fake-power-point-table.pptx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 39894 (39K) [application/octet-stream]\n", + "Saving to: ‘ppt-files/fake-power-point-table.pptx’\n", + "\n", + "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.008s \n", + "\n", + "2025-07-20 23:51:11 (4.60 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n", + "\n", + "--2025-07-20 23:51:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/speaker-notes.pptx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 39414 (38K) [application/octet-stream]\n", + "Saving to: ‘ppt-files/speaker-notes.pptx’\n", + "\n", + "speaker-notes.pptx 100%[===================>] 38.49K --.-KB/s in 0.007s \n", + "\n", + "2025-07-20 23:51:11 (5.49 MB/s) - ‘ppt-files/speaker-notes.pptx’ saved [39414/39414]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir ppt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/fake-power-point.pptx -P ppt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/fake-power-point-table.pptx -P ppt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1259-Implement-Reader2Doc-Annotator/src/test/resources/reader/ppt/speaker-notes.pptx -P ppt-files" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fPCpk7RTGRjo", + "outputId": "daeb374c-44c6-42cd-adbf-a42610456a61" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 20...|\n", + "|[{document, 21, 5...|\n", + "|[{document, 51, 8...|\n", + "|[{document, 89, 1...|\n", + "|[{document, 144, ...|\n", + "|[{document, 166, ...|\n", + "|[{document, 0, 20...|\n", + "|[{document, 21, 5...|\n", + "|[{document, 51, 8...|\n", + "|[{document, 89, 1...|\n", + "|[{document, 144, ...|\n", + "|[{document, 166, ...|\n", + "|[{document, 0, 19...|\n", + "|[{document, 20, 2...|\n", + "|[{document, 28, 3...|\n", + "|[{document, 36, 4...|\n", + "|[{document, 44, 4...|\n", + "|[{document, 47, 5...|\n", + "|[{document, 52, 5...|\n", + "|[{document, 56, 6...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/vnd.ms-powerpoint\") \\\n", + " .setContentPath(\"./ppt-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yHsnpNNmrWtR" + }, + "source": [ + "### Reading Excel Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "40ts9-MmqNHp" + }, + "source": [ + "**Downloading Excel files**" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G3-BCYP6qQ4x", + "outputId": "11775571-4dd6-47f2-f1f2-b075f13a608c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12541 (12K) [application/octet-stream]\n", + "Saving to: ‘excel-files/vodafone.xlsx’\n", + "\n", + "\r", + "vodafone.xlsx 0%[ ] 0 --.-KB/s \r", + "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0.001s \n", + "\n", + "2025-07-20 23:51:15 (18.2 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n", + "\n", + "--2025-07-20 23:51:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 38442 (38K) [application/octet-stream]\n", + "Saving to: ‘excel-files/2023-half-year-analyses-by-segment.xlsx’\n", + "\n", + "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.007s \n", + "\n", + "2025-07-20 23:51:15 (5.15 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n", + "\n", + "--2025-07-20 23:51:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10676 (10K) [application/octet-stream]\n", + "Saving to: ‘excel-files/page-break-example.xlsx’\n", + "\n", + "page-break-example. 100%[===================>] 10.43K --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:16 (42.5 MB/s) - ‘excel-files/page-break-example.xlsx’ saved [10676/10676]\n", + "\n", + "--2025-07-20 23:51:16-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9210 (9.0K) [application/octet-stream]\n", + "Saving to: ‘excel-files/xlsx-subtable-cases.xlsx’\n", + "\n", + "xlsx-subtable-cases 100%[===================>] 8.99K --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:16 (73.0 MB/s) - ‘excel-files/xlsx-subtable-cases.xlsx’ saved [9210/9210]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir excel-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx -P excel-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx -P excel-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx -P excel-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx -P excel-files" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PQ4MpGw6xCko", + "outputId": "84664ce1-20ff-4237-8f2d-10b75c6b4c87" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 21...|\n", + "|[{document, 22, 4...|\n", + "|[{document, 44, 6...|\n", + "|[{document, 63, 1...|\n", + "|[{document, 107, ...|\n", + "|[{document, 339, ...|\n", + "|[{document, 395, ...|\n", + "|[{document, 452, ...|\n", + "|[{document, 508, ...|\n", + "|[{document, 566, ...|\n", + "|[{document, 615, ...|\n", + "|[{document, 682, ...|\n", + "|[{document, 734, ...|\n", + "|[{document, 793, ...|\n", + "|[{document, 858, ...|\n", + "|[{document, 949, ...|\n", + "|[{document, 993, ...|\n", + "|[{document, 1225,...|\n", + "|[{document, 1282,...|\n", + "|[{document, 1339,...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/vnd.ms-excel\") \\\n", + " .setContentPath(\"./excel-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_GyL6D4N75i-" + }, + "source": [ + "## Reading Text Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ATDLz3Gws5ob" + }, + "source": [ + "**Downloading Text files**" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AV-krG6Ps8pq", + "outputId": "3d6080b7-ad02-4c2d-930a-1ce36743de74" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:19-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 300 [text/plain]\n", + "Saving to: ‘txt-files/simple-text.txt’\n", + "\n", + "\r", + "simple-text.txt 0%[ ] 0 --.-KB/s \r", + "simple-text.txt 100%[===================>] 300 --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:19 (11.3 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir txt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mutwZUFj720X", + "outputId": "0063e1e5-f7d9-481a-a26a-ade5633ad172" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 17...|\n", + "|[{document, 18, 1...|\n", + "|[{document, 145, ...|\n", + "|[{document, 161, ...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"text/plain\") \\\n", + " .setContentPath(\"./txt-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "epCp5DnQ8E7o" + }, + "source": [ + "## Reading XML Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QVq5C0Uqs4wU" + }, + "source": [ + "**Downloading XML files**" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Gip5P7Ess63U", + "outputId": "c47de770-fc8d-4e74-bc80-fe4bc4c86b83" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:20-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 538 [text/plain]\n", + "Saving to: ‘xml-files/multi-level.xml’\n", + "\n", + "\r", + "multi-level.xml 0%[ ] 0 --.-KB/s \r", + "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:20 (26.1 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AViMSzKQtP-o", + "outputId": "b1723d38-dfd8-4090-e135-726ca1cfef4f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 12...|\n", + "|[{document, 13, 2...|\n", + "|[{document, 25, 2...|\n", + "|[{document, 29, 5...|\n", + "|[{document, 52, 6...|\n", + "|[{document, 67, 7...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"application/xml\") \\\n", + " .setContentPath(\"./xml-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8qB4uXOFiqO0" + }, + "source": [ + "## Reading Mardown Documents" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "J4OpCThUiriY", + "outputId": "0f2c06cf-2d8f-42eb-97d5-2aafde97899b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:21-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1213-Adding-MarkdownReader/src/test/resources/reader/md/simple.md\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 184 [text/plain]\n", + "Saving to: ‘md-files/simple.md’\n", + "\n", + "\r", + "simple.md 0%[ ] 0 --.-KB/s \r", + "simple.md 100%[===================>] 184 --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:21 (2.67 MB/s) - ‘md-files/simple.md’ saved [184/184]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir md-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1213-Adding-MarkdownReader/src/test/resources/reader/md/simple.md -P md-files" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZjpuIEatz5yt", + "outputId": "9e401ac0-0f71-489b-a69d-8a1a66d28458" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 11...|\n", + "|[{document, 12, 7...|\n", + "|[{document, 80, 8...|\n", + "|[{document, 88, 1...|\n", + "|[{document, 102, ...|\n", + "|[{document, 115, ...|\n", + "|[{document, 129, ...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"text/markdown\") \\\n", + " .setContentPath(\"./md-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_CuYYlw8tGQO" + }, + "source": [ + "## Reading Email Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K3Fyab6wret-" + }, + "source": [ + "**Downloading Email files**" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yYMVpVQurk7G", + "outputId": "ea24ce84-276d-4085-bc38-0381d5bd470e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-07-20 23:51:22-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3175 (3.1K) [text/plain]\n", + "Saving to: ‘email-files/email-text-attachments.eml’\n", + "\n", + "\r", + " email-tex 0%[ ] 0 --.-KB/s \r", + "email-text-attachme 100%[===================>] 3.10K --.-KB/s in 0s \n", + "\n", + "2025-07-20 23:51:22 (35.7 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n", + "\n", + "--2025-07-20 23:51:22-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1324361 (1.3M) [text/plain]\n", + "Saving to: ‘email-files/test-several-attachments.eml’\n", + "\n", + "test-several-attach 100%[===================>] 1.26M --.-KB/s in 0.05s \n", + "\n", + "2025-07-20 23:51:23 (27.1 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir email-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml -P email-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml -P email-files" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gK-Te-BWtIxQ", + "outputId": "8001ce16-d240-4c8c-e2b2-5bca89348f6e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 23...|\n", + "|[{document, 24, 1...|\n", + "|[{document, 162, ...|\n", + "|[{document, 1419,...|\n", + "|[{document, 1431,...|\n", + "|[{document, 1456,...|\n", + "|[{document, 0, 21...|\n", + "|[{document, 22, 7...|\n", + "|[{document, 74, 1...|\n", + "|[{document, 1045,...|\n", + "|[{document, 1057,...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"message/rfc822\") \\\n", + " .setContentPath(\"./email-files\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GMxLm81mLv_c" + }, + "source": [ + "## Parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z4YGo70IMA9q" + }, + "source": [ + "We can output one document per row by setting `explodeDocs` to `false`" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zFdH0OV2L96F", + "outputId": "32afaed2-dd27-418e-b78f-a11014e0bc6f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 23...|\n", + "|[{document, 0, 21...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"message/rfc822\") \\\n", + " .setContentPath(\"./email-files\") \\\n", + " .setOutputCol(\"document\") \\\n", + " .setExplodeDocs(False)\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bDF87g1eM2L0" + }, + "source": [ + "We can output plain text with minimal metadata by setting `flattentOutput` to true" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ACuEwYIuM74C", + "outputId": "c4da0a00-9e1a-47f0-e2a9-55ad0df7b57e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| document|\n", + "+--------------------+\n", + "|[{document, 0, 12...|\n", + "|[{document, 13, 4...|\n", + "|[{document, 47, 6...|\n", + "|[{document, 69, 7...|\n", + "|[{document, 78, 1...|\n", + "|[{document, 164, ...|\n", + "|[{document, 207, ...|\n", + "|[{document, 297, ...|\n", + "|[{document, 330, ...|\n", + "|[{document, 363, ...|\n", + "|[{document, 382, ...|\n", + "|[{document, 447, ...|\n", + "|[{document, 702, ...|\n", + "|[{document, 755, ...|\n", + "|[{document, 862, ...|\n", + "|[{document, 992, ...|\n", + "|[{document, 1127,...|\n", + "|[{document, 1481,...|\n", + "|[{document, 1796,...|\n", + "|[{document, 2143,...|\n", + "+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "reader2doc = Reader2Doc() \\\n", + " .setContentType(\"text/html\") \\\n", + " .setContentPath(\"./html-files\") \\\n", + " .setOutputCol(\"document\") \\\n", + " .setFlattenOutput(True)\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)\n", + "result_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aucJ6Aa9Ne4k" + }, + "source": [ + "## Pipeline Integration" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4NlyM21qNir5" + }, + "source": [ + "We can integrate with pipelines. For example, with a simple `Tokenizer`:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "KXkLK7WWNgS4" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "empty_df = spark.createDataFrame([], \"string\").toDF(\"text\")\n", + "\n", + "regex_tok = RegexTokenizer() \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setOutputCol(\"regex_token\")\n", + "\n", + "pipeline = Pipeline(stages=[reader2doc, regex_tok])\n", + "model = pipeline.fit(empty_df)\n", + "\n", + "result_df = model.transform(empty_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mc3RPhLROAg8", + "outputId": "e8d28875-769c-4a6b-cfdc-8820f81d7a7e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| document| regex_token|\n", + "+--------------------+--------------------+\n", + "|[{document, 0, 12...|[{token, 0, 5, UN...|\n", + "|[{document, 13, 4...|[{token, 13, 22, ...|\n", + "|[{document, 47, 6...|[{token, 47, 57, ...|\n", + "|[{document, 69, 7...|[{token, 69, 72, ...|\n", + "|[{document, 78, 1...|[{token, 78, 78, ...|\n", + "|[{document, 164, ...|[{token, 164, 166...|\n", + "|[{document, 207, ...|[{token, 207, 207...|\n", + "|[{document, 297, ...|[{token, 297, 299...|\n", + "|[{document, 330, ...|[{token, 330, 339...|\n", + "|[{document, 363, ...|[{token, 363, 368...|\n", + "|[{document, 382, ...|[{token, 382, 387...|\n", + "|[{document, 447, ...|[{token, 447, 452...|\n", + "|[{document, 702, ...|[{token, 702, 711...|\n", + "|[{document, 755, ...|[{token, 755, 759...|\n", + "|[{document, 862, ...|[{token, 862, 869...|\n", + "|[{document, 992, ...|[{token, 992, 999...|\n", + "|[{document, 1127,...|[{token, 1127, 11...|\n", + "|[{document, 1481,...|[{token, 1481, 14...|\n", + "|[{document, 1796,...|[{token, 1796, 18...|\n", + "|[{document, 2143,...|[{token, 2143, 21...|\n", + "+--------------------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "result_df.show()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/python/sparknlp/reader/reader2doc.py b/python/sparknlp/reader/reader2doc.py new file mode 100644 index 00000000000000..e6782c046e5d7c --- /dev/null +++ b/python/sparknlp/reader/reader2doc.py @@ -0,0 +1,188 @@ +# Copyright 2017-2025 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pyspark import keyword_only +from pyspark.ml.param import TypeConverters, Params, Param + +from sparknlp.common import AnnotatorType +from sparknlp.internal import AnnotatorTransformer +from sparknlp.partition.partition_properties import * + + +class Reader2Doc( + AnnotatorTransformer, + HasEmailReaderProperties, + HasExcelReaderProperties, + HasHTMLReaderProperties, + HasPowerPointProperties, + HasTextReaderProperties, +): + """ + The Reader2Doc annotator allows you to use reading files more smoothly within existing + Spark NLP workflows, enabling seamless reuse of your pipelines. + + Reader2Doc can be used for extracting structured content from various document types + using Spark NLP readers. It supports reading from many file types and returns parsed + output as a structured Spark DataFrame. + + Supported formats include: + + - Plain text + - HTML + - Word (.doc/.docx) + - Excel (.xls/.xlsx) + - PowerPoint (.ppt/.pptx) + - Email files (.eml, .msg) + - PDFs + + Examples + -------- + >>> from johnsnowlabs.reader import Reader2Doc + >>> from johnsnowlabs.nlp.base import DocumentAssembler + >>> from pyspark.ml import Pipeline + >>> # Initialize Reader2Doc for PDF files + >>> reader2doc = Reader2Doc() \\ + ... .setContentType("application/pdf") \\ + ... .setContentPath(f"{pdf_directory}/") + >>> # Build the pipeline with the Reader2Doc stage + >>> pipeline = Pipeline(stages=[reader2doc]) + >>> # Fit the pipeline to an empty DataFrame + >>> pipeline_model = pipeline.fit(empty_data_set) + >>> result_df = pipeline_model.transform(empty_data_set) + >>> # Show the resulting DataFrame + >>> result_df.show() + +------------------------------------------------------------------------------------------------------------------------------------+ + |document | + +------------------------------------------------------------------------------------------------------------------------------------+ + |[{'document', 0, 14, 'This is a Title', {'pageNumber': 1, 'elementType': 'Title', 'fileName': 'pdf-title.pdf'}, []}] | + |[{'document', 15, 38, 'This is a narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]| + |[{'document', 39, 68, 'This is another narrative text', {'pageNumber': 1, 'elementType': 'NarrativeText', 'fileName': 'pdf-title.pdf'}, []}]| + +------------------------------------------------------------------------------------------------------------------------------------+ +""" + + name = "Reader2Doc" + outputAnnotatorType = AnnotatorType.DOCUMENT + + contentPath = Param( + Params._dummy(), + "contentPath", + "contentPath path to files to read", + typeConverter=TypeConverters.toString, + ) + + outputCol = Param( + Params._dummy(), + "outputCol", + "output column name", + typeConverter=TypeConverters.toString, + ) + + contentType = Param( + Params._dummy(), + "contentType", + "Set the content type to load following MIME specification", + typeConverter=TypeConverters.toString, + ) + + explodeDocs = Param( + Params._dummy(), + "explodeDocs", + "whether to explode the documents into separate rows", + typeConverter=TypeConverters.toBoolean, + ) + + flattenOutput = Param( + Params._dummy(), + "flattenOutput", + "If true, output is flattened to plain text with minimal metadata", + typeConverter=TypeConverters.toBoolean, + ) + + titleThreshold = Param( + Params._dummy(), + "titleThreshold", + "Minimum font size threshold for title detection in PDF docs", + typeConverter=TypeConverters.toFloat, + ) + + @keyword_only + def __init__(self): + super(Reader2Doc, self).__init__(classname="com.johnsnowlabs.reader.Reader2Doc") + self._setDefault(outputCol="document") + + @keyword_only + def setParams(self): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def setContentPath(self, value): + """Sets content path. + + Parameters + ---------- + value : str + contentPath path to files to read + """ + return self._set(contentPath=value) + + def setContentType(self, value): + """ + Set the content type to load following MIME specification + + Parameters + ---------- + value : str + content type to load following MIME specification + """ + return self._set(contentType=value) + + def setExplodeDocs(self, value): + """Sets whether to explode the documents into separate rows. + + + Parameters + ---------- + value : boolean + Whether to explode the documents into separate rows + """ + return self._set(explodeDocs=value) + + def setOutputCol(self, value): + """Sets output column name. + + Parameters + ---------- + value : str + Name of the Output Column + """ + return self._set(outputCol=value) + + def setFlattenOutput(self, value): + """Sets whether to flatten the output to plain text with minimal metadata. + + Parameters + ---------- + value : bool + If true, output is flattened to plain text with minimal metadata + """ + return self._set(flattenOutput=value) + + def setTitleThreshold(self, value): + """Sets the minimum font size threshold for title detection in PDF documents. + + Parameters + ---------- + value : float + Minimum font size threshold for title detection in PDF docs + """ + return self._set(titleThreshold=value) diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py index 6dc744279c19f8..d2c5d82954d894 100644 --- a/python/sparknlp/reader/sparknlp_reader.py +++ b/python/sparknlp/reader/sparknlp_reader.py @@ -413,4 +413,49 @@ def md(self, filePath): if not isinstance(filePath, str): raise TypeError("filePath must be a string") jdf = self._java_obj.md(filePath) + return self.getDataFrame(self.spark, jdf) + + def csv(self, csvPath): + """Reads CSV files and returns a Spark DataFrame. + + Parameters + ---------- + docPath : str + Path to an CSV file or a directory containing CSV files. + + Returns + ------- + pyspark.sql.DataFrame + A DataFrame containing parsed CSV content. + + Examples + -------- + >>> from sparknlp.reader import SparkNLPReader + >>> csv_df = SparkNLPReader(spark).csv("home/user/csv-directory") + + You can use SparkNLP for one line of code + + >>> import sparknlp + >>> csv_df = sparknlp.read().csv("home/user/csv-directory") + >>> csv_df.show(truncate=False) + +-----------------------------------------------------------------------------------------------------------------------------------------+ + |csv | + +-----------------------------------------------------------------------------------------------------------------------------------------+ + |[{NarrativeText, Alice 100 Bob 95, {}}, {Table,
| Alice | 100 |
| Bob | 95 |
elements - tag match { - case "title" | "h1" | "h2" | "h3" | "header" => true - case "p" => isFormattedAsTitle(elem) // Check if
behaves like a title - case _ => elem.attr("role").toLowerCase == "heading" // ARIA role="heading" - } + private def isTextElement(elem: Element): Boolean = { + !isFormattedAsTitle(elem) && + (elem.attr("style").toLowerCase.contains("text") || + elem.tagName().toLowerCase == "p" || + (elem.tagName().toLowerCase == "div" && isParagraphLikeElement(elem))) } private def isFormattedAsTitle(elem: Element): Boolean = { - // Check for bold text, large font size, or centered alignment val style = elem.attr("style").toLowerCase - val isBold = style.contains("font-weight:bold") - val isLargeFont = style.contains("font-size") && extractFontSize(style) >= titleFontSize - val isCentered = style.contains("text-align:center") - - isBold || isLargeFont || (isCentered && isBold) || (isCentered && isLargeFont) - } - - private def extractFontSize(style: String): Int = { - val sizePattern = """font-size:(\d+)pt""".r - sizePattern.findFirstMatchIn(style) match { - case Some(m) => m.group(1).toInt - case None => 0 - } + val hasBoldTag = + elem.getElementsByTag("b").size() > 0 || elem.getElementsByTag("strong").size() > 0 + hasBoldTag || HTMLParser.isFormattedAsTitle(style, titleFontSize) } private def extractNestedTableContent(elem: Element): String = { diff --git a/src/main/scala/com/johnsnowlabs/reader/PdfReader.scala b/src/main/scala/com/johnsnowlabs/reader/PdfReader.scala new file mode 100644 index 00000000000000..610b20dd609de7 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/reader/PdfReader.scala @@ -0,0 +1,159 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithBinaryFile +import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.text.{PDFTextStripper, TextPosition} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, udf} +import java.io.ByteArrayInputStream +import scala.collection.JavaConverters._ +import scala.collection.mutable + +/** Class to parse and read PDF files. + * + * @param titleThreshold + * Minimum font size threshold used as part of heuristic rules to detect title elements based + * on formatting (e.g., bold, centered, capitalized). By default, it is set to 18. + * @param storeContent + * Whether to include the raw file content in the output DataFrame as a separate 'content' + * + * pdfPath: this is a path to a directory of HTML files or a path to an HTML file E.g. + * "path/pdf/files" + * + * ==Example== + * {{{ + * val path = "./pdf-files/pdf-doc.pdf" + * val PdfReader = new PdfReader() + * val pdfDF = PdfReader.read(url) + * }}} + * + * {{{ + * pdfDF.show() + * +--------------------+--------------------+ + * | path| html| + * +--------------------+--------------------+ + * |file:/content/htm...|[{Title, My First...| + * +--------------------+--------------------+ + * + * pdfDF.printSchema() + * root + * |-- path: string (nullable = true) + * |-- pdf: array (nullable = true) + * | |-- element: struct (containsNull = true) + * | | |-- elementType: string (nullable = true) + * | | |-- content: string (nullable = true) + * | | |-- metadata: map (nullable = true) + * | | | |-- key: string + * | | | |-- value: string (valueContainsNull = true) + * }}} + * For more examples please refer to this + * [[https://github.com/JohnSnowLabs/spark-nlp/examples/python/reader/SparkNLP_PDF_Reader_Demo.ipynb notebook]]. + */ +class PdfReader(storeContent: Boolean = false, titleThreshold: Double = 18.0) + extends Serializable { + + private lazy val spark = ResourceHelper.spark + private var outputColumn = "pdf" + + def setOutputColumn(name: String): this.type = { + require(name.nonEmpty, "Output column name cannot be empty.") + outputColumn = name + this + } + def getOutputColumn: String = outputColumn + + def pdf(filePath: String): DataFrame = { + if (!ResourceHelper.validFile(filePath)) + throw new IllegalArgumentException(s"Invalid filePath: $filePath") + + val binaryDF = datasetWithBinaryFile(spark, filePath) + val withElements = binaryDF.withColumn(outputColumn, parsePdfUDF(col("content"))) + if (storeContent) withElements.select("path", outputColumn, "content") + else withElements.select("path", outputColumn) + } + + private val parsePdfUDF = udf((data: Array[Byte]) => pdfToHTMLElement(data)) + + def pdfToHTMLElement(content: Array[Byte]): Seq[HTMLElement] = { + val docInputStream = new ByteArrayInputStream(content) + try { + val pdfDoc = PDDocument.load(docInputStream) + val elements = extractElementsFromPdf(pdfDoc) + pdfDoc.close() + elements + } catch { + case e: Exception => + Seq( + HTMLElement( + ElementType.UNCATEGORIZED_TEXT, + s"Could not parse PDF: ${e.getMessage}", + mutable.Map())) + } finally { + docInputStream.close() + } + } + + private def extractElementsFromPdf(pdfDoc: PDDocument): Seq[HTMLElement] = { + val collectedElements = mutable.ListBuffer[HTMLElement]() + val textStripper = new PDFTextStripper() { + override def writeString( + text: String, + textPositions: java.util.List[TextPosition]): Unit = { + val lineGroups = groupTextPositionsByLine(textPositions) + val lineElements = lineGroups.flatMap { case (_, linePositions) => + classifyLineElement(linePositions, getCurrentPageNo) + } + collectedElements ++= lineElements + } + } + textStripper.setSortByPosition(true) + textStripper.setStartPage(1) + textStripper.setEndPage(pdfDoc.getNumberOfPages) + textStripper.getText(pdfDoc) + collectedElements + } + + private def groupTextPositionsByLine( + textPositions: java.util.List[TextPosition]): Map[Int, Seq[TextPosition]] = { + val yTolerance = 2f // Potential parameter, since needs to experiment to fit your PDFs + textPositions.asScala.groupBy(tp => (tp.getY / yTolerance).round) + } + + private def classifyLineElement( + linePositions: Seq[TextPosition], + pageNumber: Int): Option[HTMLElement] = { + val lineText = linePositions.map(_.getUnicode).mkString.trim + if (lineText.isEmpty) return None + + val averageFontSize = linePositions.map(_.getFontSize).sum / linePositions.size + val mostCommonFontName = linePositions.groupBy(_.getFont.getName).maxBy(_._2.size)._1 + + val elementType = + if (isTitle(averageFontSize, mostCommonFontName)) ElementType.TITLE + else ElementType.NARRATIVE_TEXT + + val metadata = mutable.Map("pageNumber" -> pageNumber.toString) + Some(HTMLElement(elementType, lineText, metadata)) + } + + private def isTitle(fontSize: Double, fontName: String): Boolean = { + fontSize >= titleThreshold || fontName.toLowerCase.contains("bold") + } + +} diff --git a/src/main/scala/com/johnsnowlabs/reader/Reader2Doc.scala b/src/main/scala/com/johnsnowlabs/reader/Reader2Doc.scala new file mode 100644 index 00000000000000..86b956eee89caf --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/reader/Reader2Doc.scala @@ -0,0 +1,261 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT +import com.johnsnowlabs.nlp.{Annotation, HasOutputAnnotationCol, HasOutputAnnotatorType} +import com.johnsnowlabs.partition.util.PartitionHelper.{ + datasetWithBinaryFile, + datasetWithTextFile, + isStringContent +} +import com.johnsnowlabs.partition.{ + HasEmailReaderProperties, + HasExcelReaderProperties, + HasHTMLReaderProperties, + HasPowerPointProperties, + HasReaderProperties, + HasTextReaderProperties, + HasXmlReaderProperties, + Partition +} +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} +import org.apache.spark.sql.functions.{array, col, explode, udf} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} + +import scala.jdk.CollectionConverters.mapAsJavaMapConverter + +/** The Reader2Doc annotator allows you to use the reading files more smoothly within existing + * Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Doc can be used for + * extracting structured content from various document types using Spark NLP readers. It supports + * reading from many files types and returns parsed output as a structured Spark DataFrame. + * + * Supported formats include plain text, HTML, Word (.doc/.docx), Excel (.xls/.xlsx), PowerPoint + * (.ppt/.pptx), email files (.eml, .msg), and PDFs. + * + * ==Example== + * {{{ + * import com.johnsnowlabs.reader.Reader2Doc + * import com. johnsnowlabs.nlp.base.DocumentAssembler + * import org.apache.spark.ml.Pipeline + * + * val partition = new Reader2Doc() + * .setContentType("application/pdf") + * .setContentPath(s"$pdfDirectory/") + * + * val pipeline = new Pipeline() + * .setStages(Array(reader2Doc)) + * + * val pipelineModel = pipeline.fit(emptyDataSet) + * val resultDf = pipelineModel.transform(emptyDataSet) + * + * resultDf.show() + * +------------------------------------------------------------------------------------------------------------------------------------+ + * |document | + * +------------------------------------------------------------------------------------------------------------------------------------+ + * |[{document, 0, 14, This is a Title, {pageNumber -> 1, elementType -> Title, fileName -> pdf-title.pdf}, []}] | + * |[{document, 15, 38, This is a narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}] | + * |[{document, 39, 68, This is another narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}]| + * +------------------------------------------------------------------------------------------------------------------------------------+ + * }}} + */ +class Reader2Doc(override val uid: String) + extends Transformer + with DefaultParamsWritable + with HasOutputAnnotatorType + with HasOutputAnnotationCol + with HasReaderProperties + with HasEmailReaderProperties + with HasExcelReaderProperties + with HasHTMLReaderProperties + with HasPowerPointProperties + with HasTextReaderProperties + with HasXmlReaderProperties { + + def this() = this(Identifiable.randomUID("Reader2Doc")) + + val explodeDocs: BooleanParam = + new BooleanParam(this, "explodeDocs", "whether to explode the documents into separate rows") + + def setExplodeDocs(value: Boolean): this.type = set(explodeDocs, value) + + val flattenOutput: BooleanParam = + new BooleanParam( + this, + "flattenOutput", + "If true, output is flattened to plain text with minimal metadata") + + def setFlattenOutput(value: Boolean): this.type = set(flattenOutput, value) + + val titleThreshold: Param[Float] = + new Param[Float]( + this, + "titleThreshold", + "Minimum font size threshold for title detection in PDF docs") + + def setTitleThreshold(value: Float): this.type = { + set(titleThreshold, value) + } + + setDefault( + this.explodeDocs -> true, + contentType -> "", + flattenOutput -> false, + titleThreshold -> 18) + + override def transform(dataset: Dataset[_]): DataFrame = { + validateRequiredParameters() + + val partitionDf = partitionContent(partitionBuilder, dataset) + + val annotatedDf = partitionDf + .withColumn( + getOutputCol, + wrapColumnMetadata( + partitionToAnnotation($(flattenOutput))(col("partition"), col("fileName")))) + .select(getOutputCol) + + afterAnnotate(annotatedDf) + } + + private def partitionBuilder: Partition = { + val params = Map( + "contentType" -> $(contentType), + "storeContent" -> $(storeContent).toString, + "titleFontSize" -> $(titleFontSize).toString, + "inferTableStructure" -> $(inferTableStructure).toString, + "includePageBreaks" -> $(includePageBreaks).toString, + "addAttachmentContent" -> $(addAttachmentContent).toString, + "cellSeparator" -> $(cellSeparator), + "appendCells" -> $(appendCells).toString, + "timeout" -> $(timeout).toString, + "includeSlideNotes" -> $(includeSlideNotes).toString, + "titleLengthSize" -> $(titleLengthSize).toString, + "groupBrokenParagraphs" -> $(groupBrokenParagraphs).toString, + "paragraphSplit" -> $(paragraphSplit), + "shortLineWordThreshold" -> $(shortLineWordThreshold).toString, + "maxLineCount" -> $(maxLineCount).toString, + "threshold" -> $(threshold).toString, + "xmlKeepTags" -> $(xmlKeepTags).toString, + "onlyLeafNodes" -> $(onlyLeafNodes).toString, + "titleThreshold" -> $(titleThreshold).toString) + new Partition(params.asJava) + } + + private def partitionContent(partition: Partition, dataset: Dataset[_]): DataFrame = { + + if (isStringContent($(contentType))) { + val partitionUDF = + udf((text: String) => partition.partitionStringContent(text, $(this.headers).asJava)) + val stringContentDF = datasetWithTextFile(dataset.sparkSession, $(contentPath)) + stringContentDF + .withColumn(partition.getOutputColumn, partitionUDF(col("content"))) + .withColumn("fileName", getFileName(col("path"))) + } else { + val binaryContentDF = datasetWithBinaryFile(dataset.sparkSession, $(contentPath)) + val partitionUDF = + udf((input: Array[Byte]) => partition.partitionBytesContent(input)) + binaryContentDF + .withColumn(partition.getOutputColumn, partitionUDF(col("content"))) + .withColumn("fileName", getFileName(col("path"))) + } + } + + private def afterAnnotate(dataset: DataFrame): DataFrame = { + if ($(explodeDocs)) { + dataset + .select(dataset.columns.filterNot(_ == getOutputCol).map(col) :+ explode( + col(getOutputCol)).as("_tmp"): _*) + .withColumn( + getOutputCol, + array(col("_tmp")) + .as(getOutputCol, dataset.schema.fields.find(_.name == getOutputCol).get.metadata)) + .drop("_tmp") + } else dataset + } + + private def validateRequiredParameters(): Unit = { + require( + $(contentPath) != null && $(contentPath).trim.nonEmpty, + "contentPath must be set and not empty") + require( + $(contentType) != null && $(contentType).trim.nonEmpty, + "contentType must be set and not empty") + } + + private val getFileName = udf { path: String => + if (path != null) path.split("/").last else "" + } + + private def partitionToAnnotation(flatten: Boolean) = udf { + (partitions: Seq[Row], fileName: String) => + if (partitions == null) Nil + else { + var currentOffset = 0 + partitions.map { part => + val elementType = part.getAs[String]("elementType") + val content = part.getAs[String]("content") + val metadata = part.getAs[Map[String, String]]("metadata") + val begin = currentOffset + val end = currentOffset + (if (content != null) content.length else 0) - 1 + currentOffset = end + 1 + + // Compute new metadata + val baseMeta = if (metadata != null) metadata else Map.empty[String, String] + val withExtras = baseMeta + + ("elementType" -> elementType) + + ("fileName" -> fileName) + val finalMeta = + if (flatten) withExtras.filterKeys(_ == "sentence") + else withExtras + + Annotation( + annotatorType = outputAnnotatorType, + begin = begin, + end = end, + result = content, + metadata = finalMeta, + embeddings = Array.emptyFloatArray) + } + } + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override val outputAnnotatorType: AnnotatorType = DOCUMENT + + private lazy val columnMetadata: Metadata = { + val metadataBuilder: MetadataBuilder = new MetadataBuilder() + metadataBuilder.putString("annotatorType", outputAnnotatorType) + metadataBuilder.build + } + + override def transformSchema(schema: StructType): StructType = { + val outputFields = schema.fields :+ + StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, columnMetadata) + StructType(outputFields) + } + + private def wrapColumnMetadata(col: Column): Column = { + col.as(getOutputCol, columnMetadata) + } + +} + +object Reader2Doc extends DefaultParamsReadable[Reader2Doc] diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala index 4b45b039e5b1c1..c7fc5ccafb3691 100644 --- a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala +++ b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala @@ -19,14 +19,13 @@ import com.johnsnowlabs.nlp.annotators.cleaners.util.CleanerHelper.{ BLOCK_SPLIT_PATTERN, DOUBLE_PARAGRAPH_PATTERN } -import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.reader.util.pdf.TextStripperType import com.johnsnowlabs.reader.util.PartitionOptions.{ getDefaultBoolean, getDefaultInt, - getDefaultString + getDefaultString, + getDefaultDouble } -import org.apache.spark.ml.Pipeline import org.apache.spark.sql.DataFrame import scala.collection.JavaConverters._ @@ -93,35 +92,60 @@ class SparkNLPReader( def html(htmlPath: String): DataFrame = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.read(htmlPath) } def htmlToHTMLElement(html: String): Seq[HTMLElement] = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.htmlToHTMLElement(html) } def urlToHTMLElement(url: String): Seq[HTMLElement] = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.urlToHTMLElement(url) } def html(urls: Array[String]): DataFrame = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.read(urls) } def html(urls: java.util.List[String]): DataFrame = { val htmlReader = - new HTMLReader(getTitleFontSize, getStoreContent, getTimeout, headers = htmlHeaders) + new HTMLReader( + getTitleFontSize, + getStoreContent, + getTimeout, + getIncludeTitleTag, + headers = htmlHeaders) setOutputColumn(htmlReader.getOutputColumn) htmlReader.read(urls.asScala.toArray) } @@ -142,6 +166,13 @@ class SparkNLPReader( getDefaultInt(params.asScala.toMap, Seq("timeout"), default = 30) } + private def getIncludeTitleTag: Boolean = { + getDefaultBoolean( + params.asScala.toMap, + Seq("includeTitleTag", "include_title_tag"), + default = false) + } + /** Instantiates class to read email files. * * emailPath: this is a path to a directory of email files or a path to an email file E.g. @@ -302,22 +333,20 @@ class SparkNLPReader( * Parameter with custom configuration */ def pdf(pdfPath: String): DataFrame = { - val spark = ResourceHelper.spark - spark.conf.set("spark.sql.legacy.allowUntypedScalaUDF", "true") - val pdfToText = new PdfToText() - .setStoreSplittedPdf(getStoreSplittedPdf) - .setSplitPage(getSplitPage) - .setOnlyPageNum(getOnlyPageNum) - .setTextStripper(getTextStripper) - .setSort(getSort) - .setExtractCoordinates(getExtractCoordinates) - .setNormalizeLigatures(getNormalizeLigatures) - val binaryPdfDF = spark.read.format("binaryFile").load(pdfPath) - val pipelineModel = new Pipeline() - .setStages(Array(pdfToText)) - .fit(binaryPdfDF) - - pipelineModel.transform(binaryPdfDF) + val pdfReader = new PdfReader(getStoreContent, getTitleThreshold) + pdfReader.pdf(pdfPath) + } + + def pdf(content: Array[Byte]): Seq[HTMLElement] = { + val pdfReader = new PdfReader(getStoreContent, getTitleThreshold) + pdfReader.pdfToHTMLElement(content) + } + + private def getTitleThreshold: Double = { + getDefaultDouble( + params.asScala.toMap, + Seq("titleThreshold", "title_threshold"), + default = 18.0) } private def getStoreSplittedPdf: Boolean = { @@ -757,4 +786,73 @@ class SparkNLPReader( markdownReader.parseMarkdownWithTables(mdContent) } + /** Instantiates class to read XML files. + * + * csvPath: this is a path to a directory of CSV files or a path to an CSV file. E.g., + * "path/csv/files" + * + * ==Example== + * {{{ + * val csvPath = "home/user/csv-directory" + * val sparkNLPReader = new SparkNLPReader() + * val csvDf = sparkNLPReader.csv(csvPath) + * }}} + * + * ==Example 2== + * You can use SparkNLP for one line of code + * {{{ + * val csvDf = SparkNLP.read.csv(csvPath) + * }}} + * + * {{{ + * csvDf.select("csv").show(false) + * +-----------------------------------------------------------------------------------------------------------------------------------------+ + * |csv | + * +-----------------------------------------------------------------------------------------------------------------------------------------+ + * |[{NarrativeText, Alice 100 Bob 95, {}}, {Table,
| Alice | 100 |
| Bob | 95 |
elements + tag match { + case "title" | "h1" | "h2" | "h3" | "header" => true + case "p" | "div" => isFormattedAsTitle(style, titleFontSize) + case _ => role == "heading" // ARIA role="heading" + } + } + +} diff --git a/src/main/scala/com/johnsnowlabs/reader/util/PartitionOptions.scala b/src/main/scala/com/johnsnowlabs/reader/util/PartitionOptions.scala index 0d5ec60a6a7120..77255d8f29ac5a 100644 --- a/src/main/scala/com/johnsnowlabs/reader/util/PartitionOptions.scala +++ b/src/main/scala/com/johnsnowlabs/reader/util/PartitionOptions.scala @@ -50,4 +50,15 @@ object PartitionOptions { .getOrElse(default) } + def getDefaultDouble( + params: Map[String, String], + options: Seq[String], + default: Double): Double = { + options + .flatMap(params.get) + .flatMap(value => Try(value.toDouble).toOption) + .headOption + .getOrElse(default) + } + } diff --git a/src/test/resources/reader/csv/semicolon-delimited.csv b/src/test/resources/reader/csv/semicolon-delimited.csv new file mode 100644 index 00000000000000..5b3d9cf16c9371 --- /dev/null +++ b/src/test/resources/reader/csv/semicolon-delimited.csv @@ -0,0 +1,5 @@ +Lorem, ipsum; dolor sit; amet +consectetur; adipiscing; elit +sed, do; eiusmod; tempor incididunt +ut labore; et, dolore; magna aliqua +Ut enim; ad minim; veniam, quis diff --git a/src/test/resources/reader/csv/stanley-cups-utf-16.csv b/src/test/resources/reader/csv/stanley-cups-utf-16.csv new file mode 100644 index 00000000000000..b152e27aac8e0a Binary files /dev/null and b/src/test/resources/reader/csv/stanley-cups-utf-16.csv differ diff --git a/src/test/resources/reader/csv/stanley-cups.csv b/src/test/resources/reader/csv/stanley-cups.csv new file mode 100644 index 00000000000000..ab6de889333da4 --- /dev/null +++ b/src/test/resources/reader/csv/stanley-cups.csv @@ -0,0 +1,5 @@ +Stanley Cups,, +Team,Location,Stanley Cups +Blues,STL,1 +Flyers,PHI,2 +Maple Leafs,TOR,13 diff --git a/src/test/resources/reader/html/example-bold-strong.html b/src/test/resources/reader/html/example-bold-strong.html new file mode 100644 index 00000000000000..5a92267ed76560 --- /dev/null +++ b/src/test/resources/reader/html/example-bold-strong.html @@ -0,0 +1,18 @@ + + +
+ +| Name | +Subject | +Grade | +
|---|---|---|
| Alice | +Math | +A | +
| Bob | +Science | +B+ | +