JohnSnowLabs · paulamib123 · Mar 17, 2025 · Mar 31, 2025 · Apr 4, 2025 · Apr 7, 2025
diff --git a/examples/python/data-preprocessing/SparkNLP_PartitionTransformer_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_PartitionTransformer_Demo.ipynb
@@ -0,0 +1,315 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tzcU5p2gdak9"
+   },
+   "source": [
+    "# Introducing PartitionTransformer in SparkNLP\n",
+    "Spark NLP Readers and `Partition` help build structured inputs for your downstream NLP tasks.\n",
+    "\n",
+    "The new `PartitionTransformer` makes your current Spark NLP workflow smoother by allowing to reuse your pipelines seamlessly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sparknlp\n",
+    "# # let's start Spark with Spark NLP\n",
+    "spark = sparknlp.start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "JWHyJJBdkSAZ"
+   },
+   "source": [
+    "## Setup and Initialization\n",
+    "Let's keep in mind a few things before we start 😊\n",
+    "\n",
+    "Support for **PartitionTransformer** was introduced in Spark NLP 6.0.2 Please make sure you have upgraded to the latest Spark NLP release.\n",
+    "\n",
+    "For local files example we will download different files from Spark NLP Github repo:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "CAg4inaqkU8J"
+   },
+   "source": [
+    "Downloading HTML files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "bo7s-jZVrE7W",
+    "outputId": "2ce20a85-f3f1-4e93-9a7d-da60a415e6cf"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2025-05-24 14:57:07--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1174-Adding-PartitionTransformer/src/test/resources/reader/html/example-10k.html\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 2456707 (2.3M) [text/plain]\n",
+      "Saving to: ‘html-files/example-10k.html’\n",
+      "\n",
+      "example-10k.html    100%[===================>]   2.34M  --.-KB/s    in 0.07s   \n",
+      "\n",
+      "2025-05-24 14:57:08 (31.6 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n",
+      "\n",
+      "--2025-05-24 14:57:08--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1174-Adding-PartitionTransformer/src/test/resources/reader/html/fake-html.html\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 665 [text/plain]\n",
+      "Saving to: ‘html-files/fake-html.html’\n",
+      "\n",
+      "fake-html.html      100%[===================>]     665  --.-KB/s    in 0s      \n",
+      "\n",
+      "2025-05-24 14:57:08 (38.0 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir html-files\n",
+    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html -P html-files\n",
+    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html -P html-files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EoFI66NAdalE"
+   },
+   "source": [
+    "## Partitioning Documents"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "nluIcWMbM_rx"
+   },
+   "source": [
+    "`PartitionTransformer` outpus a different schema than `Partition`, here we can expect our common Annotation schema"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "mWnypHRwXruC",
+    "outputId": "b82b20f4-cb27-43ab-8ed3-e4e5b12acee2"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+--------------------+--------------------+--------------------+\n",
+      "|                path|             content|                text|           partition|\n",
+      "+--------------------+--------------------+--------------------+--------------------+\n",
+      "|file:/content/htm...|<?xml  version=\"1...|[{Title, UNITED S...|[{document, 0, 12...|\n",
+      "|file:/content/htm...|<!DOCTYPE html>\\n...|[{Title, My First...|[{document, 0, 15...|\n",
+      "+--------------------+--------------------+--------------------+--------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pyspark.ml import Pipeline\n",
+    "from sparknlp.partition.partition_transformer import *\n",
+    "\n",
+    "empty_df = spark.createDataFrame([], \"string\").toDF(\"text\")\n",
+    "\n",
+    "partition_transformer = PartitionTransformer() \\\n",
+    "    .setInputCols([\"text\"]) \\\n",
+    "    .setContentType(\"text/html\") \\\n",
+    "    .setContentPath(\"./html-files\") \\\n",
+    "    .setOutputCol(\"partition\")\n",
+    "\n",
+    "pipeline = Pipeline(stages=[\n",
+    "    partition_transformer\n",
+    "])\n",
+    "\n",
+    "pipeline_model = pipeline.fit(empty_df)\n",
+    "result_df = pipeline_model.transform(empty_df)\n",
+    "\n",
+    "result_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "EFMhyfnc_g1V",
+    "outputId": "b04f2d7c-aff3-4bb4-93c4-0007151211a7"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- path: string (nullable = true)\n",
+      " |-- content: string (nullable = true)\n",
+      " |-- text: array (nullable = true)\n",
+      " |    |-- element: struct (containsNull = true)\n",
+      " |    |    |-- elementType: string (nullable = true)\n",
+      " |    |    |-- content: string (nullable = true)\n",
+      " |    |    |-- metadata: map (nullable = true)\n",
+      " |    |    |    |-- key: string\n",
+      " |    |    |    |-- value: string (valueContainsNull = true)\n",
+      " |-- partition: array (nullable = true)\n",
+      " |    |-- element: struct (containsNull = true)\n",
+      " |    |    |-- annotatorType: string (nullable = true)\n",
+      " |    |    |-- begin: integer (nullable = false)\n",
+      " |    |    |-- end: integer (nullable = false)\n",
+      " |    |    |-- result: string (nullable = true)\n",
+      " |    |    |-- metadata: map (nullable = true)\n",
+      " |    |    |    |-- key: string\n",
+      " |    |    |    |-- value: string (valueContainsNull = true)\n",
+      " |    |    |-- embeddings: array (nullable = true)\n",
+      " |    |    |    |-- element: float (containsNull = false)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "result_df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "gBNYByJ5Bqq6"
+   },
+   "source": [
+    "You can integrate `PartitionTransformer` directly into your existing Spark NLP pipelines."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "id": "W7LLHf_0BrtQ"
+   },
+   "outputs": [],
+   "source": [
+    "text = (\n",
+    "    \"The big brown fox\\n\"\n",
+    "    \"was walking down the lane.\\n\"\n",
+    "    \"\\n\"\n",
+    "    \"At the end of the lane,\\n\"\n",
+    "    \"the fox met a bear.\"\n",
+    ")\n",
+    "\n",
+    "testDataSet = spark.createDataFrame(\n",
+    "    [(text,)],\n",
+    "    [\"text\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "id": "gPsYuhgOlg4G"
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.ml import Pipeline\n",
+    "from sparknlp import DocumentAssembler\n",
+    "\n",
+    "emptyDataSet = spark.createDataFrame([], testDataSet.schema)\n",
+    "\n",
+    "documentAssembler = DocumentAssembler() \\\n",
+    "            .setInputCol(\"text\") \\\n",
+    "            .setOutputCol(\"document\")\n",
+    "\n",
+    "partition = PartitionTransformer() \\\n",
+    "    .setInputCols([\"document\"]) \\\n",
+    "    .setOutputCol(\"partition\") \\\n",
+    "    .setGroupBrokenParagraphs(True)\n",
+    "\n",
+    "pipeline = Pipeline(stages=[documentAssembler, partition])\n",
+    "pipelineModel = pipeline.fit(emptyDataSet)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "xR9-8vDtlq5o",
+    "outputId": "4f56f1d1-152f-4fbc-b426-acdea9b7952b"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|partition                                                                                                                                                                    |\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|[{document, 0, 43, The big brown fox was walking down the lane., {paragraph -> 0}, []}, {document, 0, 42, At the end of the lane, the fox met a bear., {paragraph -> 0}, []}]|\n",
+      "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "resultDf = pipelineModel.transform(testDataSet)\n",
+    "resultDf.select(\"partition\").show(truncate=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}