diff --git a/CHANGELOG b/CHANGELOG index 702350931f60ed..c31c21a0660d4b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,20 @@ ======= -6.0.1 +6.0.3 +======= +---------------- +New Features & Enhancements +---------------- +* Introducing E5-V Universal Embeddings (SPARKNLP-1143) +* Enhanced Chunking Strategies (SPARKNLP-1125) +* New XML Reader (SPARKNLP-1119) + +---------------- +Bug Fixes +---------------- +* Fixed typo for Excel reader notebook + +======= +6.0.2 ======= ---------------- New Features & Enhancements diff --git a/README.md b/README.md index 8a1b52ddda0d56..b9da074c7f4c08 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==6.0.2 pyspark==3.3.1 +$ pip install spark-nlp==6.0.3 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -129,7 +129,7 @@ For a quick example of using pipelines and models take a look at our official [d ### Apache Spark Support -Spark NLP *6.0.2* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x +Spark NLP *6.0.3* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -159,7 +159,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http ### Databricks Support -Spark NLP 6.0.2 has been tested and is compatible with the following runtimes: +Spark NLP 6.0.3 has been tested and is compatible with the following runtimes: | **CPU** | **GPU** | |--------------------|--------------------| @@ -176,7 +176,7 @@ We are compatible with older runtimes. For a full list check databricks support ### EMR Support -Spark NLP 6.0.2 has been tested and is compatible with the following EMR releases: +Spark NLP 6.0.3 has been tested and is compatible with the following EMR releases: | **EMR Release** | |--------------------| diff --git a/build.sbt b/build.sbt index 1dbf63e351a275..652613c33f6f62 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "6.0.2" +version := "6.0.3" (ThisBuild / scalaVersion) := scalaVer diff --git a/conda/meta.yaml b/conda/meta.yaml index 5006fa819237f0..944566505b3821 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ {% set name = "spark-nlp" %} -{% set version = "6.0.2" %} +{% set version = "6.0.3" %} package: name: {{ name|lower }} @@ -7,7 +7,7 @@ package: source: url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/spark_nlp-{{ version }}.tar.gz - sha256: 8b97358206809a123076bcd58aba3b6487086c95c6370be6a9a34f0d5568b43d + sha256: ff09f27c512401cff1ec3af572069b2e2af35b87a0f6737c5340538bac10faf7 build: noarch: python diff --git a/docs/en/transformer_entries/E5VEmbeddings.md b/docs/en/transformer_entries/E5VEmbeddings.md new file mode 100644 index 00000000000000..68ff482cd2f900 --- /dev/null +++ b/docs/en/transformer_entries/E5VEmbeddings.md @@ -0,0 +1,133 @@ +{%- capture title -%} +E5VEmbeddings +{%- endcapture -%} + +{%- capture description -%} +Universal multimodal embeddings using E5-V. + +E5-V is a multimodal embedding model that bridges the modality gap between text and images, enabling strong performance in cross-modal retrieval, classification, clustering, and more. It supports both image+text and text-only embedding scenarios, and is fine-tuned from lmms-lab/llama3-llava-next-8b. The default model is `"e5v_int4"`. + +Note that this annotator is only supported for Spark Versions 3.4 and up. + +Pretrained models can be loaded with `pretrained` of the companion object: + +```scala +val embeddings = E5VEmbeddings.pretrained() + .setInputCols("image_assembler") + .setOutputCol("e5v") +``` + +For available pretrained models please see the +[Models Hub](https://sparknlp.org/models?q=E5V). + +For extended examples of usage, see +[E5VEmbeddingsTestSpec](https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5VEmbeddingsTestSpec.scala). + +**Sources** : + +- [E5-V: Universal Embeddings with Multimodal Large Language Models (arXiv)](https://arxiv.org/abs/2407.12580) +- [Hugging Face Model Card](https://huggingface.co/royokong/e5-v) +- [E5-V Github Repository](https://github.com/kongds/E5-V) +{%- endcapture -%} + +{%- capture input_anno -%} +IMAGE +{%- endcapture -%} + +{%- capture output_anno -%} +SENTENCE_EMBEDDINGS +{%- endcapture -%} + +{%- capture python_example -%} +# Image + Text Embedding +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline +from pyspark.sql.functions import lit + +image_df = spark.read.format("image").option("dropInvalid", True).load(imageFolder) +imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" +test_df = image_df.withColumn("text", lit(imagePrompt)) +imageAssembler = ImageAssembler() \ + .setInputCol("image") \ + .setOutputCol("image_assembler") +e5vEmbeddings = E5VEmbeddings.pretrained() \ + .setInputCols(["image_assembler"]) \ + .setOutputCol("e5v") +pipeline = Pipeline().setStages([ + imageAssembler, + e5vEmbeddings +]) +result = pipeline.fit(test_df).transform(test_df) +result.select("e5v.embeddings").show(truncate=False) + +# Text-Only Embedding +from sparknlp.util import EmbeddingsDataFrameUtils +textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" +textDesc = "A cat sitting in a box." +nullImageDF = spark.createDataFrame( + spark.sparkContext.parallelize([EmbeddingsDataFrameUtils.emptyImageRow]), + EmbeddingsDataFrameUtils.imageSchema) +textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("", textDesc))) +e5vEmbeddings = E5VEmbeddings.pretrained() \ + .setInputCols(["image"]) \ + .setOutputCol("e5v") +result = e5vEmbeddings.transform(textDF) +result.select("e5v.embeddings").show(truncate=False) +{%- endcapture -%} + +{%- capture scala_example -%} +// Image + Text Embedding +import org.apache.spark.sql.functions.lit +import com.johnsnowlabs.nlp.base.ImageAssembler +import com.johnsnowlabs.nlp.embeddings.E5VEmbeddings +import org.apache.spark.ml.Pipeline + +val imageDF = spark.read.format("image").option("dropInvalid", value = true).load(imageFolder) +val imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" +val testDF = imageDF.withColumn("text", lit(imagePrompt)) +val imageAssembler = new ImageAssembler().setInputCol("image").setOutputCol("image_assembler") +val e5vEmbeddings = E5VEmbeddings.pretrained() + .setInputCols("image_assembler") + .setOutputCol("e5v") +val pipeline = new Pipeline().setStages(Array(imageAssembler, e5vEmbeddings)) +val result = pipeline.fit(testDF).transform(testDF) +result.select("e5v.embeddings").show(truncate = false) + +// Text-Only Embedding +import com.johnsnowlabs.nlp.util.EmbeddingsDataFrameUtils.{emptyImageRow, imageSchema} +val textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" +val textDesc = "A cat sitting in a box." +val nullImageDF = spark.createDataFrame(spark.sparkContext.parallelize(Seq(emptyImageRow)), imageSchema) +val textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("", textDesc))) +val e5vEmbeddings = E5VEmbeddings.pretrained() + .setInputCols("image") + .setOutputCol("e5v") +val result2 = e5vEmbeddings.transform(textDF) +result2.select("e5v.embeddings").show(truncate = false) +{%- endcapture -%} + +{%- capture api_link -%} +[E5VEmbeddings](/api/com/johnsnowlabs/nlp/embeddings/E5VEmbeddings) +{%- endcapture -%} + +{%- capture python_api_link -%} +[E5VEmbeddings](/api/python/reference/autosummary/sparknlp/annotator/cv/e5v_embeddings/index.html#sparknlp.annotator.cv.e5v_embeddings.E5VEmbeddings) +{%- endcapture -%} + +{%- capture source_link -%} +[E5VEmbeddings](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5VEmbeddings.scala) +{%- endcapture -%} + +{% include templates/anno_template.md + title=title + description=description + input_anno=input_anno + output_anno=output_anno + python_example=python_example + scala_example=scala_example + api_link=api_link + python_api_link=python_api_link + source_link=source_link +%} \ No newline at end of file diff --git a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb index c5eecc20945f3f..659bdbc309f127 100644 --- a/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb +++ b/examples/python/data-preprocessing/SparkNLP_Partition_Demo.ipynb @@ -20,8 +20,30 @@ "## Setup and Initialization\n", "Let's keep in mind a few things before we start 😊\n", "\n", - "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release.\n", - "\n", + "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 Please make sure you have upgraded to the latest Spark NLP release." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "For local files example we will download different files from Spark NLP Github repo:" ] }, @@ -42,34 +64,34 @@ "base_uri": "https://localhost:8080/" }, "id": "bo7s-jZVrE7W", - "outputId": "e7234d36-765e-4a29-f922-02ceab1626dd" + "outputId": "b0e91448-3b2c-4dab-84c7-5e7d8bad0be5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:05-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n", + "--2025-06-09 22:10:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/example-10k.html\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 2456707 (2.3M) [text/plain]\n", "Saving to: ‘html-files/example-10k.html’\n", "\n", - "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.05s \n", + "example-10k.html 100%[===================>] 2.34M --.-KB/s in 0.04s \n", "\n", - "2025-05-26 23:11:06 (45.1 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n", + "2025-06-09 22:10:23 (52.9 MB/s) - ‘html-files/example-10k.html’ saved [2456707/2456707]\n", "\n", - "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/html/fake-html.html\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 665 [text/plain]\n", "Saving to: ‘html-files/fake-html.html’\n", "\n", "fake-html.html 100%[===================>] 665 --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:06 (30.2 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", + "2025-06-09 22:10:24 (18.3 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", "\n" ] } @@ -97,38 +119,36 @@ "base_uri": "https://localhost:8080/" }, "id": "ya8qZe00dalC", - "outputId": "ba520f44-c4b9-45b1-f03c-6a8e3a33320b" + "outputId": "9b4fbf52-9ecc-454b-bef1-0ce31dadb7c7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", + "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/image_3_pages.pdf\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 15629 (15K) [application/octet-stream]\n", "Saving to: ‘pdf-files/image_3_pages.pdf’\n", "\n", - "\r", - "image_3_pages.pdf 0%[ ] 0 --.-KB/s \r", "image_3_pages.pdf 100%[===================>] 15.26K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:06 (25.5 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n", + "2025-06-09 22:10:24 (24.3 MB/s) - ‘pdf-files/image_3_pages.pdf’ saved [15629/15629]\n", "\n", - "--2025-05-26 23:11:06-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n", + "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 25803 (25K) [application/octet-stream]\n", "Saving to: ‘pdf-files/pdf-title.pdf’\n", "\n", - "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0s \n", + "pdf-title.pdf 100%[===================>] 25.20K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:06 (58.5 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n", + "2025-06-09 22:10:24 (21.2 MB/s) - ‘pdf-files/pdf-title.pdf’ saved [25803/25803]\n", "\n", - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n", + "--2025-06-09 22:10:24-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -137,7 +157,7 @@ "\n", "text_3_pages.pdf 100%[===================>] 9.26K --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:07 (79.2 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n", + "2025-06-09 22:10:24 (73.3 MB/s) - ‘pdf-files/text_3_pages.pdf’ saved [9487/9487]\n", "\n" ] } @@ -166,47 +186,45 @@ "base_uri": "https://localhost:8080/" }, "id": "zLLEUl3KpYZ6", - "outputId": "4346e6e1-18ec-47a8-92c0-c8bc588f3441" + "outputId": "407e9405-6cc9-4724-f576-f52c503cb52d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n", + "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/contains-pictures.docx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 95087 (93K) [application/octet-stream]\n", "Saving to: ‘word-files/contains-pictures.docx’\n", "\n", - "\r", - "contains-pictures.d 0%[ ] 0 --.-KB/s \r", - "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.01s \n", + "contains-pictures.d 100%[===================>] 92.86K --.-KB/s in 0.02s \n", "\n", - "2025-05-26 23:11:07 (6.85 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n", + "2025-06-09 22:10:25 (4.74 MB/s) - ‘word-files/contains-pictures.docx’ saved [95087/95087]\n", "\n", - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/fake_table.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 12392 (12K) [application/octet-stream]\n", "Saving to: ‘word-files/fake_table.docx’\n", "\n", "fake_table.docx 100%[===================>] 12.10K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:07 (17.7 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n", + "2025-06-09 22:10:25 (18.9 MB/s) - ‘word-files/fake_table.docx’ saved [12392/12392]\n", "\n", - "--2025-05-26 23:11:07-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:25-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/doc/page-breaks.docx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 14584 (14K) [application/octet-stream]\n", "Saving to: ‘word-files/page-breaks.docx’\n", "\n", "page-breaks.docx 100%[===================>] 14.24K --.-KB/s in 0.001s \n", "\n", - "2025-05-26 23:11:08 (22.4 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n", + "2025-06-09 22:10:25 (21.5 MB/s) - ‘word-files/page-breaks.docx’ saved [14584/14584]\n", "\n" ] } @@ -235,48 +253,58 @@ "base_uri": "https://localhost:8080/" }, "id": "G3-BCYP6qQ4x", - "outputId": "38489a6e-588d-4a1b-e319-0c7f66559ca0" + "outputId": "95c5a31d-eed9-47a1-bb55-0868daec7da7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/vodafone.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 12541 (12K) [application/octet-stream]\n", "Saving to: ‘excel-files/vodafone.xlsx’\n", "\n", "\r", "vodafone.xlsx 0%[ ] 0 --.-KB/s \r", - "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0.001s \n", + "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:08 (22.2 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n", + "2025-06-09 22:10:26 (30.4 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n", "\n", - "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 38442 (38K) [application/octet-stream]\n", "Saving to: ‘excel-files/2023-half-year-analyses-by-segment.xlsx’\n", "\n", - "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.007s \n", + "2023-half-year-anal 100%[===================>] 37.54K --.-KB/s in 0.01s \n", "\n", - "2025-05-26 23:11:08 (5.37 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n", + "2025-06-09 22:10:26 (3.43 MB/s) - ‘excel-files/2023-half-year-analyses-by-segment.xlsx’ saved [38442/38442]\n", "\n", - "--2025-05-26 23:11:08-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/page-break-example.xlsx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 404 Not Found\n", - "2025-05-26 23:11:09 ERROR 404: Not Found.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 10676 (10K) [application/octet-stream]\n", + "Saving to: ‘excel-files/page-break-example.xlsx’\n", + "\n", + "page-break-example. 100%[===================>] 10.43K --.-KB/s in 0s \n", "\n", - "--2025-05-26 23:11:09-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n", + "2025-06-09 22:10:26 (79.4 MB/s) - ‘excel-files/page-break-example.xlsx’ saved [10676/10676]\n", + "\n", + "--2025-06-09 22:10:26-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 404 Not Found\n", - "2025-05-26 23:11:09 ERROR 404: Not Found.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 9210 (9.0K) [application/octet-stream]\n", + "Saving to: ‘excel-files/xlsx-subtable-cases.xlsx’\n", + "\n", + "xlsx-subtable-cases 100%[===================>] 8.99K --.-KB/s in 0s \n", + "\n", + "2025-06-09 22:10:26 (65.5 MB/s) - ‘excel-files/xlsx-subtable-cases.xlsx’ saved [9210/9210]\n", "\n" ] } @@ -289,17 +317,6 @@ "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xls/xlsx-subtable-cases.xlsx -P excel-files" ] }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "LcSYn6q7jW9-" - }, - "outputs": [], - "source": [ - "!cp drive/MyDrive/JSL/PageBreakExample.xlsx ./excel-files" - ] - }, { "cell_type": "markdown", "metadata": { @@ -317,42 +334,45 @@ "base_uri": "https://localhost:8080/" }, "id": "1jDRFmcHqpxn", - "outputId": "4d59c445-3764-41a8-c91b-9231d401eac6" + "outputId": "cd7e3c96-bb5f-49ab-f466-56ec6be20f75" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:09-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2025-06-09 22:10:27-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point.pptx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 38412 (38K) [application/octet-stream]\n", "Saving to: ‘ppt-files/fake-power-point.pptx’\n", "\n", - "\r", - "fake-power-point.pp 0%[ ] 0 --.-KB/s \r", - "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.007s \n", + "fake-power-point.pp 100%[===================>] 37.51K --.-KB/s in 0.01s \n", "\n", - "2025-05-26 23:11:10 (5.29 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n", + "2025-06-09 22:10:27 (3.41 MB/s) - ‘ppt-files/fake-power-point.pptx’ saved [38412/38412]\n", "\n", - "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n", + "--2025-06-09 22:10:27-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/fake-power-point-table.pptx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 39894 (39K) [application/octet-stream]\n", "Saving to: ‘ppt-files/fake-power-point-table.pptx’\n", "\n", - "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.006s \n", + "fake-power-point-ta 100%[===================>] 38.96K --.-KB/s in 0.008s \n", "\n", - "2025-05-26 23:11:10 (6.73 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n", + "2025-06-09 22:10:28 (4.93 MB/s) - ‘ppt-files/fake-power-point-table.pptx’ saved [39894/39894]\n", "\n", - "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n", + "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/ppt/speaker-notes.pptx\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", - "HTTP request sent, awaiting response... 404 Not Found\n", - "2025-05-26 23:11:10 ERROR 404: Not Found.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 39414 (38K) [application/octet-stream]\n", + "Saving to: ‘ppt-files/speaker-notes.pptx’\n", + "\n", + "speaker-notes.pptx 100%[===================>] 38.49K --.-KB/s in 0.008s \n", + "\n", + "2025-06-09 22:10:28 (4.76 MB/s) - ‘ppt-files/speaker-notes.pptx’ saved [39414/39414]\n", "\n" ] } @@ -381,14 +401,14 @@ "base_uri": "https://localhost:8080/" }, "id": "yYMVpVQurk7G", - "outputId": "cedb0e39-f137-4759-a158-0b84ed31b282" + "outputId": "293a864a-2980-4502-c6dc-a1d3cee815ee" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:10-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n", + "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/email-text-attachments.eml\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -399,18 +419,18 @@ " email-tex 0%[ ] 0 --.-KB/s \r", "email-text-attachme 100%[===================>] 3.10K --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:11 (49.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n", + "2025-06-09 22:10:28 (21.2 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n", "\n", - "--2025-05-26 23:11:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "--2025-06-09 22:10:28-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/email/test-several-attachments.eml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 1324361 (1.3M) [text/plain]\n", "Saving to: ‘email-files/test-several-attachments.eml’\n", "\n", "test-several-attach 100%[===================>] 1.26M --.-KB/s in 0.04s \n", "\n", - "2025-05-26 23:11:11 (32.0 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n", + "2025-06-09 22:10:29 (30.2 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n", "\n" ] } @@ -438,14 +458,14 @@ "base_uri": "https://localhost:8080/" }, "id": "AV-krG6Ps8pq", - "outputId": "c407a77f-11d5-4a3c-85e0-4abffa48bd12" + "outputId": "bd7317e0-97d3-4f30-a800-6ffa8148f266" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2025-05-26 23:11:11-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n", + "--2025-06-09 22:10:29-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", @@ -456,7 +476,7 @@ "simple-text.txt 0%[ ] 0 --.-KB/s \r", "simple-text.txt 100%[===================>] 300 --.-KB/s in 0s \n", "\n", - "2025-05-26 23:11:11 (4.81 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n", + "2025-06-09 22:10:29 (3.39 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n", "\n" ] } @@ -466,6 +486,51 @@ "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "QVq5C0Uqs4wU" + }, + "source": [ + "**Downloading XML files**" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Gip5P7Ess63U", + "outputId": "dde0fa15-2571-4b4a-ef73-517fe2b7a7a7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-06-09 22:15:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 538 [text/plain]\n", + "Saving to: ‘xml-files/multi-level.xml’\n", + "\n", + "\r", + "multi-level.xml 0%[ ] 0 --.-KB/s \r", + "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n", + "\n", + "2025-06-09 22:15:15 (21.2 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files" + ] + }, { "cell_type": "markdown", "metadata": { @@ -478,13 +543,13 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bAkMjJ1vdalE", - "outputId": "15401bcc-3cb2-474a-d771-0efed1eaf9cd" + "outputId": "582dcc26-76ea-4cac-c5f6-46e009b639f9" }, "outputs": [ { @@ -519,13 +584,13 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VWbUgoVQrO8m", - "outputId": "36bbf310-7ee5-474a-93f2-4d940d3c0547" + "outputId": "56f4f9ce-41bb-48ba-b5db-7e1bde47d8d8" }, "outputs": [ { @@ -558,13 +623,13 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YFzeGJJ3ICVM", - "outputId": "01c349aa-16d2-4e0d-8a30-11399caf2ef2" + "outputId": "fc9bc68c-2b20-479e-8fe8-3e380877cebf" }, "outputs": [ { @@ -597,13 +662,13 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y_xl0ahaJ0Hy", - "outputId": "6040b119-2eca-4c58-f51b-e20fbefeef8d" + "outputId": "327222b8-0c6b-4578-8fde-4f14f9835edc" }, "outputs": [ { @@ -636,13 +701,13 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4JnKvFe5KVDf", - "outputId": "d91d1ee5-d4a3-48a1-b40a-d5f6bf997025" + "outputId": "c9252fb7-3840-4c95-d461-a56eef9adaea" }, "outputs": [ { @@ -675,13 +740,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_sldwjppKoPl", - "outputId": "467e9085-86dd-43df-f63b-a707b920d3b3" + "outputId": "0619383d-abf4-43a6-f63d-ad81897f8d9e" }, "outputs": [ { @@ -714,13 +779,13 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GltbZAjmKwQs", - "outputId": "c3f18b1f-06df-4233-8874-e9702c465e69" + "outputId": "df9ae11b-0186-4e61-d6ff-9581c597ccd1" }, "outputs": [ { @@ -731,9 +796,9 @@ "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n", "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", "\n" ] @@ -798,7 +863,9 @@ "| `infer_table_structure` | Word, Excel, PowerPoint | Whether to generate an HTML table representation from structured table content. When enabled, a full `` element is added alongside cell-level elements, based on row and column layout. |\n", "| `append_cells` | Excel | Whether to append all rows into a single content block instead of creating separate elements per row. |\n", "| `cell_separator` | Excel | String used to join cell values in a row when assembling textual output |\n", - "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |" + "| `add_attachment_content` | Email | Whether to extract and include the textual content of plain-text attachments in the output |\n", + "| `xml_keep_tags` | XML | Whether to retain original XML tag names and include them in the metadata for each extracted element |\n", + "| `only_leaf_nodes` | XML | If true, only the deepest elements are extracted. If false, all elements are extracted|" ] }, { @@ -812,13 +879,13 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gDJyUi_9R4fr", - "outputId": "4aebe625-444d-4161-be23-512708ced1b5" + "outputId": "181d8e88-7a0b-4a6e-f497-7fd4add3726c" }, "outputs": [ { @@ -830,8 +897,8 @@ "| path| doc|\n", "+--------------------+--------------------+\n", "|file:/content/wor...|[{NarrativeText, ...|\n", - "|file:/content/wor...|[{Header, An inli...|\n", "|file:/content/wor...|[{Table, Header C...|\n", + "|file:/content/wor...|[{Header, An inli...|\n", "+--------------------+--------------------+\n", "\n" ] @@ -843,50 +910,23 @@ ] }, { - "cell_type": "code", - "execution_count": 23, + "cell_type": "markdown", "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3vz48AHQHyON", - "outputId": "f3ba8c4b-3bfc-453a-d8d4-f86a5fca0a1b" + "id": "F0lCz9OyPYYh" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Warning::Spark Session already created, some configs may not take.\n", - "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n", - "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 25803|This is a Title \\...| 842| 596| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 1|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 15629| \\n| 841| 595| NULL| NULL| 2|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", - "|file:/content/pdf...|2025-05-26 23:11:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", - "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", - "\n" - ] - } - ], "source": [ - "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n", - "partition_df.show()" + "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "S50lqIFskNO3", - "outputId": "e52f4cde-cfb9-4a55-d989-6e9fe40a0321" + "id": "qExdRJ2aPsYV", + "outputId": "9a033a02-4bae-4570-aaba-b81c23b8e0e1" }, "outputs": [ { @@ -894,38 +934,40 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|path |xls |\n", - "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|file:/content/excel-files/PageBreakExample.xlsx|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}]|\n", - "+-----------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+--------------------+--------------------+--------------------+\n", + "| path| doc| content|\n", + "+--------------------+--------------------+--------------------+\n", + "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n", + "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n", + "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n", + "+--------------------+--------------------+--------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(content_type = \"application/vnd.ms-excel\").partition(\"./excel-files/PageBreakExample.xlsx\")\n", - "partition_df.show(truncate=False)" + "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n", + "partition_df.show()" ] }, { "cell_type": "markdown", "metadata": { - "id": "F0lCz9OyPYYh" + "id": "E3bCFJZn8TS0" }, "source": [ - "We can use the `store_content` option to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output" + "## Partitioning PDF Files" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "qExdRJ2aPsYV", - "outputId": "0284de34-ce6a-4d1e-91bc-268521111015" + "id": "3vz48AHQHyON", + "outputId": "19369e63-f963-4422-a791-57ea5394df1a" }, "outputs": [ { @@ -933,19 +975,23 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+--------------------+--------------------+--------------------+\n", - "| path| doc| content|\n", - "+--------------------+--------------------+--------------------+\n", - "|file:/content/wor...|[{NarrativeText, ...|[50 4B 03 04 14 0...|\n", - "|file:/content/wor...|[{Header, An inli...|[50 4B 03 04 14 0...|\n", - "|file:/content/wor...|[{Table, Header C...|[50 4B 03 04 14 0...|\n", - "+--------------------+--------------------+--------------------+\n", + "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", + "| path| modificationTime|length| text|height_dimension|width_dimension|content|exception|pagenum|\n", + "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 25803|This is a Title \\...| 842| 596| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 1|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 15629| \\n| 841| 595| NULL| NULL| 2|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| This is a page.\\n| 841| 595| NULL| NULL| 0|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487|This is another p...| 841| 595| NULL| NULL| 1|\n", + "|file:/content/pdf...|2025-06-09 22:10:...| 9487| Yet another page.\\n| 841| 595| NULL| NULL| 2|\n", + "+--------------------+--------------------+------+--------------------+----------------+---------------+-------+---------+-------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(content_type = \"application/msword\", store_content = True).partition(\"./word-files\")\n", + "partition_df = Partition(content_type = \"application/pdf\").partition(\"./pdf-files\")\n", "partition_df.show()" ] }, @@ -969,13 +1015,13 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_2J0zEmma8jm", - "outputId": "405391bf-60bf-4632-ef0e-e84496049c71" + "outputId": "90f668d7-03d9-496f-dc82-a620c59f9c08" }, "outputs": [ { @@ -1018,13 +1064,13 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4sY2ADN8dusy", - "outputId": "98af2c82-8a55-46ff-f631-7775431820cb" + "outputId": "8164237e-6835-404a-d7a7-b5ef0ef99c6d" }, "outputs": [ { @@ -1046,24 +1092,33 @@ "partition_df.show(truncate=False)" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMyqJX-K7dss" + }, + "source": [ + "## Partitioning MS Office documents" + ] + }, { "cell_type": "markdown", "metadata": { "id": "_9dDTCrpGdoN" }, "source": [ - "For Word documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output." + "For Excel documents, use `includePageBreaks` to preserve structural information like page boundaries, which are inserted as HTML tables in the output." ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7ICTZmLGk3Sa", - "outputId": "5e31a551-2746-4c45-b933-56f55e4866c9" + "outputId": "1796055a-808c-4eff-fc86-14e29cf9b53e" }, "outputs": [ { @@ -1087,13 +1142,13 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YId4UG1rOVQq", - "outputId": "7de8b4be-9936-4330-8a0f-019c3a55182a" + "outputId": "32827dea-d7b3-4137-abff-9e4502f8cd93" }, "outputs": [ { @@ -1118,38 +1173,21 @@ { "cell_type": "markdown", "metadata": { - "id": "jpRmFNPNNqkf" - }, - "source": [ - "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "id": "HwnYBQ5l7rDM" + "id": "E8ockED4NxLi" }, - "outputs": [], "source": [ - "text = (\n", - " \"The big brown fox\\n\"\n", - " \"was walking down the lane.\\n\"\n", - " \"\\n\"\n", - " \"At the end of the lane,\\n\"\n", - " \"the fox met a bear.\"\n", - " )" + "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output." ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "mutwZUFj720X", - "outputId": "87cd31c5-2f94-4777-9ea5-b6edf8277347" + "id": "fPCpk7RTGRjo", + "outputId": "a818ecd7-8580-4098-b30f-6e46b8ef6baa" }, "outputs": [ { @@ -1157,61 +1195,77 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|txt |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|path |ppt |\n", + "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n", + "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n", - "text_df.show(truncate=False)" + "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n", + "partition_df.show(truncate=False)" ] }, { "cell_type": "markdown", "metadata": { - "id": "E8ockED4NxLi" + "id": "qRfRSGvhN303" }, "source": [ - "For PowerPoint files, the `include_slide_notes` flag ensures that speaker notes from each slide are extracted and included in the output." + "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display." ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "xF8F-5CP3qWY", - "outputId": "71b5e0cb-b22a-4774-a7b6-83c4fd67fadb" + "id": "twLdjGxZWiOJ", + "outputId": "8adcaa80-b02c-4e8f-8205-20efa8c40b4b" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "fake-power-point.pptx fake-power-point-table.pptx\n" + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xls |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML,
DateFri Jul 19 00:00:00 UTC 2024
AssetsDebts
Bank15865.43Credit Card12000.0
Bank210140.19Credit Card21500.0
Bank31200.0Credit Card3348.0
Bank41438.27TotalSUM(F3:F5)
TotalSUM(B3:B6)
, {SheetName -> Sheet1}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" ] } ], "source": [ - "!ls ppt-files" + "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n", + "partition_df.select(\"xls\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8txswwbjN8Mg" + }, + "source": [ + "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually." ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "fPCpk7RTGRjo", - "outputId": "74144c26-5060-4c99-f291-a097b838e774" + "id": "PQ4MpGw6xCko", + "outputId": "aaf807a7-27b9-40cc-8a75-58be077f8403" }, "outputs": [ { @@ -1219,38 +1273,64 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|path |ppt |\n", - "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|file:/content/ppt-files/speaker-notes.pptx|[{Title, Adding a Bullet Slide, {}}, {ListItem, • Find the bullet slide layout, {}}, {ListItem, – Use _TextFrame.text for first bullet, {}}, {ListItem, • Use _TextFrame.add_paragraph() for subsequent bullets, {}}, {NarrativeText, Here is a lot of text!, {}}, {NarrativeText, Here is some text in a text box!, {}}]|\n", - "+------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xls |\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(include_slide_notes = True).partition(\"./ppt-files/speaker-notes.pptx\")\n", - "partition_df.show(truncate=False)" + "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n", + "partition_df.select(\"xls\").show(truncate=False)" ] }, { "cell_type": "markdown", "metadata": { - "id": "qRfRSGvhN303" + "id": "_GyL6D4N75i-" }, "source": [ - "In Excel files, enabling `infer_table_structure` allows Partition to generate an HTML representation of table structures, useful for downstream parsing or display." + "## Partitioning Text Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jpRmFNPNNqkf" + }, + "source": [ + "When parsing plain text files, `group_broken_paragraphs` can be enabled to intelligently merge broken paragraphs by interpreting blank lines as true paragraph breaks." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "HwnYBQ5l7rDM" + }, + "outputs": [], + "source": [ + "text = (\n", + " \"The big brown fox\\n\"\n", + " \"was walking down the lane.\\n\"\n", + " \"\\n\"\n", + " \"At the end of the lane,\\n\"\n", + " \"the fox met a bear.\"\n", + " )" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "twLdjGxZWiOJ", - "outputId": "ec340358-7279-4247-b27c-5a0a25f38ee6" + "id": "mutwZUFj720X", + "outputId": "8b4f474d-2f3f-4e81-cecf-5de420561124" }, "outputs": [ { @@ -1258,38 +1338,47 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|xls |\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{Title, Date\\tFri Jul 19 00:00:00 UTC 2024, {location -> (0, 1), SheetName -> Sheet1}}, {Title, Assets\\t\\tDebts, {location -> (1, 4), SheetName -> Sheet1}}, {NarrativeText, Bank1\\t5865.43\\tCredit Card1\\t2000.0, {location -> (2, 5), SheetName -> Sheet1}}, {NarrativeText, Bank2\\t10140.19\\tCredit Card2\\t1500.0, {location -> (3, 5), SheetName -> Sheet1}}, {NarrativeText, Bank3\\t1200.0\\tCredit Card3\\t348.0, {location -> (4, 5), SheetName -> Sheet1}}, {Title, Bank4\\t1438.27\\tTotal\\tSUM(F3:F5), {location -> (5, 5), SheetName -> Sheet1}}, {Title, Total\\tSUM(B3:B6), {location -> (6, 1), SheetName -> Sheet1}}, {HTML,
DateFri Jul 19 00:00:00 UTC 2024
AssetsDebts
Bank15865.43Credit Card12000.0
Bank210140.19Credit Card21500.0
Bank31200.0Credit Card3348.0
Bank41438.27TotalSUM(F3:F5)
TotalSUM(B3:B6)
, {SheetName -> Sheet1}}]|\n", - "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|txt |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{NarrativeText, The big brown fox was walking down the lane., {paragraph -> 0}}, {NarrativeText, At the end of the lane, the fox met a bear., {paragraph -> 0}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(infer_table_structure = True).partition(\"./excel-files/page-break-example.xlsx\")\n", - "partition_df.select(\"xls\").show(truncate=False)" + "text_df = Partition(group_broken_paragraphs=True).partition_text(text = text)\n", + "text_df.show(truncate=False)" ] }, { "cell_type": "markdown", "metadata": { - "id": "8txswwbjN8Mg" + "id": "epCp5DnQ8E7o" }, "source": [ - "With Excel inputs, set `append_cells` to concatenate all cell values in a row into a single string instead of separating each cell individually." + "## Partitioning XML Files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DWX0nkc4tM7J" + }, + "source": [ + "In Spark NLP 6.0.3 we added support for XML files" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 45, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "PQ4MpGw6xCko", - "outputId": "808783d2-f15b-45ae-90fb-a623243898f3" + "id": "AViMSzKQtP-o", + "outputId": "147a1ef9-3f14-4832-a050-e60c8ac9544b" }, "outputs": [ { @@ -1297,18 +1386,18 @@ "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|xls |\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", - "|[{NarrativeText, a\\tb\\nc\\td\\te\\n- f\\na\\nb\\tc\\nd\\te\\na\\nb\\nc\\td\\ne\\tf\\na\\tb\\nc\\td\\n2. e\\na\\tb\\nc\\td\\ne\\nf\\na\\nb\\tc\\nd\\te\\nf\\na\\nb\\nc\\td\\ne\\tf\\ng\\na\\nb\\tc\\nd\\te\\nf\\ng\\na\\nb\\nc\\td\\ne\\tf\\ng\\nh\\na\\tb\\tc\\na\\nb\\tc\\td\\na\\tb\\tc\\nd\\ne, {SheetName -> Sheet1}}]|\n", - "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xml |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "\n" ] } ], "source": [ - "partition_df = Partition(append_cells = True).partition(\"./excel-files/xlsx-subtable-cases.xlsx\")\n", - "partition_df.select(\"xls\").show(truncate=False)" + "partition_df = Partition(xml_keep_tags = True).partition(\"./xml-files/multi-level.xml\")\n", + "partition_df.select(\"xml\").show(truncate=False)" ] } ], diff --git a/examples/python/data-preprocessing/SparkNLP_Partition_with_Chunking_Demo.ipynb b/examples/python/data-preprocessing/SparkNLP_Partition_with_Chunking_Demo.ipynb new file mode 100644 index 00000000000000..8bb4a05faf6d46 --- /dev/null +++ b/examples/python/data-preprocessing/SparkNLP_Partition_with_Chunking_Demo.ipynb @@ -0,0 +1,362 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing Partition with Semantic Chunking SparkNLP\n", + "This notebook showcases the newly added `Partition` component in Spark NLP\n", + "providing a streamlined and user-friendly interface for interacting with Spark NLP readers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RFOFhaEedalB" + }, + "source": [ + "## Setup and Initialization\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "Support for **Partitioning** files was introduced in Spark NLP 6.0.1 \n", + "\n", + "Chunking support was added in Spark NLP 6.0.3\n", + "Please make sure you have upgraded to the latest Spark NLP release.\n", + "\n", + "For local files example we will download different files from Spark NLP Github repo:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ATDLz3Gws5ob" + }, + "source": [ + "**Downloading Files**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "g7PMCOJo0ZlU" + }, + "outputs": [], + "source": [ + "!mkdir txt-files\n", + "!mkdir html-files" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AV-krG6Ps8pq", + "outputId": "ea4c2484-6e83-4a7a-a000-537f38189ed0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-06-06 15:19:01-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1125-Implement-Chunking-Strategies/src/test/resources/reader/txt/long-text.txt\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1032 (1.0K) [text/plain]\n", + "Saving to: ‘txt-files/long-text.txt’\n", + "\n", + "long-text.txt 100%[===================>] 1.01K --.-KB/s in 0s \n", + "\n", + "2025-06-06 15:19:01 (58.1 MB/s) - ‘txt-files/long-text.txt’ saved [1032/1032]\n", + "\n", + "--2025-06-06 15:19:01-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1125-Implement-Chunking-Strategies/src/test/resources/reader/html/fake-html.html\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 665 [text/plain]\n", + "Saving to: ‘html-files/fake-html.html’\n", + "\n", + "fake-html.html 100%[===================>] 665 --.-KB/s in 0s \n", + "\n", + "2025-06-06 15:19:02 (26.7 MB/s) - ‘html-files/fake-html.html’ saved [665/665]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1125-Implement-Chunking-Strategies/src/test/resources/reader/txt/long-text.txt -P txt-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1125-Implement-Chunking-Strategies/src/test/resources/reader/html/fake-html.html -P html-files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EoFI66NAdalE" + }, + "source": [ + "## Partitioning Documents with Chunking\n", + "Use the `basic` chunking to segment data into coherent chunks based on character limits" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bAkMjJ1vdalE", + "outputId": "75831f62-c84a-4170-f87e-e70a6c1ef39d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ], + "source": [ + "from sparknlp.partition.partition import Partition\n", + "\n", + "partition_df = Partition(content_type = \"text/plain\", chunking_strategy = \"basic\").partition(\"./txt-files/long-text.txt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k6uvYxiVzGsG" + }, + "source": [ + "Output without `basic` chunk:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3L-Tp017qgqb", + "outputId": "98af5f84-5abc-4554-bab7-7dd9c5212612" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|col |\n", + "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|Ukrainian forces reportedly advanced in the western Donetsk-eastern Zaporizhia Oblast border area and in western Zaporizhia Oblast amid Ukrainian counteroffensive operations in southern and eastern Ukraine. Tavriisk Group of Forces Spokesperson Oleksandr Shtupun reported that Ukrainian forces are advancing in the directions of Novoprokopivka (13km south of Orikhiv), Mala Tokmachka (9km southeast of Orikhiv), and Ocheretuvate (25km southeast of Orikhiv) in western Zaporizhia Oblast.[1] Shtupun also stated that Ukrainian forces advanced near Urozhaine (9km south of Velyka Novosilka) and Robotyne (10km south of Orikhiv) and achieved unspecified successes near Staromayorske (9km south of Velyka Novosilka) in the Berdyansk direction (western Donetsk-eastern Zaporizhia Oblast border area) and in an unspecified location in the Melitopol direction (western Zaporizhia Oblast).[2] Ukrainian Eastern Group of Forces Spokesperson Ilya Yevlash stated that Ukrainian forces continued offensive operations in the Bakhmut direction.[3]|\n", + "+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import explode, col\n", + "\n", + "result_df = partition_df.select(explode(col(\"txt.content\")))\n", + "result_df.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EQJvQsnxzRg1" + }, + "source": [ + "Output with `basic` chunk:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VlhnXCV5qr4J", + "outputId": "cdaf98f1-3109-4770-adaf-f51c80a59ab9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|col |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|Ukrainian forces reportedly advanced in the western Donetsk-eastern Zaporizhia Oblast border area and in western Zaporizhia Oblast amid Ukrainian counteroffensive operations in southern and eastern Ukraine. Tavriisk Group of Forces Spokesperson Oleksandr Shtupun reported that Ukrainian forces are advancing in the directions of Novoprokopivka (13km south of Orikhiv), Mala Tokmachka (9km southeast of Orikhiv), and Ocheretuvate (25km southeast of Orikhiv) in western Zaporizhia Oblast.[1] Shtupun|\n", + "|also stated that Ukrainian forces advanced near Urozhaine (9km south of Velyka Novosilka) and Robotyne (10km south of Orikhiv) and achieved unspecified successes near Staromayorske (9km south of Velyka Novosilka) in the Berdyansk direction (western Donetsk-eastern Zaporizhia Oblast border area) and in an unspecified location in the Melitopol direction (western Zaporizhia Oblast).[2] Ukrainian Eastern Group of Forces Spokesperson Ilya Yevlash stated that Ukrainian forces continued offensive |\n", + "|operations in the Bakhmut direction.[3] |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "result_df = partition_df.select(explode(col(\"chunks.content\")))\n", + "result_df.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4YYTB7G6zbmN" + }, + "source": [ + "Use `by_title` chunking to group sections in documents with headings, tables, and mixed semantic elements" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PxTf0Ot23ZaO", + "outputId": "9b02a493-b4d0-41fc-c5ee-9ed8ab2de194" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n" + ] + } + ], + "source": [ + "partition_df = Partition(content_type = \"text/html\", chunking_strategy = \"by_title\", combineTextUnderNChars = 50).partition(\"./html-files/fake-html.html\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YXMf3cBfz_2-" + }, + "source": [ + "Output without `by_title` chunk:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O-_R-W86sFo-", + "outputId": "6f07e491-c556-41af-89da-273e905d0e8b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------------------------------------------------------------------------------------+\n", + "|col |\n", + "+----------------------------------------------------------------------------------------------------------------------------------+\n", + "|My First Heading |\n", + "|My Second Heading |\n", + "|My first paragraph. lorem ipsum dolor set amet. if the cow comes home under the sun how do you fault the cow for it's worn hooves?|\n", + "|A Third Heading |\n", + "|Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2 |\n", + "+----------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "result_df = partition_df.select(explode(col(\"html.content\")))\n", + "result_df.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EhLOvpfe0JIe" + }, + "source": [ + "Output with `by_title` chunk:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WhSWaeYGrvP-", + "outputId": "8f5da326-029c-4ad6-c201-c5d2f2f8fa7d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|col |\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|My First Heading My Second Heading My first paragraph. lorem ipsum dolor set amet. if the cow comes home under the sun how do you fault the cow for it's worn hooves? A Third Heading|\n", + "|Column 1 Column 2 Row 1, Cell 1 Row 1, Cell 2 Row 2, Cell 1 Row 2, Cell 2 |\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "result_df = partition_df.select(explode(col(\"chunks.content\")))\n", + "result_df.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BB2FEfegGuxl" + }, + "source": [ + "You can also use DFS file systems like:\n", + "- Databricks: `dbfs://`\n", + "- HDFS: `hdfs://`\n", + "- Microsoft Fabric OneLake: `abfss://`" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/python/data-preprocessing/SparkNLP_RAG_Demo_with_PartitionTransformer_and_Semantic_Chunking.ipynb b/examples/python/data-preprocessing/SparkNLP_RAG_Demo_with_PartitionTransformer_and_Semantic_Chunking.ipynb new file mode 100644 index 00000000000000..5155d7f55a49d5 --- /dev/null +++ b/examples/python/data-preprocessing/SparkNLP_RAG_Demo_with_PartitionTransformer_and_Semantic_Chunking.ipynb @@ -0,0 +1,622 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing Chunking in Partition Transformer in SparkNLP\n", + "This notebook demonstrates how to use **Spark NLP's PartitionTransformer** for\n", + " chunking of documents, enabling efficient text segmentation.\n", + "\n", + "We further showcase a practical application of this chunking strategy in the context of **Retrieval-Augmented Generation (RAG)**.\n", + "\n", + "We can use this powerful method to enhance the performance of large language models by supplying context-relevant information from a knowledge base." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3y_JC9AmJtYr" + }, + "source": [ + "Creating Files" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "bo7s-jZVrE7W" + }, + "outputs": [], + "source": [ + "!echo -e \"Introduction: RAG stands for Retrieval-Augmented Generation. Why RAG? It improves factual accuracy and adds fresh or private data to LLMs. Chunking: Breaks documents into pieces so they can be embedded. Semantic Chunking: Focus on respecting document structure like sections. Summary: RAG is powerful when paired with good chunking!\" > rag_intro.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "lkJ-P8-50Nhy" + }, + "outputs": [], + "source": [ + "!echo -e \"Tomatoes grow best in warm weather with plenty of sun. It's important to water them regularly and use nutrient-rich soil. They are typically planted after the last frost and harvested in late summer.\" > tomatoes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ay-nZLk_J0C4", + "outputId": "983de5e8-7ee8-434f-c4e2-7e742f97f189" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Introduction: RAG stands for Retrieval-Augmented Generation. Why RAG? It improves factual accuracy and adds fresh or private data to LLMs. Chunking: Breaks documents into pieces so they can be embedded. Semantic Chunking: Focus on respecting document structure like sections. Summary: RAG is powerful when paired with good chunking!\n" + ] + } + ], + "source": [ + "!cat rag_intro.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YmCMs_uU0Qkm", + "outputId": "55c22d57-c1ff-4628-b410-9ef322820dec" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tomatoes grow best in warm weather with plenty of sun. It's important to water them regularly and use nutrient-rich soil. They are typically planted after the last frost and harvested in late summer.\n" + ] + } + ], + "source": [ + "!cat tomatoes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "FpiTDDMx0Rx-" + }, + "outputs": [], + "source": [ + "!mkdir txt-data\n", + "!cp rag_intro.txt txt-data/rag_intro.txt\n", + "!cp tomatoes.txt txt-data/tomatoes.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Only run this cell when you are using Spark NLP on Google Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import Spark NLP\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "import sparknlp\n", + "\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EoFI66NAdalE" + }, + "source": [ + "## Partitioning Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nluIcWMbM_rx" + }, + "source": [ + "Partition Transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mWnypHRwXruC", + "outputId": "a2a8e50b-dcf2-423b-94fe-1c61fa7deda2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+--------------------+--------------------+\n", + "| path| content| text| chunks|\n", + "+--------------------+--------------------+--------------------+--------------------+\n", + "|file:/content/txt...|Tomatoes grow bes...|[{NarrativeText, ...|[{document, 0, 19...|\n", + "|file:/content/txt...|Introduction: RAG...|[{NarrativeText, ...|[{document, 0, 33...|\n", + "+--------------------+--------------------+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.ml import Pipeline\n", + "from sparknlp.partition.partition_transformer import *\n", + "\n", + "empty_df = spark.createDataFrame([], \"string\").toDF(\"text\")\n", + "\n", + "partition_transformer = PartitionTransformer() \\\n", + " .setInputCols([\"text\"]) \\\n", + " .setContentType(\"text/plain\") \\\n", + " .setContentPath(\"./txt-data\") \\\n", + " .setOutputCol(\"chunks\") \\\n", + " .setChunkingStrategy(\"basic\") \\\n", + " .setMaxCharacters(140)\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " partition_transformer\n", + "])\n", + "\n", + "pipeline_model = pipeline.fit(empty_df)\n", + "result_df = pipeline_model.transform(empty_df)\n", + "\n", + "result_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EFMhyfnc_g1V", + "outputId": "57befaf7-91af-40b3-acca-9b67c623543a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|chunks |\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{document, 0, 198, Tomatoes grow best in warm weather with plenty of sun. It's important to water them regularly and use nutrient-rich soil. They are typically planted after the last frost and harvested in late summer., {paragraph -> 0}, []}] |\n", + "|[{document, 0, 331, Introduction: RAG stands for Retrieval-Augmented Generation. Why RAG? It improves factual accuracy and adds fresh or private data to LLMs. Chunking: Breaks documents into pieces so they can be embedded. Semantic Chunking: Focus on respecting document structure like sections. Summary: RAG is powerful when paired with good chunking!, {paragraph -> 0}, []}]|\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "result_df.select(\"chunks\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gBNYByJ5Bqq6" + }, + "source": [ + "RAG Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "W7LLHf_0BrtQ", + "outputId": "2e6e3577-044b-4c01-84a2-6d80d73e2a58" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "small_bert_L2_768 download started this may take some time.\n", + "Approximate size to download 135.3 MB\n", + "[OK!]\n" + ] + } + ], + "source": [ + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols([\"chunks\"]) \\\n", + " .setOutputCol(\"token\")\n", + "\n", + "bert_embeddings = BertEmbeddings.pretrained() \\\n", + " .setInputCols([\"chunks\", \"token\"]) \\\n", + " .setOutputCol(\"embeddings\")\n", + "\n", + "sentence_embeddings = SentenceEmbeddings() \\\n", + " .setInputCols([\"chunks\", \"embeddings\"]) \\\n", + " .setOutputCol(\"sentence_embeddings\") \\\n", + " .setPoolingStrategy(\"AVERAGE\")\n", + "\n", + "finisher = EmbeddingsFinisher().setInputCols([\"sentence_embeddings\"]).setOutputCols([\"finished_sentence_embeddings\"]).setOutputAsVector(True)\n", + "\n", + "rag_pipeline = Pipeline(stages=[\n", + " partition_transformer,\n", + " tokenizer,\n", + " bert_embeddings,\n", + " sentence_embeddings,\n", + " finisher\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sY3fW-93CL2J" + }, + "source": [ + "Embed a Knowledge Base" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "LR0E1EdjCEjS" + }, + "outputs": [], + "source": [ + "rag_model = rag_pipeline.fit(empty_df)\n", + "kb_df = rag_model.transform(empty_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fKfZCpqLl5WZ", + "outputId": "38e64d2b-ab95-4beb-e6fb-c4b0ce65d654" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "| path| content| text| chunks| token| embeddings| sentence_embeddings|finished_sentence_embeddings|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "|file:/content/txt...|Tomatoes grow bes...|[{NarrativeText, ...|[{document, 0, 19...|[{token, 0, 7, To...|[{word_embeddings...|[{sentence_embedd...| [[0.6935687065124...|\n", + "|file:/content/txt...|Introduction: RAG...|[{NarrativeText, ...|[{document, 0, 33...|[{token, 0, 11, I...|[{word_embeddings...|[{sentence_embedd...| [[0.5774036645889...|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "\n" + ] + } + ], + "source": [ + "kb_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-IhJqVfU2HJj", + "outputId": "9ac8bb5f-cc84-40fe-b181-95083baa3c25" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|chunks |\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{document, 0, 198, Tomatoes grow best in warm weather with plenty of sun. It's important to water them regularly and use nutrient-rich soil. They are typically planted after the last frost and harvested in late summer., {paragraph -> 0}, []}] |\n", + "|[{document, 0, 331, Introduction: RAG stands for Retrieval-Augmented Generation. Why RAG? It improves factual accuracy and adds fresh or private data to LLMs. Chunking: Breaks documents into pieces so they can be embedded. Semantic Chunking: Focus on respecting document structure like sections. Summary: RAG is powerful when paired with good chunking!, {paragraph -> 0}, []}]|\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "kb_df.select(\"chunks\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t6d9Za6jbdqF" + }, + "source": [ + "Preparing the output of a Spark NLP RAG pipeline by aligning each chunk of text with its embedding vector," + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "OZsD7pfZm0br" + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import posexplode, monotonically_increasing_id\n", + "from pyspark.ml.functions import vector_to_array\n", + "\n", + "kb_df = kb_df.withColumn(\"doc_id\", monotonically_increasing_id())\n", + "exploded_chunks = kb_df.selectExpr(\"doc_id\", \"chunks.result as chunks\") \\\n", + " .select(posexplode(\"chunks\").alias(\"pos\", \"chunk_text\"), \"doc_id\")\n", + "\n", + "exploded_vectors = kb_df.selectExpr(\"doc_id\", \"finished_sentence_embeddings as vectors\") \\\n", + " .select(posexplode(\"vectors\").alias(\"pos\", \"vector\"), \"doc_id\")\n", + "\n", + "aligned_df = exploded_chunks.join(exploded_vectors, on=[\"doc_id\", \"pos\"]).select(\"doc_id\", \"chunk_text\", \"vector\")\n", + "\n", + "aligned_df = aligned_df.withColumn(\"vector\", vector_to_array(\"vector\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uMelNiiiHfrU", + "outputId": "aa123f33-2522-458a-905c-cd66266f25cf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+--------------------+--------------------+\n", + "|doc_id| chunk_text| vector|\n", + "+------+--------------------+--------------------+\n", + "| 0|Tomatoes grow bes...|[0.69356870651245...|\n", + "| 1|Introduction: RAG...|[0.57740366458892...|\n", + "+------+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "aligned_df_clean = aligned_df.select(\"doc_id\", \"chunk_text\", \"vector\").cache()\n", + "aligned_df_clean.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UuyM3NdN4ttf" + }, + "source": [ + "Query Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a1HVp-g34z6g", + "outputId": "b67ec752-65d8-497e-ea90-4eabd72eaadd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "small_bert_L2_768 download started this may take some time.\n", + "Approximate size to download 135.3 MB\n", + "[OK!]\n" + ] + } + ], + "source": [ + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\") \\\n", + " .setOutputCol(\"document\")\n", + "\n", + "sentence_detector = SentenceDetector() \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setOutputCol(\"sentence\")\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols([\"sentence\"]) \\\n", + " .setOutputCol(\"token\")\n", + "\n", + "bert_embeddings = BertEmbeddings.pretrained() \\\n", + " .setInputCols([\"sentence\", \"token\"]) \\\n", + " .setOutputCol(\"embeddings\")\n", + "\n", + "sentence_embeddings = SentenceEmbeddings() \\\n", + " .setInputCols([\"sentence\", \"embeddings\"]) \\\n", + " .setOutputCol(\"sentence_embeddings\") \\\n", + " .setPoolingStrategy(\"AVERAGE\")\n", + "\n", + "query_pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " sentence_detector,\n", + " tokenizer,\n", + " bert_embeddings,\n", + " sentence_embeddings,\n", + " finisher\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "Numk3cjdoRI3" + }, + "outputs": [], + "source": [ + "query = \"What is semantic chunking?\"\n", + "query_df = spark.createDataFrame([[query]]).toDF(\"text\")\n", + "query_model = query_pipeline.fit(query_df)\n", + "# query_model = rag_pipeline.fit(query_df)\n", + "query_result = query_model.transform(query_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Kv_mpg-n4cvi", + "outputId": "28f4bcfc-3292-4fba-f274-59090bf423ac" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "| text| document| sentence| token| embeddings| sentence_embeddings|finished_sentence_embeddings|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "|What is semantic ...|[{document, 0, 25...|[{document, 0, 25...|[{token, 0, 3, Wh...|[{word_embeddings...|[{sentence_embedd...| [[0.3536282181739...|\n", + "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------------+\n", + "\n" + ] + } + ], + "source": [ + "query_result.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "JqfkKYkXoYd8" + }, + "outputs": [], + "source": [ + "query_vector = query_result.select(\"finished_sentence_embeddings\").first()[0][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "id": "LvP5QoaSoEZv" + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf, col\n", + "from pyspark.sql.types import FloatType\n", + "import numpy as np\n", + "\n", + "def cosine_sim(vec1, vec2):\n", + " v1, v2 = np.array(vec1), np.array(vec2)\n", + " return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))\n", + "\n", + "# Register UDF\n", + "cosine_sim_udf = udf(lambda v: cosine_sim(v, query_vector), FloatType())\n", + "\n", + "# Add similarity score to each chunk\n", + "scored_chunks = aligned_df_clean.withColumn(\"similarity\", cosine_sim_udf(col(\"vector\"))) \\\n", + " .orderBy(col(\"similarity\").desc())" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "__Db-4tpJz6N", + "outputId": "55bf0969-b9fa-4fda-feea-6fd75a9e8804" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------+--------------------+--------------------+----------+\n", + "|doc_id| chunk_text| vector|similarity|\n", + "+------+--------------------+--------------------+----------+\n", + "| 1|Introduction: RAG...|[0.57740366458892...|0.61944675|\n", + "| 0|Tomatoes grow bes...|[0.69356870651245...| 0.2762234|\n", + "+------+--------------------+--------------------+----------+\n", + "\n" + ] + } + ], + "source": [ + "scored_chunks.show()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/python/reader/SparkNLP_PowerPoint_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_PowerPoint_Reader_Demo.ipynb index b70c0ac889c7b1..fcbe884c96baf9 100644 --- a/examples/python/reader/SparkNLP_PowerPoint_Reader_Demo.ipynb +++ b/examples/python/reader/SparkNLP_PowerPoint_Reader_Demo.ipynb @@ -109,7 +109,7 @@ }, "source": [ "## Parsing PowerPoint slides from Local Files\n", - "Use the `ppt()` method to parse Excel content from local directories." + "Use the `ppt()` method to parse PowerPoint content from local directories." ] }, { diff --git a/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb new file mode 100644 index 00000000000000..38b43aed37b95e --- /dev/null +++ b/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb @@ -0,0 +1,339 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing XML reader in SparkNLP\n", + "This notebook showcases the newly added `sparknlp.read().xml()` method in Spark NLP that parses XML content from both local files and real-time URLs into a Spark DataFrame.\n", + "\n", + "**Key Features:**\n", + "- Ability to parse XML from local directories and URLs.\n", + "- Versatile support for varied data ingestion scenarios." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RFOFhaEedalB" + }, + "source": [ + "## Setup and Initialization\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "Support for reading xml files was introduced in Spark NLP 6.1.0. Please make sure you have upgraded to the latest Spark NLP release." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y3hWfT5q-npM" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "u3ORYVyb-pRI" + }, + "outputs": [], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oIbFQyEo-tat" + }, + "source": [ + "For local files example we will download a couple of XML files from Spark NLP Github repo:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ya8qZe00dalC", + "outputId": "7d597910-9826-4472-9fdc-5b8ac398e6cf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-06-09 21:43:40-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/multi-level.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 538 [text/plain]\n", + "Saving to: ‘xml-files/multi-level.xml’\n", + "\n", + "\r", + "multi-level.xml 0%[ ] 0 --.-KB/s \r", + "multi-level.xml 100%[===================>] 538 --.-KB/s in 0s \n", + "\n", + "2025-06-09 21:43:40 (34.0 MB/s) - ‘xml-files/multi-level.xml’ saved [538/538]\n", + "\n", + "--2025-06-09 21:43:40-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1119-Implement-XML-Reader/src/test/resources/reader/xml/test.xml\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 382 [text/plain]\n", + "Saving to: ‘xml-files/test.xml’\n", + "\n", + "test.xml 100%[===================>] 382 --.-KB/s in 0s \n", + "\n", + "2025-06-09 21:43:40 (7.58 MB/s) - ‘xml-files/test.xml’ saved [382/382]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/multi-level.xml -P xml-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/xml/test.xml -P xml-files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EoFI66NAdalE" + }, + "source": [ + "## Parsing XML from Local Files\n", + "Use the `xml()` method to parse XML content from local directories." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bAkMjJ1vdalE", + "outputId": "0bba10be-75de-48de-9a06-d6197d35218f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+--------------------+--------------------+\n", + "| path| xml|\n", + "+--------------------+--------------------+\n", + "|file:/content/xml...|[{Title, Harry Po...|\n", + "|file:/content/xml...|[{Title, The Alch...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "xml_df = sparknlp.read().xml(\"./xml-files\")\n", + "\n", + "xml_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oBj0cHPXSD1m", + "outputId": "00951736-40d4-4f9e-fe25-cc5117405269" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- path: string (nullable = true)\n", + " |-- xml: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- elementType: string (nullable = true)\n", + " | | |-- content: string (nullable = true)\n", + " | | |-- metadata: map (nullable = true)\n", + " | | | |-- key: string\n", + " | | | |-- value: string (valueContainsNull = true)\n", + "\n" + ] + } + ], + "source": [ + "xml_df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FrVKxdySz8pR" + }, + "source": [ + "### Configuration Parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CC_klLwhV8um" + }, + "source": [ + "`xmlKeepTags`: When true, includes the tag name of each XML element in the metadata under the key `tag`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aNfN0fQC0Vzz", + "outputId": "ebdb1393-b91c-4c60-d7e7-b7ecc6465171" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xml |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> title}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> author}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> year}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc, tag -> price}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> title}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> author}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> year}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46, tag -> price}}]|\n", + "|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> title}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, tag -> year}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> title}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> author}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b, tag -> year}}] |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"xmlKeepTags\": \"true\"}\n", + "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n", + "xml_df.select(\"xml\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t06KtTItWQ4R" + }, + "source": [ + "`onlyLeafNodes`: When true, includes only leaf elements (i.e., elements with no child elements) in the output. When false, all elements (including containers) are included." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jTM1btqNntUL", + "outputId": "f86a0b28-73ac-46d1-8d26-f920e2d935cd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xml |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{UncategorizedText, , {elementId -> 931f811d0c9b488a01a7875f80992a62}}, {UncategorizedText, , {elementId -> 1f610d9429ab17d0d7ab49ee3069b4fc, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, , {elementId -> 249aff1b3e9835325b45e51cdfc4ad46, parentId -> 931f811d0c9b488a01a7875f80992a62}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}] |\n", + "|[{UncategorizedText, , {elementId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> d7416d9cac3ba3af57ef6b6b71d7841b, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> b79ae4ca74ec00f63a00b6cd66acc1e0, parentId -> d7416d9cac3ba3af57ef6b6b71d7841b}}, {UncategorizedText, , {elementId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a, parentId -> b79ae4ca74ec00f63a00b6cd66acc1e0}}, {Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, , {elementId -> 9ebecf846e7dea80c563ebcb2f7d4a9a, parentId -> 8f4f71ddf1b6429fbec582add2cb963f}}, {UncategorizedText, , {elementId -> 80472cd1880f453b8adecc61870748ba, parentId -> 9ebecf846e7dea80c563ebcb2f7d4a9a}}, {UncategorizedText, , {elementId -> 9708b29025b53d9f54c723ee005b647b, parentId -> 80472cd1880f453b8adecc61870748ba}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"onlyLeafNodes\": \"false\"}\n", + "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n", + "xml_df.select(\"xml\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O8DePUq8nkYm" + }, + "source": [ + "You can access the raw content of the file using the `storeContent` parameter" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E0S5aRb5WFLf", + "outputId": "5e624eeb-fbc1-47a4-ff21-aef410a10bb2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|path |content |xml |\n", + "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|file:/content/xml-files/test.xml |\\n \\n Harry Potter\\n J K. Rowling\\n 2005\\n 29.99\\n \\n \\n Learning XML\\n Erik T. Ray\\n 2003\\n 39.95\\n \\n |[{Title, Harry Potter, {elementId -> 42962e493b50acee6acdd7851128bbb3, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, J K. Rowling, {elementId -> 28f300ecb3ddf2a297416caf0b936a15, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 2005, {elementId -> 1486c560869e6720e2668f318be8c4b0, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {UncategorizedText, 29.99, {elementId -> 52f0aebb3d4d4d08290edd1b6016ec2a, parentId -> 1f610d9429ab17d0d7ab49ee3069b4fc}}, {Title, Learning XML, {elementId -> 26f1538c947d0c13d84679137dd718d6, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {Title, Erik T. Ray, {elementId -> 3b7e3c115d8f5d645d739fcf961ceef4, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 2003, {elementId -> 98e22aa418bbc4eec79d7abf6d43ef71, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}, {UncategorizedText, 39.95, {elementId -> 2758d8ea75e72394c27bbe4b8feba4f7, parentId -> 249aff1b3e9835325b45e51cdfc4ad46}}]|\n", + "|file:/content/xml-files/multi-level.xml|\\n
\\n \\n \\n The Alchemist\\n Paulo Coelho\\n 1988\\n \\n \\n
\\n
\\n \\n \\n A Brief History of Time\\n Stephen Hawking\\n 1988\\n \\n \\n
\\n
\\n|[{Title, The Alchemist, {elementId -> 3f0b15f67f42de56d13e76244399ff1b, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, Paulo Coelho, {elementId -> c36286e42e975f08e839ed574509626c, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 14b03a92e8c7cf57ee62bfcdeadb1e6a}}, {Title, A Brief History of Time, {elementId -> 1aa35512b27fd41a8f8f9cf58c10f46e, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {Title, Stephen Hawking, {elementId -> 7877d555703011ffc6f0b9abbf1f8355, parentId -> 9708b29025b53d9f54c723ee005b647b}}, {UncategorizedText, 1988, {elementId -> 2337fd4aef45764877639e9363feacd7, parentId -> 9708b29025b53d9f54c723ee005b647b}}] |\n", + "+---------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"storeContent\": \"true\"}\n", + "xml_df = sparknlp.read(params).xml(\"./xml-files\")\n", + "xml_df.show(truncate=False)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_E5VEmbeddings.ipynb b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_E5VEmbeddings.ipynb new file mode 100644 index 00000000000000..a0757f51a05f0e --- /dev/null +++ b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_E5VEmbeddings.ipynb @@ -0,0 +1,1530 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c9c512f5", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_E5V.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "860a240a", + "metadata": {}, + "source": [ + "# Import OpenVINO E5V models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "This notebook provides a detailed walkthrough on optimizing and importing E5V models from HuggingFace for use in Spark NLP, with [Intel OpenVINO toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html). The focus is on converting the model to the OpenVINO format and applying precision optimizations (INT8 and INT4), to enhance the performance and efficiency on CPU platforms using [Optimum Intel](https://huggingface.co/docs/optimum/main/en/intel/inference).\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- OpenVINO support was introduced in `Spark NLP 5.4.0`, enabling high performance CPU inference for models. So please make sure you have upgraded to the latest Spark NLP release.\n", + "- Model quantization is a computationally expensive process, so it is recommended to use a runtime with more than 32GB memory for exporting the quantized model from HuggingFace.\n", + "- You can import E5V models via `E5V`. These models are usually under `Text Generation` category and have `E5V` in their labels.\n", + "- Reference: [E5V](https://huggingface.co/docs/transformers/model_doc/llama#transformers.E5V)\n", + "- Some [example models](https://huggingface.co/models?search=E5V)" + ] + }, + { + "cell_type": "markdown", + "id": "100a6911", + "metadata": {}, + "source": [ + "## 1. Export and Save the HuggingFace model\n", + "\n", + "- Let's install `transformers` and `openvino` packages with other dependencies. You don't need `openvino` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.41.2`. This doesn't mean it won't work with the future release, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "902635c5", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "529ad224", + "metadata": {}, + "outputs": [], + "source": [ + "# # Install OpenVINO and NNCF for model optimization\n", + "import platform\n", + "\n", + "%pip install -q \"einops\" \"torch>2.1\" \"torchvision\" \"matplotlib>=3.4\" \"timm>=0.9.8\" \"transformers==4.41.2\" \"pillow\" \"gradio>=4.19\" --extra-index-url https://download.pytorch.org/whl/cpu\n", + "%pip install -q -U --pre \"openvino>=2025.0\" \"openvino-tokenizers>=2025.0\" \"openvino-genai>=2025.0\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n", + "%pip install -q \"accelerate\" \"nncf>=2.14.0\" \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu\n", + "\n", + "if platform.system() == \"Darwin\":\n", + " %pip install -q \"numpy<2.0.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3997e780", + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg -O dog.jpg" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c1623528", + "metadata": {}, + "outputs": [], + "source": [ + "model_id = \"royokong/e5-v\"\n", + "output_dir = f\"./models/int4/{model_id}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "46678a0b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "Loading checkpoint shards: 100%|██████████| 4/4 [01:20<00:00, 20.18s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "111" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration\n", + "import torch\n", + "import gc\n", + "\n", + "processor = LlavaNextProcessor.from_pretrained(model_id)\n", + "image_encoder_model, input_embedding_model, language_model = None, None, None\n", + "\n", + "\n", + "class ImageEncoder(torch.nn.Module):\n", + " def __init__(self, config, vision_tower, multi_modal_projector):\n", + " super().__init__()\n", + " self.config = config\n", + " self.vision_tower = vision_tower\n", + " self.multi_modal_projector = multi_modal_projector\n", + "\n", + " def forward(self, pixel_values):\n", + " batch_size, num_patches, num_channels, height, width = pixel_values.shape\n", + " reshaped_pixel_values = pixel_values.view(\n", + " batch_size * num_patches, num_channels, height, width\n", + " )\n", + " image_features = self.vision_tower(\n", + " reshaped_pixel_values, output_hidden_states=True\n", + " )\n", + " selected_image_feature = image_features.hidden_states[\n", + " self.config.vision_feature_layer\n", + " ]\n", + " if self.config.vision_feature_select_strategy == \"default\":\n", + " selected_image_feature = selected_image_feature[:, 1:]\n", + " elif self.config.vision_feature_select_strategy == \"full\":\n", + " selected_image_feature = selected_image_feature\n", + " image_features = self.multi_modal_projector(selected_image_feature)\n", + " return image_features\n", + "\n", + "\n", + "model = LlavaNextForConditionalGeneration.from_pretrained(\n", + " model_id, low_cpu_mem_usage=True\n", + ")\n", + "model.config.save_pretrained(output_dir)\n", + "image_encoder_model = ImageEncoder(\n", + " model.config, model.vision_tower, model.multi_modal_projector\n", + ")\n", + "input_embedding_model = input_embedding_model = model.get_input_embeddings()\n", + "language_model = model.language_model\n", + "del model\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1908bc09", + "metadata": {}, + "outputs": [], + "source": [ + "import openvino as ov\n", + "from pathlib import Path\n", + "\n", + "core = ov.Core()\n", + "device = \"CPU\"\n", + "# Load the model and convert it to OpenVINO format\n", + "output_dir = f\"./models/int4/{model_id}\"\n", + "output_dir = Path(output_dir)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2341d4b8", + "metadata": {}, + "outputs": [], + "source": [ + "IMAGE_ENCODER_PATH = output_dir / \"openvino_vision_embeddings_model.xml\"\n", + "LANGUAGE_MODEL_PATH = output_dir / \"openvino_language_model.xml\"\n", + "INPUT_EMBEDDING_PATH = output_dir / \"openvino_text_embeddings_model.xml\"\n", + "\n", + "IMAGE_PACKER_PATH = output_dir / \"openvino_image_packer.xml\"\n", + "MULTIMODAL_MERGER_PATH = output_dir / \"openvino_multimodal_merger.xml\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6a0e77cd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/transformers/models/clip/modeling_clip.py:276: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):\n", + "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/transformers/models/clip/modeling_clip.py:316: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):\n" + ] + }, + { + "data": { + "text/plain": [ + "7397" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "import openvino as ov\n", + "import gc\n", + "\n", + "\n", + "def cleanup_torchscript_cache():\n", + " \"\"\"\n", + " Helper for removing cached model representation\n", + " \"\"\"\n", + " torch._C._jit_clear_class_registry()\n", + " torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()\n", + " torch.jit._state._clear_class_state()\n", + "\n", + "\n", + "if not IMAGE_ENCODER_PATH.exists():\n", + " ov_image_encoder = ov.convert_model(\n", + " image_encoder_model, example_input=torch.zeros((1, 5, 3, 336, 336))\n", + " )\n", + " ov.save_model(ov_image_encoder, IMAGE_ENCODER_PATH)\n", + " del ov_image_encoder\n", + " cleanup_torchscript_cache()\n", + "\n", + "del image_encoder_model\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0147d547", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "117" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "llm_input = None\n", + "\n", + "llm_input = input_embedding_model(torch.ones((2, 2), dtype=torch.int64))\n", + "\n", + "if not INPUT_EMBEDDING_PATH.exists():\n", + " ov_input_embeddings_model = ov.convert_model(\n", + " input_embedding_model, example_input=torch.ones((2, 2), dtype=torch.int64)\n", + " )\n", + " ov.save_model(ov_input_embeddings_model, INPUT_EMBEDDING_PATH)\n", + " del ov_input_embeddings_model\n", + " cleanup_torchscript_cache()\n", + "\n", + "del input_embedding_model\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "18b0be05", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/openvino/runtime/__init__.py:10: DeprecationWarning: The `openvino.runtime` module is deprecated and will be removed in the 2026.0 release. Please replace `openvino.runtime` with `openvino`.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from typing import Optional, Tuple, List\n", + "from openvino.runtime import opset13\n", + "import numpy as np\n", + "\n", + "\n", + "def model_has_state(ov_model: ov.Model):\n", + " return len(ov_model.get_sinks()) > 0\n", + "\n", + "\n", + "def model_has_input_output_name(ov_model: ov.Model, name: str):\n", + " \"\"\"\n", + " Helper function for checking that model has specified input or output name\n", + "\n", + " Parameters:\n", + " ov_model (ov.Model):\n", + " name (str):\n", + " name of input or output\n", + "\n", + " Returns:\n", + " True if input or output with requested name exists else False\n", + " \"\"\"\n", + " return name in sum(\n", + " [list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], []\n", + " )\n", + "\n", + "\n", + "def fuse_cache_reorder(\n", + " ov_model: ov.Model,\n", + " not_kv_inputs: List[str],\n", + " key_value_input_names: List[str],\n", + " gather_dim: int,\n", + "):\n", + " \"\"\"\n", + " Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly.\n", + "\n", + " Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model.\n", + " Should be run before make_stateful. Implements optimumum's _reorder_cache\n", + " inside the model in the beginning of each iteration.\n", + " Gather works along given gather_dim dimension that may vary from model to model.\n", + " KV-cache inputs are identified based on names in key_value_input_names.\n", + " Append the new beam_idx parameter to not_kv_inputs.\n", + "\n", + " Parameters:\n", + " ov_model (`ov.Model`):\n", + " openvino model for processing\n", + " not_kv_inputs (`List[str]`):\n", + " list of input nodes in model that not related to past key values\n", + " key_value_input_names (`List[str]`):\n", + " list of names for key value input layers\n", + " gather_dim (int):\n", + " dimension for gathering cache during reorder pass\n", + " \"\"\"\n", + "\n", + " if model_has_input_output_name(ov_model, \"beam_idx\"):\n", + " raise ValueError(\"Model already has fused cache\")\n", + " input_batch = ov_model.input(\"inputs_embeds\").get_partial_shape()[0]\n", + " beam_idx = opset13.parameter(\n", + " name=\"beam_idx\", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch])\n", + " )\n", + " beam_idx.output(0).get_tensor().add_names({\"beam_idx\"}) # why list is not accepted?\n", + " ov_model.add_parameters([beam_idx])\n", + " not_kv_inputs.append(ov_model.inputs[-1])\n", + " # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx\n", + " for input_name in key_value_input_names:\n", + " parameter_output_port = ov_model.input(input_name)\n", + " consumers = parameter_output_port.get_target_inputs()\n", + " gather = opset13.gather(\n", + " parameter_output_port, beam_idx, opset13.constant(gather_dim)\n", + " )\n", + " for consumer in consumers:\n", + " consumer.replace_source_output(gather.output(0))\n", + " ov_model.validate_nodes_and_infer_types()\n", + "\n", + "\n", + "def build_state_initializer(ov_model: ov.Model, batch_dim: int):\n", + " \"\"\"\n", + " Build initialization ShapeOf Expression for all ReadValue ops\n", + "\n", + " Parameters:\n", + " ov_model (ov.Model):\n", + " openvino model\n", + " batch_dim (int):\n", + " index of dimension corresponding to batch size\n", + " \"\"\"\n", + " input_ids = ov_model.input(\"inputs_embeds\")\n", + " batch = opset13.gather(\n", + " opset13.shape_of(input_ids, output_type=\"i64\"),\n", + " opset13.constant([0]),\n", + " opset13.constant(0),\n", + " )\n", + " for op in ov_model.get_ops():\n", + " if op.get_type_name() == \"ReadValue\":\n", + " dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))]\n", + " dims[batch_dim] = batch\n", + " dims = [\n", + " (\n", + " opset13.constant(np.array([dim], dtype=np.int64))\n", + " if isinstance(dim, int)\n", + " else dim\n", + " )\n", + " for dim in dims\n", + " ]\n", + " shape = opset13.concat(dims, axis=0)\n", + " broadcast = opset13.broadcast(\n", + " opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape\n", + " )\n", + " op.set_arguments([broadcast])\n", + " ov_model.validate_nodes_and_infer_types()\n", + "\n", + "\n", + "def make_stateful(\n", + " ov_model: ov.Model,\n", + " not_kv_inputs: List[str],\n", + " key_value_input_names: List[str],\n", + " key_value_output_names: List[str],\n", + " batch_dim: int,\n", + " num_attention_heads: int,\n", + " num_beams_and_batch: int = None,\n", + "):\n", + " \"\"\"\n", + " Hides kv-cache inputs and outputs inside the model as variables.\n", + "\n", + " Parameters:\n", + " ov_model (ov.Model):\n", + " openvino model\n", + " not_kv_inputs (`List[str]`):\n", + " list of input nodes in model that not related to past key values\n", + " key_value_input_names (`List[str]`):\n", + " list of names for key value input layers\n", + " key_value_output_names (`List[str]`):\n", + " list of names for key value input layers\n", + " batch_dim (int):\n", + " index of batch dimension in key value layers\n", + " num_attention_heads (int):\n", + " number of attention heads for batch dimension initialization\n", + " num_beams_an_batch (int):\n", + " precalculated number of beams and batch for shapes initialization\n", + " \"\"\"\n", + " from openvino._offline_transformations import apply_make_stateful_transformation\n", + "\n", + " input_output_map = {}\n", + "\n", + " if num_beams_and_batch is not None:\n", + " # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue\n", + " for input in not_kv_inputs:\n", + " shape = input.get_partial_shape()\n", + " if shape.rank.get_length() <= 2: # == 1 for beam_index\n", + " shape[0] = num_beams_and_batch\n", + " input.get_node().set_partial_shape(shape)\n", + " for kv_name_pair in zip(key_value_input_names, key_value_output_names):\n", + " input_output_map[kv_name_pair[0]] = kv_name_pair[1]\n", + " if num_beams_and_batch is not None:\n", + " input = ov_model.input(kv_name_pair[0])\n", + " shape = input.get_partial_shape()\n", + " shape[batch_dim] = num_beams_and_batch * num_attention_heads\n", + " input.get_node().set_partial_shape(shape)\n", + "\n", + " if num_beams_and_batch is not None:\n", + " # Re-validation model if shapes are altered above\n", + " ov_model.validate_nodes_and_infer_types()\n", + "\n", + " apply_make_stateful_transformation(ov_model, input_output_map)\n", + " if num_beams_and_batch is None:\n", + " build_state_initializer(ov_model, batch_dim)\n", + "\n", + "\n", + "def patch_stateful(ov_model):\n", + " key_value_input_names = [key.get_any_name() for key in ov_model.inputs[2:-1]]\n", + " key_value_output_names = [key.get_any_name() for key in ov_model.outputs[1:]]\n", + " not_kv_inputs = [\n", + " input\n", + " for input in ov_model.inputs\n", + " if not any(name in key_value_input_names for name in input.get_names())\n", + " ]\n", + " if not key_value_input_names or not key_value_output_names:\n", + " return\n", + " batch_dim = 0\n", + " num_attention_heads = 1\n", + "\n", + " fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)\n", + " make_stateful(\n", + " ov_model,\n", + " not_kv_inputs,\n", + " key_value_input_names,\n", + " key_value_output_names,\n", + " batch_dim,\n", + " num_attention_heads,\n", + " None,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5cd69acd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00, 1.00s/it]\n", + "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py:1060: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if sequence_length != 1:\n", + "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/torch/jit/_trace.py:165: UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations. (Triggered internally at /pytorch/build/aten/src/ATen/core/TensorBody.h:489.)\n", + " if a.grad is not None:\n" + ] + } + ], + "source": [ + "import types\n", + "\n", + "make_stateful_model = False\n", + "core = ov.Core()\n", + "model = LlavaNextForConditionalGeneration.from_pretrained(\n", + " model_id, low_cpu_mem_usage=True\n", + ")\n", + "language_model = model.language_model\n", + "if not LANGUAGE_MODEL_PATH.exists() or True:\n", + "\n", + " def forward_wrap(\n", + " self,\n", + " attention_mask,\n", + " position_ids=None,\n", + " past_key_values=None,\n", + " inputs_embeds=None,\n", + " ):\n", + " result = self._orig_forward(\n", + " input_ids=None,\n", + " attention_mask=attention_mask,\n", + " position_ids=position_ids,\n", + " past_key_values=past_key_values,\n", + " inputs_embeds=inputs_embeds,\n", + " output_hidden_states=True,\n", + " return_dict=True,\n", + " )\n", + " return result[\"hidden_states\"][-1][:, -1, :]\n", + "\n", + " model_inputs = [\"attention_mask\", \"position_ids\"]\n", + " model_outputs = [\"last_hidden_state\"]\n", + " model_inputs.append(\"inputs_embeds\")\n", + " language_model.config.torchscript = True\n", + " position_ids = torch.tensor([[2, 3], [2, 3]])\n", + " language_model._orig_forward = language_model.forward\n", + " language_model.forward = types.MethodType(forward_wrap, language_model)\n", + " ov_model = ov.convert_model(\n", + " language_model,\n", + " example_input={\n", + " \"inputs_embeds\": llm_input,\n", + " \"attention_mask\": torch.ones((2, 4)),\n", + " \"position_ids\": position_ids,\n", + " },\n", + " )\n", + "\n", + " for input, input_name in zip(ov_model.inputs, model_inputs):\n", + " input.get_tensor().set_names({input_name})\n", + "\n", + " for output, output_name in zip(ov_model.outputs, model_outputs):\n", + " output.get_tensor().set_names({output_name})\n", + " if make_stateful_model:\n", + " patch_stateful(ov_model)\n", + " ov.save_model(ov_model, LANGUAGE_MODEL_PATH)\n", + " del ov_model\n", + " cleanup_torchscript_cache()\n", + " gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "49838499", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:nncf:NNCF provides best results with torch==2.6.*, while current torch version is 2.7.0+cpu. If you encounter issues, consider switching to torch==2.6.*\n", + "INFO:nncf:Statistics of the bitwidth distribution:\n", + "┍━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n", + "│ Weight compression mode │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n", + "┝━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n", + "│ int8_asym │ 1% (1 / 224) │ 0% (0 / 223) │\n", + "├───────────────────────────┼─────────────────────────────┼────────────────────────────────────────┤\n", + "│ int4_asym │ 99% (223 / 224) │ 100% (223 / 223) │\n", + "┕━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n" + ] + }, + { + "data": { + "text/html": [ + "
/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/rich/live.py:231: UserWarning: install \"ipywidgets\" \n",
+       "for Jupyter support\n",
+       "  warnings.warn('install \"ipywidgets\" for Jupyter support')\n",
+       "
\n" + ], + "text/plain": [ + "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/rich/live.py:231: UserWarning: install \"ipywidgets\" \n", + "for Jupyter support\n", + " warnings.warn('install \"ipywidgets\" for Jupyter support')\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import nncf\n",
+    "\n",
+    "compression_configuration = {\n",
+    "    \"mode\": nncf.CompressWeightsMode.INT4_ASYM,\n",
+    "    \"group_size\": 64,\n",
+    "    \"ratio\": 1.0,\n",
+    "}\n",
+    "LANGUAGE_MODEL_PATH_INT4 = (\n",
+    "    LANGUAGE_MODEL_PATH.parent / LANGUAGE_MODEL_PATH.name.replace(\".xml\", \"-int4.xml\")\n",
+    ")\n",
+    "ov_model = core.read_model(LANGUAGE_MODEL_PATH)\n",
+    "ov_model_compressed = nncf.compress_weights(ov_model, **compression_configuration)\n",
+    "ov.save_model(ov_model_compressed, LANGUAGE_MODEL_PATH_INT4)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "695c2fbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "\n",
+    "class UnpadImage(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(UnpadImage, self).__init__()\n",
+    "\n",
+    "    def forward(self, tensor, original_size, current_size):\n",
+    "        \"\"\"\n",
+    "        Unpads an image tensor to its original size based on the current size.\n",
+    "        Args:\n",
+    "            tensor (torch.Tensor): The input image tensor of shape (C, H, W).\n",
+    "            original_size (torch.Tensor): The original size of the image tensor as (H, W).\n",
+    "            current_size (torch.Tensor): The current size of the image tensor as (H, W).\n",
+    "        \"\"\"\n",
+    "        # tensor: (C, H, W)\n",
+    "        original_size = original_size.to(torch.float32)\n",
+    "        original_height, original_width = original_size[0], original_size[1]\n",
+    "        current_height, current_width = current_size[0], current_size[1]\n",
+    "\n",
+    "        original_aspect_ratio = original_width / original_height\n",
+    "        current_aspect_ratio = current_width / current_height\n",
+    "\n",
+    "        # Comparison\n",
+    "        condition = original_aspect_ratio > current_aspect_ratio\n",
+    "\n",
+    "        # Branch 1: vertical padding\n",
+    "        scale_factor_1 = current_width.float() / original_width.float()\n",
+    "        new_height = (original_height.float() * scale_factor_1).int()\n",
+    "        pad_top = ((current_height.float() - new_height) / 2).floor().long()\n",
+    "\n",
+    "        # Branch 2: horizontal padding\n",
+    "        scale_factor_2 = current_height.float() / original_height.float()\n",
+    "        new_width = (original_width.float() * scale_factor_2).int()\n",
+    "        pad_left = ((current_width.float() - new_width) / 2).floor().long()\n",
+    "\n",
+    "        zero = torch.zeros(1, dtype=pad_top.dtype, device=tensor.device).squeeze(0)\n",
+    "\n",
+    "        # Use torch.where to conditionally compute slicing\n",
+    "        y_start = torch.where(condition, pad_top, zero)\n",
+    "        y_end = torch.where(condition, current_height - pad_top, current_height)\n",
+    "\n",
+    "        x_start = torch.where(condition, zero, pad_left)\n",
+    "        x_end = torch.where(condition, current_width - pad_left, current_width)\n",
+    "        out = tensor[:, y_start.int() : y_end.int(), x_start.int() : x_end.int()]\n",
+    "        return out  # Remove batch dimension if needed\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "ba325001",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "\n",
+    "\n",
+    "class PackImageFeatures(nn.Module):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__()\n",
+    "        self.config = config\n",
+    "        self.unpad_image = UnpadImage()\n",
+    "        self.height = config.vision_config.image_size // config.vision_config.patch_size\n",
+    "        self.width = config.vision_config.image_size // config.vision_config.patch_size\n",
+    "\n",
+    "    def forward(self, image_feature, image_sizes, num_patch_height, num_patch_width):\n",
+    "        # we image features is a single image features, so we can remove the loop\n",
+    "        base_image_features = image_feature[0]\n",
+    "        features = image_feature[1:]  # Skip the first token\n",
+    "        features = (\n",
+    "            features.view(\n",
+    "                num_patch_height, num_patch_width, self.height, self.width, -1\n",
+    "            )\n",
+    "            .permute(4, 0, 2, 1, 3)\n",
+    "            .contiguous()\n",
+    "            .flatten(1, 2)\n",
+    "            .flatten(2, 3)\n",
+    "        )\n",
+    "        features = self.unpad_image(\n",
+    "            features, image_sizes[0], torch._shape_as_tensor(features)[1:3]\n",
+    "        )\n",
+    "        features = features.flatten(1, 2).transpose(0, 1)\n",
+    "        features = torch.cat([base_image_features, features], dim=0)\n",
+    "        return features.unsqueeze(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "f911f0a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "\n",
+    "class MergeInputWithImageFeatures(nn.Module):\n",
+    "    def __init__(self, pad_token_id=0, image_token_index=0):\n",
+    "        super().__init__()\n",
+    "        self.pad_token_id = pad_token_id\n",
+    "        self.image_token_index = image_token_index\n",
+    "\n",
+    "    def forward(self, image_features, inputs_embeds, input_ids, attention_mask):\n",
+    "        num_images, num_image_patches, embed_dim = image_features.shape\n",
+    "        batch_size, sequence_length = input_ids.shape\n",
+    "\n",
+    "        # left_padding = torch.sum(input_ids[:, -1] == self.pad_token_id) == 0  # Removed, not needed now\n",
+    "\n",
+    "        special_image_token_mask = input_ids == self.image_token_index  # [B, S]\n",
+    "        num_special_image_tokens = special_image_token_mask.sum(dim=-1)  # [B]\n",
+    "\n",
+    "        max_embed_dim = (\n",
+    "            num_special_image_tokens.max() * (num_image_patches - 1)\n",
+    "        ) + sequence_length  # scalar\n",
+    "\n",
+    "        batch_indices, non_image_indices = torch.where(\n",
+    "            input_ids != self.image_token_index\n",
+    "        )  # [N], [N]\n",
+    "\n",
+    "        # Step 2: Compute new token positions\n",
+    "        new_token_positions = (\n",
+    "            torch.cumsum(special_image_token_mask * (num_image_patches - 1) + 1, dim=-1)\n",
+    "            - 1\n",
+    "        )  # [B, S]\n",
+    "\n",
+    "        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]  # [B]\n",
+    "\n",
+    "        # left_padding_flag = (input_ids[:, -1] != self.pad_token_id).to(nb_image_pad.dtype)  # original\n",
+    "        left_padding_flag = (\n",
+    "            input_ids[:, -1] != self.pad_token_id\n",
+    "        ).long()  # more idiomatic torch\n",
+    "        # new_token_positions = new_token_positions + (left_padding_flag[:, None] * nb_image_pad[:, None])  # original\n",
+    "        new_token_positions += (\n",
+    "            left_padding_flag[:, None] * nb_image_pad[:, None]\n",
+    "        )  # updated\n",
+    "\n",
+    "        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]  # [N]\n",
+    "\n",
+    "        # Step 3: Init final tensors\n",
+    "        final_embedding = torch.zeros(\n",
+    "            batch_size,\n",
+    "            max_embed_dim,\n",
+    "            embed_dim,\n",
+    "            dtype=inputs_embeds.dtype,\n",
+    "            device=inputs_embeds.device,\n",
+    "        )\n",
+    "        final_attention_mask = torch.zeros(\n",
+    "            batch_size,\n",
+    "            max_embed_dim,\n",
+    "            dtype=attention_mask.dtype,\n",
+    "            device=inputs_embeds.device,\n",
+    "        )\n",
+    "\n",
+    "        # final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]  # original\n",
+    "        final_embedding.index_put_(\n",
+    "            (batch_indices, text_to_overwrite),\n",
+    "            inputs_embeds[batch_indices, non_image_indices],\n",
+    "        )  # torch native\n",
+    "\n",
+    "        # final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]  # original\n",
+    "        final_attention_mask.index_put_(\n",
+    "            (batch_indices, text_to_overwrite),\n",
+    "            attention_mask[batch_indices, non_image_indices],\n",
+    "        )  # torch native\n",
+    "\n",
+    "        # Step 5: fill in image features\n",
+    "        image_to_overwrite = (final_embedding == 0).all(dim=-1)  # [B, L]\n",
+    "        image_to_overwrite &= (image_to_overwrite.cumsum(-1) - 1) >= nb_image_pad[\n",
+    "            :, None\n",
+    "        ]  # apply pad cutoff\n",
+    "\n",
+    "        flat_image_features = image_features.reshape(-1, embed_dim).to(\n",
+    "            inputs_embeds.device\n",
+    "        )  # [N_img, D]\n",
+    "\n",
+    "        # final_embedding[image_to_overwrite] = flat_image_features  # original\n",
+    "        final_embedding[image_to_overwrite] = flat_image_features[\n",
+    "            : image_to_overwrite.sum()\n",
+    "        ]  # safe assignment\n",
+    "\n",
+    "        final_attention_mask |= image_to_overwrite  # logical or with existing mask\n",
+    "\n",
+    "        position_ids = final_attention_mask.cumsum(-1) - 1\n",
+    "        position_ids = position_ids.masked_fill(final_attention_mask == 0, 1)\n",
+    "\n",
+    "        # Step 6: remove pad token embeddings\n",
+    "        batch_pad_indices, pad_token_positions = torch.where(\n",
+    "            input_ids == self.pad_token_id\n",
+    "        )  # [N_pad]\n",
+    "        indices_to_mask = new_token_positions[\n",
+    "            batch_pad_indices, pad_token_positions\n",
+    "        ]  # [N_pad]\n",
+    "\n",
+    "        # final_embedding[batch_pad_indices, indices_to_mask] = 0  # original\n",
+    "        final_embedding.index_put_(\n",
+    "            (batch_pad_indices, indices_to_mask),\n",
+    "            torch.zeros_like(final_embedding[batch_pad_indices, indices_to_mask]),\n",
+    "        )  # updated\n",
+    "\n",
+    "        return {\n",
+    "            \"final_embedding\": final_embedding,\n",
+    "            \"final_attention_mask\": final_attention_mask,\n",
+    "            \"position_ids\": position_ids,\n",
+    "        }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfb25757",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compile the models\n",
+    "language_model = core.read_model(LANGUAGE_MODEL_PATH)\n",
+    "compiled_language_model = core.compile_model(language_model, \"AUTO\")\n",
+    "\n",
+    "image_embed_model = core.compile_model(IMAGE_ENCODER_PATH, device)\n",
+    "text_embeddings_model = core.compile_model(INPUT_EMBEDDING_PATH, device)\n",
+    "\n",
+    "if IMAGE_PACKER_PATH.exists():\n",
+    "    image_packer_model = core.compile_model(IMAGE_PACKER_PATH, device)\n",
+    "else:\n",
+    "    image_packer_model = None\n",
+    "if MULTIMODAL_MERGER_PATH.exists()\n",
+    "    multimodal_merger_model = core.compile_model(MULTIMODAL_MERGER_PATH, device)\n",
+    "else:\n",
+    "    multimodal_merger_model = None\n",
+    "\n",
+    "# multimodal_merger_model = core.compile_model(MODEL_MERGER_PATH, device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "0d5643ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+      "/home/prabod/anaconda3/envs/e5v/lib/python3.11/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Image size: (360, 282), Mode: RGB\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "import requests\n",
+    "from PIL import Image\n",
+    "from transformers import AutoTokenizer, AutoConfig\n",
+    "from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration\n",
+    "\n",
+    "llama3_template = \"<|start_header_id|>user<|end_header_id|>\\n\\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n \\n\"\n",
+    "\n",
+    "processor = LlavaNextProcessor.from_pretrained(\"royokong/e5-v\")\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(\"royokong/e5-v\")\n",
+    "img_prompt = llama3_template.format(\"\\nSummary above image in one word: \")\n",
+    "text_prompt = llama3_template.format(\"\\nSummary above sentence in one word: \")\n",
+    "\n",
+    "images = [Image.open(\"dog.jpg\").convert(\"RGB\")]\n",
+    "\n",
+    "for image in images:\n",
+    "    print(f\"Image size: {image.size}, Mode: {image.mode}\")\n",
+    "\n",
+    "texts = [\"A dog sitting in the grass.\"]\n",
+    "\n",
+    "text_inputs = processor(\n",
+    "    [text_prompt.replace(\"\", text) for text in texts],\n",
+    "    return_tensors=\"pt\",\n",
+    "    padding=True,\n",
+    ")\n",
+    "img_inputs = processor(\n",
+    "    [img_prompt] * len(images), images, return_tensors=\"pt\", padding=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "ad1b402c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_input_ids = img_inputs[\"input_ids\"]\n",
+    "img_attention_mask = img_inputs[\"attention_mask\"]\n",
+    "image_sizes = img_inputs[\"image_sizes\"]\n",
+    "pixel_values = img_inputs[\"pixel_values\"]\n",
+    "\n",
+    "text_input_ids = text_inputs[\"input_ids\"]\n",
+    "text_attention_mask = text_inputs[\"attention_mask\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "2649d101",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_features = torch.from_numpy(image_embed_model(pixel_values)[0])\n",
+    "image_inputs_embeds = torch.from_numpy(text_embeddings_model(img_input_ids)[0])\n",
+    "text_inputs_embeds = torch.from_numpy(text_embeddings_model(text_input_ids)[0])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "844968c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_packer = PackImageFeatures(config)\n",
+    "input_merger = MergeInputWithImageFeatures(\n",
+    "    pad_token_id=processor.tokenizer.pad_token_id,\n",
+    "    image_token_index=config.image_token_index,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "190da649",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from typing import Union, List, Tuple\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:\n",
+    "    \"\"\"\n",
+    "    Selects the best resolution from a list of possible resolutions based on the original size.\n",
+    "\n",
+    "    This is done by calculating the effective and wasted resolution for each possible resolution.\n",
+    "\n",
+    "    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.\n",
+    "\n",
+    "    Args:\n",
+    "        original_size (tuple):\n",
+    "            The original size of the image in the format (height, width).\n",
+    "        possible_resolutions (list):\n",
+    "            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].\n",
+    "\n",
+    "    Returns:\n",
+    "        tuple: The best fit resolution in the format (height, width).\n",
+    "    \"\"\"\n",
+    "    original_height, original_width = original_size\n",
+    "    best_fit = None\n",
+    "    max_effective_resolution = 0\n",
+    "    min_wasted_resolution = float(\"inf\")\n",
+    "\n",
+    "    for height, width in possible_resolutions:\n",
+    "        scale = min(width / original_width, height / original_height)\n",
+    "        downscaled_width, downscaled_height = (\n",
+    "            int(original_width * scale),\n",
+    "            int(original_height * scale),\n",
+    "        )\n",
+    "        effective_resolution = min(\n",
+    "            downscaled_width * downscaled_height, original_width * original_height\n",
+    "        )\n",
+    "        wasted_resolution = (width * height) - effective_resolution\n",
+    "\n",
+    "        if effective_resolution > max_effective_resolution or (\n",
+    "            effective_resolution == max_effective_resolution\n",
+    "            and wasted_resolution < min_wasted_resolution\n",
+    "        ):\n",
+    "            max_effective_resolution = effective_resolution\n",
+    "            min_wasted_resolution = wasted_resolution\n",
+    "            best_fit = (height, width)\n",
+    "\n",
+    "    return best_fit\n",
+    "\n",
+    "\n",
+    "def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):\n",
+    "    \"\"\"\n",
+    "    Calculate the number of patches after the preprocessing for images of any resolution.\n",
+    "\n",
+    "    Args:\n",
+    "        image_size (`Union[torch.LongTensor, np.ndarray, Tuple[int, int]):\n",
+    "            The size of the input image in the format (height, width). ?\n",
+    "        grid_pinpoints (`List`):\n",
+    "            A list containing possible resolutions. Each item in the list should be a tuple or list\n",
+    "            of the form `(height, width)`.\n",
+    "        patch_size (`int`):\n",
+    "            The size of each image patch.\n",
+    "\n",
+    "    Returns:\n",
+    "        int: the number of patches\n",
+    "    \"\"\"\n",
+    "    if not isinstance(grid_pinpoints, list):\n",
+    "        raise ValueError(\"grid_pinpoints should be a list of tuples or lists\")\n",
+    "\n",
+    "    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate\n",
+    "    if not isinstance(image_size, (list, tuple)):\n",
+    "        if not isinstance(image_size, (torch.Tensor, np.ndarray)):\n",
+    "            raise ValueError(\n",
+    "                f\"image_size invalid type {type(image_size)} with value {image_size}\"\n",
+    "            )\n",
+    "        image_size = image_size.tolist()\n",
+    "\n",
+    "    best_resolution = select_best_resolution(image_size, grid_pinpoints)\n",
+    "    height, width = best_resolution\n",
+    "    num_patches = 0\n",
+    "    # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1\n",
+    "    for i in range(0, height, patch_size):\n",
+    "        for j in range(0, width, patch_size):\n",
+    "            num_patches += 1\n",
+    "    # add the base patch\n",
+    "    num_patches += 1\n",
+    "    return num_patches\n",
+    "\n",
+    "\n",
+    "def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):\n",
+    "    \"\"\"\n",
+    "    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.\n",
+    "\n",
+    "    Args:\n",
+    "        image_size (`tuple`):\n",
+    "            The size of the input image in the format (width, height).\n",
+    "        grid_pinpoints (`List`):\n",
+    "            A list containing possible resolutions. Each item in the list should be a tuple or list\n",
+    "            of the form `(height, width)`.\n",
+    "        patch_size (`int`):\n",
+    "            The size of each image patch.\n",
+    "\n",
+    "    Returns:\n",
+    "        tuple: The shape of the image patch grid in the format (width, height).\n",
+    "    \"\"\"\n",
+    "    if not isinstance(grid_pinpoints, list):\n",
+    "        raise ValueError(\"grid_pinpoints should be a list of tuples or lists\")\n",
+    "\n",
+    "    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate\n",
+    "    if not isinstance(image_size, (list, tuple)):\n",
+    "        if not isinstance(image_size, (torch.Tensor, np.ndarray)):\n",
+    "            raise ValueError(\n",
+    "                f\"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor\"\n",
+    "            )\n",
+    "        image_size = image_size.tolist()\n",
+    "\n",
+    "    height, width = select_best_resolution(image_size, grid_pinpoints)\n",
+    "    return height // patch_size, width // patch_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "bcbec245",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_patch_width, num_patch_height = get_anyres_image_grid_shape(\n",
+    "    image_sizes[0],\n",
+    "    config.image_grid_pinpoints,\n",
+    "    config.vision_config.image_size,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "40620525",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "packed_image_features = image_packer(\n",
+    "    image_features,\n",
+    "    image_sizes,\n",
+    "    num_patch_height=num_patch_height,\n",
+    "    num_patch_width=num_patch_width\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "0eb947f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "if IMAGE_PACKER_PATH.exists():\n",
+    "    IMAGE_PACKER_PATH.unlink()\n",
+    "\n",
+    "ov_image_packer = ov.convert_model(\n",
+    "    image_packer,\n",
+    "    example_input={\n",
+    "        \"image_feature\": image_features,\n",
+    "        \"image_sizes\": image_sizes,\n",
+    "        \"num_patch_height\": torch.tensor(num_patch_height, dtype=torch.int64),\n",
+    "        \"num_patch_width\": torch.tensor(num_patch_width, dtype=torch.int64)\n",
+    "    }\n",
+    ")\n",
+    "ov.save_model(ov_image_packer, IMAGE_PACKER_PATH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "f2fae423",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if MULTIMODAL_MERGER_PATH.exists():\n",
+    "    MULTIMODAL_MERGER_PATH.unlink()\n",
+    "ov_multimodal_merger = ov.convert_model(\n",
+    "    input_merger,\n",
+    "    example_input={\n",
+    "        \"image_features\": packed_image_features,\n",
+    "        \"inputs_embeds\": image_inputs_embeds,\n",
+    "        \"input_ids\": img_input_ids,\n",
+    "        \"attention_mask\": img_attention_mask\n",
+    "    }\n",
+    ")\n",
+    "ov.save_model(ov_multimodal_merger, MULTIMODAL_MERGER_PATH)\n",
+    "cleanup_torchscript_cache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "0599dd94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import shutil\n",
+    "import os\n",
+    "if not os.path.exists(f\"{output_dir}/assets\"):\n",
+    "    output_dir = Path(output_dir)\n",
+    "    assets_dir = output_dir/\"assets\"\n",
+    "    assets_dir.mkdir(exist_ok=True)\n",
+    "    processor.save_pretrained(output_dir)\n",
+    "    # copy all the assets to the assets directory (json files, vocab files, etc.)\n",
+    "    for file in output_dir.glob(\"*.json\"):\n",
+    "        shutil.copy(file, assets_dir)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "27e894ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# delete the f32 language model\n",
+    "if LANGUAGE_MODEL_PATH.exists():\n",
+    "    LANGUAGE_MODEL_PATH.unlink()\n",
+    "\n",
+    "# delete the f32 language model bin file if exists\n",
+    "if LANGUAGE_MODEL_PATH.with_suffix(\".bin\").exists():\n",
+    "    LANGUAGE_MODEL_PATH.with_suffix(\".bin\").unlink()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ff9ecebb",
+   "metadata": {},
+   "source": [
+    "## 2. Test the Exported model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "17eaa581",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "IMAGE_ENCODER_PATH = output_dir / \"openvino_vision_embeddings_model.xml\"\n",
+    "LANGUAGE_MODEL_PATH = output_dir / \"openvino_language_model-int4.xml\"\n",
+    "INPUT_EMBEDDING_PATH = output_dir / \"openvino_text_embeddings_model.xml\"\n",
+    "\n",
+    "IMAGE_PACKER_PATH = output_dir / \"openvino_image_packer.xml\"\n",
+    "MULTIMODAL_MERGER_PATH = output_dir / \"openvino_multimodal_merger.xml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "0782a4a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compile the models\n",
+    "language_model = core.read_model(LANGUAGE_MODEL_PATH)\n",
+    "compiled_language_model = core.compile_model(language_model, \"AUTO\")\n",
+    "\n",
+    "image_embed_model = core.compile_model(IMAGE_ENCODER_PATH, device)\n",
+    "text_embeddings_model = core.compile_model(INPUT_EMBEDDING_PATH, device)\n",
+    "\n",
+    "if IMAGE_PACKER_PATH.exists():\n",
+    "    image_packer_model = core.compile_model(IMAGE_PACKER_PATH, device)\n",
+    "else:\n",
+    "    image_packer_model = None\n",
+    "if MULTIMODAL_MERGER_PATH.exists():\n",
+    "    multimodal_merger_model = core.compile_model(MULTIMODAL_MERGER_PATH, device)\n",
+    "else:\n",
+    "    multimodal_merger_model = None\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "4b88a40c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use openvino model to pack the image features\n",
+    "packed_image_features = image_packer_model({\n",
+    "    'image_feature': image_features,\n",
+    "    'image_sizes': image_sizes,\n",
+    "    'num_patch_height': torch.tensor(num_patch_height, dtype=torch.int64),\n",
+    "    'num_patch_width': torch.tensor(num_patch_width, dtype=torch.int64)\n",
+    "})[0]\n",
+    "packed_image_features = torch.from_numpy(packed_image_features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "1a69b30b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use openvino model to merge the image features with text features\n",
+    "merger_out = multimodal_merger_model({\n",
+    "        \"image_features\": packed_image_features,\n",
+    "        \"inputs_embeds\": image_inputs_embeds,\n",
+    "        \"input_ids\": img_input_ids,\n",
+    "        \"attention_mask\": img_attention_mask\n",
+    "    }\n",
+    ")\n",
+    "image_final_embeds = torch.from_numpy(merger_out['final_embedding'])\n",
+    "image_final_attention_mask = torch.from_numpy(merger_out['final_attention_mask'])\n",
+    "image_position_ids = torch.from_numpy(merger_out['position_ids'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "131763dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "request = compiled_language_model.create_infer_request()\n",
+    "img_input_lm = {\n",
+    "    \"inputs_embeds\": image_final_embeds.detach().numpy(),\n",
+    "    \"attention_mask\": image_final_attention_mask.detach().numpy(),\n",
+    "    \"position_ids\": image_position_ids.detach().numpy(),\n",
+    "}\n",
+    "request.start_async(img_input_lm, share_inputs=True)\n",
+    "request.wait()\n",
+    "img_lm_output = torch.from_numpy(request.get_tensor(\"last_hidden_state\").data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "68787196",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_request = compiled_language_model.create_infer_request()\n",
+    "text_position_ids = text_attention_mask.long().cumsum(-1) - 1\n",
+    "text_position_ids.masked_fill_(text_attention_mask == 0, 1)\n",
+    "text_input_lm = {\n",
+    "    \"inputs_embeds\": text_inputs_embeds.detach().numpy(),\n",
+    "    \"attention_mask\": text_attention_mask.detach().numpy(),\n",
+    "    \"position_ids\": text_position_ids.detach().numpy(),\n",
+    "}\n",
+    "text_request.start_async(text_input_lm, share_inputs=True)\n",
+    "text_request.wait()\n",
+    "text_lm_output = torch.from_numpy(text_request.get_tensor(\"last_hidden_state\").data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "df6a5ae1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[0.7158]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch.nn.functional as F\n",
+    "\n",
+    "txt_embed = F.normalize(text_lm_output, dim=-1)\n",
+    "img_embed = F.normalize(img_lm_output, dim=-1)\n",
+    "\n",
+    "print(txt_embed @ img_embed.T)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3764af1b",
+   "metadata": {},
+   "source": [
+    "## 3 Import and Save E5V in Spark NLP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "265ecf82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "285bb60c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sparknlp\n",
+    "\n",
+    "# let's start Spark with Spark NLP\n",
+    "spark = sparknlp.start()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "18611787",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_id = \"royokong/e5-v\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8ca2060a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "25/06/10 03:45:32 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n",
+      "25/06/10 03:45:41 WARN NativeLibrary: Failed to load library null: java.lang.UnsatisfiedLinkError: Can't load library: /tmp/openvino-native4021672575912693842/libtbb.so.2\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING: An illegal reflective access operation has occurred\n",
+      "WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/home/prabod/spark/jars/spark-core_2.12-3.3.2.jar) to field java.util.regex.Pattern.pattern\n",
+      "WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$\n",
+      "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
+      "WARNING: All illegal access operations will be denied in a future release\n"
+     ]
+    }
+   ],
+   "source": [
+    "e5v_embeddings_sn = E5VEmbeddings \\\n",
+    "            .loadSavedModel(str(output_dir),spark) \\\n",
+    "            .setInputCols(\"image_assembler\") \\\n",
+    "            .setOutputCol(\"answer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "d5b60572",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "e5v_embeddings_sn.write().overwrite().save(f\"file:///tmp/{model_id}_spark_nlp\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd9cf656",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sparknlp\n",
+    "from sparknlp.base import *\n",
+    "from sparknlp.annotator import *\n",
+    "from pyspark.sql.functions import lit\n",
+    "from pyspark.ml import Pipeline\n",
+    "from sparknlp.util import EmbeddingsDataFrameUtils\n",
+    "\n",
+    "from pathlib import Path\n",
+    "import os\n",
+    "\n",
+    "# download two images to test into ./images folder\n",
+    "\n",
+    "url1 = \"https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11\"\n",
+    "\n",
+    "Path(\"images\").mkdir(exist_ok=True)\n",
+    "\n",
+    "!wget -q -O images/image1.jpg {url1}\n",
+    "\n",
+    "\n",
+    "\n",
+    "images_path = \"file://\" + os.getcwd() + \"/images/\"\n",
+    "image_df = spark.read.format(\"image\").load(\n",
+    "    path=images_path\n",
+    ")\n",
+    "\n",
+    "imagePrompt = \"<|start_header_id|>user<|end_header_id|>\\n\\n\\\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n \\n\"\n",
+    "image_df = spark.read.format(\"image\").option(\"dropInvalid\", True).load(images_path)\n",
+    "test_df = image_df.withColumn(\"text\", lit(imagePrompt))\n",
+    "\n",
+    "textPrompt = \"<|start_header_id|>user<|end_header_id|>\\n\\n\\\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n \\n\"\n",
+    "textDesc = \"A cat sitting in a box.\"\n",
+    "nullImageDF = spark.createDataFrame(\n",
+    "    [EmbeddingsDataFrameUtils.emptyImageRow], schema=\n",
+    "    EmbeddingsDataFrameUtils.imageSchema)\n",
+    "textDF = nullImageDF.withColumn(\"text\", lit(textPrompt.replace(\"\", textDesc)))\n",
+    "\n",
+    "test_df = test_df.union(textDF)\n",
+    "\n",
+    "imageAssembler = ImageAssembler() \\\n",
+    "            .setInputCol(\"image\") \\\n",
+    "            .setOutputCol(\"image_assembler\")\n",
+    "e5v = E5VEmbeddings.load(f\"file:///tmp/{model_id}_spark_nlp\") \\\n",
+    "    .setInputCols([\"image_assembler\"]) \\\n",
+    "    .setOutputCol(\"e5v\")\n",
+    "pipeline = Pipeline().setStages([imageAssembler, e5v])\n",
+    "results = pipeline.fit(test_df).transform(test_df)\n",
+    "results.select(\"e5v.embeddings\").show(truncate=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "e5v",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/README.md b/python/README.md
index 4185cb673f6df8..05088b0fb6dccf 100644
--- a/python/README.md
+++ b/python/README.md
@@ -63,7 +63,7 @@ $ java -version
 $ conda create -n sparknlp python=3.7 -y
 $ conda activate sparknlp
 # spark-nlp by default is based on pyspark 3.x
-$ pip install spark-nlp==6.0.2 pyspark==3.3.1
+$ pip install spark-nlp==6.0.3 pyspark==3.3.1
 ```
 
 In Python console or Jupyter `Python3` kernel:
@@ -129,7 +129,7 @@ For a quick example of using pipelines and models take a look at our official [d
 
 ### Apache Spark Support
 
-Spark NLP *6.0.2* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
+Spark NLP *6.0.3* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
 
 | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
 |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
@@ -159,7 +159,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
 
 ### Databricks Support
 
-Spark NLP 6.0.2 has been tested and is compatible with the following runtimes:
+Spark NLP 6.0.3 has been tested and is compatible with the following runtimes:
 
 | **CPU**            | **GPU**            |
 |--------------------|--------------------|
@@ -176,7 +176,7 @@ We are compatible with older runtimes. For a full list check databricks support
 
 ### EMR Support
 
-Spark NLP 6.0.2 has been tested and is compatible with the following EMR releases:
+Spark NLP 6.0.3 has been tested and is compatible with the following EMR releases:
 
 | **EMR Release**    |
 |--------------------|
diff --git a/python/setup.py b/python/setup.py
index 4e25a688f18c51..2c4b63df248851 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -41,7 +41,7 @@
     # project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
 
-    version='6.0.2',  # Required
+    version='6.0.3',  # Required
 
     # This is a one-line description or tagline of what your project does. This
     # corresponds to the 'Summary' metadata field:
diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py
index 11fdbd17ab7e90..ff98a0aa1fd539 100644
--- a/python/sparknlp/__init__.py
+++ b/python/sparknlp/__init__.py
@@ -66,7 +66,7 @@
 annotators = annotator
 embeddings = annotator
 
-__version__ = "6.0.2"
+__version__ = "6.0.3"
 
 
 def start(gpu=False,
diff --git a/python/sparknlp/annotator/embeddings/__init__.py b/python/sparknlp/annotator/embeddings/__init__.py
index da453d2c555037..f93ac2e3b11ec4 100644
--- a/python/sparknlp/annotator/embeddings/__init__.py
+++ b/python/sparknlp/annotator/embeddings/__init__.py
@@ -41,3 +41,4 @@
 from sparknlp.annotator.embeddings.snowflake_embeddings import *
 from sparknlp.annotator.embeddings.nomic_embeddings import *
 from sparknlp.annotator.embeddings.auto_gguf_embeddings import *
+from sparknlp.annotator.embeddings.e5v_embeddings import *
\ No newline at end of file
diff --git a/python/sparknlp/annotator/embeddings/e5v_embeddings.py b/python/sparknlp/annotator/embeddings/e5v_embeddings.py
new file mode 100644
index 00000000000000..e8ee518a40333e
--- /dev/null
+++ b/python/sparknlp/annotator/embeddings/e5v_embeddings.py
@@ -0,0 +1,138 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from sparknlp.common import *
+
+class E5VEmbeddings(AnnotatorModel,
+                   HasBatchedAnnotateImage,
+                   HasImageFeatureProperties,
+                   HasEngine,
+                    HasRescaleFactor):
+    """Universal multimodal embeddings using the E5-V model (see https://huggingface.co/royokong/e5-v).
+
+    E5-V bridges the modality gap between different input types (text, image) and demonstrates strong performance in multimodal embeddings, even without fine-tuning. It also supports a single-modality training approach, where the model is trained exclusively on text pairs, often yielding better performance than multimodal training.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
+
+    >>> e5vEmbeddings = E5VEmbeddings.pretrained() \
+    ...     .setInputCols(["image_assembler"]) \
+    ...     .setOutputCol("e5v")
+
+    The default model is ``"e5v_int4"``, if no name is provided.
+
+    For available pretrained models please see the `Models Hub `__.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``IMAGE``              ``SENTENCE_EMBEDDINGS``
+    ====================== ======================
+
+    Examples
+    --------
+    Image + Text Embedding:
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> image_df = spark.read.format("image").option("dropInvalid", value = True).load(imageFolder)
+    >>> imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
+    >>> test_df = image_df.withColumn("text", lit(imagePrompt))
+    >>> imageAssembler = ImageAssembler() \
+    ...     .setInputCol("image") \
+    ...     .setOutputCol("image_assembler")
+    >>> e5vEmbeddings = E5VEmbeddings.pretrained() \
+    ...     .setInputCols(["image_assembler"]) \
+    ...     .setOutputCol("e5v")
+    >>> pipeline = Pipeline().setStages([
+    ...     imageAssembler,
+    ...     e5vEmbeddings
+    ... ])
+    >>> result = pipeline.fit(test_df).transform(test_df)
+    >>> result.select("e5v.embeddings").show(truncate = False)
+
+    Text-Only Embedding:
+    >>> from sparknlp.util import EmbeddingsDataFrameUtils
+    >>> textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
+    >>> textDesc = "A cat sitting in a box."
+    >>> nullImageDF = spark.createDataFrame(spark.sparkContext.parallelize([EmbeddingsDataFrameUtils.emptyImageRow]), EmbeddingsDataFrameUtils.imageSchema)
+    >>> textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("", textDesc)))
+    >>> e5vEmbeddings = E5VEmbeddings.pretrained() \
+    ...     .setInputCols(["image"]) \
+    ...     .setOutputCol("e5v")
+    >>> result = e5vEmbeddings.transform(textDF)
+    >>> result.select("e5v.embeddings").show(truncate = False)
+    """
+
+    name = "E5VEmbeddings"
+
+    inputAnnotatorTypes = [AnnotatorType.IMAGE]
+    outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.E5VEmbeddings", java_model=None):
+        """Initializes the E5VEmbeddings annotator.
+
+        Parameters
+        ----------
+        classname : str, optional
+            The Java class name of the annotator, by default "com.johnsnowlabs.nlp.annotators.embeddings.E5VEmbeddings"
+        java_model : Optional[java.lang.Object], optional
+            A pre-initialized Java model, by default None
+        """
+        super(E5VEmbeddings, self).__init__(classname=classname, java_model=java_model)
+        self._setDefault()
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session, use_openvino=False):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+        use_openvino : bool, optional
+            Whether to use OpenVINO engine, by default False
+
+        Returns
+        -------
+        E5VEmbeddings
+            The restored model
+        """
+        from sparknlp.internal import _E5VEmbeddingsLoader
+        jModel = _E5VEmbeddingsLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
+        return E5VEmbeddings(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="e5v_int4", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "e5v_int4"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        E5VEmbeddings
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(E5VEmbeddings, name, lang, remote_loc) 
\ No newline at end of file
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
index 25f34ce4eba599..e7300ab8586b5c 100644
--- a/python/sparknlp/internal/__init__.py
+++ b/python/sparknlp/internal/__init__.py
@@ -1165,3 +1165,11 @@ def __init__(self, path, jspark, use_openvino=False):
             jspark,
             use_openvino,
         )
+class _E5VEmbeddingsLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark, use_openvino=False):
+        super(_E5VEmbeddingsLoader, self).__init__(
+            "com.johnsnowlabs.nlp.embeddings.E5VEmbeddings.loadSavedModel",
+            path,
+            jspark,
+            use_openvino
+        )
\ No newline at end of file
diff --git a/python/sparknlp/partition/partition_properties.py b/python/sparknlp/partition/partition_properties.py
index 3bea2e77610da7..a13f9167eef668 100644
--- a/python/sparknlp/partition/partition_properties.py
+++ b/python/sparknlp/partition/partition_properties.py
@@ -254,4 +254,66 @@ def setThreshold(self, value):
         return self._set(threshold=value)
 
     def getThreshold(self):
-        return self.getOrDefault(self.threshold)
\ No newline at end of file
+        return self.getOrDefault(self.threshold)
+
+class HasChunkerProperties(Params):
+
+    chunkingStrategy = Param(
+        Params._dummy(),
+        "chunkingStrategy",
+        "Set the chunking strategy",
+        typeConverter=TypeConverters.toString
+    )
+
+    def setChunkingStrategy(self, value):
+        return self._set(chunkingStrategy=value)
+
+    maxCharacters = Param(
+        Params._dummy(),
+        "maxCharacters",
+        "Set the maximum number of characters",
+        typeConverter=TypeConverters.toInt
+    )
+
+    def setMaxCharacters(self, value):
+        return self._set(maxCharacters=value)
+
+    newAfterNChars = Param(
+        Params._dummy(),
+        "newAfterNChars",
+        "Insert a new chunk after N characters",
+        typeConverter=TypeConverters.toInt
+    )
+
+    def setNewAfterNChars(self, value):
+        return self._set(newAfterNChars=value)
+
+    overlap = Param(
+        Params._dummy(),
+        "overlap",
+        "Set the number of overlapping characters between chunks",
+        typeConverter=TypeConverters.toInt
+    )
+
+    def setOverlap(self, value):
+        return self._set(overlap=value)
+
+    combineTextUnderNChars = Param(
+        Params._dummy(),
+        "combineTextUnderNChars",
+        "Threshold to merge adjacent small sections",
+        typeConverter=TypeConverters.toInt
+    )
+
+    def setCombineTextUnderNChars(self, value):
+        return self._set(combineTextUnderNChars=value)
+
+    overlapAll = Param(
+        Params._dummy(),
+        "overlapAll",
+        "Apply overlap context between all sections, not just split chunks",
+        typeConverter=TypeConverters.toBoolean
+    )
+
+    def setOverlapAll(self, value):
+        return self._set(overlapAll=value)
diff --git a/python/sparknlp/partition/partition_transformer.py b/python/sparknlp/partition/partition_transformer.py
index 0598c3aaa20af2..a971bb44ae78b7 100644
--- a/python/sparknlp/partition/partition_transformer.py
+++ b/python/sparknlp/partition/partition_transformer.py
@@ -15,13 +15,15 @@
 from sparknlp.common import *
 from sparknlp.partition.partition_properties import *
 
+
 class PartitionTransformer(
     AnnotatorModel,
     HasEmailReaderProperties,
     HasExcelReaderProperties,
     HasHTMLReaderProperties,
     HasPowerPointProperties,
-    HasTextReaderProperties
+    HasTextReaderProperties,
+    HasChunkerProperties
 ):
     """
     The PartitionTransformer annotator allows you to use the Partition feature more smoothly
@@ -162,10 +164,6 @@ def setIncludePageBreaks(self, value):
     def getIncludePageBreaks(self):
         return self.getOrDefault(self.includePageBreaks)
 
-    # def setHeaders(self, headers: Dict[str, str]):
-    #     self._call_java("setHeadersPython", headers)
-    #     return self
-
     @keyword_only
     def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
                  java_model=None):
@@ -192,5 +190,11 @@ def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
             paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
             shortLineWordThreshold=5,
             maxLineCount=2000,
-            threshold=0.1
-        )
+            threshold=0.1,
+            chunkingStrategy="",
+            maxCharacters=100,
+            newAfterNChars=-1,
+            overlap=0,
+            combineTextUnderNChars=0,
+            overlapAll=False
+        )
\ No newline at end of file
diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py
index dfd865116f3821..86bf5781053050 100644
--- a/python/sparknlp/reader/sparknlp_reader.py
+++ b/python/sparknlp/reader/sparknlp_reader.py
@@ -322,4 +322,49 @@ def txt(self, docPath):
         if not isinstance(docPath, str):
             raise TypeError("docPath must be a string")
         jdf = self._java_obj.txt(docPath)
+        return self.getDataFrame(self.spark, jdf)
+
+    def xml(self, docPath):
+        """Reads XML files and returns a Spark DataFrame.
+
+        Parameters
+        ----------
+        docPath : str
+            Path to an XML file or a directory containing XML files.
+
+        Returns
+        -------
+        pyspark.sql.DataFrame
+            A DataFrame containing parsed XML content.
+
+        Examples
+        --------
+        >>> from sparknlp.reader import SparkNLPReader
+        >>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
+
+        You can use SparkNLP for one line of code
+
+        >>> import sparknlp
+        >>> xml_df = sparknlp.read().xml("home/user/xml-directory")
+        >>> xml_df.show(truncate=False)
+        +-----------------------------------------------------------+
+        |xml                                                       |
+        +-----------------------------------------------------------+
+        |[{Title, John Smith, {elementId -> ..., tag -> title}}]   |
+        +-----------------------------------------------------------+
+
+        >>> xml_df.printSchema()
+        root
+         |-- path: string (nullable = true)
+         |-- xml: array (nullable = true)
+         |    |-- element: struct (containsNull = true)
+         |    |    |-- elementType: string (nullable = true)
+         |    |    |-- content: string (nullable = true)
+         |    |    |-- metadata: map (nullable = true)
+         |    |    |    |-- key: string
+         |    |    |    |-- value: string (valueContainsNull = true)
+        """
+        if not isinstance(docPath, str):
+            raise TypeError("docPath must be a string")
+        jdf = self._java_obj.xml(docPath)
         return self.getDataFrame(self.spark, jdf)
\ No newline at end of file
diff --git a/python/sparknlp/util.py b/python/sparknlp/util.py
index 0bbacd410a9e8f..8381337b5873a5 100644
--- a/python/sparknlp/util.py
+++ b/python/sparknlp/util.py
@@ -15,6 +15,9 @@
 
 
 import sparknlp.internal as _internal
+import numpy as np
+from pyspark.sql import Row
+from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BinaryType
 
 
 def get_config_path():
@@ -33,3 +36,26 @@ def exportConllFiles(*args):
             _internal._CoNLLGeneratorExportFromTargetAndPipeline(*args).apply()
         else:
             raise NotImplementedError(f"No exportConllFiles alternative takes {num_args} parameters")
+
+
+class EmbeddingsDataFrameUtils:
+    """
+    Utility for creating DataFrames compatible with multimodal embedding models (e.g., E5VEmbeddings) for text-only scenarios.
+    Provides:
+      - imageSchema: the expected schema for Spark image DataFrames
+      - emptyImageRow: a dummy image row for text-only embedding
+    """
+    imageSchema = StructType([
+        StructField(
+            "image",
+            StructType([
+                StructField("origin", StringType(), True),
+                StructField("height", IntegerType(), True),
+                StructField("width", IntegerType(), True),
+                StructField("nChannels", IntegerType(), True),
+                StructField("mode", IntegerType(), True),
+                StructField("data", BinaryType(), True),
+            ]),
+        )
+    ])
+    emptyImageRow = Row(Row("", 0, 0, 0, 0, bytes()))
diff --git a/python/test/annotator/embeddings/e5v_embeddings_test.py b/python/test/annotator/embeddings/e5v_embeddings_test.py
new file mode 100644
index 00000000000000..249484232284d3
--- /dev/null
+++ b/python/test/annotator/embeddings/e5v_embeddings_test.py
@@ -0,0 +1,64 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import os
+import unittest
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from pyspark.ml import Pipeline
+from pyspark.sql.functions import lit
+from test.util import SparkContextForTest
+
+@pytest.mark.slow
+class E5VEmbeddingsTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+        self.images_path = "file://"+os.getcwd() + "/../src/test/resources/image/"
+
+    def test_image_and_text_embedding(self):
+        # Simulate image+text embedding (requires actual image files for full test)
+        image_folder = os.environ.get("E5V_IMAGE_TEST_FOLDER", self.images_path)
+        imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
+        image_df = self.spark.read.format("image").option("dropInvalid", True).load(image_folder)
+        test_df = image_df.withColumn("text", lit(imagePrompt))
+
+        imageAssembler = ImageAssembler() \
+            .setInputCol("image") \
+            .setOutputCol("image_assembler")
+        e5v = E5VEmbeddings.pretrained() \
+            .setInputCols(["image_assembler"]) \
+            .setOutputCol("e5v")
+        pipeline = Pipeline().setStages([imageAssembler, e5v])
+        results = pipeline.fit(test_df).transform(test_df)
+        results.select("e5v.embeddings").show(truncate=True)
+
+    def test_text_only_embedding(self):
+        # Simulate text-only embedding using emptyImageRow and imageSchema
+        from sparknlp.util import EmbeddingsDataFrameUtils
+        textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
+        textDesc = "A cat sitting in a box."
+        nullImageDF = self.spark.createDataFrame(
+            self.spark.sparkContext.parallelize([EmbeddingsDataFrameUtils.emptyImageRow]),
+            EmbeddingsDataFrameUtils.imageSchema)
+        textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("", textDesc)))
+        imageAssembler = ImageAssembler() \
+            .setInputCol("image") \
+            .setOutputCol("image_assembler")
+        e5v = E5VEmbeddings.pretrained() \
+            .setInputCols(["image_assembler"]) \
+            .setOutputCol("e5v")
+        pipeline = Pipeline().setStages([imageAssembler, e5v])
+        results = pipeline.fit(textDF).transform(textDF)
+        results.select("e5v.embeddings").show(truncate=True)
\ No newline at end of file
diff --git a/python/test/partition/partition_transformer_test.py b/python/test/partition/partition_transformer_test.py
index decd1fcb176e16..270486c561eece 100644
--- a/python/test/partition/partition_transformer_test.py
+++ b/python/test/partition/partition_transformer_test.py
@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import os
 import unittest
 
 import pytest
@@ -80,4 +81,33 @@ def runTest(self):
         resultDf = pipelineModel.transform(self.testDataSet)
         resultDf.show(truncate=False)
 
-        self.assertTrue(resultDf.select("partition").count() > 0)
\ No newline at end of file
+        self.assertTrue(resultDf.select("partition").count() > 0)
+
+
+@pytest.mark.fast
+class PartitionTransformerChunkTestSpec(unittest.TestCase):
+
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+        self.content_path = f"file:///{os.getcwd()}/../src/test/resources/reader/txt/long-text.txt"
+        self.testDataSet = self.spark.createDataFrame(
+            [("An example with DocumentAssembler annotator",)],
+            ["text"]
+        )
+        self.emptyDataSet = self.spark.createDataFrame([], self.testDataSet.schema)
+
+    def runTest(self):
+        partition = PartitionTransformer() \
+            .setInputCols(["text"]) \
+            .setContentPath(self.content_path) \
+            .setOutputCol("partition") \
+            .setChunkingStrategy("basic") \
+            .setMaxCharacters(140)
+
+        pipeline = Pipeline(stages=[partition])
+        pipelineModel = pipeline.fit(self.emptyDataSet)
+
+        resultDf = pipelineModel.transform(self.emptyDataSet)
+        resultDf.show(truncate=False)
+
+        self.assertTrue(resultDf.select("partition").count() >= 0)
\ No newline at end of file
diff --git a/python/test/sparknlp_test.py b/python/test/sparknlp_test.py
index 68ea10b36476bf..c2baa14fec213d 100644
--- a/python/test/sparknlp_test.py
+++ b/python/test/sparknlp_test.py
@@ -125,4 +125,18 @@ def runTest(self):
         txt_df = sparknlp.read().txt(self.txt_file)
         txt_df.show()
 
-        self.assertTrue(txt_df.select("txt").count() > 0)
\ No newline at end of file
+        self.assertTrue(txt_df.select("txt").count() > 0)
+
+
+@pytest.mark.fast
+class SparkNLPTestXMLFilesSpec(unittest.TestCase):
+
+    def setUp(self):
+        self.data = SparkContextForTest.data
+        self.xml_files = f"file:///{os.getcwd()}/../src/test/resources/reader/xml"
+
+    def runTest(self):
+        xml_df = sparknlp.read().xml(self.xml_files)
+        xml_df.show()
+
+        self.assertTrue(xml_df.select("xml").count() > 0)
\ No newline at end of file
diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh
index 3e4afce648ec51..08c3e4b9b4a3e3 100644
--- a/scripts/colab_setup.sh
+++ b/scripts/colab_setup.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #default values for pyspark, spark-nlp, and SPARK_HOME
-SPARKNLP="6.0.2"
+SPARKNLP="6.0.3"
 PYSPARK="3.4.4"
 
 while getopts s:p:g option; do
diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh
index 98c89af03f80a5..105fc4e3d96a3f 100644
--- a/scripts/kaggle_setup.sh
+++ b/scripts/kaggle_setup.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #default values for pyspark, spark-nlp, and SPARK_HOME
-SPARKNLP="6.0.2"
+SPARKNLP="6.0.3"
 PYSPARK="3.2.3"
 
 while getopts s:p:g option
diff --git a/scripts/sagemaker_setup.sh b/scripts/sagemaker_setup.sh
index f27b949f49994a..113135444eaca2 100644
--- a/scripts/sagemaker_setup.sh
+++ b/scripts/sagemaker_setup.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Default values for pyspark, spark-nlp, and SPARK_HOME
-SPARKNLP="6.0.2"
+SPARKNLP="6.0.3"
 PYSPARK="3.2.3"
 
 echo "Setup SageMaker for PySpark $PYSPARK and Spark NLP $SPARKNLP"
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/E5V.scala b/src/main/scala/com/johnsnowlabs/ml/ai/E5V.scala
new file mode 100644
index 00000000000000..6653fe97cdebcb
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/E5V.scala
@@ -0,0 +1,412 @@
+/*
+ * Copyright 2017-2022 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.ml.ai
+
+import com.johnsnowlabs.ml.ai.util.Generation.GenerationConfig
+import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
+import com.johnsnowlabs.ml.openvino.OpenvinoWrapper.E5VWrappers
+import com.johnsnowlabs.ml.util.{ONNX, Openvino}
+import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
+import com.johnsnowlabs.nlp._
+import com.johnsnowlabs.nlp.annotators.common.SentenceSplit
+import com.johnsnowlabs.ml.ai.util.transform.E5VUtils
+import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor
+import com.johnsnowlabs.nlp.annotators.cv.util.io.ImageIOUtils
+import com.johnsnowlabs.nlp.annotators.cv.util.transform.ImageResizeUtils
+import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BpeTokenizer, LLAVATokenizer, SpecialTokens}
+import org.intel.openvino.InferRequest
+
+private[johnsnowlabs] class E5V(
+    val onnxWrappers: Option[DecoderWrappers],
+    val openvinoWrapper: Option[E5VWrappers],
+    merges: Map[(String, String), Int],
+    vocabulary: Map[String, Int],
+    addedTokens: Map[String, Int],
+    preprocessor: Preprocessor,
+    generationConfig: GenerationConfig,
+    imageToken: Int,
+    imageGridPinpoints: Map[Int, Array[Int]],
+    patchSize: Int)
+    extends Serializable {
+
+  val detectedEngine: String =
+    if (onnxWrappers.isDefined) ONNX.name
+    else if (openvinoWrapper.isDefined) Openvino.name
+    else Openvino.name
+
+  private val GenerationConfig(
+    bosTokenId: Int,
+    paddingTokenId: Int,
+    eosTokenId: Int,
+    vocabSize: Int,
+    beginSuppressTokens,
+    suppressTokenIds,
+    forcedDecoderIds) =
+    generationConfig
+  val reversedVocabulary: Map[Int, String] = vocabulary.map(_.swap)
+  val specialTokens: SpecialTokens = SpecialTokens(
+    vocabulary,
+    startTokenString = reversedVocabulary(bosTokenId),
+    endTokenString = reversedVocabulary(eosTokenId),
+    unkTokenString = reversedVocabulary(eosTokenId),
+    maskTokenString = reversedVocabulary(eosTokenId),
+    padTokenString = reversedVocabulary(paddingTokenId),
+    additionalStrings = addedTokens.keys.toArray)
+
+  val bpeTokenizer: LLAVATokenizer = BpeTokenizer
+    .forModel(
+      "llava",
+      merges = merges,
+      vocab = vocabulary,
+      specialTokens = Some(specialTokens),
+      addPrefixSpaceToSentence = false,
+      alwaysAddPrefix = false,
+      prependString = "")
+    .asInstanceOf[LLAVATokenizer]
+
+  /** Decode a sequence of sentences
+    * @param sentences
+    *   Sequence of sentences
+    * @return
+    *   Sequence of decoded sentences
+    */
+  def decode(sentences: Array[Array[Int]]): Seq[String] = {
+    sentences.map(s => bpeTokenizer.decodeTokens(s.map(_.toInt)))
+  }
+
+  /** Encode a sequence of sentences
+    * @param sentences
+    *   Sequence of sentences
+    * @return
+    *   Sequence of encoded sentences
+    */
+  def encodeText(sentences: Seq[Annotation]): Seq[Array[Int]] = {
+
+    val tokens = SentenceSplit
+      .unpack(sentences)
+      .map(s => {
+        val sentWithTask = s
+        bpeTokenizer
+          .tokenize(sentWithTask)
+          .map(bpeTokenizer.encode)
+          .flatMap(_.map(_.pieceId))
+      })
+    tokens
+  }
+
+  def encode(
+      imageAnnotations: Seq[AnnotationImage],
+      sentences: Seq[Annotation],
+      preprocessor: Preprocessor): (
+      Seq[Array[Int]],
+      Option[Array[Array[Array[Array[Array[Float]]]]]],
+      Option[Array[(Int, Int)]]) = {
+    val encodedText = encodeText(sentences).toArray
+
+    // check if image annotations are present an height and width are > 0
+    val imageAnnotationsFiltered =
+      imageAnnotations.filter(annot => annot.width > 0 && annot.height > 0)
+
+    val preprocessedImages = if (imageAnnotationsFiltered.nonEmpty) {
+      Some(encodeImage(imageAnnotations.toArray, preprocessor))
+    } else {
+      None
+    }
+    val imageSizes = if (imageAnnotationsFiltered.nonEmpty) {
+      Some(imageAnnotations.map(annot => (annot.width, annot.height)).toArray)
+    } else {
+      None
+    }
+
+    (encodedText, preprocessedImages, imageSizes)
+  }
+
+  def tag(
+      batch: Seq[Array[Int]],
+      images: Option[Array[Array[Array[Array[Array[Float]]]]]],
+      imageSizes: Option[Array[(Int, Int)]]): Array[Array[Float]] = {
+
+    val pixelValues = images
+    val expandedDecoderInputsVals = batch
+    val sequencesLength = expandedDecoderInputsVals.map(x => x.length).toArray
+    val numReturn_sequences = 1
+    // from config
+
+    var effectiveBatch_size = 1
+    var effectiveBatch_mult = 1
+
+    effectiveBatch_size = expandedDecoderInputsVals.length
+    effectiveBatch_mult = 1
+
+    val inferRequestLanguageModel =
+      openvinoWrapper.get.languageModel.getCompiledModel().create_infer_request()
+    val inferRequestVisionEmbeddingsModel =
+      openvinoWrapper.get.visionEmbeddingsModel.getCompiledModel().create_infer_request()
+    val inferRequestTextEmbeddingsModel =
+      openvinoWrapper.get.textEmbeddingsModel.getCompiledModel().create_infer_request()
+    val inferRequestImagePackerModel =
+      openvinoWrapper.get.imagePackerModel.getCompiledModel().create_infer_request()
+    val inferRequestMergeModel =
+      openvinoWrapper.get.mergeModel.getCompiledModel().create_infer_request()
+
+    val generatedEmbeddings = getModelOutputs(
+      decoderInputIds = expandedDecoderInputsVals.toArray,
+      pixelValues = pixelValues,
+      imageSizes = imageSizes,
+      inferRequestLanguageModel = inferRequestLanguageModel,
+      inferRequestVisionEmbeddingsModel = inferRequestVisionEmbeddingsModel,
+      inferRequestTextEmbeddingsModel = inferRequestTextEmbeddingsModel,
+      inferRequestImagePackerModel = inferRequestImagePackerModel,
+      inferRequestMergeModel = inferRequestMergeModel)
+    generatedEmbeddings
+  }
+
+  def predict(
+      sentences: Seq[Annotation],
+      imageAnnotations: Seq[AnnotationImage]): Seq[Annotation] = {
+
+    val (encodedText, preprocessedImages, imageSizes) =
+      encode(imageAnnotations, sentences, preprocessor)
+    val sentenceEmbeddings = tag(encodedText, preprocessedImages, imageSizes)
+
+    val annotations = sentences.zip(sentenceEmbeddings).map { case (sentence, vectors) =>
+      Annotation(
+        annotatorType = AnnotatorType.SENTENCE_EMBEDDINGS,
+        begin = sentence.begin,
+        end = sentence.end,
+        result = sentence.result,
+        metadata = sentence.metadata,
+        embeddings = vectors)
+    }
+    annotations
+  }
+
+  def getModelOutputs(
+      decoderInputIds: Array[Array[Int]],
+      pixelValues: Option[Array[Array[Array[Array[Array[Float]]]]]],
+      imageSizes: Option[Array[(Int, Int)]],
+      inferRequestLanguageModel: InferRequest,
+      inferRequestVisionEmbeddingsModel: InferRequest,
+      inferRequestTextEmbeddingsModel: InferRequest,
+      inferRequestImagePackerModel: InferRequest,
+      inferRequestMergeModel: InferRequest): Array[Array[Float]] = {
+
+    val (inputIdsLong, inputPositionIDsLong): (Array[Long], Array[Long]) = {
+      // First pass
+      val inpIdsLong = decoderInputIds.flatMap { tokenIds => tokenIds.map(_.toLong) }
+      val posIdsLong = decoderInputIds.flatMap { tokenIds =>
+        tokenIds.zipWithIndex.map { case (_, i) =>
+          i.toLong
+        }
+      }
+      (inpIdsLong, posIdsLong)
+    }
+
+    val attentionMask: Array[Long] = decoderInputIds.flatMap { tokenIds => tokenIds.map(_ => 1L) }
+    val batchSize: Int = decoderInputIds.length
+    val shape: Array[Int] = Array(batchSize, inputIdsLong.length / batchSize)
+
+    val decoderAttentionMask: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(Array(batchSize, decoderInputIds.head.length), attentionMask)
+    val decoderPositionIDs: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(shape, inputPositionIDsLong)
+
+    val (finalEmbeds, finalAttentionMask, finalPositionIds) = getMultimodalEmbeddings(
+      decoderInputIds,
+      pixelValues,
+      imageSizes,
+      decoderAttentionMask,
+      inferRequestVisionEmbeddingsModel,
+      inferRequestTextEmbeddingsModel,
+      inferRequestImagePackerModel,
+      inferRequestMergeModel)
+
+    inferRequestLanguageModel.set_tensor("inputs_embeds", finalEmbeds)
+    if (finalAttentionMask.isDefined) {
+      val finalAttentionMaskFloatTensor = new org.intel.openvino.Tensor(
+        finalAttentionMask.get.get_shape(),
+        // flat array of floats of values 1.0
+        Array.fill(finalAttentionMask.get.get_shape().product)(1.0f))
+      inferRequestLanguageModel.set_tensor("attention_mask", finalAttentionMaskFloatTensor)
+    } else {
+      val attentionMaskFloat: Array[Float] =
+        decoderInputIds.flatMap { tokenIds => tokenIds.map(_ => 1f) }
+      val attentionMaskFloatTensor =
+        new org.intel.openvino.Tensor(
+          Array(batchSize, decoderInputIds.head.length),
+          attentionMaskFloat)
+      inferRequestLanguageModel.set_tensor("attention_mask", attentionMaskFloatTensor)
+    }
+    if (finalPositionIds.isDefined) {
+      inferRequestLanguageModel.set_tensor("position_ids", finalPositionIds.get)
+    } else {
+      inferRequestLanguageModel.set_tensor("position_ids", decoderPositionIDs)
+    }
+    inferRequestLanguageModel.infer()
+
+    val result = inferRequestLanguageModel.get_tensor("last_hidden_state")
+    val hiddenStateData = result.data()
+    val hiddenStateShape = result.get_shape()
+    val batchSizeResult = hiddenStateShape(0)
+    val hiddenSize = hiddenStateShape(1)
+    // Reshape to (batch, hidden_size) and return as Array[Array[Float]]
+    Array.tabulate(batchSizeResult) { b =>
+      val start = b * hiddenSize
+      val end = start + hiddenSize
+      hiddenStateData.slice(start, end)
+    }
+
+  }
+
+  private def encodeImage(
+      annotations: Array[AnnotationImage],
+      preprocessor: Preprocessor): Array[Array[Array[Array[Array[Float]]]]] = {
+
+    val batchProcessedImages = annotations.map { annot =>
+      val bufferedImage = ImageIOUtils.byteToBufferedImage(
+        bytes = annot.result,
+        w = annot.width,
+        h = annot.height,
+        nChannels = annot.nChannels)
+      val bestResolution = E5VUtils.selectBestResolution(
+        (bufferedImage.getHeight, bufferedImage.getWidth),
+        imageGridPinpoints.map { case (_, pinpoints) =>
+          (pinpoints(0), pinpoints(1))
+        }.toList)
+
+      val (newHeight, newWidth) = E5VUtils.getPatchOutputSize(bufferedImage, bestResolution)
+      val resizedForPatches = ImageResizeUtils.resizeBufferedImage(
+        width = newWidth,
+        height = newHeight,
+        resample = preprocessor.resample)(bufferedImage)
+
+      val paddedForPatches = E5VUtils.padImage(resizedForPatches, bestResolution)
+
+      var patches = E5VUtils.divideToPatches(paddedForPatches, patchSize)
+
+      // add the reshaped original image as the first patch
+      val resizedOriginalImage = ImageResizeUtils.resizeBufferedImage(
+        width = preprocessor.size,
+        height = preprocessor.size,
+        resample = preprocessor.resample)(bufferedImage)
+
+      patches = List(resizedOriginalImage) ++ patches
+      patches.map { patch =>
+        ImageResizeUtils.normalizeAndConvertBufferedImage(
+          img = patch,
+          mean = preprocessor.image_mean,
+          std = preprocessor.image_std,
+          doNormalize = preprocessor.do_normalize,
+          doRescale = preprocessor.do_rescale,
+          rescaleFactor = preprocessor.rescale_factor)
+      }.toArray
+    }
+
+    batchProcessedImages
+
+  }
+
+  def getMultimodalEmbeddings(
+      inputIds: Array[Array[Int]],
+      pixelValues: Option[Array[Array[Array[Array[Array[Float]]]]]],
+      imageSizes: Option[Array[(Int, Int)]],
+      attentionMask: org.intel.openvino.Tensor,
+      inferRequestVisionEmbeddingsModel: InferRequest,
+      inferRequestTextEmbeddingsModel: InferRequest,
+      inferRequestImagePackerModel: InferRequest,
+      inferRequestMergeModel: InferRequest): (
+      org.intel.openvino.Tensor,
+      Option[org.intel.openvino.Tensor],
+      Option[org.intel.openvino.Tensor]) = {
+
+    val inputIdsLong: Array[Long] = inputIds.flatMap(_.map(_.toLong))
+    val batchSize: Int = inputIds.length
+    val shape: Array[Int] = Array(batchSize, inputIdsLong.length / batchSize)
+    val inputIdsLongTensor = new org.intel.openvino.Tensor(shape, inputIdsLong)
+
+    // If pixelValues and imageSizes are present, do multimodal
+    (pixelValues, imageSizes, attentionMask) match {
+      case (Some(pixels), Some(sizes), attnMask) if pixels.nonEmpty && sizes.nonEmpty =>
+        // 1. Get image features
+        val pixelShape = Array(
+          pixels.length,
+          pixels.head.length,
+          pixels.head.head.length,
+          pixels.head.head.head.length,
+          pixels.head.head.head.head.length)
+        // Flatten the pixel values to match the expected input shape
+        val flattenedPixels = pixels.flatten.flatten.flatten.flatten
+        val pixelTensor =
+          new org.intel.openvino.Tensor(pixelShape, flattenedPixels)
+
+        inferRequestVisionEmbeddingsModel.set_tensor("pixel_values", pixelTensor)
+        inferRequestVisionEmbeddingsModel.infer()
+        val imageFeatures = inferRequestVisionEmbeddingsModel.get_output_tensor()
+
+        // 2. Compute patch grid shape (dummy for now, should use config)
+        val (numPatchHeight, numPatchWidth) =
+          E5VUtils.getAnyResImageGridShape(
+            imageSizes.get.head,
+            imageGridPinpoints.map { case (_, pinpoints) =>
+              (pinpoints(0), pinpoints(1))
+            }.toList,
+            preprocessor.size)
+
+        // 3. Pack image features
+        val imageSizesTensor = new org.intel.openvino.Tensor(
+          Array(sizes.length, 2),
+          sizes.flatMap(t => Array(t._1.toLong, t._2.toLong)))
+
+        val numPatchHeightTensor =
+          new org.intel.openvino.Tensor(Array[Int](), Array(numPatchHeight.toLong))
+
+        val numPatchWidthTensor =
+          new org.intel.openvino.Tensor(Array[Int](), Array(numPatchWidth.toLong))
+
+        inferRequestImagePackerModel.set_tensor("image_feature", imageFeatures)
+        inferRequestImagePackerModel.set_tensor("image_sizes", imageSizesTensor)
+        inferRequestImagePackerModel.set_tensor("num_patch_height", numPatchHeightTensor)
+        inferRequestImagePackerModel.set_tensor("num_patch_width", numPatchWidthTensor)
+        inferRequestImagePackerModel.infer()
+
+        val packedImageFeatures = inferRequestImagePackerModel.get_output_tensor()
+
+        // 4. Get text embeddings
+        inferRequestTextEmbeddingsModel.set_input_tensor(inputIdsLongTensor)
+        inferRequestTextEmbeddingsModel.infer()
+        val textEmbeddings = inferRequestTextEmbeddingsModel.get_output_tensor()
+
+        // 5. Merge image and text embeddings
+        inferRequestMergeModel.set_tensor("image_features", packedImageFeatures)
+        inferRequestMergeModel.set_tensor("inputs_embeds", textEmbeddings)
+        inferRequestMergeModel.set_tensor("input_ids", inputIdsLongTensor)
+
+        inferRequestMergeModel.set_tensor("attention_mask", attnMask)
+        inferRequestMergeModel.infer()
+        (
+          inferRequestMergeModel.get_tensor("final_embedding"),
+          Some(inferRequestMergeModel.get_tensor("final_attention_mask")),
+          Some(inferRequestMergeModel.get_tensor("position_ids")))
+      case _ =>
+        // Text-only
+        inferRequestTextEmbeddingsModel.set_input_tensor(inputIdsLongTensor)
+        inferRequestTextEmbeddingsModel.infer()
+        (inferRequestTextEmbeddingsModel.get_output_tensor(), None, None)
+    }
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/util/transform/E5VUtils.scala b/src/main/scala/com/johnsnowlabs/ml/ai/util/transform/E5VUtils.scala
new file mode 100644
index 00000000000000..a2561d2bd28613
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/util/transform/E5VUtils.scala
@@ -0,0 +1,134 @@
+package com.johnsnowlabs.ml.ai.util.transform
+
+import java.awt.image.BufferedImage
+import java.awt.{Color, Graphics2D}
+
+object ChannelDimension extends Enumeration {
+  type ChannelDimension = Value
+  val FIRST, LAST = Value
+}
+
+object E5VUtils {
+  import ChannelDimension._
+
+  def selectBestResolution(
+      originalSize: (Int, Int),
+      possibleResolutions: List[(Int, Int)]): (Int, Int) = {
+    val (originalHeight, originalWidth) = originalSize
+    var bestFit: (Int, Int) = possibleResolutions.head
+    var maxEffectiveResolution = 0
+    var minWastedResolution = Double.PositiveInfinity
+
+    for ((height, width) <- possibleResolutions) {
+      val scale = math.min(width.toDouble / originalWidth, height.toDouble / originalHeight)
+      val downscaledWidth = (originalWidth * scale).toInt
+      val downscaledHeight = (originalHeight * scale).toInt
+      val effectiveResolution =
+        math.min(downscaledWidth * downscaledHeight, originalWidth * originalHeight)
+      val wastedResolution = (width * height) - effectiveResolution
+
+      if (effectiveResolution > maxEffectiveResolution ||
+        (effectiveResolution == maxEffectiveResolution && wastedResolution < minWastedResolution)) {
+        maxEffectiveResolution = effectiveResolution
+        minWastedResolution = wastedResolution
+        bestFit = (height, width)
+      }
+    }
+    bestFit
+  }
+
+  def imageSizeToNumPatches(
+      imageSize: (Int, Int),
+      gridPinpoints: List[(Int, Int)],
+      patchSize: Int): Int = {
+    val (height, width) = selectBestResolution(imageSize, gridPinpoints)
+    val numPatches = (0 until height by patchSize).size * (0 until width by patchSize).size
+    // add the base patch
+    numPatches + 1
+  }
+
+  def getAnyResImageGridShape(
+      imageSize: (Int, Int),
+      gridPinpoints: List[(Int, Int)],
+      patchSize: Int): (Int, Int) = {
+    val (height, width) = selectBestResolution(imageSize, gridPinpoints)
+    (height / patchSize, width / patchSize)
+  }
+
+  def getImageSize(image: BufferedImage): (Int, Int) = {
+    (image.getHeight, image.getWidth)
+  }
+
+  def expandToSquare(image: BufferedImage, backgroundColor: Color): BufferedImage = {
+    val width = image.getWidth
+    val height = image.getHeight
+    if (width == height) {
+      image
+    } else if (width > height) {
+      val result = new BufferedImage(width, width, image.getType)
+      val g = result.createGraphics()
+      g.setColor(backgroundColor)
+      g.fillRect(0, 0, width, width)
+      g.drawImage(image, 0, (width - height) / 2, null)
+      g.dispose()
+      result
+    } else {
+      val result = new BufferedImage(height, height, image.getType)
+      val g = result.createGraphics()
+      g.setColor(backgroundColor)
+      g.fillRect(0, 0, height, height)
+      g.drawImage(image, (height - width) / 2, 0, null)
+      g.dispose()
+      result
+    }
+  }
+
+  def divideToPatches(image: BufferedImage, patchSize: Int): List[BufferedImage] = {
+    val width = image.getWidth
+    val height = image.getHeight
+    val patches = for {
+      i <- 0 until height by patchSize
+      j <- 0 until width by patchSize
+    } yield {
+      val w = math.min(patchSize, width - j)
+      val h = math.min(patchSize, height - i)
+      image.getSubimage(j, i, w, h)
+    }
+    patches.toList
+  }
+
+  def getPatchOutputSize(image: BufferedImage, targetResolution: (Int, Int)): (Int, Int) = {
+    val (originalHeight, originalWidth) = getImageSize(image)
+    val (targetHeight, targetWidth) = targetResolution
+
+    val scaleW = targetWidth.toDouble / originalWidth
+    val scaleH = targetHeight.toDouble / originalHeight
+
+    if (scaleW < scaleH) {
+      val newWidth = targetWidth
+      val newHeight = math.min(math.ceil(originalHeight * scaleW).toInt, targetHeight)
+      (newHeight, newWidth)
+    } else {
+      val newHeight = targetHeight
+      val newWidth = math.min(math.ceil(originalWidth * scaleH).toInt, targetWidth)
+      (newHeight, newWidth)
+    }
+  }
+
+  def padImage(image: BufferedImage, targetResolution: (Int, Int)): BufferedImage = {
+    val (targetHeight, targetWidth) = targetResolution
+    val (originalHeight, originalWidth) = getImageSize(image)
+    val (newHeight, newWidth) = getPatchOutputSize(image, targetResolution)
+    val result = new BufferedImage(targetWidth, targetHeight, image.getType)
+    val g = result.createGraphics()
+    g.setColor(Color.BLACK)
+    g.fillRect(0, 0, newWidth, newHeight)
+    g.drawImage(
+      image,
+      (targetWidth - originalWidth) / 2,
+      (targetHeight - originalHeight) / 2,
+      null)
+    g.dispose()
+    result
+  }
+}
diff --git a/src/main/scala/com/johnsnowlabs/ml/openvino/OpenvinoWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/openvino/OpenvinoWrapper.scala
index 274e085325aaf3..961995ffdd1511 100644
--- a/src/main/scala/com/johnsnowlabs/ml/openvino/OpenvinoWrapper.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/openvino/OpenvinoWrapper.scala
@@ -285,4 +285,10 @@ object OpenvinoWrapper {
       textEmbeddingsModel: OpenvinoWrapper,
       imageEmbedModel: OpenvinoWrapper,
       modelMergerModel: OpenvinoWrapper)
+  case class E5VWrappers(
+      languageModel: OpenvinoWrapper,
+      visionEmbeddingsModel: OpenvinoWrapper,
+      textEmbeddingsModel: OpenvinoWrapper,
+      imagePackerModel: OpenvinoWrapper,
+      mergeModel: OpenvinoWrapper)
 }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala
index ae620dc78cbaa5..839346b79453b5 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala
@@ -152,7 +152,21 @@ class ImageAssembler(override val uid: String)
           result = image.get.data,
           metadata = metadata,
           text = text.getOrElse("")))
-    } else Seq.empty
+    } else if (text.isDefined) {
+      Seq(
+        AnnotationImage(
+          annotatorType = outputAnnotatorType,
+          origin = "",
+          height = 0,
+          width = 0,
+          nChannels = 0,
+          mode = 0,
+          result = Array.emptyByteArray,
+          metadata = metadata,
+          text = text.getOrElse("")))
+    } else {
+      Seq.empty[AnnotationImage]
+    }
 
   }
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5VEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5VEmbeddings.scala
new file mode 100644
index 00000000000000..657d012c04734e
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5VEmbeddings.scala
@@ -0,0 +1,641 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.embeddings
+
+import com.johnsnowlabs.ml.ai.util.Generation.GenerationConfig
+import com.johnsnowlabs.ml.ai.E5V
+import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
+import com.johnsnowlabs.ml.util.LoadExternalModel.{
+  loadJsonStringAsset,
+  loadTextAsset,
+  modelSanityCheck,
+  notSupportedEngineError
+}
+import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor
+import com.johnsnowlabs.ml.util.Openvino
+import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, IMAGE, SENTENCE_EMBEDDINGS}
+import com.johnsnowlabs.nlp._
+import org.json4s.{DefaultFormats, JValue}
+import org.json4s.jackson.JsonMethods.parse
+import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel}
+import com.johnsnowlabs.ml.openvino.OpenvinoWrapper.E5VWrappers
+import com.johnsnowlabs.nlp.serialization.{MapFeature, StructFeature}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.param.{IntArrayParam, IntParam}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.{Row, SparkSession}
+
+/** E5VEmbeddings provides universal multimodal embeddings using the E5-V model, which is
+  * fine-tuned from lmms-lab/llama3-llava-next-8b.
+  *
+  * E5-V bridges the modality gap between different input types (text, image) and demonstrates
+  * strong performance in multimodal embeddings, even without fine-tuning. It also supports a
+  * single-modality training approach, where the model is trained exclusively on text pairs, often
+  * yielding better performance than multimodal training.
+  *
+  * For more details, see the Hugging Face model card: https://huggingface.co/royokong/e5-v
+  *
+  * ==Overview==
+  *
+  * E5-V can embed both text and images into a shared space, enabling cross-modal retrieval and
+  * similarity tasks. The model is designed for universal embeddings and is suitable for scenarios
+  * where you want to compare or retrieve across modalities.
+  *
+  * ==Example==
+  *
+  * ===Image + Text Embedding===
+  * {{ { import org.apache.spark.sql.functions.lit import com.johnsnowlabs.nlp.base.ImageAssembler
+  * import com.johnsnowlabs.nlp.embeddings.E5VEmbeddings import org.apache.spark.ml.Pipeline
+  *
+  * val imageDF = spark.read.format("image").option("dropInvalid", value = true).load(imageFolder)
+  * val imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above image
+  * in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" val testDF =
+  * imageDF.withColumn("text", lit(imagePrompt))
+  *
+  * val imageAssembler = new ImageAssembler().setInputCol("image").setOutputCol("image_assembler")
+  * val e5vEmbeddings = E5VEmbeddings.pretrained() .setInputCols("image_assembler")
+  * .setOutputCol("e5v")
+  *
+  * val pipeline = new Pipeline().setStages(Array(imageAssembler, e5vEmbeddings)) val result =
+  * pipeline.fit(testDF).transform(testDF) result.select("e5v.embeddings").show(truncate = false)
+  * }}
+  *
+  * ===Text-Only Embedding===
+  * {{ { import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.lit import
+  * com.johnsnowlabs.nlp.util.EmbeddingsDataFrameUtils.{emptyImageRow, imageSchema} import
+  * com.johnsnowlabs.nlp.embeddings.E5VEmbeddings
+  *
+  * val spark: SparkSession = ... val textPrompt =
+  * "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above sentence in one word:
+  * <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" val textDesc = "A cat sitting
+  * in a box." val nullImageDF =
+  * spark.createDataFrame(spark.sparkContext.parallelize(Seq(emptyImageRow)), imageSchema) val
+  * textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("", textDesc)))
+  *
+  * val e5vEmbeddings = E5VEmbeddings.pretrained() .setInputCols("image") .setOutputCol("e5v") val
+  * result = e5vEmbeddings.transform(textDF) result.select("e5v.embeddings").show(truncate =
+  * false) }}
+  *
+  * ==References==
+  *   - Hugging Face model card: https://huggingface.co/royokong/e5-v
+  *   - Paper: https://arxiv.org/abs/2407.12580
+  *   - Code: https://github.com/kongds/E5-V
+  *
+  * @see
+  *   [[CLIPForZeroShotClassification]] for Zero Shot Image Classifier
+  * @see
+  *   [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer
+  *   based classifiers
+  * @groupname anno Annotator types
+  * @groupdesc anno
+  *   Required input and expected output annotator types
+  * @groupname Ungrouped Members
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupname Ungrouped Members
+  * @groupprio param  1
+  * @groupprio anno  2
+  * @groupprio Ungrouped 3
+  * @groupprio setParam  4
+  * @groupprio getParam  5
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+
+class E5VEmbeddings(override val uid: String)
+    extends AnnotatorModel[E5VEmbeddings]
+    with HasBatchedAnnotateImage[E5VEmbeddings]
+    with HasImageFeatureProperties
+    with WriteOpenvinoModel
+    with HasGeneratorProperties
+    with HasEngine {
+
+  /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
+    * type
+    */
+  def this() = this(Identifiable.randomUID("E5VEmbeddings"))
+
+  /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
+    * type
+    */
+  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(IMAGE)
+  override val outputAnnotatorType: AnnotatorType = SENTENCE_EMBEDDINGS
+
+  /** @group setParam */
+  def setRandomSeed(value: Int): E5VEmbeddings.this.type = {
+    if (randomSeed.isEmpty) {
+      this.randomSeed = Some(value)
+    }
+    this
+  }
+
+  /** A list of token ids which are ignored in the decoder's output (Default: `Array()`)
+    *
+    * @group param
+    */
+  var ignoreTokenIds = new IntArrayParam(
+    this,
+    "ignoreTokenIds",
+    "A list of token ids which are ignored in the decoder's output")
+
+  /** @group setParam */
+  def setIgnoreTokenIds(tokenIds: Array[Int]): E5VEmbeddings.this.type = {
+    set(ignoreTokenIds, tokenIds)
+  }
+
+  /** @group getParam */
+  def getIgnoreTokenIds: Array[Int] = $(ignoreTokenIds)
+
+  /** Vocabulary used to encode the words to ids with bpeTokenizer.encode
+    *
+    * @group param
+    */
+  val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected()
+
+  /** @group setParam */
+  def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value)
+
+  /** Holding merges.txt coming from RoBERTa model
+    *
+    * @group param
+    */
+  val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected()
+
+  /** @group setParam */
+  def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value)
+
+  /** Additional tokens to be added to the vocabulary
+    *
+    * @group param
+    */
+  val addedTokens: MapFeature[String, Int] = new MapFeature(this, "addedTokens").setProtected()
+
+  /** @group setParam */
+  def setAddedTokens(value: Map[String, Int]): this.type = set(addedTokens, value)
+
+  /** Stop tokens to terminate the generation
+    *
+    * @group param
+    */
+  override val stopTokenIds =
+    new IntArrayParam(this, "stopTokenIds", "Stop tokens to terminate the generation")
+
+  /** @group setParam */
+  override def setStopTokenIds(value: Array[Int]): this.type = {
+    set(stopTokenIds, value)
+  }
+
+  /** @group getParam */
+  override def getStopTokenIds: Array[Int] = $(stopTokenIds)
+
+  private var _model: Option[Broadcast[E5V]] = None
+  val generationConfig: StructFeature[GenerationConfig] =
+    new StructFeature(this, "generationConfig").setProtected()
+
+  def setGenerationConfig(value: GenerationConfig): this.type =
+    set(generationConfig, value)
+
+  def getGenerationConfig: GenerationConfig = $$(generationConfig)
+
+  val imageToken =
+    new IntParam(this, "imageToken", "Token id for image embeddings")
+
+  /** @group setParam */
+  def setImageToken(value: Int): this.type = set(imageToken, value)
+
+  /** @group getParam */
+  def getImageToken: Int = $(imageToken)
+
+  /** Pinpoints for image grid, used to extract image features from the grid
+    *
+    * @group param
+    */
+  val imageGridPinpoints: MapFeature[Int, Array[Int]] = new MapFeature(this, "imageGridPinpoints")
+
+  /** @group setParam */
+  def setImageGridPinpoints(value: Map[Int, Array[Int]]): this.type =
+    set(imageGridPinpoints, value)
+
+  /** @group getParam */
+  def getImageGridPinpoints: Map[Int, Array[Int]] = $$(imageGridPinpoints)
+
+  /** Patch size for image embeddings
+    *
+    * @group param
+    */
+  val patchSize: IntParam =
+    new IntParam(this, "patchSize", "Patch size for image embeddings, default is 336")
+
+  /** @group setParam */
+  def setPatchSize(value: Int): this.type = set(patchSize, value)
+
+  /** @group getParam */
+  def getPatchSize: Int = $(patchSize)
+
+  /** @group setParam */
+  def setModelIfNotSet(
+      spark: SparkSession,
+      preprocessor: Preprocessor,
+      onnxWrappers: Option[DecoderWrappers],
+      openvinoWrapper: Option[E5VWrappers]): this.type = {
+    if (_model.isEmpty) {
+      _model = Some(
+        spark.sparkContext.broadcast(
+          new E5V(
+            onnxWrappers,
+            openvinoWrapper,
+            $$(merges),
+            $$(vocabulary),
+            $$(addedTokens),
+            preprocessor,
+            generationConfig = getGenerationConfig,
+            imageToken = getImageToken,
+            imageGridPinpoints = getImageGridPinpoints,
+            patchSize = getPatchSize)))
+    }
+    this
+  }
+
+  /** @group getParam */
+  def getModelIfNotSet: E5V = _model.get.value
+
+  setDefault(
+    minOutputLength -> 0,
+    maxOutputLength -> 20,
+    doSample -> false,
+    temperature -> 0.6,
+    topK -> -1,
+    topP -> 0.9,
+    repetitionPenalty -> 1.0,
+    noRepeatNgramSize -> 3,
+    ignoreTokenIds -> Array(),
+    batchSize -> 1,
+    beamSize -> 1,
+    maxInputLength -> 4096,
+    stopTokenIds -> Array(2),
+    imageToken -> 128256,
+    patchSize -> 336)
+
+  /** takes a document and annotations and produces new annotations of this annotator's annotation
+    * type
+    *
+    * @param batchedAnnotations
+    *   Annotations in batches that correspond to inputAnnotationCols generated by previous
+    *   annotators if any
+    * @return
+    *   any number of annotations processed for every batch of input annotations. Not necessary
+    *   one to one relationship
+    */
+  override def batchAnnotate(
+      batchedAnnotations: Seq[Array[AnnotationImage]]): Seq[Seq[Annotation]] = {
+
+    batchedAnnotations
+      //      .filter { annotationImages =>
+      //        annotationImages.exists(_.text.nonEmpty)
+      //      }
+      .map { cleanAnnotationImages =>
+        val validImages = cleanAnnotationImages
+        val questionAnnotations = extractInputAnnotation(validImages)
+
+        getModelIfNotSet.predict(questionAnnotations, validImages.toSeq)
+      }
+  }
+
+  private def extractInputAnnotation(
+      annotationImages: Array[AnnotationImage]): Seq[Annotation] = {
+    val questions = annotationImages.map(annotationImage => {
+      val imageText =
+        if (annotationImage.text.nonEmpty) annotationImage.text
+        else
+          "<|user|> \n <|image|> This is an image\n <|end|>\n <|assistant|>\n" // default question
+      Annotation(imageText)
+    })
+
+    questions
+  }
+
+  override def onWrite(path: String, spark: SparkSession): Unit = {
+    super.onWrite(path, spark)
+    getEngine match {
+      case Openvino.name =>
+        val wrappers = getModelIfNotSet.openvinoWrapper
+        writeOpenvinoModels(
+          path,
+          spark,
+          Seq((wrappers.get.languageModel, "openvino_language_model-int4.xml")),
+          E5VEmbeddings.suffix)
+
+        writeOpenvinoModels(
+          path,
+          spark,
+          Seq((wrappers.get.visionEmbeddingsModel, "openvino_vision_embeddings_model.xml")),
+          E5VEmbeddings.suffix)
+
+        writeOpenvinoModels(
+          path,
+          spark,
+          Seq((wrappers.get.textEmbeddingsModel, "openvino_text_embeddings_model.xml")),
+          E5VEmbeddings.suffix)
+
+        writeOpenvinoModels(
+          path,
+          spark,
+          Seq((wrappers.get.imagePackerModel, "openvino_image_packer.xml")),
+          E5VEmbeddings.suffix)
+
+        writeOpenvinoModels(
+          path,
+          spark,
+          Seq((wrappers.get.mergeModel, "openvino_multimodal_merger.xml")),
+          E5VEmbeddings.suffix)
+      case _ =>
+        throw new Exception(notSupportedEngineError)
+    }
+  }
+
+}
+
+trait ReadablePretrainedE5VEmbeddings
+    extends ParamsAndFeaturesReadable[E5VEmbeddings]
+    with HasPretrained[E5VEmbeddings] {
+
+  override val defaultModelName: Some[String] = Some("e5v_1_5_7b_int4")
+
+  /** Java compliant-overrides */
+  override def pretrained(): E5VEmbeddings = super.pretrained()
+
+  override def pretrained(name: String): E5VEmbeddings =
+    super.pretrained(name)
+
+  override def pretrained(name: String, lang: String): E5VEmbeddings =
+    super.pretrained(name, lang)
+
+  override def pretrained(name: String, lang: String, remoteLoc: String): E5VEmbeddings =
+    super.pretrained(name, lang, remoteLoc)
+
+}
+
+trait ReadE5VEmbeddingsDLModel extends ReadOpenvinoModel {
+  this: ParamsAndFeaturesReadable[E5VEmbeddings] =>
+  val suffix: String = "_e5v"
+  override val openvinoFile: String = "e5v_openvino"
+  def readModel(instance: E5VEmbeddings, path: String, spark: SparkSession): Unit = {
+    instance.getEngine match {
+      case Openvino.name =>
+        val languageModelWrappers =
+          readOpenvinoModels(path, spark, Seq("openvino_language_model-int4.xml"), suffix)
+
+        val visionEmbeddingsModelWrappers =
+          readOpenvinoModels(path, spark, Seq("openvino_vision_embeddings_model.xml"), suffix)
+
+        val textEmbeddingsModelWrappers =
+          readOpenvinoModels(path, spark, Seq("openvino_text_embeddings_model.xml"), suffix)
+
+        val imagePackerModelWrappers =
+          readOpenvinoModels(path, spark, Seq("openvino_image_packer.xml"), suffix)
+
+        val mergeModelWrappers =
+          readOpenvinoModels(path, spark, Seq("openvino_multimodal_merger.xml"), suffix)
+
+        val ovWrapper = E5VWrappers(
+          languageModel = languageModelWrappers("openvino_language_model-int4.xml"),
+          visionEmbeddingsModel =
+            visionEmbeddingsModelWrappers("openvino_vision_embeddings_model.xml"),
+          textEmbeddingsModel = textEmbeddingsModelWrappers("openvino_text_embeddings_model.xml"),
+          mergeModel = mergeModelWrappers("openvino_multimodal_merger.xml"),
+          imagePackerModel = imagePackerModelWrappers("openvino_image_packer.xml"))
+        val preprocessor = Preprocessor(
+          do_normalize = true,
+          do_resize = true,
+          "E5VFeatureExtractor",
+          instance.getImageMean,
+          instance.getImageStd,
+          instance.getResample,
+          instance.getSize)
+        instance.setModelIfNotSet(spark, preprocessor, None, Some(ovWrapper))
+      case _ => {
+        throw new Exception(notSupportedEngineError)
+      }
+    }
+  }
+
+  addReader(readModel)
+
+  def loadSavedModel(
+      modelPath: String,
+      spark: SparkSession,
+      useOpenvino: Boolean = false): E5VEmbeddings = {
+    implicit val formats: DefaultFormats.type = DefaultFormats // for json4
+    val (localModelPath, detectedEngine) =
+      modelSanityCheck(
+        modelPath,
+        isDecoder = false,
+        custom = Some(
+          List(
+            "openvino_language_model-int4",
+            "openvino_vision_embeddings_model",
+            "openvino_text_embeddings_model",
+            "openvino_image_packer",
+            "openvino_multimodal_merger")))
+    val modelConfig: JValue =
+      parse(loadJsonStringAsset(localModelPath, "config.json"))
+
+    val preprocessorConfigJsonContent =
+      loadJsonStringAsset(localModelPath, "preprocessor_config.json")
+    val preprocessorConfig = Preprocessor.loadPreprocessorConfig(preprocessorConfigJsonContent)
+    val beginSuppressTokens: Array[Int] =
+      (modelConfig \ "begin_suppress_tokens").extract[Array[Int]]
+
+    val suppressTokenIds: Array[Int] =
+      (modelConfig \ "suppress_tokens").extract[Array[Int]]
+
+    val forcedDecoderIds: Array[(Int, Int)] =
+      (modelConfig \ "forced_decoder_ids").extract[Array[Array[Int]]].map {
+        case idxWithTokenId: Array[Int] if idxWithTokenId.length == 2 =>
+          (idxWithTokenId(0), idxWithTokenId(1))
+        case _ =>
+          throw new Exception(
+            "Could not extract forced_decoder_ids. Should be a list of tuples with 2 entries.")
+      }
+
+    def arrayOrNone[T](array: Array[T]): Option[Array[T]] =
+      if (array.nonEmpty) Some(array) else None
+
+    val bosTokenId = (modelConfig \ "text_config" \ "bos_token_id").extract[Int]
+    val eosTokenId = (modelConfig \ "text_config" \ "eos_token_id").extract[Int]
+    val padTokenId = (modelConfig \ "text_config" \ "eos_token_id").extract[Int]
+    val vocabSize = (modelConfig \ "text_config" \ "vocab_size").extract[Int]
+
+    val imageToken = (modelConfig \ "image_token_index").extract[Int]
+    val imageGridPinpoints: Array[Array[Int]] =
+      (modelConfig \ "image_grid_pinpoints").extract[Array[Array[Int]]]
+    val imageGridPinpointsMap: Map[Int, Array[Int]] =
+      imageGridPinpoints.zipWithIndex.map { case (pinpoints, index) =>
+        (index, pinpoints)
+      }.toMap
+    // Check if tokenizer.json exists
+    val tokenizerPath = s"$localModelPath/assets/tokenizer.json"
+    val tokenizerExists = new java.io.File(tokenizerPath).exists()
+    val (vocabs, addedTokens, bytePairs) = if (tokenizerExists) {
+      val tokenizerConfig: JValue = parse(loadJsonStringAsset(localModelPath, "tokenizer.json"))
+      // extract vocab from tokenizer.json ( model -> vocab)
+      var vocabs: Map[String, Int] =
+        (tokenizerConfig \ "model" \ "vocab").extract[Map[String, Int]]
+
+      // extract merges from tokenizer.json ( model -> merges)
+//      val bytePairs = (tokenizerConfig \ "model" \ "merges")
+//        .extract[List[Array[String]]]
+//        .filter(w => w.length == 2)
+//        .map { case Array(c1, c2) => (c1, c2) }
+//        .zipWithIndex
+//        .toMap
+      val bytePairs = (tokenizerConfig \ "model" \ "merges")
+        .extract[List[String]]
+        .map(_.split(" "))
+        .filter(w => w.length == 2)
+        .map { case Array(c1, c2) => (c1, c2) }
+        .zipWithIndex
+        .toMap
+      // extract added_tokens from tokenizer.json (added_tokens)
+      // "added_tokens": [
+      //    {
+      //      "id": 128000,
+      //      "content": "<|begin_of_text|>",
+      //      "single_word": false,
+      //      "lstrip": false,
+      //      "rstrip": false,
+      //      "normalized": false,
+      //      "special": true
+      //    }, ...
+      //  ]
+      val addedTokens = (tokenizerConfig \ "added_tokens")
+        .extract[List[Map[String, Any]]]
+        .map { token =>
+          val id = token("id").asInstanceOf[BigInt].intValue()
+          val content = token("content").asInstanceOf[String]
+          (content, id)
+        }
+        .toMap
+
+      // update vocab with added tokens
+      addedTokens.foreach { case (content, id) =>
+        vocabs += (content -> id)
+      }
+      (vocabs, addedTokens, bytePairs)
+    } else {
+      val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap
+      val addedTokens = loadTextAsset(localModelPath, "added_tokens.txt").zipWithIndex.toMap
+      val bytePairs = loadTextAsset(localModelPath, "merges.txt")
+        .map(_.split(" "))
+        .filter(w => w.length == 2)
+        .map { case Array(c1, c2) => (c1, c2) }
+        .zipWithIndex
+        .toMap
+      (vocabs, addedTokens, bytePairs)
+    }
+
+    val annotatorModel = new E5VEmbeddings()
+      .setGenerationConfig(
+        GenerationConfig(
+          bosTokenId,
+          padTokenId,
+          eosTokenId,
+          vocabSize,
+          arrayOrNone(beginSuppressTokens),
+          arrayOrNone(suppressTokenIds),
+          arrayOrNone(forcedDecoderIds)))
+      .setVocabulary(vocabs)
+      .setMerges(bytePairs)
+      .setAddedTokens(addedTokens)
+      .setImageToken(imageToken)
+      .setSize(preprocessorConfig.size)
+      .setImageMean(preprocessorConfig.image_mean)
+      .setImageStd(preprocessorConfig.image_std)
+      .setResample(preprocessorConfig.resample)
+      .setImageGridPinpoints(imageGridPinpointsMap)
+
+    val modelEngine =
+      if (useOpenvino)
+        Openvino.name
+      else
+        detectedEngine
+    annotatorModel.set(annotatorModel.engine, modelEngine)
+
+    detectedEngine match {
+      case Openvino.name =>
+        val visionWrapper =
+          OpenvinoWrapper.read(
+            spark,
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            detectedEngine = detectedEngine,
+            modelName = "openvino_vision_embeddings_model")
+        val textWrapper =
+          OpenvinoWrapper.read(
+            spark,
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            detectedEngine = detectedEngine,
+            modelName = "openvino_text_embeddings_model")
+
+        val imagePackerModelWrapper =
+          OpenvinoWrapper.read(
+            spark,
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            detectedEngine = detectedEngine,
+            modelName = "openvino_image_packer")
+
+        val mergeWrapper =
+          OpenvinoWrapper.read(
+            spark,
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            detectedEngine = detectedEngine,
+            modelName = "openvino_multimodal_merger")
+        val languageModelWrapper =
+          OpenvinoWrapper.read(
+            spark,
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            detectedEngine = detectedEngine,
+            modelName = "openvino_language_model-int4")
+
+        val openvinoWrapper = E5VWrappers(
+          languageModel = languageModelWrapper,
+          visionEmbeddingsModel = visionWrapper,
+          textEmbeddingsModel = textWrapper,
+          imagePackerModel = imagePackerModelWrapper,
+          mergeModel = mergeWrapper)
+        annotatorModel.setModelIfNotSet(spark, preprocessorConfig, None, Some(openvinoWrapper))
+      case _ =>
+        throw new Exception(notSupportedEngineError)
+    }
+
+    annotatorModel
+  }
+}
+
+object E5VEmbeddings extends ReadablePretrainedE5VEmbeddings with ReadE5VEmbeddingsDLModel
diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
index 6cb1ab2aa565f0..fcd7c11be57029 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
@@ -710,7 +710,8 @@ object PythonResourceDownloader {
     "PaliGemmaForMultiModal" -> PaliGemmaForMultiModal,
     "Gemma3ForMultiModal" -> Gemma3ForMultiModal,
     "InternVLForMultiModal" -> InternVLForMultiModal,
-    "Florence2Transformer" -> Florence2Transformer)
+    "Florence2Transformer" -> Florence2Transformer,
+    "E5VEmbeddings" -> E5VEmbeddings)
 
   // List pairs of types such as the one with key type can load a pretrained model from the value type
   val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering")
diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/EmbeddingsDataFrameUtils.scala b/src/main/scala/com/johnsnowlabs/nlp/util/EmbeddingsDataFrameUtils.scala
new file mode 100644
index 00000000000000..5c701306da0c22
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/util/EmbeddingsDataFrameUtils.scala
@@ -0,0 +1,22 @@
+package com.johnsnowlabs.nlp.util
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types._
+
+object EmbeddingsDataFrameUtils {
+  // Schema Spark expects for `format("image")`
+  val imageSchema: StructType = StructType(
+    Seq(
+      StructField(
+        "image",
+        StructType(Seq(
+          StructField("origin", StringType, true),
+          StructField("height", IntegerType, true),
+          StructField("width", IntegerType, true),
+          StructField("nChannels", IntegerType, true),
+          StructField("mode", IntegerType, true),
+          StructField("data", BinaryType, true))))))
+
+  // A reusable null image row for text-only embedding scenarios
+  val emptyImageRow: Row = Row(Row("", 0, 0, 0, 0, Array[Byte]()))
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala b/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala
new file mode 100644
index 00000000000000..881f07e8664985
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.reader.HTMLElement
+
+import scala.collection.mutable
+
+object BasicChunker {
+
+  /** Splits a list of [[HTMLElement]]s into chunks constrained by a maximum number of characters.
+    *
+    * This method ensures that no chunk exceeds the specified `maxCharacters` limit. Optionally, a
+    * `newAfterNChars` parameter can be used to set a soft boundary for starting new chunks
+    * earlier, and `overlap` can be used to retain trailing characters from the previous chunk in
+    * the next one (when splitting long elements).
+    *
+    * @param elements
+    *   The list of [[HTMLElement]]s to be chunked.
+    * @param maxCharacters
+    *   The hard limit on the number of characters per chunk.
+    * @param newAfterNChars
+    *   Optional soft limit for starting a new chunk before reaching `maxCharacters`. If set to
+    * -1, this soft limit is ignored.
+    * @param overlap
+    *   Number of trailing characters to overlap between chunks when splitting long elements. This
+    *   helps maintain context in downstream NLP tasks.
+    * @return
+    *   A list of [[Chunk]] objects, each containing a group of elements whose combined content
+    *   length does not exceed the specified limits.
+    */
+
+  def chunkBasic(
+      elements: List[HTMLElement],
+      maxCharacters: Int,
+      newAfterNChars: Int = -1,
+      overlap: Int = 0): List[Chunk] = {
+    val softLimit = if (newAfterNChars > 0) newAfterNChars else maxCharacters
+    var currentChunk = List.empty[HTMLElement]
+    var currentLength = 0
+    val chunks = mutable.ListBuffer.empty[Chunk]
+
+    def finalizeChunk(): Unit = {
+      if (currentChunk.nonEmpty) {
+        chunks += Chunk(currentChunk)
+        currentChunk = List.empty[HTMLElement]
+        currentLength = 0
+      }
+    }
+
+    for (element <- elements) {
+      val elLength = element.content.length
+
+      if (elLength > maxCharacters) {
+        val splitElements = splitHTMLElement(element, maxCharacters, overlap)
+        for (splitEl <- splitElements) {
+          if (currentLength + splitEl.content.length > maxCharacters || currentLength >= softLimit)
+            finalizeChunk()
+          currentChunk :+= splitEl
+          currentLength += splitEl.content.length
+        }
+      } else if (currentLength + elLength > maxCharacters || currentLength >= softLimit) {
+        finalizeChunk()
+        currentChunk :+= element
+        currentLength += elLength
+      } else {
+        currentChunk :+= element
+        currentLength += elLength
+      }
+    }
+
+    finalizeChunk()
+    chunks.toList
+  }
+
+  private def splitHTMLElement(
+      element: HTMLElement,
+      maxLen: Int,
+      overlap: Int): List[HTMLElement] = {
+    val words = element.content.split(" ")
+    val buffer = mutable.ListBuffer.empty[HTMLElement]
+    var chunk = new StringBuilder
+
+    for (word <- words) {
+      if (chunk.length + word.length + 1 > maxLen) {
+        val text = chunk.toString().trim
+        buffer += element.copy(content = text)
+        chunk = new StringBuilder
+        if (overlap > 0 && text.length >= overlap)
+          chunk.append(text.takeRight(overlap)).append(" ")
+      }
+      chunk.append(word).append(" ")
+    }
+
+    if (chunk.nonEmpty)
+      buffer += element.copy(content = chunk.toString().trim)
+
+    buffer.toList
+  }
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/Chunk.scala b/src/main/scala/com/johnsnowlabs/partition/Chunk.scala
new file mode 100644
index 00000000000000..04e6a2585378e1
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/partition/Chunk.scala
@@ -0,0 +1,7 @@
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.reader.HTMLElement
+
+case class Chunk(elements: List[HTMLElement]) {
+  def length: Int = elements.map(_.content.length).sum
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/HasChunkerProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasChunkerProperties.scala
new file mode 100644
index 00000000000000..82de0df0ca13d7
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/partition/HasChunkerProperties.scala
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable
+import org.apache.spark.ml.param.Param
+
+trait HasChunkerProperties extends ParamsAndFeaturesWritable {
+
+  val chunkingStrategy = new Param[String](this, "chunkingStrategy", "Set the chunking strategy")
+
+  def setChunkingStrategy(value: String): this.type = set(chunkingStrategy, value)
+
+  val maxCharacters =
+    new Param[Int](this, "maxCharacters", "Set the maximum number of characters")
+
+  def setMaxCharacters(value: Int): this.type = set(maxCharacters, value)
+
+  val newAfterNChars =
+    new Param[Int](this, "newAfterNChars", "Insert a new chunk after N characters")
+
+  def setNewAfterNChars(value: Int): this.type = set(newAfterNChars, value)
+
+  val overlap =
+    new Param[Int](this, "overlap", "Set the number of overlapping characters between chunks")
+
+  def setOverlap(value: Int): this.type = set(overlap, value)
+
+  val combineTextUnderNChars =
+    new Param[Int](this, "combineTextUnderNChars", "Threshold to merge adjacent small sections")
+
+  def setComBineTextUnderNChars(value: Int): this.type =
+    set(combineTextUnderNChars, value)
+
+  val overlapAll =
+    new Param[Boolean](
+      this,
+      "overlapAll",
+      "Apply overlap context between all sections, not just split chunks")
+
+  def setOverlapAll(value: Boolean): this.type = set(overlapAll, value)
+
+  setDefault(
+    chunkingStrategy -> "",
+    maxCharacters -> 100,
+    newAfterNChars -> -1,
+    overlap -> 0,
+    combineTextUnderNChars -> 0,
+    overlapAll -> false)
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala
new file mode 100644
index 00000000000000..4993bc65a8cd8b
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/partition/HasXmlReaderProperties.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable
+import org.apache.spark.ml.param.Param
+
+trait HasXmlReaderProperties extends ParamsAndFeaturesWritable {
+
+  val xmlKeepTags = new Param[Boolean](
+    this,
+    "xmlKeepTags",
+    "Whether to include XML tag names as metadata in the output.")
+
+  def setXmlKeepTags(value: Boolean): this.type = set(xmlKeepTags, value)
+
+  val onlyLeafNodes = new Param[Boolean](
+    this,
+    "onlyLeafNodes",
+    "If true, only processes XML leaf nodes (no nested children).")
+
+  def setOnlyLeafNodes(value: Boolean): this.type = set(onlyLeafNodes, value)
+
+  setDefault(xmlKeepTags -> false, onlyLeafNodes -> true)
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala
index 1480d33a8c053d..2e6f69b8c5b4c4 100644
--- a/src/main/scala/com/johnsnowlabs/partition/Partition.scala
+++ b/src/main/scala/com/johnsnowlabs/partition/Partition.scala
@@ -144,7 +144,13 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case None => getReaderByExtension(path, sparkNLPReader)
     }
 
-    reader(path)
+    val partitionResult = reader(path)
+    if (hasChunkerStrategy) {
+      val chunker = new PartitionChunker(params.asScala.toMap)
+      partitionResult.withColumn(
+        "chunks",
+        chunker.chunkUDF()(partitionResult(sparkNLPReader.getOutputColumn)))
+    } else partitionResult
   }
 
   def partitionStringContent(
@@ -182,6 +188,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
           "application/vnd.openxmlformats-officedocument.presentationml.presentation" =>
         sparkNLPReader.ppt
       case "application/pdf" => sparkNLPReader.pdf
+      case "application/xml" => sparkNLPReader.xml
       case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
     }
   }
@@ -193,6 +200,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "text/plain" => sparkNLPReader.txtToHTMLElement
       case "text/html" => sparkNLPReader.htmlToHTMLElement
       case "url" => sparkNLPReader.urlToHTMLElement
+      case "application/xml" => sparkNLPReader.xmlToHTMLElement
       case _ => throw new IllegalArgumentException(s"Unsupported content type: $contentType")
     }
   }
@@ -228,6 +236,7 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case "xls" | "xlsx" => sparkNLPReader.xls
       case "ppt" | "pptx" => sparkNLPReader.ppt
       case "pdf" => sparkNLPReader.pdf
+      case "xml" => sparkNLPReader.xml
       case _ => throw new IllegalArgumentException(s"Unsupported file type: $extension")
     }
   }
@@ -342,6 +351,11 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       .headOption
   }
 
+  private def hasChunkerStrategy: Boolean = {
+    Seq("chunking_strategy", "chunkingStrategy")
+      .exists(params.asScala.contains)
+  }
+
 }
 
 object Partition {
diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionChunker.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionChunker.scala
new file mode 100644
index 00000000000000..84187d40b60364
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/partition/PartitionChunker.scala
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.partition.BasicChunker.chunkBasic
+import com.johnsnowlabs.partition.TitleChunker.chunkByTitle
+import com.johnsnowlabs.reader.HTMLElement
+import com.johnsnowlabs.reader.util.PartitionOptions.{
+  getDefaultBoolean,
+  getDefaultInt,
+  getDefaultString
+}
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.expressions.UserDefinedFunction
+import org.apache.spark.sql.functions.udf
+
+import scala.collection.mutable
+
+class PartitionChunker(chunkerOptions: Map[String, String]) extends Serializable {
+
+  def chunkUDF(): UserDefinedFunction = {
+    udf((elements: Seq[Row]) => {
+      val htmlElements = elements.map { row =>
+        val elementType = row.getAs[String]("elementType")
+        val content = row.getAs[String]("content")
+        val metadata = row.getAs[Map[String, String]]("metadata")
+        HTMLElement(elementType, content, mutable.Map.empty ++ metadata)
+      }.toList
+
+      val chunks = getChunkerStrategy match {
+        case "basic" => chunkBasic(htmlElements, getMaxCharacters, getNewAfterNChars, getOverlap)
+        case "byTitle" | "by_title" =>
+          chunkByTitle(
+            htmlElements,
+            getMaxCharacters,
+            getCombineTextUnderNChars,
+            getOverlap,
+            getNewAfterNChars,
+            getOverlapAll)
+        case _ =>
+          throw new IllegalArgumentException(s"Unknown chunker strategy: $getChunkerStrategy")
+      }
+
+      chunks.flatMap(_.elements)
+    })
+  }
+
+  private def getMaxCharacters: Int = {
+    getDefaultInt(chunkerOptions, Seq("maxCharacters", "max_characters"), default = 500)
+  }
+
+  private def getNewAfterNChars: Int = {
+    getDefaultInt(chunkerOptions, Seq("newAfterNChars", "new_after_n_chars"), default = -1)
+  }
+
+  private def getOverlap: Int = {
+    getDefaultInt(chunkerOptions, Seq("overlap", "overlap"), default = 0)
+  }
+
+  private def getChunkerStrategy: String = {
+    getDefaultString(
+      chunkerOptions,
+      Seq("chunkingStrategy", "chunking_strategy"),
+      default = "none")
+  }
+
+  private def getCombineTextUnderNChars: Int = {
+    getDefaultInt(
+      chunkerOptions,
+      Seq("combineTextUnderNChars", "combine_text_under_n_chars"),
+      default = 0)
+  }
+
+  private def getOverlapAll: Boolean = {
+    getDefaultBoolean(chunkerOptions, Seq("overlapAll", "overlap_all"), default = false)
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
index 1dc2e48b8282ac..281af53931d72c 100644
--- a/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
+++ b/src/main/scala/com/johnsnowlabs/partition/PartitionTransformer.scala
@@ -19,12 +19,12 @@ import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT}
 import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
 import com.johnsnowlabs.partition.util.PartitionHelper.{
   datasetWithBinaryFile,
-  datasetWithTxtFile,
+  datasetWithTextFile,
   isStringContent
 }
 import com.johnsnowlabs.reader.util.HasPdfProperties
 import com.johnsnowlabs.reader.{HTMLElement, PdfToText}
-import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.PipelineModel
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.functions.{col, explode, udf}
 import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType}
@@ -85,7 +85,9 @@ class PartitionTransformer(override val uid: String)
     with HasHTMLReaderProperties
     with HasPowerPointProperties
     with HasTextReaderProperties
-    with HasPdfProperties {
+    with HasPdfProperties
+    with HasXmlReaderProperties
+    with HasChunkerProperties {
 
   def this() = this(Identifiable.randomUID("PartitionTransformer"))
   protected val logger: Logger = LoggerFactory.getLogger(getClass.getName)
@@ -150,16 +152,24 @@ class PartitionTransformer(override val uid: String)
       "paragraphSplit" -> $(paragraphSplit),
       "shortLineWordThreshold" -> $(shortLineWordThreshold).toString,
       "maxLineCount" -> $(maxLineCount).toString,
-      "threshold" -> $(threshold).toString)
+      "threshold" -> $(threshold).toString,
+      "chunkingStrategy" -> $(chunkingStrategy),
+      "maxCharacters" -> $(maxCharacters).toString,
+      "newAfterNChars" -> $(newAfterNChars).toString,
+      "overlap" -> $(overlap).toString,
+      "combineTextUnderNChars" -> $(combineTextUnderNChars).toString,
+      "overlapAll" -> $(overlapAll).toString,
+      "xmlKeepTags" -> $(xmlKeepTags).toString,
+      "onlyLeafNodes" -> $(onlyLeafNodes).toString)
     val partitionInstance = new Partition(params.asJava)
-    partitionInstance.setOutputColumn($(inputCols).head)
 
     val inputColum = if (get(inputCols).isDefined) {
       $(inputCols).head
     } else {
       partitionInstance.getOutputColumn
     }
-    partitionInstance.setOutputColumn($(inputCols).head)
+    partitionInstance.setOutputColumn(inputColum)
+
     val partitionDf = if (isStringContent($(contentType))) {
       val partitionUDF = udf((text: String) =>
         partitionInstance.partitionStringContent(text, $(this.headers).asJava))
@@ -167,7 +177,7 @@ class PartitionTransformer(override val uid: String)
 
       schemaFieldOpt match {
         case Some(StructField(_, StringType, _, _)) =>
-          val stringContentDF = datasetWithTxtFile(dataset.sparkSession, $(contentPath))
+          val stringContentDF = datasetWithTextFile(dataset.sparkSession, $(contentPath))
           stringContentDF
             .withColumn(inputColum, partitionUDF(col("content")))
 
@@ -192,12 +202,14 @@ class PartitionTransformer(override val uid: String)
       binaryContentDF.withColumn(inputColum, partitionUDF(col("content")))
     }
 
-    val colName = findHTMLElementColumn(partitionDf).getOrElse {
+    val htmlElementColumns = findHTMLElementColumns(partitionDf)
+
+    if (htmlElementColumns.isEmpty) {
       val schemaString = partitionDf.schema.treeString
       throw new IllegalArgumentException(
         s"""❌ No column of type Array[HTMLElement] was found in the DataFrame.
            |
-           |💡 Expected a column with schema matching: Array[HTMLElement]
+           |💡 Expected one or more columns with schema matching: Array[HTMLElement]
            |
            |🧪 DataFrame Schema:
            |$schemaString
@@ -208,7 +220,12 @@ class PartitionTransformer(override val uid: String)
            |  - metadata: Map[String, String]
      """.stripMargin)
     }
-    partitionDf.withColumn(getOutputCol, wrapColumnMetadata(convertToAnnotations(col(colName))))
+
+    // Transform each matching column
+    val transformedDf = htmlElementColumns.foldLeft(partitionDf) { (df, colName) =>
+      df.withColumn(getOutputCol, wrapColumnMetadata(convertToAnnotations(col(colName))))
+    }
+    transformedDf
   }
 
   private def convertToAnnotations = udf { elements: Seq[Row] =>
@@ -242,4 +259,14 @@ class PartitionTransformer(override val uid: String)
       .map(_.name)
   }
 
+  private def findHTMLElementColumns(dataFrame: DataFrame): Seq[String] = {
+    val htmlElementSchema = Encoders.product[HTMLElement].schema
+
+    dataFrame.schema.fields.collect {
+      case StructField(name, ArrayType(structType: StructType, _), _, _)
+          if structType == htmlElementSchema =>
+        name
+    }
+  }
+
 }
diff --git a/src/main/scala/com/johnsnowlabs/partition/TitleChunker.scala b/src/main/scala/com/johnsnowlabs/partition/TitleChunker.scala
new file mode 100644
index 00000000000000..3a03151fd6303a
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/partition/TitleChunker.scala
@@ -0,0 +1,151 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.reader.{ElementType, HTMLElement}
+
+import scala.collection.mutable
+
+object TitleChunker {
+
+  /** Splits a list of HTML elements into semantically grouped Chunks based on Title and Table
+    * markers.
+    *
+    * @param elements
+    *   List of input HTML elements to chunk.
+    * @param maxCharacters
+    *   Maximum length allowed per chunk. Longer sections are split.
+    * @param combineTextUnderNChars
+    *   Threshold to merge adjacent small sections.
+    * @param overlap
+    *   Number of characters to repeat between consecutive chunks.
+    * @param newAfterNChars
+    *   Soft limit to trigger new section if length exceeded, even before maxCharacters.
+    * @param overlapAll
+    *   Apply overlap context between all sections, not just split chunks.
+    * @return
+    *   List of Chunks partitioned by title and content heuristics.
+    */
+  def chunkByTitle(
+      elements: List[HTMLElement],
+      maxCharacters: Int,
+      combineTextUnderNChars: Int = 0,
+      overlap: Int = 0,
+      newAfterNChars: Int = -1,
+      overlapAll: Boolean = false): List[Chunk] = {
+
+    val softLimit = if (newAfterNChars <= 0) maxCharacters else newAfterNChars
+    val chunks = mutable.ListBuffer.empty[Chunk]
+    val sections = mutable.ListBuffer.empty[List[HTMLElement]]
+    var currentSection = List.empty[HTMLElement]
+    var currentLength = 0
+    var currentPage = -1
+
+    for (element <- elements) {
+      val elementLength = element.content.length
+      val isTable = element.elementType == "Table"
+      val elementPage = element.metadata.getOrElse("pageNumber", "-1").toInt
+
+      val pageChanged = currentPage != -1 && elementPage != currentPage
+      val softLimitExceeded = currentSection.length >= 2 &&
+        (currentLength + elementLength > softLimit)
+
+      if (isTable) {
+        if (currentSection.nonEmpty) sections += currentSection
+        sections += List(element)
+        currentSection = List.empty
+        currentLength = 0
+        currentPage = -1
+      } else if (pageChanged || softLimitExceeded) {
+        if (currentSection.nonEmpty) sections += currentSection
+        currentSection = List(element)
+        currentLength = elementLength
+        currentPage = elementPage
+      } else {
+        currentSection :+= element
+        currentLength += elementLength
+        currentPage = elementPage
+      }
+    }
+    if (currentSection.nonEmpty) sections += currentSection
+
+    val mergedSections = sections.foldLeft(List.empty[List[HTMLElement]]) { (acc, section) =>
+      val sectionLength = section.map(_.content.length).sum
+      val canMerge = combineTextUnderNChars > 0 &&
+        sectionLength < combineTextUnderNChars &&
+        acc.nonEmpty &&
+        acc.last.exists(_.elementType != "Table") &&
+        section.exists(_.elementType != "Table")
+
+      if (canMerge) {
+        acc.init :+ (acc.last ++ section)
+      } else {
+        acc :+ section
+      }
+    }
+
+    var lastNarrativeText = ""
+    for (section <- mergedSections) {
+      if (section.exists(_.elementType == "Table")) {
+        chunks += Chunk(section)
+        lastNarrativeText = ""
+      } else {
+        val sectionText = section.map(_.content).mkString(" ")
+        val content =
+          if (overlap > 0 && lastNarrativeText.nonEmpty && (overlapAll || sectionText.length > maxCharacters))
+            lastNarrativeText.takeRight(overlap) + " " + sectionText
+          else sectionText
+
+        val merged = HTMLElement(ElementType.NARRATIVE_TEXT, content.trim, section.head.metadata)
+        val split = if (content.length > maxCharacters) {
+          splitHTMLElement(merged, maxCharacters, overlap)
+        } else List(merged)
+
+        chunks ++= split.map(e => Chunk(List(e)))
+        lastNarrativeText = sectionText
+      }
+    }
+
+    chunks.toList
+  }
+
+  private def splitHTMLElement(
+      element: HTMLElement,
+      maxLen: Int,
+      overlap: Int): List[HTMLElement] = {
+
+    val words = element.content.split(" ")
+    val buffer = mutable.ListBuffer.empty[HTMLElement]
+    var chunk = new StringBuilder
+
+    for (word <- words) {
+      if (chunk.length + word.length + 1 > maxLen) {
+        val text = chunk.toString().trim
+        buffer += element.copy(content = text)
+        chunk = new StringBuilder
+        if (overlap > 0 && text.length >= overlap)
+          chunk.append(text.takeRight(overlap)).append(" ")
+      }
+      chunk.append(word).append(" ")
+    }
+
+    if (chunk.nonEmpty)
+      buffer += element.copy(content = chunk.toString().trim)
+
+    buffer.toList
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala b/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala
index a69b6f51dd51ba..2f7c959f86cf58 100644
--- a/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala
+++ b/src/main/scala/com/johnsnowlabs/partition/util/PartitionHelper.scala
@@ -29,10 +29,11 @@ object PartitionHelper {
     byteArrayRDD.toDF("path", "content")
   }
 
-  def datasetWithTxtFile(sparkSession: SparkSession, contentPath: String): DataFrame = {
+  def datasetWithTextFile(sparkSession: SparkSession, contentPath: String): DataFrame = {
     import sparkSession.implicits._
     val textFilesRDD = sparkSession.sparkContext.wholeTextFiles(contentPath)
-    textFilesRDD.toDF("path", "content")
+    textFilesRDD
+      .toDF("path", "content")
   }
 
   def isStringContent(contentType: String): Boolean = {
diff --git a/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala
index d205e3fca5dd1e..06a03003c4878a 100644
--- a/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala
+++ b/src/main/scala/com/johnsnowlabs/reader/HTMLReader.scala
@@ -17,7 +17,7 @@ package com.johnsnowlabs.reader
 
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import com.johnsnowlabs.nlp.util.io.ResourceHelper.{isValidURL, validFile}
-import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTxtFile
+import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions.{col, udf}
 import org.jsoup.Jsoup
@@ -109,7 +109,7 @@ class HTMLReader(
 
     ResourceHelper match {
       case _ if validFile(inputSource) && !inputSource.startsWith("http") =>
-        val htmlDf = datasetWithTxtFile(spark, inputSource)
+        val htmlDf = datasetWithTextFile(spark, inputSource)
           .withColumn(outputColumn, parseHtmlUDF(col("content")))
         if (storeContent) htmlDf.select("path", "content", outputColumn)
         else htmlDf.select("path", outputColumn)
diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
index a3af24454af074..216492876cc718 100644
--- a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
+++ b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
@@ -33,7 +33,8 @@ import scala.collection.JavaConverters._
 
 class SparkNLPReader(
     params: java.util.Map[String, String] = new java.util.HashMap(),
-    headers: java.util.Map[String, String] = new java.util.HashMap()) {
+    headers: java.util.Map[String, String] = new java.util.HashMap())
+    extends Serializable {
 
   /** Instantiates class to read HTML files.
     *
@@ -295,7 +296,6 @@ class SparkNLPReader(
     *  |-- width_dimension: integer (nullable = true)
     *  |-- content: binary (nullable = true)
     *  |-- exception: string (nullable = true)
-    *  |-- pagenum: integer (nullable = true)
     * }}}
     *
     * @param params
@@ -641,4 +641,69 @@ class SparkNLPReader(
       default = BLOCK_SPLIT_PATTERN)
   }
 
+  /** Instantiates class to read XML files.
+    *
+    * xmlPath: this is a path to a directory of XML files or a path to an XML file. E.g.,
+    * "path/xml/files"
+    *
+    * ==Example==
+    * {{{
+    * val xmlPath = "home/user/xml-directory"
+    * val sparkNLPReader = new SparkNLPReader()
+    * val xmlDf = sparkNLPReader.xml(xmlPath)
+    * }}}
+    *
+    * ==Example 2==
+    * You can use SparkNLP for one line of code
+    * {{{
+    * val xmlDf = SparkNLP.read.xml(xmlPath)
+    * }}}
+    *
+    * {{{
+    * xmlDf.select("xml").show(false)
+    * +------------------------------------------------------------------------------------------------------------------------+
+    * |xml                                                                                                                    |
+    * +------------------------------------------------------------------------------------------------------------------------+
+    * |[{Title, John Smith, {elementId -> ..., tag -> title}}, {UncategorizedText, Some content..., {elementId -> ...}}]     |
+    * +------------------------------------------------------------------------------------------------------------------------+
+    *
+    * xmlDf.printSchema()
+    * root
+    *  |-- path: string (nullable = true)
+    *  |-- xml: array (nullable = true)
+    *  |    |-- element: struct (containsNull = true)
+    *  |    |    |-- elementType: string (nullable = true)
+    *  |    |    |-- content: string (nullable = true)
+    *  |    |    |-- metadata: map (nullable = true)
+    *  |    |    |    |-- key: string
+    *  |    |    |    |-- value: string (valueContainsNull = true)
+    * }}}
+    *
+    * @param xmlPath
+    *   Path to the XML file or directory
+    * @return
+    *   A DataFrame with parsed XML as structured elements
+    */
+
+  def xml(xmlPath: String): DataFrame = {
+    val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes)
+    xmlReader.read(xmlPath)
+  }
+
+  def xmlToHTMLElement(xml: String): Seq[HTMLElement] = {
+    val xmlReader = new XMLReader(getStoreContent, getXmlKeepTags, getOnlyLeafNodes)
+    xmlReader.parseXml(xml)
+  }
+
+  private def getXmlKeepTags: Boolean = {
+    getDefaultBoolean(params.asScala.toMap, Seq("xmlKeepTags", "xml_keep_tags"), default = false)
+  }
+
+  private def getOnlyLeafNodes: Boolean = {
+    getDefaultBoolean(
+      params.asScala.toMap,
+      Seq("onlyLeafNodes", "only_leaf_nodes"),
+      default = true)
+  }
+
 }
diff --git a/src/main/scala/com/johnsnowlabs/reader/TextReader.scala b/src/main/scala/com/johnsnowlabs/reader/TextReader.scala
index d69050ab112031..ea0598a05940da 100644
--- a/src/main/scala/com/johnsnowlabs/reader/TextReader.scala
+++ b/src/main/scala/com/johnsnowlabs/reader/TextReader.scala
@@ -20,7 +20,7 @@ import com.johnsnowlabs.nlp.annotators.cleaners.util.CleanerHelper.{
   DOUBLE_PARAGRAPH_PATTERN
 }
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
-import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTxtFile
+import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile
 import com.johnsnowlabs.reader.util.TextParser
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions.{col, udf}
@@ -112,8 +112,9 @@ class TextReader(
     */
   def txt(filePath: String): DataFrame = {
     if (ResourceHelper.validFile(filePath)) {
-      val textDf = datasetWithTxtFile(spark, filePath)
-        .withColumn(outputColumn, parseTxtUDF(col("content")))
+      import spark.implicits._
+      val textDf = datasetWithTextFile(spark, filePath)
+        .withColumn(outputColumn, parseTxtUDF($"content"))
       if (storeContent) textDf.select("path", outputColumn, "content")
       else textDf.select("path", outputColumn)
     } else {
diff --git a/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala
new file mode 100644
index 00000000000000..3665cdec2b6651
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/reader/XMLReader.scala
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.reader
+
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.nlp.util.io.ResourceHelper.validFile
+import com.johnsnowlabs.partition.util.PartitionHelper.datasetWithTextFile
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.{col, udf}
+
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+import scala.xml.{Elem, Node, XML}
+
+/** Class to parse and read XML files.
+  *
+  * @param storeContent
+  *   Whether to include the raw XML content in the resulting DataFrame as a separate 'content'
+  *   column. By default, this is false.
+  *
+  * @param xmlKeepTags
+  *   Whether to retain original XML tag names and include them in the metadata for each extracted
+  *   element. Useful for preserving structure. Default is false.
+  *
+  * @param onlyLeafNodes
+  *   If true, only the deepest elements (those without child elements) are extracted. If false,
+  *   all elements are extracted. Default is true.
+  *
+  * ==Input Format==
+  * Input must be a valid path to an XML file or a directory containing XML files.
+  *
+  * ==Example==
+  * {{{
+  * val xmlPath = "./data/sample.xml"
+  * val xmlReader = new XMLReader()
+  * val xmlDf = xmlReader.read(xmlPath)
+  * }}}
+  *
+  * {{{
+  * xmlDf.show(truncate = false)
+  * +----------------------+--------------------------------------------------+
+  * |path                  |xml                                               |
+  * +----------------------+--------------------------------------------------+
+  * |file:/data/sample.xml |[{Title, My Book, {tag -> title}}, ...]          |
+  * +----------------------+--------------------------------------------------+
+  *
+  * xmlDf.printSchema()
+  * root
+  *  |-- path: string (nullable = true)
+  *  |-- xml: array (nullable = true)
+  *  |    |-- element: struct (containsNull = true)
+  *  |    |    |-- elementType: string (nullable = true)
+  *  |    |    |-- content: string (nullable = true)
+  *  |    |    |-- metadata: map (nullable = true)
+  *  |    |    |    |-- key: string
+  *  |    |    |    |-- value: string (valueContainsNull = true)
+  * }}}
+  *
+  * For more examples refer to:
+  * [[https://github.com/JohnSnowLabs/spark-nlp/examples/python/reader/SparkNLP_XML_Reader_Demo.ipynb notebook]]
+  */
+class XMLReader(
+    storeContent: Boolean = false,
+    xmlKeepTags: Boolean = false,
+    onlyLeafNodes: Boolean = true)
+    extends Serializable {
+
+  private lazy val spark = ResourceHelper.spark
+
+  private var outputColumn = "xml"
+
+  def setOutputColumn(value: String): this.type = {
+    require(value.nonEmpty, "Output column name cannot be empty.")
+    outputColumn = value
+    this
+  }
+
+  def read(inputSource: String): DataFrame = {
+    if (validFile(inputSource)) {
+      val xmlDf = datasetWithTextFile(spark, inputSource)
+        .withColumn(outputColumn, parseXmlUDF(col("content")))
+      if (storeContent) xmlDf.select("path", "content", outputColumn)
+      else xmlDf.select("path", outputColumn)
+    } else throw new IllegalArgumentException(s"Invalid inputSource: $inputSource")
+  }
+
+  private val parseXmlUDF = udf((xml: String) => {
+    parseXml(xml)
+  })
+
+  def parseXml(xmlString: String): List[HTMLElement] = {
+    val xml = XML.loadString(xmlString)
+    val elements = ListBuffer[HTMLElement]()
+
+    def traverse(node: Node, parentId: Option[String]): Unit = {
+      node match {
+        case elem: Elem =>
+          val tagName = elem.label.toLowerCase
+          val textContent = elem.text.trim
+          val elementId = hash(tagName + textContent)
+
+          val isLeaf = !elem.child.exists(_.isInstanceOf[Elem])
+
+          if (!onlyLeafNodes || isLeaf) {
+            val elementType = tagName match {
+              case "title" | "author" => ElementType.TITLE
+              case _ => ElementType.UNCATEGORIZED_TEXT
+            }
+
+            val metadata = mutable.Map[String, String]("elementId" -> elementId)
+            if (xmlKeepTags) metadata += ("tag" -> tagName)
+            parentId.foreach(id => metadata += ("parentId" -> id))
+
+            val content = if (isLeaf) textContent else ""
+            elements += HTMLElement(elementType, content, metadata)
+          }
+
+          // Traverse children
+          elem.child.foreach(traverse(_, Some(elementId)))
+
+        case _ => // Ignore other types
+      }
+    }
+
+    traverse(xml, None)
+    elements.toList
+  }
+
+  def hash(s: String): String = {
+    java.security.MessageDigest
+      .getInstance("MD5")
+      .digest(s.getBytes)
+      .map("%02x".format(_))
+      .mkString
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala
index 9eddcc5dec52a0..17b67e8fa9712d 100644
--- a/src/main/scala/com/johnsnowlabs/util/Build.scala
+++ b/src/main/scala/com/johnsnowlabs/util/Build.scala
@@ -17,5 +17,5 @@
 package com.johnsnowlabs.util
 
 object Build {
-  val version: String = "6.0.2"
+  val version: String = "6.0.3"
 }
diff --git a/src/test/resources/reader/txt/long-text.txt b/src/test/resources/reader/txt/long-text.txt
new file mode 100644
index 00000000000000..cadaab9be2048e
--- /dev/null
+++ b/src/test/resources/reader/txt/long-text.txt
@@ -0,0 +1 @@
+Ukrainian forces reportedly advanced in the western Donetsk-eastern Zaporizhia Oblast border area and in western Zaporizhia Oblast amid Ukrainian counteroffensive operations in southern and eastern Ukraine. Tavriisk Group of Forces Spokesperson Oleksandr Shtupun reported that Ukrainian forces are advancing in the directions of Novoprokopivka (13km south of Orikhiv), Mala Tokmachka (9km southeast of Orikhiv), and Ocheretuvate (25km southeast of Orikhiv) in western Zaporizhia Oblast.[1] Shtupun also stated that Ukrainian forces advanced near Urozhaine (9km south of Velyka Novosilka) and Robotyne (10km south of Orikhiv) and achieved unspecified successes near Staromayorske (9km south of Velyka Novosilka) in the Berdyansk direction (western Donetsk-eastern Zaporizhia Oblast border area) and in an unspecified location in the Melitopol direction (western Zaporizhia Oblast).[2] Ukrainian Eastern Group of Forces Spokesperson Ilya Yevlash stated that Ukrainian forces continued offensive operations in the Bakhmut direction.[3]
\ No newline at end of file
diff --git a/src/test/resources/reader/xml/multi-level.xml b/src/test/resources/reader/xml/multi-level.xml
new file mode 100644
index 00000000000000..e14e5ad684be30
--- /dev/null
+++ b/src/test/resources/reader/xml/multi-level.xml
@@ -0,0 +1,20 @@
+
+    
+ + + The Alchemist + Paulo Coelho + 1988 + + +
+
+ + + A Brief History of Time + Stephen Hawking + 1988 + + +
+
diff --git a/src/test/resources/reader/xml/test.xml b/src/test/resources/reader/xml/test.xml new file mode 100644 index 00000000000000..44bdab910b4c96 --- /dev/null +++ b/src/test/resources/reader/xml/test.xml @@ -0,0 +1,14 @@ + + + Harry Potter + J K. Rowling + 2005 + 29.99 + + + Learning XML + Erik T. Ray + 2003 + 39.95 + + \ No newline at end of file diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5VEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5VEmbeddingsTestSpec.scala new file mode 100644 index 00000000000000..9bc4d00f9b7eb2 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5VEmbeddingsTestSpec.scala @@ -0,0 +1,87 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.nlp.{AssertAnnotations, ImageAssembler} +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.{FastTest, SlowTest} +import org.apache.spark.ml.Pipeline +import org.apache.spark.sql.{DataFrame, Encoder, Encoders} +import org.apache.spark.sql.functions.{col, lit, size} +import org.scalatest.flatspec.AnyFlatSpec +import com.johnsnowlabs.nlp.util.EmbeddingsDataFrameUtils.{emptyImageRow, imageSchema} + +class E5VEmbeddingsTestSpec extends AnyFlatSpec { + lazy val model = getE5VEmbeddingsPipelineModel + + val textPrompt = + "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" + val imagePrompt = + "<|start_header_id|>user<|end_header_id|>\n\n\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" + + "E5V Embeddings" should "correctly embed sentences" taggedAs SlowTest in { + val testDF = getTestDF + val result = model.transform(testDF) + + result.select("e5v.embeddings").show(true) + + } + + private def getTestDF: DataFrame = { + val imageFolder = "src/test/resources/image1/" + val imageDF: DataFrame = ResourceHelper.spark.read + .format("image") + .option("dropInvalid", value = true) + .load(imageFolder) + + val testDF: DataFrame = imageDF.withColumn("text", lit(imagePrompt)) + val textDesc = "A cat sitting in a box." + + // Create DataFrame with a single null image row + val spark = ResourceHelper.spark + val nullImageDF = + spark.createDataFrame(spark.sparkContext.parallelize(Seq(emptyImageRow)), imageSchema) + + val textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("", textDesc))) + + testDF.union(textDF) +// textDF + } + private def getE5VEmbeddingsPipelineModel = { + val testDF = getTestDF + + val imageAssembler: ImageAssembler = new ImageAssembler() + .setInputCol("image") + .setOutputCol("image_assembler") + + val loadModel = E5VEmbeddings + .pretrained() + .setInputCols("image_assembler") + .setOutputCol("e5v") + + val newPipeline: Pipeline = + new Pipeline().setStages(Array(imageAssembler, loadModel)) + + val pipelineModel = newPipeline.fit(testDF) + + pipelineModel + .transform(testDF) + .show(truncate = true) + + pipelineModel + } +} diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala new file mode 100644 index 00000000000000..4eabfff8b4304a --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/partition/PartitionChunkerTest.scala @@ -0,0 +1,55 @@ +/* + * Copyright 2017-2025 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.FastTest +import org.apache.spark.sql.functions.explode +import org.scalatest.flatspec.AnyFlatSpec + +class PartitionChunkerTest extends AnyFlatSpec { + import ResourceHelper.spark.implicits._ + val txtDirectory = "src/test/resources/reader/txt" + val htmlDirectory = "src/test/resources/reader/html" + + "Partition" should "perform basic chunk text" taggedAs FastTest in { + val partitionOptions = Map("contentType" -> "text/plain", "chunkingStrategy" -> "basic") + val textDf = Partition(partitionOptions).partition(s"$txtDirectory/long-text.txt") + textDf.show(truncate = false) + + val partitionDf = textDf.select(explode($"txt.content")) + partitionDf.show(truncate = false) + assert(partitionDf.count() == 1) + + val chunkDf = textDf.select(explode($"chunks.content")) + chunkDf.show(truncate = false) + assert(chunkDf.count() > 1) + } + + it should "perform chunking by title" taggedAs FastTest in { + val partitionOptions = Map( + "contentType" -> "text/html", + "titleFontSize" -> "14", + "chunkingStrategy" -> "byTitle", + "combineTextUnderNChars" -> "50") + val textDf = Partition(partitionOptions).partition(s"$htmlDirectory/fake-html.html") + + val partitionDf = textDf.select(explode($"chunks.content")) + partitionDf.show(truncate = false) + assert(partitionDf.count() == 2) + } + +} diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala index 9937b95f59e512..05c5916c843424 100644 --- a/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala +++ b/src/test/scala/com/johnsnowlabs/partition/PartitionTest.scala @@ -32,6 +32,7 @@ class PartitionTest extends AnyFlatSpec { val emailDirectory = "src/test/resources/reader/email" val htmlDirectory = "src/test/resources/reader/html" val pdfDirectory = "src/test/resources/reader/pdf" + val xmlDirectory = "src/test/resources/reader/xml" "Partition" should "work with text content_type" taggedAs FastTest in { val textDf = Partition(Map("content_type" -> "text/plain")).partition(txtDirectory) @@ -181,4 +182,11 @@ class PartitionTest extends AnyFlatSpec { assert(elements == expectedElements) } + it should "work with XML content_type" taggedAs FastTest in { + val pdfDf = Partition(Map("content_type" -> "application/xml")).partition(xmlDirectory) + pdfDf.show() + + assert(!pdfDf.select(col("xml")).isEmpty) + } + } diff --git a/src/test/scala/com/johnsnowlabs/partition/PartitionTransformerTest.scala b/src/test/scala/com/johnsnowlabs/partition/PartitionTransformerTest.scala index 474cf4c1b13430..47d6f1d0c8e70b 100644 --- a/src/test/scala/com/johnsnowlabs/partition/PartitionTransformerTest.scala +++ b/src/test/scala/com/johnsnowlabs/partition/PartitionTransformerTest.scala @@ -27,10 +27,11 @@ class PartitionTransformerTest extends AnyFlatSpec with SparkSessionTest { val wordDirectory = "src/test/resources/reader/doc" val emailDirectory = "src/test/resources/reader/email" val htmlDirectory = "src/test/resources/reader/html" + val txtDirectory = "src/test/resources/reader/txt" "PartitionTransformer" should "work in a RAG pipeline" taggedAs SlowTest in { val partition = new PartitionTransformer() - .setInputCols("doc") + .setInputCols("text") .setContentType("application/msword") .setContentPath(s"$wordDirectory/fake_table.docx") .setOutputCol("partition") @@ -46,7 +47,7 @@ class PartitionTransformerTest extends AnyFlatSpec with SparkSessionTest { val pipelineModel = pipeline.fit(emptyDataSet) val resultDf = pipelineModel.transform(emptyDataSet) - resultDf.select("doc", "partition", "translation").show(truncate = false) + resultDf.select("text", "partition", "translation").show(truncate = false) assert(resultDf.select("partition").count() > 0) } @@ -167,4 +168,46 @@ class PartitionTransformerTest extends AnyFlatSpec with SparkSessionTest { assert(resultDf.select("partition").count() > 0) } + it should "chunk semantically" taggedAs FastTest in { + val partition = new PartitionTransformer() + .setInputCols("text") + .setContentType("text/plain") + .setContentPath(s"$txtDirectory") + .setOutputCol("chunks") + .setChunkingStrategy("basic") + .setMaxCharacters(72) + + val pipeline = new Pipeline() + .setStages(Array(partition)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(emptyDataSet) + resultDf.show(truncate = false) + } + + it should "chunk semantically with document assembler" taggedAs FastTest in { + import spark.implicits._ + val testDataSet = Seq( + "Introduction: RAG stands for Retrieval-Augmented Generation." + + " Why RAG? It improves factual accuracy and adds fresh or private data to LLMs." + + " Chunking: Breaks documents into pieces so they can be embedded." + + " Semantic Chunking: Focus on respecting document structure like sections." + + " Summary: RAG is powerful when paired with good chunking!").toDS + .toDF("text") + + val partition = new PartitionTransformer() + .setInputCols("document") + .setOutputCol("chunks") + .setChunkingStrategy("basic") + .setMaxCharacters(140) + + val pipeline = new Pipeline() + .setStages(Array(documentAssembler, partition)) + + val pipelineModel = pipeline.fit(emptyDataSet) + val resultDf = pipelineModel.transform(testDataSet) + resultDf.select("chunks").show(truncate = false) + + } + } diff --git a/src/test/scala/com/johnsnowlabs/partition/TitleChunkerTest.scala b/src/test/scala/com/johnsnowlabs/partition/TitleChunkerTest.scala new file mode 100644 index 00000000000000..671b5403dcc43a --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/partition/TitleChunkerTest.scala @@ -0,0 +1,73 @@ +package com.johnsnowlabs.partition + +import com.johnsnowlabs.reader.HTMLElement +import org.scalatest.flatspec.AnyFlatSpec + +import scala.collection.mutable + +class TitleChunkerTest extends AnyFlatSpec { + + def element(et: String, text: String, page: Int = 1): HTMLElement = + HTMLElement(et, text, mutable.Map("pageNumber" -> page.toString)) + + "chunkByTitle" should "include titles in same chunk with following text" in { + val elements = List( + element("Title", "My First Heading"), + element("Title", "My Second Heading"), + element("NarrativeText", "My first paragraph. lorem ipsum dolor set amet."), + element("Title", "A Third Heading")) + + val result = TitleChunker.chunkByTitle(elements, maxCharacters = 1000) + + assert(result.length == 1) + val content = result.head.elements.head.content + assert(content.contains("My First Heading")) + assert(content.contains("My Second Heading")) + } + + it should "split on soft limit newAfterNChars" in { + val elements = List( + element("Title", "Heading"), + element("NarrativeText", "a " * 50), + element("NarrativeText", "b " * 50)) + + val result = TitleChunker.chunkByTitle(elements, maxCharacters = 1000, newAfterNChars = 100) + + assert(result.length == 2) + } + + it should "add overlap context when overlapAll is true" in { + val elements = List( + element("Title", "Intro"), + element("NarrativeText", "The cow jumped over the moon. " * 5), + element("Title", "Next Section"), + element("NarrativeText", "And the dish ran away with the spoon.")) + + val maxCharacters = 100 + val overlap = 10 + val result = TitleChunker.chunkByTitle( + elements, + maxCharacters = maxCharacters, + overlap = overlap, + overlapAll = true) + assert(result.length >= 2) + + val prevText = ("The cow jumped over the moon. " * 5).trim + val expectedOverlap = prevText.takeRight(overlap).trim + assert(result(1).elements.head.content.contains(expectedOverlap)) + } + + it should "chunk content correctly across page boundaries" in { + val elements = List( + element("Title", "Page 1 Heading"), + element("NarrativeText", "Text on page 1."), + element("Title", "Page 2 Heading", page = 2), + element("NarrativeText", "Text on page 2.", page = 2)) + + val result = TitleChunker.chunkByTitle(elements, maxCharacters = 1000) + assert(result.length == 2) + assert(result(0).elements.head.content.contains("Page 1 Heading")) + assert(result(1).elements.head.content.contains("Page 2 Heading")) + } + +} diff --git a/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala new file mode 100644 index 00000000000000..a75537803e61de --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/reader/XMLReaderTest.scala @@ -0,0 +1,43 @@ +package com.johnsnowlabs.reader + +import com.johnsnowlabs.tags.FastTest +import org.apache.spark.sql.functions.{array_contains, col, explode, map_keys} +import org.scalatest.flatspec.AnyFlatSpec + +class XMLReaderTest extends AnyFlatSpec { + + val xmlFilesDirectory = "./src/test/resources/reader/xml/" + + "XMLReader" should "read xml as dataframe" taggedAs FastTest in { + val XMLReader = new XMLReader() + val xmlDF = XMLReader.read(s"$xmlFilesDirectory/test.xml") + xmlDF.show(truncate = false) + + assert(!xmlDF.select(col("xml").getItem(0)).isEmpty) + assert(!xmlDF.columns.contains("content")) + } + + it should "include tags in the output" taggedAs FastTest in { + val XMLReader = new XMLReader(xmlKeepTags = true) + val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml") + xmlDF.show(truncate = false) + + val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml"))) + val tagsDf = explodedDf.filter(col("xml_exploded.metadata")("tag") =!= "") + + assert(tagsDf.count() > 0) + } + + it should "output all nodes" taggedAs FastTest in { + val XMLReader = new XMLReader(onlyLeafNodes = false) + val xmlDF = XMLReader.read(s"$xmlFilesDirectory/multi-level.xml") + xmlDF.show(truncate = false) + val explodedDf = xmlDF.withColumn("xml_exploded", explode(col("xml"))) + + val noParentIdCount = explodedDf + .filter(!array_contains(map_keys(col("xml_exploded.metadata")), "parentId")) + + assert(noParentIdCount.count() > 0) + } + +}