JohnSnowLabs · danilojsl · May 29, 2025 · May 29, 2025
diff --git a/python/sparknlp/partition/partition_properties.py b/python/sparknlp/partition/partition_properties.py
@@ -254,4 +254,46 @@ def setThreshold(self, value):
         return self._set(threshold=value)
 
     def getThreshold(self):
-        return self.getOrDefault(self.threshold)
+        return self.getOrDefault(self.threshold)
+
+class HasSemanticChunkerProperties(Params):
+
+    chunkingStrategy = Param(
+        Params._dummy(),
+        "chunkingStrategy",
+        "Set the chunking strategy",
+        typeConverter=TypeConverters.toString
+    )
+
+    def setChunkingStrategy(self, value):
+        return self._set(chunkingStrategy=value)
+
+    maxCharacters = Param(
+        Params._dummy(),
+        "maxCharacters",
+        "Set the maximum number of characters",
+        typeConverter=TypeConverters.toInt
+    )
+
+    def setMaxCharacters(self, value):
+        return self._set(maxCharacters=value)
+
+    newAfterNChars = Param(
+        Params._dummy(),
+        "newAfterNChars",
+        "Insert a new chunk after N characters",
+        typeConverter=TypeConverters.toInt
+    )
+
+    def setNewAfterNChars(self, value):
+        return self._set(newAfterNChars=value)
+
+    overlap = Param(
+        Params._dummy(),
+        "overlap",
+        "Set the number of overlapping characters between chunks",
+        typeConverter=TypeConverters.toInt
+    )
+
+    def setOverlap(self, value):
+        return self._set(overlap=value)
diff --git a/python/sparknlp/partition/partition_transformer.py b/python/sparknlp/partition/partition_transformer.py
@@ -15,13 +15,15 @@
 from sparknlp.common import *
 from sparknlp.partition.partition_properties import *
 
+
 class PartitionTransformer(
     AnnotatorModel,
     HasEmailReaderProperties,
     HasExcelReaderProperties,
     HasHTMLReaderProperties,
     HasPowerPointProperties,
-    HasTextReaderProperties
+    HasTextReaderProperties,
+    HasSemanticChunkerProperties
 ):
     """
     The PartitionTransformer annotator allows you to use the Partition feature more smoothly
@@ -162,10 +164,6 @@ def setIncludePageBreaks(self, value):
     def getIncludePageBreaks(self):
         return self.getOrDefault(self.includePageBreaks)
 
-    # def setHeaders(self, headers: Dict[str, str]):
-    #     self._call_java("setHeadersPython", headers)
-    #     return self
-
     @keyword_only
     def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
                  java_model=None):
@@ -192,5 +190,9 @@ def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
             paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
             shortLineWordThreshold=5,
             maxLineCount=2000,
-            threshold=0.1
-        )
+            threshold=0.1,
+            chunkingStrategy="",
+            maxCharacters=100,
+            newAfterNChars=-1,
+            overlap=0
+        )
diff --git a/python/test/partition/partition_transformer_test.py b/python/test/partition/partition_transformer_test.py
@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import os
 import unittest
 
 import pytest
@@ -80,4 +81,33 @@ def runTest(self):
         resultDf = pipelineModel.transform(self.testDataSet)
         resultDf.show(truncate=False)
 
-        self.assertTrue(resultDf.select("partition").count() > 0)
+        self.assertTrue(resultDf.select("partition").count() > 0)
+
+
+@pytest.mark.slow
+class PartitionTransformerChunkTestSpec(unittest.TestCase):
+
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+        self.content_path = f"file:///{os.getcwd()}/../src/test/resources/reader/txt/rag-example.txt"
+        self.testDataSet = self.spark.createDataFrame(
+            [("An example with DocumentAssembler annotator",)],
+            ["text"]
+        )
+        self.emptyDataSet = self.spark.createDataFrame([], self.testDataSet.schema)
+
+    def runTest(self):
+        partition = PartitionTransformer() \
+            .setInputCols(["document"]) \
+            .setContentPath(self.content_path) \
+            .setOutputCol("partition") \
+            .setChunkingStrategy("basic") \
+            .setMaxCharacters(140)
+
+        pipeline = Pipeline(stages=[partition])
+        pipelineModel = pipeline.fit(self.emptyDataSet)
+
+        resultDf = pipelineModel.transform(self.emptyDataSet)
+        resultDf.show(truncate=False)
+
+        # self.assertTrue(resultDf.select("partition").count() >= 0)
diff --git a/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala b/src/main/scala/com/johnsnowlabs/partition/BasicChunker.scala
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.reader.HTMLElement
+
+import scala.collection.mutable
+
+case class Chunk(elements: List[HTMLElement]) {
+  def length: Int = elements.map(_.content.length).sum
+}
+
+object BasicChunker {
+
+  /** Splits a list of [[HTMLElement]]s into chunks constrained by a maximum number of characters.
+    *
+    * This method ensures that no chunk exceeds the specified `maxCharacters` limit. Optionally, a
+    * `newAfterNChars` parameter can be used to set a soft boundary for starting new chunks
+    * earlier, and `overlap` can be used to retain trailing characters from the previous chunk in
+    * the next one (when splitting long elements).
+    *
+    * @param elements
+    *   The list of [[HTMLElement]]s to be chunked.
+    * @param maxCharacters
+    *   The hard limit on the number of characters per chunk.
+    * @param newAfterNChars
+    *   Optional soft limit for starting a new chunk before reaching `maxCharacters`. If set to
+    * -1, this soft limit is ignored.
+    * @param overlap
+    *   Number of trailing characters to overlap between chunks when splitting long elements. This
+    *   helps maintain context in downstream NLP tasks.
+    * @return
+    *   A list of [[Chunk]] objects, each containing a group of elements whose combined content
+    *   length does not exceed the specified limits.
+    */
+
+  def chunkBasic(
+      elements: List[HTMLElement],
+      maxCharacters: Int,
+      newAfterNChars: Int = -1,
+      overlap: Int = 0): List[Chunk] = {
+    val softLimit = if (newAfterNChars > 0) newAfterNChars else maxCharacters
+    var currentChunk = List.empty[HTMLElement]
+    var currentLength = 0
+    val chunks = mutable.ListBuffer.empty[Chunk]
+
+    def finalizeChunk(): Unit = {
+      if (currentChunk.nonEmpty) {
+        chunks += Chunk(currentChunk)
+        currentChunk = List.empty[HTMLElement]
+        currentLength = 0
+      }
+    }
+
+    for (element <- elements) {
+      val elLength = element.content.length
+
+      if (elLength > maxCharacters) {
+        val splitElements = splitHTMLElement(element, maxCharacters, overlap)
+        for (splitEl <- splitElements) {
+          if (currentLength + splitEl.content.length > maxCharacters || currentLength >= softLimit)
+            finalizeChunk()
+          currentChunk :+= splitEl
+          currentLength += splitEl.content.length
+        }
+      } else if (currentLength + elLength > maxCharacters || currentLength >= softLimit) {
+        finalizeChunk()
+        currentChunk :+= element
+        currentLength += elLength
+      } else {
+        currentChunk :+= element
+        currentLength += elLength
+      }
+    }
+
+    finalizeChunk()
+    chunks.toList
+  }
+
+  private def splitHTMLElement(
+      element: HTMLElement,
+      maxLen: Int,
+      overlap: Int): List[HTMLElement] = {
+    val words = element.content.split(" ")
+    val buffer = mutable.ListBuffer.empty[HTMLElement]
+    var chunk = new StringBuilder
+
+    for (word <- words) {
+      if (chunk.length + word.length + 1 > maxLen) {
+        val text = chunk.toString().trim
+        buffer += element.copy(content = text)
+        chunk = new StringBuilder
+        if (overlap > 0 && text.length >= overlap)
+          chunk.append(text.takeRight(overlap)).append(" ")
+      }
+      chunk.append(word).append(" ")
+    }
+
+    if (chunk.nonEmpty)
+      buffer += element.copy(content = chunk.toString().trim)
+
+    buffer.toList
+  }
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/HasSemanticChunkerProperties.scala b/src/main/scala/com/johnsnowlabs/partition/HasSemanticChunkerProperties.scala
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2017-2025 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.partition
+
+import com.johnsnowlabs.nlp.ParamsAndFeaturesWritable
+import org.apache.spark.ml.param.Param
+
+trait HasSemanticChunkerProperties extends ParamsAndFeaturesWritable {
+
+  val chunkingStrategy = new Param[String](this, "chunkingStrategy", "Set the chunking strategy")
+
+  def setChunkingStrategy(value: String): this.type = set(chunkingStrategy, value)
+
+  val maxCharacters =
+    new Param[Int](this, "maxCharacters", "Set the maximum number of characters")
+
+  def setMaxCharacters(value: Int): this.type = set(maxCharacters, value)
+
+  val newAfterNChars =
+    new Param[Int](this, "newAfterNChars", "Insert a new chunk after N characters")
+
+  def setNewAfterNChars(value: Int): this.type = set(newAfterNChars, value)
+
+  val overlap =
+    new Param[Int](this, "overlap", "Set the number of overlapping characters between chunks")
+
+  def setOverlap(value: Int): this.type = set(overlap, value)
+
+  setDefault(chunkingStrategy -> "", maxCharacters -> 100, newAfterNChars -> -1, overlap -> 0)
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/partition/Partition.scala b/src/main/scala/com/johnsnowlabs/partition/Partition.scala
@@ -144,7 +144,13 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       case None => getReaderByExtension(path, sparkNLPReader)
     }
 
-    reader(path)
+    val partitionResult = reader(path)
+    if (hasChunkerStrategy) {
+      val chunker = new SemanticChunker(params.asScala.toMap)
+      partitionResult.withColumn(
+        "chunks",
+        chunker.chunkUDF()(partitionResult(sparkNLPReader.getOutputColumn)))
+    } else partitionResult
   }
 
   def partitionStringContent(
@@ -342,6 +348,11 @@ class Partition(params: java.util.Map[String, String] = new java.util.HashMap())
       .headOption
   }
 
+  private def hasChunkerStrategy: Boolean = {
+    Seq("chunking_strategy", "chunkingStrategy")
+      .exists(params.asScala.contains)
+  }
+
 }
 
 object Partition {