Merge pull request #10567 from DevinTDHa/feature/sentence-detector-delimiters

maziyarpanahi · web-flow · commit 0f6de01862f4 · 2022-07-19T08:37:59.000+02:00
[SentenceDetector] Added Flag for returning custom bounds
diff --git a/python/sparknlp/annotator/sentence/sentence_detector.py b/python/sparknlp/annotator/sentence/sentence_detector.py
@@ -13,7 +13,6 @@
 #  limitations under the License.
 """Contains classes for the SentenceDetector."""
 
-
 from sparknlp.common import *
 
 
@@ -36,6 +35,11 @@ class SentenceDetectorParams:
                                 "Only utilize custom bounds in sentence detection",
                                 typeConverter=TypeConverters.toBoolean)
 
+    customBoundsStrategy = Param(Params._dummy(),
+                                 "customBoundsStrategy",
+                                 "How to return matched custom bounds",
+                                 typeConverter=TypeConverters.toString)
+
     explodeSentences = Param(Params._dummy(),
                              "explodeSentences",
                              "whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
@@ -106,6 +110,15 @@ class SentenceDetector(AnnotatorModel, SentenceDetectorParams):
         characters used to explicitly mark sentence bounds, by default []
     useCustomBoundsOnly
         Only utilize custom bounds in sentence detection, by default False
+    customBoundsStrategy
+        Sets how to return matched custom bounds, by default "none".
+
+        Will have no effect if no custom bounds are used.
+        Possible values are:
+
+        - "none" - Will not return the matched bound
+        - "prepend" - Prepends a sentence break to the match
+        - "append" - Appends a sentence break to the match
     explodeSentences
         whether to explode each sentence into a different row, for better
         parallelization, by default False
@@ -166,6 +179,23 @@ def setCustomBounds(self, value):
         """
         return self._set(customBounds=value)
 
+    def setCustomBoundsStrategy(self, value):
+        """Sets how to return matched custom bounds, by default "none".
+
+        Will have no effect if no custom bounds are used.
+        Possible values are:
+
+        - "none" - Will not return the matched bound
+        - "prepend" - Prepends a sentence break to the match
+        - "append" - Appends a sentence break to the match
+
+        Parameters
+        ----------
+        value : str
+            Strategy to use
+        """
+        return self._set(customBoundsStrategy=value)
+
     def setUseAbbreviations(self, value):
         """Sets whether to apply abbreviations at sentence detection, by default
         True
@@ -249,8 +279,8 @@ def __init__(self):
             detectLists=True,
             useCustomBoundsOnly=False,
             customBounds=[],
+            customBoundsStrategy="none",
             explodeSentences=False,
             minLength=0,
             maxLength=99999
         )
-
diff --git a/python/test/annotator/sentence/pragmatic_test.py b/python/test/annotator/sentence/pragmatic_test.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 import os
+import textwrap
 import unittest
 
 import pytest
@@ -62,8 +63,9 @@ def runTest(self):
         lemmatizer = Lemmatizer() \
             .setInputCols(["token"]) \
             .setOutputCol("lemma") \
-            .setDictionary(path="file:///" + os.getcwd() + "/../src/test/resources/lemma-corpus-small/lemmas_small.txt",
-                           key_delimiter="->", value_delimiter="\t")
+            .setDictionary(
+            path="file:///" + os.getcwd() + "/../src/test/resources/lemma-corpus-small/lemmas_small.txt",
+            key_delimiter="->", value_delimiter="\t")
         sentiment_detector = SentimentDetector() \
             .setInputCols(["lemma", "sentence"]) \
             .setOutputCol("sentiment") \
@@ -76,3 +78,79 @@ def runTest(self):
         lemmatized = lemmatizer.fit(tokenized).transform(tokenized)
         sentiment_detector.fit(lemmatized).transform(lemmatized).show()
 
+
+@pytest.mark.fast
+class PragmaticSBDReturnCustomBoundsTestSpec(unittest.TestCase):
+
+    def create_data(self, data):
+        return SparkContextForTest.spark.createDataFrame([[data]]).toDF("text")
+
+    def runTest(self):
+        def assert_sentence_bounds(sent, sd, expected_sentence):
+            doc_assembler = DocumentAssembler() \
+                .setInputCol("text") \
+                .setOutputCol("document")
+
+            data = self.create_data(sent)
+            doc = doc_assembler.transform(data)
+
+            result = sd.transform(doc).select("sentence.result").first()["result"]
+
+            for sent, exp in zip(result, expected_sentence):
+                assert sent == exp
+
+        example = "This is a sentence. This one uses custom bounds; As is this one;"
+
+        sentence_detector_default = SentenceDetector() \
+            .setInputCols("document") \
+            .setOutputCol("sentence") \
+            .setCustomBounds([r"\.", ";"]) \
+            .setUseCustomBoundsOnly(True)
+
+        expected_default = ["This is a sentence", "This one uses custom bounds",
+                            "As is this one"]
+
+        assert_sentence_bounds(example, sentence_detector_default, expected_default)
+
+        sentence_detector = SentenceDetector() \
+            .setInputCols("document") \
+            .setOutputCol("sentence") \
+            .setCustomBounds([r"\.", ";"]) \
+            .setUseCustomBoundsOnly(True) \
+            .setCustomBoundsStrategy("append")
+
+        sentence_detector_mixed = SentenceDetector() \
+            .setInputCols("document") \
+            .setOutputCol("sentence") \
+            .setCustomBounds([";"]) \
+            .setCustomBoundsStrategy("append")
+
+        expected_append = ["This is a sentence.", "This one uses custom bounds;",
+                    "As is this one;"]
+
+        assert_sentence_bounds(example, sentence_detector, expected_append)
+        assert_sentence_bounds(example, sentence_detector_mixed, expected_append)
+
+        subHeaderList = textwrap.dedent(
+            """
+            1. This is a list
+            1.1 This is a subpoint
+            2. Second thing
+            2.2 Second subthing
+            """
+        )
+
+        sentence_detector_prepend = SentenceDetector() \
+            .setInputCols("document") \
+            .setOutputCol("sentence") \
+            .setCustomBounds([r"\n[\d\. ]+"]) \
+            .setUseCustomBoundsOnly(True) \
+            .setCustomBoundsStrategy("prepend")
+
+        expectedPrepend = [
+            "1. This is a list",
+            "1.1 This is a subpoint",
+            "2. Second thing",
+            "2.2 Second subthing"]
+        assert_sentence_bounds(subHeaderList, sentence_detector_prepend,
+                               expectedPrepend)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/SentenceDetectorParams.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/SentenceDetectorParams.scala
@@ -16,7 +16,7 @@
 
 package com.johnsnowlabs.nlp.annotators.sbd
 
-import org.apache.spark.ml.param.{BooleanParam, IntParam, Params, StringArrayParam}
+import org.apache.spark.ml.param.{BooleanParam, IntParam, Param, Params, StringArrayParam}
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -93,13 +93,26 @@ trait SentenceDetectorParams extends Params {
   val maxLength =
     new IntParam(this, "maxLength", "Set the maximum allowed length for each sentence")
 
+  /** How to return matched custom bounds (Default: `none`). Will have no effect if no custom
+    * bounds are used. Possible values are:
+    *
+    *   - "none" - Will not return the matched bound
+    *   - "prepend" - Prepends a sentence break to the match
+    *   - "append" - Appends a sentence break to the match
+    *
+    * @group param
+    */
+  val customBoundsStrategy: Param[String] =
+    new Param[String](this, "customBoundsStrategy", "How to return matched custom bounds")
+
   setDefault(
     useAbbrevations -> true,
     detectLists -> true,
     useCustomBoundsOnly -> false,
     explodeSentences -> false,
     customBounds -> Array.empty[String],
-    minLength -> 0)
+    minLength -> 0,
+    customBoundsStrategy -> "none")
 
   /** Custom sentence separator text
     * @group setParam
@@ -123,6 +136,30 @@ trait SentenceDetectorParams extends Params {
     */
   def getUseCustomBoundsOnly: Boolean = $(useCustomBoundsOnly)
 
+  /** Sets how to return matched custom bounds (Default: `none`). Will have no effect if no custom
+    * bounds are used. Possible values are:
+    *
+    *   - "none" - Will not return the matched bound
+    *   - "prepend" - Prepends a sentence break to the match
+    *   - "append" - Appends a sentence break to the match
+    *
+    * @group setParam
+    */
+  def setCustomBoundsStrategy(value: String): this.type = {
+    val possibleValues = Array("none", "prepend", "append")
+    require(
+      possibleValues.contains(value),
+      s"$value is not a valid strategy for custom bounds. " +
+        s"Possible Values: (${possibleValues.mkString(", ")}).")
+
+    set(customBoundsStrategy, value)
+  }
+
+  /** Gets how to return matched custom bounds (Default: `none`).
+    * @group getParam
+    */
+  def getCustomBoundsStrategy: String = $(customBoundsStrategy)
+
   /** Whether to consider abbreviation strategies for better accuracy but slower performance.
     * Defaults to true.
     * @group setParam
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticMethod.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticMethod.scala
@@ -19,7 +19,10 @@ package com.johnsnowlabs.nlp.annotators.sbd.pragmatic
 import com.johnsnowlabs.nlp.annotators.common.Sentence
 import com.johnsnowlabs.nlp.util.regex.MatchStrategy.MATCH_ALL
 import com.johnsnowlabs.nlp.util.regex.RuleFactory
-import com.johnsnowlabs.nlp.util.regex.TransformStrategy.REPLACE_ALL_WITH_SYMBOL
+import com.johnsnowlabs.nlp.util.regex.TransformStrategy.{
+  REPLACE_ALL_WITH_SYMBOL,
+  TransformStrategy
+}
 
 protected trait PragmaticMethod {
   def extractBounds(content: String): Array[Sentence]
@@ -29,12 +32,14 @@ protected trait PragmaticMethod {
   * This approach extracts sentence bounds by first formatting the data with [[RuleSymbols]] and
   * then extracting bounds with a strong RegexBased rule application
   */
-class CustomPragmaticMethod(customBounds: Array[String])
+class CustomPragmaticMethod(
+    customBounds: Array[String],
+    transformStrategy: TransformStrategy = REPLACE_ALL_WITH_SYMBOL)
     extends PragmaticMethod
     with Serializable {
   override def extractBounds(content: String): Array[Sentence] = {
 
-    val customBoundsFactory = new RuleFactory(MATCH_ALL, REPLACE_ALL_WITH_SYMBOL)
+    val customBoundsFactory = new RuleFactory(MATCH_ALL, transformStrategy)
     customBounds.foreach(bound => customBoundsFactory.addRule(bound.r, s"split bound: $bound"))
 
     val symbolyzedData = new PragmaticContentFormatter(content)
@@ -74,10 +79,11 @@ class DefaultPragmaticMethod(useAbbreviations: Boolean = false, detectLists: Boo
 class MixedPragmaticMethod(
     useAbbreviations: Boolean = false,
     detectLists: Boolean = true,
-    customBounds: Array[String])
+    customBounds: Array[String],
+    transformStrategy: TransformStrategy = REPLACE_ALL_WITH_SYMBOL)
     extends PragmaticMethod
     with Serializable {
-  val customBoundsFactory = new RuleFactory(MATCH_ALL, REPLACE_ALL_WITH_SYMBOL)
+  val customBoundsFactory = new RuleFactory(MATCH_ALL, transformStrategy)
   customBounds.foreach(bound => customBoundsFactory.addRule(bound.r, s"split bound: $bound"))
 
   /** this is a hardcoded order of operations considered to go from those most specific
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala
@@ -18,6 +18,7 @@ package com.johnsnowlabs.nlp.annotators.sbd.pragmatic
 
 import com.johnsnowlabs.nlp.annotators.common.{Sentence, SentenceSplit}
 import com.johnsnowlabs.nlp.annotators.sbd.SentenceDetectorParams
+import com.johnsnowlabs.nlp.util.regex.TransformStrategy
 import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
 import org.apache.spark.sql.{DataFrame, Dataset}
@@ -137,11 +138,25 @@ class SentenceDetector(override val uid: String)
   override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
 
   lazy val model: PragmaticMethod =
-    if ($(customBounds).nonEmpty && $(useCustomBoundsOnly))
-      new CustomPragmaticMethod($(customBounds))
-    else if ($(customBounds).nonEmpty)
-      new MixedPragmaticMethod($(useAbbrevations), $(detectLists), $(customBounds))
-    else
+    if ($(customBounds).nonEmpty) {
+      val transformStrategy = $(customBoundsStrategy) match {
+        case "none" => TransformStrategy.REPLACE_ALL_WITH_SYMBOL
+        case "prepend" => TransformStrategy.PREPEND_WITH_SYMBOL
+        case "append" => TransformStrategy.APPEND_WITH_SYMBOL
+        case _ =>
+          throw new IllegalArgumentException(
+            s"${$(customBoundsStrategy)} is not a valid strategy for custom bounds. " +
+              s"Possible Values: (none, prepend, append).")
+      }
+
+      if ($(useCustomBoundsOnly)) new CustomPragmaticMethod($(customBounds), transformStrategy)
+      else
+        new MixedPragmaticMethod(
+          $(useAbbrevations),
+          $(detectLists),
+          $(customBounds),
+          transformStrategy)
+    } else
       new DefaultPragmaticMethod($(useAbbrevations), $(detectLists))
 
   def tag(document: String): Array[Sentence] = {
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetectorBoundsSpec.scala