Skip to content

Commit 0f6de01

Browse files
Merge pull request #10567 from DevinTDHa/feature/sentence-detector-delimiters
[SentenceDetector] Added Flag for returning custom bounds
2 parents 4fa2435 + 81905ff commit 0f6de01

File tree

6 files changed

+285
-17
lines changed

6 files changed

+285
-17
lines changed

python/sparknlp/annotator/sentence/sentence_detector.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.
1414
"""Contains classes for the SentenceDetector."""
1515

16-
1716
from sparknlp.common import *
1817

1918

@@ -36,6 +35,11 @@ class SentenceDetectorParams:
3635
"Only utilize custom bounds in sentence detection",
3736
typeConverter=TypeConverters.toBoolean)
3837

38+
customBoundsStrategy = Param(Params._dummy(),
39+
"customBoundsStrategy",
40+
"How to return matched custom bounds",
41+
typeConverter=TypeConverters.toString)
42+
3943
explodeSentences = Param(Params._dummy(),
4044
"explodeSentences",
4145
"whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
@@ -106,6 +110,15 @@ class SentenceDetector(AnnotatorModel, SentenceDetectorParams):
106110
characters used to explicitly mark sentence bounds, by default []
107111
useCustomBoundsOnly
108112
Only utilize custom bounds in sentence detection, by default False
113+
customBoundsStrategy
114+
Sets how to return matched custom bounds, by default "none".
115+
116+
Will have no effect if no custom bounds are used.
117+
Possible values are:
118+
119+
- "none" - Will not return the matched bound
120+
- "prepend" - Prepends a sentence break to the match
121+
- "append" - Appends a sentence break to the match
109122
explodeSentences
110123
whether to explode each sentence into a different row, for better
111124
parallelization, by default False
@@ -166,6 +179,23 @@ def setCustomBounds(self, value):
166179
"""
167180
return self._set(customBounds=value)
168181

182+
def setCustomBoundsStrategy(self, value):
183+
"""Sets how to return matched custom bounds, by default "none".
184+
185+
Will have no effect if no custom bounds are used.
186+
Possible values are:
187+
188+
- "none" - Will not return the matched bound
189+
- "prepend" - Prepends a sentence break to the match
190+
- "append" - Appends a sentence break to the match
191+
192+
Parameters
193+
----------
194+
value : str
195+
Strategy to use
196+
"""
197+
return self._set(customBoundsStrategy=value)
198+
169199
def setUseAbbreviations(self, value):
170200
"""Sets whether to apply abbreviations at sentence detection, by default
171201
True
@@ -249,8 +279,8 @@ def __init__(self):
249279
detectLists=True,
250280
useCustomBoundsOnly=False,
251281
customBounds=[],
282+
customBoundsStrategy="none",
252283
explodeSentences=False,
253284
minLength=0,
254285
maxLength=99999
255286
)
256-

python/test/annotator/sentence/pragmatic_test.py

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import os
15+
import textwrap
1516
import unittest
1617

1718
import pytest
@@ -62,8 +63,9 @@ def runTest(self):
6263
lemmatizer = Lemmatizer() \
6364
.setInputCols(["token"]) \
6465
.setOutputCol("lemma") \
65-
.setDictionary(path="file:///" + os.getcwd() + "/../src/test/resources/lemma-corpus-small/lemmas_small.txt",
66-
key_delimiter="->", value_delimiter="\t")
66+
.setDictionary(
67+
path="file:///" + os.getcwd() + "/../src/test/resources/lemma-corpus-small/lemmas_small.txt",
68+
key_delimiter="->", value_delimiter="\t")
6769
sentiment_detector = SentimentDetector() \
6870
.setInputCols(["lemma", "sentence"]) \
6971
.setOutputCol("sentiment") \
@@ -76,3 +78,79 @@ def runTest(self):
7678
lemmatized = lemmatizer.fit(tokenized).transform(tokenized)
7779
sentiment_detector.fit(lemmatized).transform(lemmatized).show()
7880

81+
82+
@pytest.mark.fast
83+
class PragmaticSBDReturnCustomBoundsTestSpec(unittest.TestCase):
84+
85+
def create_data(self, data):
86+
return SparkContextForTest.spark.createDataFrame([[data]]).toDF("text")
87+
88+
def runTest(self):
89+
def assert_sentence_bounds(sent, sd, expected_sentence):
90+
doc_assembler = DocumentAssembler() \
91+
.setInputCol("text") \
92+
.setOutputCol("document")
93+
94+
data = self.create_data(sent)
95+
doc = doc_assembler.transform(data)
96+
97+
result = sd.transform(doc).select("sentence.result").first()["result"]
98+
99+
for sent, exp in zip(result, expected_sentence):
100+
assert sent == exp
101+
102+
example = "This is a sentence. This one uses custom bounds; As is this one;"
103+
104+
sentence_detector_default = SentenceDetector() \
105+
.setInputCols("document") \
106+
.setOutputCol("sentence") \
107+
.setCustomBounds([r"\.", ";"]) \
108+
.setUseCustomBoundsOnly(True)
109+
110+
expected_default = ["This is a sentence", "This one uses custom bounds",
111+
"As is this one"]
112+
113+
assert_sentence_bounds(example, sentence_detector_default, expected_default)
114+
115+
sentence_detector = SentenceDetector() \
116+
.setInputCols("document") \
117+
.setOutputCol("sentence") \
118+
.setCustomBounds([r"\.", ";"]) \
119+
.setUseCustomBoundsOnly(True) \
120+
.setCustomBoundsStrategy("append")
121+
122+
sentence_detector_mixed = SentenceDetector() \
123+
.setInputCols("document") \
124+
.setOutputCol("sentence") \
125+
.setCustomBounds([";"]) \
126+
.setCustomBoundsStrategy("append")
127+
128+
expected_append = ["This is a sentence.", "This one uses custom bounds;",
129+
"As is this one;"]
130+
131+
assert_sentence_bounds(example, sentence_detector, expected_append)
132+
assert_sentence_bounds(example, sentence_detector_mixed, expected_append)
133+
134+
subHeaderList = textwrap.dedent(
135+
"""
136+
1. This is a list
137+
1.1 This is a subpoint
138+
2. Second thing
139+
2.2 Second subthing
140+
"""
141+
)
142+
143+
sentence_detector_prepend = SentenceDetector() \
144+
.setInputCols("document") \
145+
.setOutputCol("sentence") \
146+
.setCustomBounds([r"\n[\d\. ]+"]) \
147+
.setUseCustomBoundsOnly(True) \
148+
.setCustomBoundsStrategy("prepend")
149+
150+
expectedPrepend = [
151+
"1. This is a list",
152+
"1.1 This is a subpoint",
153+
"2. Second thing",
154+
"2.2 Second subthing"]
155+
assert_sentence_bounds(subHeaderList, sentence_detector_prepend,
156+
expectedPrepend)

src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/SentenceDetectorParams.scala

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
package com.johnsnowlabs.nlp.annotators.sbd
1818

19-
import org.apache.spark.ml.param.{BooleanParam, IntParam, Params, StringArrayParam}
19+
import org.apache.spark.ml.param.{BooleanParam, IntParam, Param, Params, StringArrayParam}
2020

2121
import scala.collection.mutable.ArrayBuffer
2222

@@ -93,13 +93,26 @@ trait SentenceDetectorParams extends Params {
9393
val maxLength =
9494
new IntParam(this, "maxLength", "Set the maximum allowed length for each sentence")
9595

96+
/** How to return matched custom bounds (Default: `none`). Will have no effect if no custom
97+
* bounds are used. Possible values are:
98+
*
99+
* - "none" - Will not return the matched bound
100+
* - "prepend" - Prepends a sentence break to the match
101+
* - "append" - Appends a sentence break to the match
102+
*
103+
* @group param
104+
*/
105+
val customBoundsStrategy: Param[String] =
106+
new Param[String](this, "customBoundsStrategy", "How to return matched custom bounds")
107+
96108
setDefault(
97109
useAbbrevations -> true,
98110
detectLists -> true,
99111
useCustomBoundsOnly -> false,
100112
explodeSentences -> false,
101113
customBounds -> Array.empty[String],
102-
minLength -> 0)
114+
minLength -> 0,
115+
customBoundsStrategy -> "none")
103116

104117
/** Custom sentence separator text
105118
* @group setParam
@@ -123,6 +136,30 @@ trait SentenceDetectorParams extends Params {
123136
*/
124137
def getUseCustomBoundsOnly: Boolean = $(useCustomBoundsOnly)
125138

139+
/** Sets how to return matched custom bounds (Default: `none`). Will have no effect if no custom
140+
* bounds are used. Possible values are:
141+
*
142+
* - "none" - Will not return the matched bound
143+
* - "prepend" - Prepends a sentence break to the match
144+
* - "append" - Appends a sentence break to the match
145+
*
146+
* @group setParam
147+
*/
148+
def setCustomBoundsStrategy(value: String): this.type = {
149+
val possibleValues = Array("none", "prepend", "append")
150+
require(
151+
possibleValues.contains(value),
152+
s"$value is not a valid strategy for custom bounds. " +
153+
s"Possible Values: (${possibleValues.mkString(", ")}).")
154+
155+
set(customBoundsStrategy, value)
156+
}
157+
158+
/** Gets how to return matched custom bounds (Default: `none`).
159+
* @group getParam
160+
*/
161+
def getCustomBoundsStrategy: String = $(customBoundsStrategy)
162+
126163
/** Whether to consider abbreviation strategies for better accuracy but slower performance.
127164
* Defaults to true.
128165
* @group setParam

src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticMethod.scala

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@ package com.johnsnowlabs.nlp.annotators.sbd.pragmatic
1919
import com.johnsnowlabs.nlp.annotators.common.Sentence
2020
import com.johnsnowlabs.nlp.util.regex.MatchStrategy.MATCH_ALL
2121
import com.johnsnowlabs.nlp.util.regex.RuleFactory
22-
import com.johnsnowlabs.nlp.util.regex.TransformStrategy.REPLACE_ALL_WITH_SYMBOL
22+
import com.johnsnowlabs.nlp.util.regex.TransformStrategy.{
23+
REPLACE_ALL_WITH_SYMBOL,
24+
TransformStrategy
25+
}
2326

2427
protected trait PragmaticMethod {
2528
def extractBounds(content: String): Array[Sentence]
@@ -29,12 +32,14 @@ protected trait PragmaticMethod {
2932
* This approach extracts sentence bounds by first formatting the data with [[RuleSymbols]] and
3033
* then extracting bounds with a strong RegexBased rule application
3134
*/
32-
class CustomPragmaticMethod(customBounds: Array[String])
35+
class CustomPragmaticMethod(
36+
customBounds: Array[String],
37+
transformStrategy: TransformStrategy = REPLACE_ALL_WITH_SYMBOL)
3338
extends PragmaticMethod
3439
with Serializable {
3540
override def extractBounds(content: String): Array[Sentence] = {
3641

37-
val customBoundsFactory = new RuleFactory(MATCH_ALL, REPLACE_ALL_WITH_SYMBOL)
42+
val customBoundsFactory = new RuleFactory(MATCH_ALL, transformStrategy)
3843
customBounds.foreach(bound => customBoundsFactory.addRule(bound.r, s"split bound: $bound"))
3944

4045
val symbolyzedData = new PragmaticContentFormatter(content)
@@ -74,10 +79,11 @@ class DefaultPragmaticMethod(useAbbreviations: Boolean = false, detectLists: Boo
7479
class MixedPragmaticMethod(
7580
useAbbreviations: Boolean = false,
7681
detectLists: Boolean = true,
77-
customBounds: Array[String])
82+
customBounds: Array[String],
83+
transformStrategy: TransformStrategy = REPLACE_ALL_WITH_SYMBOL)
7884
extends PragmaticMethod
7985
with Serializable {
80-
val customBoundsFactory = new RuleFactory(MATCH_ALL, REPLACE_ALL_WITH_SYMBOL)
86+
val customBoundsFactory = new RuleFactory(MATCH_ALL, transformStrategy)
8187
customBounds.foreach(bound => customBoundsFactory.addRule(bound.r, s"split bound: $bound"))
8288

8389
/** this is a hardcoded order of operations considered to go from those most specific

src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/SentenceDetector.scala

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package com.johnsnowlabs.nlp.annotators.sbd.pragmatic
1818

1919
import com.johnsnowlabs.nlp.annotators.common.{Sentence, SentenceSplit}
2020
import com.johnsnowlabs.nlp.annotators.sbd.SentenceDetectorParams
21+
import com.johnsnowlabs.nlp.util.regex.TransformStrategy
2122
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate}
2223
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
2324
import org.apache.spark.sql.{DataFrame, Dataset}
@@ -137,11 +138,25 @@ class SentenceDetector(override val uid: String)
137138
override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
138139

139140
lazy val model: PragmaticMethod =
140-
if ($(customBounds).nonEmpty && $(useCustomBoundsOnly))
141-
new CustomPragmaticMethod($(customBounds))
142-
else if ($(customBounds).nonEmpty)
143-
new MixedPragmaticMethod($(useAbbrevations), $(detectLists), $(customBounds))
144-
else
141+
if ($(customBounds).nonEmpty) {
142+
val transformStrategy = $(customBoundsStrategy) match {
143+
case "none" => TransformStrategy.REPLACE_ALL_WITH_SYMBOL
144+
case "prepend" => TransformStrategy.PREPEND_WITH_SYMBOL
145+
case "append" => TransformStrategy.APPEND_WITH_SYMBOL
146+
case _ =>
147+
throw new IllegalArgumentException(
148+
s"${$(customBoundsStrategy)} is not a valid strategy for custom bounds. " +
149+
s"Possible Values: (none, prepend, append).")
150+
}
151+
152+
if ($(useCustomBoundsOnly)) new CustomPragmaticMethod($(customBounds), transformStrategy)
153+
else
154+
new MixedPragmaticMethod(
155+
$(useAbbrevations),
156+
$(detectLists),
157+
$(customBounds),
158+
transformStrategy)
159+
} else
145160
new DefaultPragmaticMethod($(useAbbrevations), $(detectLists))
146161

147162
def tag(document: String): Array[Sentence] = {

0 commit comments

Comments
 (0)