Skip to content

Commit 5ee7cde

Browse files
committed
update RegexTokenizer default settings
1 parent cdc7c05 commit 5ee7cde

File tree

3 files changed

+44
-46
lines changed

3 files changed

+44
-46
lines changed

mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
2626
/**
2727
* :: AlphaComponent ::
2828
* A tokenizer that converts the input string to lowercase and then splits it by white spaces.
29+
*
30+
* @see [[RegexTokenizer]]
2931
*/
3032
@AlphaComponent
3133
class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[String], Tokenizer] {
@@ -45,9 +47,9 @@ class Tokenizer(override val uid: String) extends UnaryTransformer[String, Seq[S
4547

4648
/**
4749
* :: AlphaComponent ::
48-
* A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
49-
* or using it to split the text (set matching to false). Optional parameters also allow filtering
50-
* tokens using a minimal length.
50+
* A regex based tokenizer that extracts tokens either by using the provided regex pattern to split
51+
* the text (default) or repeatedly matching the regex (if `gaps` is true).
52+
* Optional parameters also allow filtering tokens using a minimal length.
5153
* It returns an array of strings that can be empty.
5254
*/
5355
@AlphaComponent
@@ -71,8 +73,8 @@ class RegexTokenizer(override val uid: String)
7173
def getMinTokenLength: Int = $(minTokenLength)
7274

7375
/**
74-
* Indicates whether regex splits on gaps (true) or matching tokens (false).
75-
* Default: false
76+
* Indicates whether regex splits on gaps (true) or matches tokens (false).
77+
* Default: true
7678
* @group param
7779
*/
7880
val gaps: BooleanParam = new BooleanParam(this, "gaps", "Set regex to match gaps or tokens")
@@ -84,8 +86,8 @@ class RegexTokenizer(override val uid: String)
8486
def getGaps: Boolean = $(gaps)
8587

8688
/**
87-
* Regex pattern used by tokenizer.
88-
* Default: `"\\p{L}+|[^\\p{L}\\s]+"`
89+
* Regex pattern used to match delimiters if [[gaps]] is true or tokens if [[gaps]] is false.
90+
* Default: `"\\s+"`
8991
* @group param
9092
*/
9193
val pattern: Param[String] = new Param(this, "pattern", "regex pattern used for tokenizing")
@@ -96,7 +98,7 @@ class RegexTokenizer(override val uid: String)
9698
/** @group getParam */
9799
def getPattern: String = $(pattern)
98100

99-
setDefault(minTokenLength -> 1, gaps -> false, pattern -> "\\p{L}+|[^\\p{L}\\s]+")
101+
setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+")
100102

101103
override protected def createTransformFunc: String => Seq[String] = { str =>
102104
val re = $(pattern).r

mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,35 +29,34 @@ case class TokenizerTestData(rawText: String, wantedTokens: Array[String])
2929

3030
class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
3131
import org.apache.spark.ml.feature.RegexTokenizerSuite._
32-
32+
3333
test("RegexTokenizer") {
34-
val tokenizer = new RegexTokenizer()
34+
val tokenizer0 = new RegexTokenizer()
35+
.setGaps(false)
36+
.setPattern("\\w+|\\p{Punct}")
3537
.setInputCol("rawText")
3638
.setOutputCol("tokens")
37-
3839
val dataset0 = sqlContext.createDataFrame(Seq(
3940
TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")),
4041
TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct"))
4142
))
42-
testRegexTokenizer(tokenizer, dataset0)
43+
testRegexTokenizer(tokenizer0, dataset0)
4344

4445
val dataset1 = sqlContext.createDataFrame(Seq(
4546
TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")),
4647
TokenizerTestData("Te,st. punct", Array("punct"))
4748
))
49+
tokenizer0.setMinTokenLength(3)
50+
testRegexTokenizer(tokenizer0, dataset1)
4851

49-
tokenizer.setMinTokenLength(3)
50-
testRegexTokenizer(tokenizer, dataset1)
51-
52-
tokenizer
53-
.setPattern("\\s")
54-
.setGaps(true)
55-
.setMinTokenLength(0)
52+
val tokenizer2 = new RegexTokenizer()
53+
.setInputCol("rawText")
54+
.setOutputCol("tokens")
5655
val dataset2 = sqlContext.createDataFrame(Seq(
5756
TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")),
58-
TokenizerTestData("Te,st. punct", Array("Te,st.", "", "punct"))
57+
TokenizerTestData("Te,st. punct", Array("Te,st.", "punct"))
5958
))
60-
testRegexTokenizer(tokenizer, dataset2)
59+
testRegexTokenizer(tokenizer2, dataset2)
6160
}
6261
}
6362

@@ -67,9 +66,8 @@ object RegexTokenizerSuite extends FunSuite {
6766
t.transform(dataset)
6867
.select("tokens", "wantedTokens")
6968
.collect()
70-
.foreach {
71-
case Row(tokens, wantedTokens) =>
72-
assert(tokens === wantedTokens)
73-
}
69+
.foreach { case Row(tokens, wantedTokens) =>
70+
assert(tokens === wantedTokens)
71+
}
7472
}
7573
}

python/pyspark/ml/feature.py

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -446,23 +446,25 @@ def getDegree(self):
446446
@ignore_unicode_prefix
447447
class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
448448
"""
449-
A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
450-
or using it to split the text (set matching to false). Optional parameters also allow filtering
451-
tokens using a minimal length.
449+
A regex based tokenizer that extracts tokens either by using the
450+
provided regex pattern (in Java dialect) to split the text
451+
(default) or repeatedly matching the regex (if gaps is true).
452+
Optional parameters also allow filtering tokens using a minimal
453+
length.
452454
It returns an array of strings that can be empty.
453455
454-
>>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
456+
>>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
455457
>>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
456458
>>> reTokenizer.transform(df).head()
457-
Row(text=u'a b c', words=[u'a', u'b', u'c'])
459+
Row(text=u'a b c', words=[u'a', u'b', u'c'])
458460
>>> # Change a parameter.
459461
>>> reTokenizer.setParams(outputCol="tokens").transform(df).head()
460-
Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
462+
Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
461463
>>> # Temporarily modify a parameter.
462464
>>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()
463-
Row(text=u'a b c', words=[u'a', u'b', u'c'])
465+
Row(text=u'a b c', words=[u'a', u'b', u'c'])
464466
>>> reTokenizer.transform(df).head()
465-
Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
467+
Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
466468
>>> # Must use keyword arguments to specify params.
467469
>>> reTokenizer.setParams("text")
468470
Traceback (most recent call last):
@@ -472,31 +474,27 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
472474

473475
# a placeholder to make it appear in the generated doc
474476
minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)")
475-
gaps = Param(Params._dummy(), "gaps", "Set regex to match gaps or tokens")
476-
pattern = Param(Params._dummy(), "pattern", "regex pattern used for tokenizing")
477+
gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
478+
pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing")
477479

478480
@keyword_only
479-
def __init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+",
480-
inputCol=None, outputCol=None):
481+
def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
481482
"""
482-
__init__(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", \
483-
inputCol=None, outputCol=None)
483+
__init__(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
484484
"""
485485
super(RegexTokenizer, self).__init__()
486486
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.RegexTokenizer", self.uid)
487487
self.minTokenLength = Param(self, "minTokenLength", "minimum token length (>= 0)")
488-
self.gaps = Param(self, "gaps", "Set regex to match gaps or tokens")
489-
self.pattern = Param(self, "pattern", "regex pattern used for tokenizing")
490-
self._setDefault(minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+")
488+
self.gaps = Param(self, "gaps", "whether regex splits on gaps (True) or matches tokens")
489+
self.pattern = Param(self, "pattern", "regex pattern (Java dialect) used for tokenizing")
490+
self._setDefault(minTokenLength=1, gaps=True, pattern="\\s+")
491491
kwargs = self.__init__._input_kwargs
492492
self.setParams(**kwargs)
493493

494494
@keyword_only
495-
def setParams(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+",
496-
inputCol=None, outputCol=None):
495+
def setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None):
497496
"""
498-
setParams(self, minTokenLength=1, gaps=False, pattern="\\p{L}+|[^\\p{L}\\s]+", \
499-
inputCol="input", outputCol="output")
497+
setParams(self, minTokenLength=1, gaps=True, pattern="\\s+", inputCol=None, outputCol=None)
500498
Sets params for this RegexTokenizer.
501499
"""
502500
kwargs = self.setParams._input_kwargs

0 commit comments

Comments
 (0)