@@ -446,23 +446,25 @@ def getDegree(self):
446446@ignore_unicode_prefix
447447class RegexTokenizer (JavaTransformer , HasInputCol , HasOutputCol ):
448448 """
449- A regex based tokenizer that extracts tokens either by repeatedly matching the regex(default)
450- or using it to split the text (set matching to false). Optional parameters also allow filtering
451- tokens using a minimal length.
449+ A regex based tokenizer that extracts tokens either by using the
450+ provided regex pattern (in Java dialect) to split the text
451+ (default) or repeatedly matching the regex (if gaps is true).
452+ Optional parameters also allow filtering tokens using a minimal
453+ length.
452454 It returns an array of strings that can be empty.
453455
454- >>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
456+ >>> df = sqlContext.createDataFrame([("a b c",)], ["text"])
455457 >>> reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
456458 >>> reTokenizer.transform(df).head()
457- Row(text=u'a b c', words=[u'a', u'b', u'c'])
459+ Row(text=u'a b c', words=[u'a', u'b', u'c'])
458460 >>> # Change a parameter.
459461 >>> reTokenizer.setParams(outputCol="tokens").transform(df).head()
460- Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
462+ Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
461463 >>> # Temporarily modify a parameter.
462464 >>> reTokenizer.transform(df, {reTokenizer.outputCol: "words"}).head()
463- Row(text=u'a b c', words=[u'a', u'b', u'c'])
465+ Row(text=u'a b c', words=[u'a', u'b', u'c'])
464466 >>> reTokenizer.transform(df).head()
465- Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
467+ Row(text=u'a b c', tokens=[u'a', u'b', u'c'])
466468 >>> # Must use keyword arguments to specify params.
467469 >>> reTokenizer.setParams("text")
468470 Traceback (most recent call last):
@@ -472,31 +474,27 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
472474
473475 # a placeholder to make it appear in the generated doc
474476 minTokenLength = Param (Params ._dummy (), "minTokenLength" , "minimum token length (>= 0)" )
475- gaps = Param (Params ._dummy (), "gaps" , "Set regex to match gaps or tokens" )
476- pattern = Param (Params ._dummy (), "pattern" , "regex pattern used for tokenizing" )
477+ gaps = Param (Params ._dummy (), "gaps" , "whether regex splits on gaps (True) or matches tokens" )
478+ pattern = Param (Params ._dummy (), "pattern" , "regex pattern (Java dialect) used for tokenizing" )
477479
478480 @keyword_only
479- def __init__ (self , minTokenLength = 1 , gaps = False , pattern = "\\ p{L}+|[^\\ p{L}\\ s]+" ,
480- inputCol = None , outputCol = None ):
481+ def __init__ (self , minTokenLength = 1 , gaps = True , pattern = "\\ s+" , inputCol = None , outputCol = None ):
481482 """
482- __init__(self, minTokenLength=1, gaps=False, pattern="\\ p{L}+|[^\\ p{L}\\ s]+", \
483- inputCol=None, outputCol=None)
483+ __init__(self, minTokenLength=1, gaps=True, pattern="\\ s+", inputCol=None, outputCol=None)
484484 """
485485 super (RegexTokenizer , self ).__init__ ()
486486 self ._java_obj = self ._new_java_obj ("org.apache.spark.ml.feature.RegexTokenizer" , self .uid )
487487 self .minTokenLength = Param (self , "minTokenLength" , "minimum token length (>= 0)" )
488- self .gaps = Param (self , "gaps" , "Set regex to match gaps or tokens" )
489- self .pattern = Param (self , "pattern" , "regex pattern used for tokenizing" )
490- self ._setDefault (minTokenLength = 1 , gaps = False , pattern = "\\ p{L}+|[^ \\ p{L} \\ s] +" )
488+ self .gaps = Param (self , "gaps" , "whether regex splits on gaps (True) or matches tokens" )
489+ self .pattern = Param (self , "pattern" , "regex pattern (Java dialect) used for tokenizing" )
490+ self ._setDefault (minTokenLength = 1 , gaps = True , pattern = "\\ s +" )
491491 kwargs = self .__init__ ._input_kwargs
492492 self .setParams (** kwargs )
493493
494494 @keyword_only
495- def setParams (self , minTokenLength = 1 , gaps = False , pattern = "\\ p{L}+|[^\\ p{L}\\ s]+" ,
496- inputCol = None , outputCol = None ):
495+ def setParams (self , minTokenLength = 1 , gaps = True , pattern = "\\ s+" , inputCol = None , outputCol = None ):
497496 """
498- setParams(self, minTokenLength=1, gaps=False, pattern="\\ p{L}+|[^\\ p{L}\\ s]+", \
499- inputCol="input", outputCol="output")
497+ setParams(self, minTokenLength=1, gaps=True, pattern="\\ s+", inputCol=None, outputCol=None)
500498 Sets params for this RegexTokenizer.
501499 """
502500 kwargs = self .setParams ._input_kwargs
0 commit comments