apache · hujy · Jul 22, 2015 · Jul 23, 2015 · Jul 23, 2015 · Jul 23, 2015
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -553,6 +553,18 @@ def length(col):
     return Column(sc._jvm.functions.length(_to_java_column(col)))
 
 
+@ignore_unicode_prefix
+@since(1.5)
+def initcap(col):
+    """Translate the first letter of each word to upper case in the sentence.
+
+    >>> sqlContext.createDataFrame([('a b',)], ['a']).select(initcap('a b').alias('v')).collect()
+    [Row(v='A B')]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.initcap(_to_java_column(col)))
+
+
 @ignore_unicode_prefix
 @since(1.5)
 def format_number(col, d):

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -158,6 +158,7 @@ object FunctionRegistry {
     expression[Encode]("encode"),
     expression[Decode]("decode"),
     expression[FormatNumber]("format_number"),
+    expression[InitCap]("initcap"),
     expression[Lower]("lcase"),
     expression[Lower]("lower"),
     expression[Length]("length"),

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -591,6 +591,54 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
   override def prettyName: String = "format_string"
 }
 
+/**
+ * Returns string, with the first letter of each word in uppercase,
+ * all other letters in lowercase. Words are delimited by whitespace.
+ */
+case class InitCap(child: Expression) extends UnaryExpression
+  with ImplicitCastInputTypes {
+  override def dataType: DataType = StringType
+
+  override def inputTypes: Seq[DataType] = Seq(StringType)
+
+  override def nullSafeEval(string: Any): Any = {
+    if (string.asInstanceOf[UTF8String].numBytes() == 0) {
+      return string
+    } else {
+      val sb = new StringBuffer()
+      sb.append(string)
+      sb.setCharAt(0, sb.charAt(0).toUpper)
+      for (i <- 1 until sb.length) {
+        if (sb.charAt(i - 1).equals(' ')) {
+          sb.setCharAt(i, sb.charAt(i).toUpper)
+        }
+      }
+      UTF8String.fromString(sb.toString)
+    }
+  }
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, (child) => {
+      val idx = ctx.freshName("idx")
+      val sb = ctx.freshName("sb")
+      val stringBuffer = classOf[StringBuffer].getName
+      val character = classOf[Character].getName
+      s"""
+        $stringBuffer $sb = new $stringBuffer();
+        $sb.append($child);
+        if($sb.length()>0) {
+        $sb.setCharAt(0,$character.toTitleCase($sb.charAt(0)));
+        for (int $idx = 1; $idx<$sb.length(); $idx++) {
+           if ($sb.charAt($idx - 1)==' ') {
+             $sb.setCharAt($idx,$character.toTitleCase($sb.charAt($idx)));
+           }
+        }
+        ${ev.primitive} = UTF8String.fromString($sb.toString());
+        }
+       """
+    })
+  }
+}
+
 /**
  * Returns the string which repeat the given string value n times.
  */

diff --git a/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/...yst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -317,6 +317,18 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Decode(b, Literal.create(null, StringType)), null, create_row(null))
   }
 
+  test("initcap unit test") {
+    checkEvaluation(InitCap(Literal(null)), null, create_row("s0"))
+    checkEvaluation(InitCap(Literal("a b")), "A B", create_row("s1"))
+    checkEvaluation(InitCap(Literal(" a")), " A", create_row("s2"))
+    checkEvaluation(InitCap(Literal("the test")), "The Test", create_row("s3"))
+    // scalastyle:off
+    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
+    checkEvaluation(InitCap(Literal("世界")), "世界", create_row("s4"))
+    // scalastyle:on
+  }
+
+
   test("Levenshtein distance") {
     checkEvaluation(Levenshtein(Literal.create(null, StringType), Literal("")), null)
     checkEvaluation(Levenshtein(Literal(""), Literal.create(null, StringType)), null)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -1752,6 +1752,24 @@ object functions {
     FormatString((lit(format) +: arguments).map(_.expr): _*)
   }
 
+  /**
+   * Returns string, with the first letter of each word in uppercase,
+   * all other letters in lowercase. Words are delimited by whitespace.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def initcap(e: Column): Column = InitCap(e.expr)
+
+  /**
+   * Returns string, with the first letter of each word in uppercase,
+   * all other letters in lowercase. Words are delimited by whitespace.
+   *
+   * @group string_funcs
+   * @since 1.5.0
+   */
+  def initcap(columnName: String): Column = initcap(Column(columnName))
+
   /**
    * Locate the position of the first occurrence of substr column in the given string.
    * Returns null if either of the arguments are null.

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -231,6 +231,15 @@ class StringFunctionsSuite extends QueryTest {
     }
   }
 
+  test("initcap function") {
+    val df = Seq(("ab", "a B")).toDF("l", "r")
+    checkAnswer(
+      df.select(initcap($"l"), initcap("r")), Row("Ab", "A B"))
+
+    checkAnswer(
+      df.selectExpr("InitCap(l)", "InitCap(r)"), Row("Ab", "A B"))
+  }
+
   test("number format function") {
     val tuple =
       ("aa", 1.asInstanceOf[Byte], 2.asInstanceOf[Short],