[SPARK-29367][DOC] Add compatibility note for Arrow 0.15.0 to SQL guide

BryanCutler · hagerf · commit a7af5206ed39 · 2019-12-04T23:58:17.000+01:00
Add documentation to SQL programming guide to use PyArrow >= 0.15.0 with current versions of Spark. Arrow 0.15.0 introduced a change in format which requires an environment variable to maintain compatibility. No Ran pandas_udfs tests using PyArrow 0.15.0 with environment variable set. Closes apache#26045 from BryanCutler/arrow-document-legacy-IPC-fix-SPARK-29367. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/docs/sql-pyspark-pandas-with-arrow.md b/docs/sql-pyspark-pandas-with-arrow.md
@@ -219,3 +219,20 @@ Note that a standard UDF (non-Pandas) will load timestamp data as Python datetim
 different than a Pandas timestamp. It is recommended to use Pandas time series functionality when
 working with timestamps in `pandas_udf`s to get the best performance, see
 [here](https://pandas.pydata.org/pandas-docs/stable/timeseries.html) for details.
+
+### Compatibiliy Setting for PyArrow >= 0.15.0 and Spark 2.3.x, 2.4.x
+
+Since Arrow 0.15.0, a change in the binary IPC format requires an environment variable to be
+compatible with previous versions of Arrow <= 0.14.1. This is only necessary to do for PySpark
+users with versions 2.3.x and 2.4.x that have manually upgraded PyArrow to 0.15.0. The following
+can be added to `conf/spark-env.sh` to use the legacy Arrow IPC format:
+
+```
+ARROW_PRE_0_15_IPC_FORMAT=1
+```
+
+This will instruct PyArrow >= 0.15.0 to use the legacy IPC format with the older Arrow Java that
+is in Spark 2.3.x and 2.4.x. Not setting this environment variable will lead to a similar error as
+described in [SPARK-29367](https://issues.apache.org/jira/browse/SPARK-29367) when running
+`pandas_udf`s or `toPandas()` with Arrow enabled. More information about the Arrow IPC change can
+be read on the Arrow 0.15.0 release [blog](http://arrow.apache.org/blog/2019/10/06/0.15.0-release/#columnar-streaming-protocol-change-since-0140).
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -446,6 +446,7 @@ object FunctionRegistry {
     expression[Shuffle]("shuffle"),
     expression[ArrayMin]("array_min"),
     expression[ArrayMax]("array_max"),
+    expression[ArrayMedian]("array_median"),
     expression[Reverse]("reverse"),
     expression[Concat]("concat"),
     expression[Flatten]("flatten"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -18,10 +18,8 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.time.ZoneId
 import java.util.Comparator
-
 import scala.collection.mutable
 import scala.reflect.ClassTag
-
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion}
 import org.apache.spark.sql.catalyst.expressions.ArraySortLike.NullOrder
@@ -37,6 +35,7 @@ import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
 import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.collection.OpenHashSet
+import scala.reflect.runtime.universe
 
 /**
  * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit
@@ -900,6 +899,88 @@ case class SortArray(base: Expression, ascendingOrder: Expression)
   override def prettyName: String = "sort_array"
 }
 
+/**
+ * Returns the median value as double of an array of numeric values.
+ */
+@ExpressionDescription(
+usage = """
+_FUNC_(array) - Returns the median value in the array, but only accepts arrays with numeric values.
+  NULL elements are skipped and returns NULL if array is empty.""",
+examples = """
+    Examples:
+      > SELECT _FUNC_(array(1, 2, null, 3));
+       2
+  """, since = "3.0.0")
+case class ArrayMedian(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {
+
+  override def checkInputDataTypes(): TypeCheckResult = child.dataType match {
+    case ArrayType(dt, _) => dt match {
+      case _: NumericType => TypeCheckResult.TypeCheckSuccess
+      case _ => TypeCheckResult.TypeCheckFailure(
+        s"$prettyName does not support arrays of type ${dt.catalogString} which is not numeric.")
+    }
+    case _ =>
+      TypeCheckResult.TypeCheckFailure(s"$prettyName only supports array input.")
+  }
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType)
+
+  private def containsNulls: Boolean = child.dataType.asInstanceOf[ArrayType].containsNull
+
+  private def assignArrayCodeGen(array: String, ctx: CodegenContext, c: String): String = {
+    val javaType = CodeGenerator.javaType(arrayType)
+    val primitiveTypeName = CodeGenerator.primitiveTypeName(arrayType)
+
+    if(containsNulls) {
+      val numElements = ctx.freshName("numElements")
+      val tempArray = ctx.freshName("tempArray")
+      val count = ctx.freshName("count")
+      val i = ctx.freshName("i")
+
+      s"""
+         |int $numElements = $c.numElements();
+         |$javaType[] $tempArray = new $javaType[$numElements];
+         |int $count = -1;
+         |for (int $i = 0; $i < $numElements; $i++) {
+         |  if(!$c.isNullAt($i)) {
+         |    $tempArray[++$count] = $c.get$primitiveTypeName($i);
+         |  }
+         |}
+         |$javaType[] $array = java.util.Arrays.copyOf($tempArray, $count + 1);
+       """.stripMargin
+    } else {
+      s"""
+         |$javaType[] $array = $c.to${primitiveTypeName}Array();
+         """.stripMargin
+    }
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val size = ctx.freshName("size")
+    val array = ctx.freshName("array")
+
+    nullSafeCodeGen(ctx, ev, c =>
+    s"""
+       |${assignArrayCodeGen(array, ctx, c)}
+       |java.util.Arrays.sort($array);
+       |final int $size = $array.length;
+       |if ($size == 0) {
+       |  ${ev.isNull} = true;
+       |} else if ($size % 2 == 0) {
+       |  ${ev.value} = ($array[$size / 2] + $array[$size / 2 - 1]) / 2d;
+       |} else {
+       |   ${ev.value} = $array[$size / 2] / 1d;
+       |}
+       """.stripMargin)
+  }
+
+  @transient override val dataType: DataType = DoubleType
+
+  private val arrayType: DataType = child.dataType.asInstanceOf[ArrayType].elementType
+
+  override def prettyName: String = "array_median"
+}
+
 
 /**
  * Sorts the input array in ascending order according to the natural ordering of
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3902,6 +3902,15 @@ object functions {
    */
   def array_min(e: Column): Column = withExpr { ArrayMin(e.expr) }
 
+  /**
+   * Returns the median value in the array. Array must contain numeric values and no nulls.
+   * Returns null for empty arrays.
+   *
+   * @group collection_funcs
+   * @since 2.4.0
+   */
+  def array_median(e: Column): Column = withExpr { ArrayMedian(e.expr) }
+
   /**
    * Returns the maximum value in the array.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -884,6 +884,36 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
     checkAnswer(df.selectExpr("array_min(a)"), answer)
   }
 
+  test("array_median function") {
+    val doubles = Seq(
+      Seq(1.0, 3.0, 2.0).map(Option.apply),
+      Seq(Some(1.2), Some(-100.0), Some(2.5), Option.empty[Double]),
+      Seq(6.0, 2.0, 3.0, 5.0, 4.0, 1.0).map(Option.apply),
+      Seq.empty[Option[Double]]
+    ).toDF("a")
+
+    val answerDoubles = Seq(Row(2.0), Row(1.2), Row(3.5), Row(null))
+
+    val ints = Seq(
+      Seq(1, 3, 2),
+      Seq(1, -100, 2)
+    ).toDF("a")
+
+    val longs = Seq(
+      Seq(1L, 3L, 2L),
+      Seq(1L, -100L, 2L)
+    ).toDF("a")
+
+    val answerLongAndInt = Seq(Row(2.0), Row(1.0))
+
+    checkAnswer(doubles.select(array_median(doubles("a"))), answerDoubles)
+    checkAnswer(doubles.selectExpr("array_median(a)"), answerDoubles)
+    checkAnswer(ints.select(array_median(ints("a"))), answerLongAndInt)
+    checkAnswer(ints.selectExpr("array_median(a)"), answerLongAndInt)
+    checkAnswer(longs.select(array_median(longs("a"))), answerLongAndInt)
+    checkAnswer(longs.selectExpr("array_median(a)"), answerLongAndInt)
+  }
+
   test("array_max function") {
     val df = Seq(
       Seq[Option[Int]](Some(1), Some(3), Some(2)),