initial commit

kiszk · kiszk · commit 47f405c32ffa · 2017-03-01T23:01:43.000+09:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -77,6 +77,10 @@ trait CodegenSupport extends SparkPlan {
    */
   final def produce(ctx: CodegenContext, parent: CodegenSupport): String = executeQuery {
     this.parent = parent
+
+    // to track the existence of apply() call in the current produce-consume cycle
+    // if apply is not called (e.g. in aggregation), we can skip shoudStop in the inner-most loop
+    parent.shouldStopRequired = false
     ctx.freshNamePrefix = variablePrefix
     s"""
        |${ctx.registerComment(s"PRODUCE: ${this.simpleString}")}
@@ -206,6 +210,15 @@ trait CodegenSupport extends SparkPlan {
   def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
     throw new UnsupportedOperationException
   }
+
+  /* for optimization */
+  var shouldStopRequired: Boolean = false
+
+  def isShouldStopRequired: Boolean = {
+    if (shouldStopRequired) return true
+    if (this.parent != null) return this.parent.isShouldStopRequired
+    false
+  }
 }
 
 
@@ -418,6 +431,7 @@ case class WholeStageCodegenExec(child: SparkPlan) extends UnaryExecNode with Co
     } else {
       ""
     }
+    shouldStopRequired = true
     s"""
       |${row.code}
       |append(${row.value}$doCopy);
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -387,8 +387,8 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
     // How many values should be generated in the next batch.
     val nextBatchTodo = ctx.freshName("nextBatchTodo")
 
-    // The default size of a batch.
-    val batchSize = 1000L
+    // The default size of a batch, which must be positive integer
+    val batchSize = 1000
 
     ctx.addNewFunction("initRange",
       s"""
@@ -434,6 +434,17 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
     val input = ctx.freshName("input")
     // Right now, Range is only used when there is one upstream.
     ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
+
+    val localIdx = ctx.freshName("localIdx")
+    val localEnd = ctx.freshName("localEnd")
+    val range = ctx.freshName("range")
+    // we need to place consume() before calling isShouldStopRequired
+    val body = consume(ctx, Seq(ev))
+    val shouldStop = if (isShouldStopRequired) {
+      s"if (shouldStop()) { $number = $value + ${step}L; return; }"
+    } else {
+      "// shouldStop check is eliminated"
+    }
     s"""
       | // initialize Range
       | if (!$initTerm) {
@@ -442,11 +453,15 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
       | }
       |
       | while (true) {
-      |   while ($number != $batchEnd) {
-      |     long $value = $number;
-      |     $number += ${step}L;
-      |     ${consume(ctx, Seq(ev))}
-      |     if (shouldStop()) return;
+      |   long $range = $batchEnd - $number;
+      |   if ($range != 0L) {
+      |     int $localEnd = (int)($range / ${step}L);
+      |     for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) {
+      |       long $value = ((long)$localIdx * ${step}L) + $number;
+      |       $body
+      |       $shouldStop
+      |     }
+      |     $number = $batchEnd;
       |   }
       |
       |   if ($taskContext.isInterrupted()) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -69,6 +69,7 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
   override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
     val stopEarly = ctx.freshName("stopEarly")
     ctx.addMutableState("boolean", stopEarly, s"$stopEarly = false;")
+    shouldStopRequired = true // loop may break early even without append in loop body
 
     ctx.addNewFunction("stopEarly", s"""
       @Override
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameRangeSuite.scala
@@ -89,6 +89,19 @@ class DataFrameRangeSuite extends QueryTest with SharedSQLContext with Eventuall
     val n = 9L * 1000 * 1000 * 1000 * 1000 * 1000 * 1000
     val res13 = spark.range(-n, n, n / 9).select("id")
     assert(res13.count == 18)
+
+    // range with non aggregation operation
+    val res14 = spark.range(0, 100, 2).toDF.filter("50 <= id")
+    res14.collect
+    assert(res14.count == 25)
+
+    val res15 = spark.range(100, -100, -2).toDF.filter("id <= 0")
+    res15.collect
+    assert(res15.count == 50)
+
+    val res16 = spark.range(-1500, 1500, 3).toDF.filter("0 <= id")
+    res16.collect
+    assert(res16.count == 500)
   }
 
   test("Range with randomized parameters") {