refine

cfmcgrady · cfmcgrady · commit 008867122d99 · 2023-04-04T15:03:57.000+08:00
diff --git a/extensions/spark/kyuubi-spark-connector-tpcds/src/test/scala/org/apache/kyuubi/spark/connector/tpcds/TPCDSQuerySuite.scala b/extensions/spark/kyuubi-spark-connector-tpcds/src/test/scala/org/apache/kyuubi/spark/connector/tpcds/TPCDSQuerySuite.scala
@@ -88,4 +88,6 @@ class TPCDSQuerySuite extends KyuubiFunSuite {
       }
     }
   }
+
+  test("aa") {}
 }
diff --git a/externals/kyuubi-spark-sql-engine/pom.xml b/externals/kyuubi-spark-sql-engine/pom.xml
@@ -71,6 +71,13 @@
             <scope>provided</scope>
         </dependency>
 
+        <dependency>
+            <groupId>com.google.guava</groupId>
+            <artifactId>guava</artifactId>
+            <version>14.0.1</version>
+            <scope>provided</scope>
+        </dependency>
+
         <dependency>
             <groupId>org.scala-lang</groupId>
             <artifactId>scala-compiler</artifactId>
diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/ExecuteStatement.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/kyuubi/engine/spark/operation/ExecuteStatement.scala
@@ -22,14 +22,13 @@ import java.util.concurrent.RejectedExecutionException
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.execution.{CollectLimitExec, SQLExecution, TakeOrderedAndProjectExec}
+import org.apache.spark.sql.execution.{CollectLimitExec, SQLExecution}
+import org.apache.spark.sql.execution.arrow.{ArrowCollectLimitExec, KyuubiArrowUtils}
 import org.apache.spark.sql.kyuubi.SparkDatasetHelper
 import org.apache.spark.sql.types._
-import org.apache.kyuubi.{KyuubiSQLException, Logging}
-import org.apache.spark.sql.execution.arrow.{ArrowCollectLimitExec, KyuubiArrowUtils}
 
+import org.apache.kyuubi.{KyuubiSQLException, Logging}
 import org.apache.kyuubi.config.KyuubiConf.OPERATION_RESULT_MAX_ROWS
 import org.apache.kyuubi.engine.spark.KyuubiSparkUtil._
 import org.apache.kyuubi.operation.{ArrayFetchIterator, FetchIterator, IterableFetchIterator, OperationHandle, OperationState}
@@ -189,73 +188,70 @@ class ArrowBasedExecuteStatement(
     handle) {
 
   override protected def incrementalCollectResult(resultDF: DataFrame): Iterator[Any] = {
-    collectAsArrow(convertComplexType(resultDF)) { rdd =>
-      rdd.toLocalIterator
+    val df = convertComplexType(resultDF)
+    withNewExecutionId(df) {
+      SparkDatasetHelper.toArrowBatchRdd(df).toLocalIterator
     }
   }
 
   override protected def fullCollectResult(resultDF: DataFrame): Array[_] = {
-    collectAsArrow(convertComplexType(resultDF)) { rdd =>
-      rdd.collect()
-    }
+    executeCollect(convertComplexType(resultDF))
   }
 
   override protected def takeResult(resultDF: DataFrame, maxRows: Int): Array[_] = {
-    // this will introduce shuffle and hurt performance
-    val limitedResult = resultDF.limit(maxRows)
-//    collectAsArrow(convertComplexType(limitedResult)) { rdd =>
-//      rdd.collect()
-//    }
-    val df = convertComplexType(limitedResult)
-    SQLExecution.withNewExecutionId(df.queryExecution, Some("collectAsArrow")) {
-      df.queryExecution.executedPlan.resetMetrics()
-      df.queryExecution.executedPlan match {
-        case collectLimit @ CollectLimitExec(limit, _) =>
-          val timeZoneId = spark.sessionState.conf.sessionLocalTimeZone
-          val batches = ArrowCollectLimitExec.takeAsArrowBatches(collectLimit, df.schema, 1000, 1024 * 1024, timeZoneId)
-//            .map(_._1)
-          val result = ArrayBuffer[Array[Byte]]()
-          var i = 0
-          var rest = limit
-          println(s"batch....size... ${batches.length}")
-          while (i < batches.length && rest > 0) {
-            val (batch, size) = batches(i)
-            if (size < rest) {
-              result += batch
-              // TODO: toInt
-              rest = rest - size.toInt
-            } else if (size == rest) {
-              result += batch
-              rest = 0
-            } else { // size > rest
-              println(s"size......${size}....rest......${rest}")
-//              result += KyuubiArrowUtils.slice(batch, 0, rest)
-              result += KyuubiArrowUtils.sliceV2(df.schema, timeZoneId, batch, 0, rest)
-              rest = 0
-            }
-            i += 1
-          }
-          result.toArray
-
-        case takeOrderedAndProjectExec @ TakeOrderedAndProjectExec(limit, _, _, _) =>
-          val timeZoneId = spark.sessionState.conf.sessionLocalTimeZone
-          ArrowCollectLimitExec.taskOrdered(takeOrderedAndProjectExec, df.schema, 1000, 1024 * 1024, timeZoneId)
-            .map(_._1)
-        case _ =>
-          println("yyyy")
-          SparkDatasetHelper.toArrowBatchRdd(df).collect()
-      }
-    }
+    executeCollect(convertComplexType(resultDF.limit(maxRows)))
   }
 
   /**
    * refer to org.apache.spark.sql.Dataset#withAction(), assign a new execution id for arrow-based
    * operation, so that we can track the arrow-based queries on the UI tab.
    */
-  private def collectAsArrow[T](df: DataFrame)(action: RDD[Array[Byte]] => T): T = {
+  private def withNewExecutionId[T](df: DataFrame)(body: => T): T = {
     SQLExecution.withNewExecutionId(df.queryExecution, Some("collectAsArrow")) {
       df.queryExecution.executedPlan.resetMetrics()
-      action(SparkDatasetHelper.toArrowBatchRdd(df))
+      body
+    }
+  }
+
+  def executeCollect(df: DataFrame): Array[Array[Byte]] = withNewExecutionId(df) {
+    executeArrowBatchCollect(df).getOrElse {
+      SparkDatasetHelper.toArrowBatchRdd(df).collect()
+    }
+  }
+
+  private def executeArrowBatchCollect(df: DataFrame): Option[Array[Array[Byte]]] = {
+    df.queryExecution.executedPlan match {
+      case collectLimit @ CollectLimitExec(limit, _) =>
+        val timeZoneId = spark.sessionState.conf.sessionLocalTimeZone
+        val maxRecordsPerBatch = spark.conf.getOption(
+          "spark.sql.execution.arrow.maxRecordsPerBatch").map(_.toInt).getOrElse(10000)
+        // val maxBatchSize =
+        // (spark.sessionState.conf.getConf(SPARK_CONNECT_GRPC_ARROW_MAX_BATCH_SIZE) * 0.7).toLong
+        val maxBatchSize = 1024 * 1024 * 4
+        val batches = ArrowCollectLimitExec.takeAsArrowBatches(
+          collectLimit,
+          df.schema,
+          maxRecordsPerBatch,
+          maxBatchSize,
+          timeZoneId)
+        val result = ArrayBuffer[Array[Byte]]()
+        var i = 0
+        var rest = limit
+        while (i < batches.length && rest > 0) {
+          val (batch, size) = batches(i)
+          if (size <= rest) {
+            result += batch
+            // returned ArrowRecordBatch has less than `limit` row count, safety to do conversion
+            rest -= size.toInt
+          } else { // size > rest
+            result += KyuubiArrowUtils.sliceV2(df.schema, timeZoneId, batch, 0, rest)
+            rest = 0
+          }
+          i += 1
+        }
+        Option(result.toArray)
+      case _ =>
+        None
     }
   }
 
diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowCollectLimitExec.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowCollectLimitExec.scala
@@ -124,21 +124,4 @@ object ArrowCollectLimitExec extends SQLConfHelper {
       buf.toArray
     }
   }
-
-  def taskOrdered(
-      takeOrdered: TakeOrderedAndProjectExec,
-      schema: StructType,
-      maxRecordsPerBatch: Long,
-      maxEstimatedBatchSize: Long,
-      timeZoneId: String): Array[Batch] = {
-    val batches = ArrowConvertersHelper.toBatchWithSchemaIterator(
-      takeOrdered.executeCollect().iterator,
-      schema,
-      maxEstimatedBatchSize,
-      maxEstimatedBatchSize,
-      takeOrdered.limit,
-      timeZoneId)
-    batches.map(b => b -> batches.rowCountInLastBatch).toArray
-  }
 }
-
diff --git a/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/spark/sql/execution/arrow/KyuubiArrowUtils.scala b/externals/kyuubi-spark-sql-engine/src/main/scala/org/apache/spark/sql/execution/arrow/KyuubiArrowUtils.scala
@@ -21,9 +21,9 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 import java.nio.channels.Channels
 
 import org.apache.arrow.memory.RootAllocator
+import org.apache.arrow.vector.{VectorLoader, VectorSchemaRoot, VectorUnloader}
 import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter, ReadChannel, WriteChannel}
 import org.apache.arrow.vector.ipc.message.MessageSerializer
-import org.apache.arrow.vector.{VectorLoader, VectorSchemaRoot, VectorUnloader}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.ArrowUtils
 
@@ -58,8 +58,12 @@ object KyuubiArrowUtils {
     }
   }
 
-  def sliceV2(schema: StructType,
-              timeZoneId: String, bytes: Array[Byte], start: Int, length: Int): Array[Byte] = {
+  def sliceV2(
+      schema: StructType,
+      timeZoneId: String,
+      bytes: Array[Byte],
+      start: Int,
+      length: Int): Array[Byte] = {
     val in = new ByteArrayInputStream(bytes)
     val out = new ByteArrayOutputStream()
 
@@ -71,17 +75,16 @@ object KyuubiArrowUtils {
 //      println("rowCount......" + reader.getVectorSchemaRoot.getRowCount)
 //      val root = reader.getVectorSchemaRoot.slice(start, length)
 
-
       val recordBatch = MessageSerializer.deserializeRecordBatch(
-        new ReadChannel(Channels.newChannel(in)), rootAllocator)
+        new ReadChannel(Channels.newChannel(in)),
+        rootAllocator)
       val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId)
 
       val root = VectorSchemaRoot.create(arrowSchema, rootAllocator)
       val vectorLoader = new VectorLoader(root)
       vectorLoader.load(recordBatch)
       recordBatch.close()
 
-
       val unloader = new VectorUnloader(root.slice(start, length))
       val writeChannel = new WriteChannel(Channels.newChannel(out))
       val batch = unloader.getRecordBatch()
diff --git a/externals/kyuubi-spark-sql-engine/src/test/scala/org/apache/kyuubi/engine/spark/operation/SparkArrowbasedOperationSuite.scala b/externals/kyuubi-spark-sql-engine/src/test/scala/org/apache/kyuubi/engine/spark/operation/SparkArrowbasedOperationSuite.scala
@@ -20,8 +20,10 @@ package org.apache.kyuubi.engine.spark.operation
 import java.sql.Statement
 
 import org.apache.spark.KyuubiSparkContextHelper
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
 import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.util.QueryExecutionListener
 
 import org.apache.kyuubi.config.KyuubiConf
@@ -138,6 +140,20 @@ class SparkArrowbasedOperationSuite extends WithSparkSQLEngine with SparkDataTyp
     assert(metrics("numOutputRows").value === 1)
   }
 
+  test("aa") {
+
+    withJdbcStatement() { statement =>
+      loadPartitionedTable()
+      val n = 17
+      statement.executeQuery(s"SET kyuubi.operation.result.max.rows=$n")
+      val result = statement.executeQuery("select * from t_1")
+      for (i <- 0 until n) {
+        assert(result.next())
+      }
+      assert(!result.next())
+    }
+  }
+
   private def checkResultSetFormat(statement: Statement, expectFormat: String): Unit = {
     val query =
       s"""
@@ -177,4 +193,22 @@ class SparkArrowbasedOperationSuite extends WithSparkSQLEngine with SparkDataTyp
       .allSessions()
       .foreach(_.asInstanceOf[SparkSessionImpl].spark.listenerManager.unregister(listener))
   }
+
+  private def loadPartitionedTable(): Unit = {
+    SparkSQLEngine.currentEngine.get
+      .backendService
+      .sessionManager
+      .allSessions()
+      .map(_.asInstanceOf[SparkSessionImpl].spark)
+      .foreach { spark =>
+        spark.range(1000)
+          .repartitionByRange(100, col("id"))
+          .createOrReplaceTempView("t_1")
+        spark.sql("select * from t_1")
+          .foreachPartition { p: Iterator[Row] =>
+            assert(p.length == 10)
+            ()
+          }
+      }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -88,4 +88,6 @@ class TPCDSQuerySuite extends KyuubiFunSuite {`
`88`	`88`	`}`
`89`	`89`	`}`
`90`	`90`	`}`
	`91`	`+`
	`92`	`+ test("aa") {}`
`91`	`93`	`}`