apache
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala‎
Lines changed: 10 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/scheduler/TaskDescriptionSuite.scala‎
Lines changed: 16 additions & 0 deletions b/‎core/src/test/scala/org/apache/spark/scheduler/TaskDescriptionSuite.scala‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala‎
Lines changed: 19 additions & 6 deletions b/‎external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala‎
Lines changed: 19 additions & 6 deletions
diff --git a/‎external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala‎
Lines changed: 2 additions & 1 deletion b/‎external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala‎
Lines changed: 2 additions & 2 deletions b/‎external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala‎
Lines changed: 2 additions & 2 deletions b/‎external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala‎
Lines changed: 22 additions & 2 deletions b/‎mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala‎
Lines changed: 26 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala‎
Lines changed: 26 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala‎
Lines changed: 6 additions & 6 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala‎
Lines changed: 6 additions & 6 deletions
@@ -19,6 +19,7 @@ package org.apache.spark.scheduler
 
 import java.io.{DataInputStream, DataOutputStream}
 import java.nio.ByteBuffer
+import java.nio.charset.StandardCharsets
 import java.util.Properties
 
 import scala.collection.JavaConverters._
@@ -86,7 +87,10 @@ private[spark] object TaskDescription {
     dataOut.writeInt(taskDescription.properties.size())
     taskDescription.properties.asScala.foreach { case (key, value) =>
       dataOut.writeUTF(key)
-      dataOut.writeUTF(value)
+      // SPARK-19796 -- writeUTF doesn't work for long strings, which can happen for property values
+      val bytes = value.getBytes(StandardCharsets.UTF_8)
+      dataOut.writeInt(bytes.length)
+      dataOut.write(bytes)
     }
 
     // Write the task. The task is already serialized, so write it directly to the byte buffer.
@@ -124,7 +128,11 @@ private[spark] object TaskDescription {
     val properties = new Properties()
     val numProperties = dataIn.readInt()
     for (i <- 0 until numProperties) {
-      properties.setProperty(dataIn.readUTF(), dataIn.readUTF())
+      val key = dataIn.readUTF()
+      val valueLength = dataIn.readInt()
+      val valueBytes = new Array[Byte](valueLength)
+      dataIn.readFully(valueBytes)
+      properties.setProperty(key, new String(valueBytes, StandardCharsets.UTF_8))
     }
 
     // Create a sub-buffer for the serialized task into its own buffer (to be deserialized later).
 
@@ -17,6 +17,7 @@
 
 package org.apache.spark.scheduler
 
+import java.io.{ByteArrayOutputStream, DataOutputStream, UTFDataFormatException}
 import java.nio.ByteBuffer
 import java.util.Properties
 
@@ -36,6 +37,21 @@ class TaskDescriptionSuite extends SparkFunSuite {
     val originalProperties = new Properties()
     originalProperties.put("property1", "18")
     originalProperties.put("property2", "test value")
+    // SPARK-19796 -- large property values (like a large job description for a long sql query)
+    // can cause problems for DataOutputStream, make sure we handle correctly
+    val sb = new StringBuilder()
+    (0 to 10000).foreach(_ => sb.append("1234567890"))
+    val largeString = sb.toString()
+    originalProperties.put("property3", largeString)
+    // make sure we've got a good test case
+    intercept[UTFDataFormatException] {
+      val out = new DataOutputStream(new ByteArrayOutputStream())
+      try {
+        out.writeUTF(largeString)
+      } finally {
+        out.close()
+      }
+    }
 
     // Create a dummy byte buffer for the task.
     val taskBuffer = ByteBuffer.wrap(Array[Byte](1, 2, 3, 4))
 
@@ -36,7 +36,11 @@ import org.apache.spark.util.NextIterator
 /** Class representing a range of Kinesis sequence numbers. Both sequence numbers are inclusive. */
 private[kinesis]
 case class SequenceNumberRange(
-    streamName: String, shardId: String, fromSeqNumber: String, toSeqNumber: String)
+    streamName: String,
+    shardId: String,
+    fromSeqNumber: String,
+    toSeqNumber: String,
+    recordCount: Int)
 
 /** Class representing an array of Kinesis sequence number ranges */
 private[kinesis]
@@ -136,6 +140,8 @@ class KinesisSequenceRangeIterator(
   private val client = new AmazonKinesisClient(credentials)
   private val streamName = range.streamName
   private val shardId = range.shardId
+  // AWS limits to maximum of 10k records per get call
+  private val maxGetRecordsLimit = 10000
 
   private var toSeqNumberReceived = false
   private var lastSeqNumber: String = null
@@ -153,12 +159,14 @@ class KinesisSequenceRangeIterator(
 
         // If the internal iterator has not been initialized,
         // then fetch records from starting sequence number
-        internalIterator = getRecords(ShardIteratorType.AT_SEQUENCE_NUMBER, range.fromSeqNumber)
+        internalIterator = getRecords(ShardIteratorType.AT_SEQUENCE_NUMBER, range.fromSeqNumber,
+          range.recordCount)
       } else if (!internalIterator.hasNext) {
 
         // If the internal iterator does not have any more records,
         // then fetch more records after the last consumed sequence number
-        internalIterator = getRecords(ShardIteratorType.AFTER_SEQUENCE_NUMBER, lastSeqNumber)
+        internalIterator = getRecords(ShardIteratorType.AFTER_SEQUENCE_NUMBER, lastSeqNumber,
+          range.recordCount)
       }
 
       if (!internalIterator.hasNext) {
@@ -191,9 +199,12 @@ class KinesisSequenceRangeIterator(
   /**
    * Get records starting from or after the given sequence number.
    */
-  private def getRecords(iteratorType: ShardIteratorType, seqNum: String): Iterator[Record] = {
+  private def getRecords(
+      iteratorType: ShardIteratorType,
+      seqNum: String,
+      recordCount: Int): Iterator[Record] = {
     val shardIterator = getKinesisIterator(iteratorType, seqNum)
-    val result = getRecordsAndNextKinesisIterator(shardIterator)
+    val result = getRecordsAndNextKinesisIterator(shardIterator, recordCount)
     result._1
   }
 
@@ -202,10 +213,12 @@ class KinesisSequenceRangeIterator(
    * to get records from Kinesis), and get the next shard iterator for next consumption.
    */
   private def getRecordsAndNextKinesisIterator(
-      shardIterator: String): (Iterator[Record], String) = {
+      shardIterator: String,
+      recordCount: Int): (Iterator[Record], String) = {
     val getRecordsRequest = new GetRecordsRequest
     getRecordsRequest.setRequestCredentials(credentials)
     getRecordsRequest.setShardIterator(shardIterator)
+    getRecordsRequest.setLimit(Math.min(recordCount, this.maxGetRecordsLimit))
     val getRecordsResult = retryOrTimeout[GetRecordsResult](
       s"getting records using shard iterator") {
         client.getRecords(getRecordsRequest)
 
@@ -210,7 +210,8 @@ private[kinesis] class KinesisReceiver[T](
     if (records.size > 0) {
       val dataIterator = records.iterator().asScala.map(messageHandler)
       val metadata = SequenceNumberRange(streamName, shardId,
-        records.get(0).getSequenceNumber(), records.get(records.size() - 1).getSequenceNumber())
+        records.get(0).getSequenceNumber(), records.get(records.size() - 1).getSequenceNumber(),
+        records.size())
       blockGenerator.addMultipleDataWithCallback(dataIterator, metadata)
     }
   }
 
@@ -51,7 +51,7 @@ abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean)
       shardIdToSeqNumbers = shardIdToDataAndSeqNumbers.mapValues { _.map { _._2 }}
       shardIdToRange = shardIdToSeqNumbers.map { case (shardId, seqNumbers) =>
         val seqNumRange = SequenceNumberRange(
-          testUtils.streamName, shardId, seqNumbers.head, seqNumbers.last)
+          testUtils.streamName, shardId, seqNumbers.head, seqNumbers.last, seqNumbers.size)
         (shardId, seqNumRange)
       }
       allRanges = shardIdToRange.values.toSeq
@@ -181,7 +181,7 @@ abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean)
 
     // Create the necessary ranges to use in the RDD
     val fakeRanges = Array.fill(numPartitions - numPartitionsInKinesis)(
-      SequenceNumberRanges(SequenceNumberRange("fakeStream", "fakeShardId", "xxx", "yyy")))
+      SequenceNumberRanges(SequenceNumberRange("fakeStream", "fakeShardId", "xxx", "yyy", 1)))
     val realRanges = Array.tabulate(numPartitionsInKinesis) { i =>
       val range = shardIdToRange(shardIds(i + (numPartitions - numPartitionsInKinesis)))
       SequenceNumberRanges(Array(range))
 
@@ -119,13 +119,13 @@ abstract class KinesisStreamTests(aggregateTestData: Boolean) extends KinesisFun
 
     // Generate block info data for testing
     val seqNumRanges1 = SequenceNumberRanges(
-      SequenceNumberRange("fakeStream", "fakeShardId", "xxx", "yyy"))
+      SequenceNumberRange("fakeStream", "fakeShardId", "xxx", "yyy", 67))
     val blockId1 = StreamBlockId(kinesisStream.id, 123)
     val blockInfo1 = ReceivedBlockInfo(
       0, None, Some(seqNumRanges1), new BlockManagerBasedStoreResult(blockId1, None))
 
     val seqNumRanges2 = SequenceNumberRanges(
-      SequenceNumberRange("fakeStream", "fakeShardId", "aaa", "bbb"))
+      SequenceNumberRange("fakeStream", "fakeShardId", "aaa", "bbb", 89))
     val blockId2 = StreamBlockId(kinesisStream.id, 345)
     val blockInfo2 = ReceivedBlockInfo(
       0, None, Some(seqNumRanges2), new BlockManagerBasedStoreResult(blockId2, None))
 
@@ -24,12 +24,13 @@ import breeze.linalg.{DenseVector => BDV}
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.classification.LinearSVCSuite._
 import org.apache.spark.ml.feature.{Instance, LabeledPoint}
-import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.functions.udf
 
 
 class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
@@ -41,6 +42,9 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
   @transient var smallValidationDataset: Dataset[_] = _
   @transient var binaryDataset: Dataset[_] = _
 
+  @transient var smallSparseBinaryDataset: Dataset[_] = _
+  @transient var smallSparseValidationDataset: Dataset[_] = _
+
   override def beforeAll(): Unit = {
     super.beforeAll()
 
@@ -51,6 +55,13 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     smallBinaryDataset = generateSVMInput(A, Array[Double](B, C), nPoints, 42).toDF()
     smallValidationDataset = generateSVMInput(A, Array[Double](B, C), nPoints, 17).toDF()
     binaryDataset = generateSVMInput(1.0, Array[Double](1.0, 2.0, 3.0, 4.0), 10000, 42).toDF()
+
+    // Dataset for testing SparseVector
+    val toSparse: Vector => SparseVector = _.asInstanceOf[DenseVector].toSparse
+    val sparse = udf(toSparse)
+    smallSparseBinaryDataset = smallBinaryDataset.withColumn("features", sparse('features))
+    smallSparseValidationDataset = smallValidationDataset.withColumn("features", sparse('features))
+
   }
 
   /**
@@ -68,13 +79,17 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     val model = svm.fit(smallBinaryDataset)
     assert(model.transform(smallValidationDataset)
       .where("prediction=label").count() > nPoints * 0.8)
+    val sparseModel = svm.fit(smallSparseBinaryDataset)
+    checkModels(model, sparseModel)
   }
 
   test("Linear SVC binary classification with regularization") {
     val svm = new LinearSVC()
     val model = svm.setRegParam(0.1).fit(smallBinaryDataset)
     assert(model.transform(smallValidationDataset)
       .where("prediction=label").count() > nPoints * 0.8)
+    val sparseModel = svm.fit(smallSparseBinaryDataset)
+    checkModels(model, sparseModel)
   }
 
   test("params") {
@@ -235,7 +250,7 @@ object LinearSVCSuite {
     "aggregationDepth" -> 3
   )
 
-    // Generate noisy input of the form Y = signum(x.dot(weights) + intercept + noise)
+  // Generate noisy input of the form Y = signum(x.dot(weights) + intercept + noise)
   def generateSVMInput(
       intercept: Double,
       weights: Array[Double],
@@ -252,5 +267,10 @@ object LinearSVCSuite {
     y.zip(x).map(p => LabeledPoint(p._1, Vectors.dense(p._2)))
   }
 
+  def checkModels(model1: LinearSVCModel, model2: LinearSVCModel): Unit = {
+    assert(model1.intercept == model2.intercept)
+    assert(model1.coefficients.equals(model2.coefficients))
+  }
+
 }
 
@@ -117,6 +117,8 @@ class Analyzer(
     Batch("Hints", fixedPoint,
       new ResolveHints.ResolveBroadcastHints(conf),
       ResolveHints.RemoveAllHints),
+    Batch("Simple Sanity Check", Once,
+      LookupFunctions),
     Batch("Substitution", fixedPoint,
       CTESubstitution,
       WindowsSubstitution,
@@ -604,7 +606,11 @@ class Analyzer(
 
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
       case i @ InsertIntoTable(u: UnresolvedRelation, parts, child, _, _) if child.resolved =>
-        i.copy(table = EliminateSubqueryAliases(lookupTableFromCatalog(u)))
+        lookupTableFromCatalog(u).canonicalized match {
+          case v: View =>
+            u.failAnalysis(s"Inserting into a view is not allowed. View: ${v.desc.identifier}.")
+          case other => i.copy(table = other)
+        }
       case u: UnresolvedRelation => resolveRelation(u)
     }
 
@@ -1038,6 +1044,25 @@ class Analyzer(
     }
   }
 
+  /**
+   * Checks whether a function identifier referenced by an [[UnresolvedFunction]] is defined in the
+   * function registry. Note that this rule doesn't try to resolve the [[UnresolvedFunction]]. It
+   * only performs simple existence check according to the function identifier to quickly identify
+   * undefined functions without triggering relation resolution, which may incur potentially
+   * expensive partition/schema discovery process in some cases.
+   *
+   * @see [[ResolveFunctions]]
+   * @see https://issues.apache.org/jira/browse/SPARK-19737
+   */
+  object LookupFunctions extends Rule[LogicalPlan] {
+    override def apply(plan: LogicalPlan): LogicalPlan = plan.transformAllExpressions {
+      case f: UnresolvedFunction if !catalog.functionExists(f.name) =>
+        withPosition(f) {
+          throw new NoSuchFunctionException(f.name.database.getOrElse("default"), f.name.funcName)
+        }
+    }
+  }
+
   /**
    * Replaces [[UnresolvedFunction]]s with concrete [[Expression]]s.
    */
 
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.catalog
 
+import java.net.URI
+
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.util.Shell
 
@@ -162,6 +164,30 @@ object CatalogUtils {
     BucketSpec(numBuckets, normalizedBucketCols, normalizedSortCols)
   }
 
+  /**
+   * Convert URI to String.
+   * Since URI.toString does not decode the uri, e.g. change '%25' to '%'.
+   * Here we create a hadoop Path with the given URI, and rely on Path.toString
+   * to decode the uri
+   * @param uri the URI of the path
+   * @return the String of the path
+   */
+  def URIToString(uri: URI): String = {
+    new Path(uri).toString
+  }
+
+  /**
+   * Convert String to URI.
+   * Since new URI(string) does not encode string, e.g. change '%' to '%25'.
+   * Here we create a hadoop Path with the given String, and rely on Path.toUri
+   * to encode the string
+   * @param str the String of the path
+   * @return the URI of the path
+   */
+  def stringToURI(str: String): URI = {
+    new Path(str).toUri
+  }
+
   private def normalizeColumnName(
       tableName: String,
       tableCols: Seq[String],
 
@@ -202,7 +202,7 @@ class InMemoryCatalog(
           tableDefinition.storage.locationUri.isEmpty
 
       val tableWithLocation = if (needDefaultTableLocation) {
-        val defaultTableLocation = new Path(catalog(db).db.locationUri, table)
+        val defaultTableLocation = new Path(new Path(catalog(db).db.locationUri), table)
         try {
           val fs = defaultTableLocation.getFileSystem(hadoopConfig)
           fs.mkdirs(defaultTableLocation)
@@ -211,7 +211,7 @@ class InMemoryCatalog(
             throw new SparkException(s"Unable to create table $table as failed " +
               s"to create its directory $defaultTableLocation", e)
         }
-        tableDefinition.withNewStorage(locationUri = Some(defaultTableLocation.toUri.toString))
+        tableDefinition.withNewStorage(locationUri = Some(defaultTableLocation.toUri))
       } else {
         tableDefinition
       }
@@ -274,7 +274,7 @@ class InMemoryCatalog(
         "Managed table should always have table location, as we will assign a default location " +
           "to it if it doesn't have one.")
       val oldDir = new Path(oldDesc.table.location)
-      val newDir = new Path(catalog(db).db.locationUri, newName)
+      val newDir = new Path(new Path(catalog(db).db.locationUri), newName)
       try {
         val fs = oldDir.getFileSystem(hadoopConfig)
         fs.rename(oldDir, newDir)
@@ -283,7 +283,7 @@ class InMemoryCatalog(
           throw new SparkException(s"Unable to rename table $oldName to $newName as failed " +
             s"to rename its directory $oldDir", e)
       }
-      oldDesc.table = oldDesc.table.withNewStorage(locationUri = Some(newDir.toUri.toString))
+      oldDesc.table = oldDesc.table.withNewStorage(locationUri = Some(newDir.toUri))
     }
 
     catalog(db).tables.put(newName, oldDesc)
@@ -389,7 +389,7 @@ class InMemoryCatalog(
 
       existingParts.put(
         p.spec,
-        p.copy(storage = p.storage.copy(locationUri = Some(partitionPath.toString))))
+        p.copy(storage = p.storage.copy(locationUri = Some(partitionPath.toUri))))
     }
   }
 
@@ -462,7 +462,7 @@ class InMemoryCatalog(
         }
         oldPartition.copy(
           spec = newSpec,
-          storage = oldPartition.storage.copy(locationUri = Some(newPartPath.toString)))
+          storage = oldPartition.storage.copy(locationUri = Some(newPartPath.toUri)))
       } else {
         oldPartition.copy(spec = newSpec)
       }
Original file line number	Diff line number	Diff line change
`@@ -210,7 +210,8 @@ private[kinesis] class KinesisReceiver[T](`
`210`	`210`	`if (records.size > 0) {`
`211`	`211`	`val dataIterator = records.iterator().asScala.map(messageHandler)`
`212`	`212`	`val metadata = SequenceNumberRange(streamName, shardId,`
`213`		`- records.get(0).getSequenceNumber(), records.get(records.size() - 1).getSequenceNumber())`
	`213`	`+ records.get(0).getSequenceNumber(), records.get(records.size() - 1).getSequenceNumber(),`
	`214`	`+ records.size())`
`214`	`215`	`blockGenerator.addMultipleDataWithCallback(dataIterator, metadata)`
`215`	`216`	`}`
`216`	`217`	`}`