From 5bd1f9a853db4444bbcad9660d5aa951340c576b Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Sun, 31 May 2015 19:13:23 +0800 Subject: [PATCH 1/6] add require for one-based in loadLIBSVM --- mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 681f4c618d302..5ac3672a52d64 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -82,6 +82,8 @@ object MLUtils { val value = indexAndValue(1).toDouble (index, value) }.unzip + require(indices.size == 0 || indices(0) >= 0, + "indices should be one-based in LIBSVM format") (label, indices.toArray, values.toArray) } From 9956365ae578ac0e1bff8c84104e26e71bf5116c Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Sun, 31 May 2015 22:42:37 +0800 Subject: [PATCH 2/6] add ut for 0-based loadlibsvm exception --- .../apache/spark/mllib/util/MLUtilsSuite.scala | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala index cdece2c174be4..fcb7bb434d1c7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala @@ -27,6 +27,7 @@ import breeze.linalg.{squaredDistance => breezeSquaredDistance} import com.google.common.base.Charsets import com.google.common.io.Files +import org.apache.spark.SparkException import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils._ @@ -109,6 +110,23 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext { Utils.deleteRecursively(tempDir) } + test("loadLibSVMFile throws SparkException when passing a zero-based vector") { + val lines = + """ + |0 + |0 0:4.0 4:5.0 6:6.0 + """.stripMargin + val tempDir = Utils.createTempDir() + val file = new File(tempDir.getPath, "part-00000") + Files.write(lines, file, Charsets.US_ASCII) + val path = tempDir.toURI.toString + + intercept[SparkException] { + val pointsWithoutNumFeatures = loadLibSVMFile(sc, path).collect() + } + Utils.deleteRecursively(tempDir) + } + test("saveAsLibSVMFile") { val examples = sc.parallelize(Seq( LabeledPoint(1.1, Vectors.sparse(3, Seq((0, 1.23), (2, 4.56)))), From 6e4f8ca6ea4bab6ffb851d70c33741df714f1d5f Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Mon, 1 Jun 2015 19:22:17 +0800 Subject: [PATCH 3/6] add check for ascending order --- .../org/apache/spark/mllib/util/MLUtils.scala | 15 +++++++++++-- .../spark/mllib/util/MLUtilsSuite.scala | 21 +++++++++++++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 5ac3672a52d64..5e56945981f9c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -82,8 +82,19 @@ object MLUtils { val value = indexAndValue(1).toDouble (index, value) }.unzip - require(indices.size == 0 || indices(0) >= 0, - "indices should be one-based in LIBSVM format") + + // check if indices is one-based and in ascending order + var previous = -1 + var i = 0 + val indicesLength = indices.size + while (i < indicesLength) { + if (indices(i) <= previous) { + throw new IllegalArgumentException("indices should be one-based and in ascending order") + } + previous = indices(i) + i += 1 + } + (label, indices.toArray, values.toArray) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala index fcb7bb434d1c7..27050bde16ef3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala @@ -110,7 +110,7 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext { Utils.deleteRecursively(tempDir) } - test("loadLibSVMFile throws SparkException when passing a zero-based vector") { + test("loadLibSVMFile throws IllegalArgumentException when indices is zero-based") { val lines = """ |0 @@ -122,7 +122,24 @@ class MLUtilsSuite extends FunSuite with MLlibTestSparkContext { val path = tempDir.toURI.toString intercept[SparkException] { - val pointsWithoutNumFeatures = loadLibSVMFile(sc, path).collect() + loadLibSVMFile(sc, path).collect() + } + Utils.deleteRecursively(tempDir) + } + + test("loadLibSVMFile throws IllegalArgumentException when indices is not in ascending order") { + val lines = + """ + |0 + |0 3:4.0 2:5.0 6:6.0 + """.stripMargin + val tempDir = Utils.createTempDir() + val file = new File(tempDir.getPath, "part-00000") + Files.write(lines, file, Charsets.US_ASCII) + val path = tempDir.toURI.toString + + intercept[SparkException] { + loadLibSVMFile(sc, path).collect() } Utils.deleteRecursively(tempDir) } From 20a2811cdb20c998016d96be2831630ffbc0532b Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Tue, 2 Jun 2015 17:57:19 +0800 Subject: [PATCH 4/6] use require --- .../main/scala/org/apache/spark/mllib/util/MLUtils.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 5e56945981f9c..8676a8983cf2a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -83,14 +83,12 @@ object MLUtils { (index, value) }.unzip - // check if indices is one-based and in ascending order + // check if indices are one-based and in ascending order var previous = -1 var i = 0 val indicesLength = indices.size while (i < indicesLength) { - if (indices(i) <= previous) { - throw new IllegalArgumentException("indices should be one-based and in ascending order") - } + require(indices(i) > previous, "indices should be one-based and in ascending order" ) previous = indices(i) i += 1 } From 43107100aaf294080136a2a7201eaf2e26a09c43 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 3 Jun 2015 09:37:42 +0800 Subject: [PATCH 5/6] merge conflict --- .../scala/org/apache/spark/mllib/util/MLUtilsSuite.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala index 7bb9d570a0b89..70219e9ad9d3e 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala @@ -21,20 +21,19 @@ import java.io.File import scala.io.Source -import org.scalatest.FunSuite - import breeze.linalg.{squaredDistance => breezeSquaredDistance} import com.google.common.base.Charsets import com.google.common.io.Files import org.apache.spark.SparkException +import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils._ import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.Utils -class MLUtilsSuite extends FunSuite with MLlibTestSparkContext { +class MLUtilsSuite extends SparkFunSuite with MLlibTestSparkContext { test("epsilon computation") { assert(1.0 + EPSILON > 1.0, s"EPSILON is too small: $EPSILON.") From 79d9c110fe2d9beb6a146910c139e2ad60190ded Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 3 Jun 2015 15:12:05 +0800 Subject: [PATCH 6/6] optimization as respond to comments --- .../main/scala/org/apache/spark/mllib/util/MLUtils.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 67d7d4d79f08b..52d6468a72af7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -86,10 +86,11 @@ object MLUtils { // check if indices are one-based and in ascending order var previous = -1 var i = 0 - val indicesLength = indices.size + val indicesLength = indices.length while (i < indicesLength) { - require(indices(i) > previous, "indices should be one-based and in ascending order" ) - previous = indices(i) + val current = indices(i) + require(current > previous, "indices should be one-based and in ascending order" ) + previous = current i += 1 }