Fixing scala style and test case

imatiach-msft · imatiach-msft · commit 14c7ce3b3bdf · 2016-12-28T18:36:57.000-05:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -377,12 +377,12 @@ private object BisectingKMeans extends Serializable {
         internalIndex -= 1
         val leftIndex = leftChildIndex(rawIndex)
         val rightIndex = rightChildIndex(rawIndex)
-        val height = math.sqrt(Seq(leftIndex, rightIndex).map { childIndex =>
+        val indexes = Seq(leftIndex, rightIndex).filter(clusters.contains(_))
+        val height = math.sqrt(indexes.map { childIndex =>
           KMeans.fastSquaredDistance(center, clusters(childIndex).center)
         }.max)
-        val left = buildSubTree(leftIndex)
-        val right = buildSubTree(rightIndex)
-        new ClusteringTreeNode(index, size, center, cost, height, Array(left, right))
+        val children = indexes.map(buildSubTree(_)).toArray
+        new ClusteringTreeNode(index, size, center, cost, height, children)
       } else {
         val index = leafIndex
         leafIndex += 1
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -66,6 +66,9 @@ class BisectingKMeansSuite
     // Verify fit does not fail on very sparse data
     val model = bkm.fit(sparseDataset)
     assert(model.hasSummary)
+    val result = model.transform(sparseDataset)
+    val numClusters = result.select("prediction").distinct().collect().length
+    assert(numClusters <= k && numClusters >= 1)
   }
 
   test("setter/getter") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -17,14 +17,15 @@
 
 package org.apache.spark.ml.clustering
 
+import scala.util.Random
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
-import scala.util.Random
 
 private[clustering] case class TestRow(features: Vector)
 

Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,9 @@ class BisectingKMeansSuite`
`66`	`66`	`// Verify fit does not fail on very sparse data`
`67`	`67`	`val model = bkm.fit(sparseDataset)`
`68`	`68`	`assert(model.hasSummary)`
	`69`	`+ val result = model.transform(sparseDataset)`
	`70`	`+ val numClusters = result.select("prediction").distinct().collect().length`
	`71`	`+ assert(numClusters <= k && numClusters >= 1)`
`69`	`72`	`}`
`70`	`73`
`71`	`74`	`test("setter/getter") {`