[SPARK-7404][ml] Add RegressionEvaluator to spark.ml

Ram Sriharsha · Ram Sriharsha · commit 1b6ebb3eb902 · 2015-05-21T20:37:30.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml.Evaluator
+import org.apache.spark.ml.param.Param
+import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.mllib.evaluation.RegressionMetrics
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.types.DoubleType
+
+/**
+ * :: AlphaComponent ::
+ *
+ * Evaluator for regression, which expects two input columns: score and label.
+ */
+@AlphaComponent
+class RegressionEvaluator(override val uid: String)
+  extends Evaluator with HasPredictionCol with HasLabelCol {
+
+  def this() = this(Identifiable.randomUID("regEval"))
+
+  /**
+   * param for metric name in evaluation
+   * @group param
+   */
+  val metricName: Param[String] = new Param(this, "metricName",
+    "metric name in evaluation (rmse|r2|mae)")
+
+  /** @group getParam */
+  def getMetricName: String = $(metricName)
+
+  /** @group setParam */
+  def setMetricName(value: String): this.type = set(metricName, value)
+
+  /** @group setParam */
+  def setScoreCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  setDefault(metricName -> "rmse")
+
+  override def evaluate(dataset: DataFrame): Double = {
+    val schema = dataset.schema
+    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
+    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)
+
+    val scoreAndLabels = dataset.select($(predictionCol), $(labelCol))
+      .map { case Row(prediction: Double, label: Double) =>
+      (prediction, label)
+    }
+    val metrics = new RegressionMetrics(scoreAndLabels)
+    val metric = $(metricName) match {
+      case "rmse" =>
+        metrics.rootMeanSquaredError
+      case "mse" =>
+        metrics.meanSquaredError
+      case "r2" =>
+        metrics.r2
+      case "mae" =>
+        metrics.meanAbsoluteError
+      case other =>
+        throw new IllegalArgumentException(s"Does not support metric $other.")
+    }
+    metric
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.scalatest.FunSuite
+
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
+import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.sql.DataFrame
+
+class RegressionEvaluatorSuite extends FunSuite with MLlibTestSparkContext {
+
+  @transient var dataset: DataFrame = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    /**
+     * Here is the instruction describing how to export the test data into CSV format
+     * so we can validate the metrics compared with scikit learns regression metrics package.
+     *
+     * import org.apache.spark.mllib.util.LinearDataGenerator
+     * val data = sc.parallelize(LinearDataGenerator.generateLinearInput(6.3,
+     *   Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1))
+     * data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1))
+     *   .saveAsTextFile("path")
+     */
+    dataset = sqlContext.createDataFrame(
+      sc.parallelize(LinearDataGenerator.generateLinearInput(
+        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
+  }
+
+  test("Regression Evaluator: default params") {
+    /**
+     * Using the following python code to load the data and train the model using scikit learn.
+     *
+     * > from sklearn.linear_model import LinearRegression
+     * > from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+     * > import pandas as pd
+     * > from patsy import dmatrices
+     * > df = pd.read_csv("path")
+     * > y, X = dmatrices('label ~ x + y',df, return_type="dataframe")
+     * > regr = LinearRegression()
+     * > regr.fit(X, y)
+     * > print('Mean Squared Error: %.2f' % mean_squared_error(y, regr.predict(X)))
+     * > print('Mean Absolute Error: %.2f' % mean_absolute_error(y, regr.predict(X)))
+     * > print('R2 score: %.2f' % r2_score(y, regr.predict(X)))
+     * > Mean Squared Error: 0.01
+     * > Mean Absolute Error: 0.08
+     * > R2 score: 1.00
+     */
+    val trainer = new LinearRegression
+    val model = trainer.fit(dataset)
+    val predictions = model.transform(dataset)
+
+    // default = rmse
+    val evaluator = new RegressionEvaluator()
+    assert(evaluator.evaluate(predictions) ~== 0.1 relTol 0.02)
+
+    // r2 score
+    evaluator.setMetricName("r2")
+    assert(evaluator.evaluate(predictions) ~== 0.01 relTol 0.002)
+
+    // mae
+    evaluator.setMetricName("mae")
+    assert(evaluator.evaluate(predictions) ~== 0.08 relTol 0.01)
+  }
+}