Added save, load to mllib.classification.LogisticRegressionModel, plus test suite

jkbradley · jkbradley · commit 418ba1b0dea6 · 2015-02-02T18:54:28.000-08:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -17,13 +17,16 @@
 
 package org.apache.spark.mllib.classification
 
+import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.mllib.linalg.{DenseVector, Vector}
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.util.{DataValidators, MLUtils}
+import org.apache.spark.mllib.util.{Importable, DataValidators, Exportable}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{Row, SQLContext, SchemaRDD}
 
 /**
  * Classification model trained using Multinomial/Binary Logistic Regression.
@@ -42,7 +45,8 @@ class LogisticRegressionModel (
     override val intercept: Double,
     val numFeatures: Int,
     val numClasses: Int)
-  extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable {
+  extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
+  with Exportable {
 
   def this(weights: Vector, intercept: Double) = this(weights, intercept, weights.size, 2)
 
@@ -60,6 +64,13 @@ class LogisticRegressionModel (
     this
   }
 
+  /**
+   * :: Experimental ::
+   * Returns the threshold (if any) used for converting raw prediction scores into 0/1 predictions.
+   */
+  @Experimental
+  def getThreshold: Option[Double] = threshold
+
   /**
    * :: Experimental ::
    * Clears the threshold so that `predict` will output raw prediction scores.
@@ -126,6 +137,65 @@ class LogisticRegressionModel (
       bestClass.toDouble
     }
   }
+
+  override def save(sc: SparkContext, path: String): Unit = {
+    val sqlContext = new SQLContext(sc)
+    import sqlContext._
+    // TODO: Do we need to use a SELECT statement to make the column ordering deterministic?
+    // Create JSON metadata.
+    val metadata =
+      LogisticRegressionModel.Metadata(clazz = this.getClass.getName, version = Exportable.version)
+    val metadataRDD: SchemaRDD = sc.parallelize(Seq(metadata))
+    metadataRDD.toJSON.saveAsTextFile(path + "/metadata")
+    // Create Parquet data.
+    val data = LogisticRegressionModel.Data(weights, intercept, threshold)
+    val dataRDD: SchemaRDD = sc.parallelize(Seq(data))
+    dataRDD.saveAsParquetFile(path + "/data")
+  }
+}
+
+object LogisticRegressionModel extends Importable[LogisticRegressionModel] {
+
+  override def load(sc: SparkContext, path: String): LogisticRegressionModel = {
+    val sqlContext = new SQLContext(sc)
+    import sqlContext._
+
+    // Load JSON metadata.
+    val metadataRDD = sqlContext.jsonFile(path + "/metadata")
+    val metadataArray = metadataRDD.select("clazz".attr, "version".attr).take(1)
+    assert(metadataArray.size == 1,
+      s"Unable to load LogisticRegressionModel metadata from: ${path + "/metadata"}")
+    metadataArray(0) match {
+      case Row(clazz: String, version: String) =>
+        assert(clazz == classOf[LogisticRegressionModel].getName, s"LogisticRegressionModel.load" +
+          s" was given model file with metadata specifying a different model class: $clazz")
+        assert(version == Importable.version, // only 1 version exists currently
+          s"LogisticRegressionModel.load did not recognize model format version: $version")
+    }
+
+    // Load Parquet data.
+    val dataRDD = sqlContext.parquetFile(path + "/data")
+    val dataArray = dataRDD.select("weights".attr, "intercept".attr, "threshold".attr).take(1)
+    assert(dataArray.size == 1,
+      s"Unable to load LogisticRegressionModel data from: ${path + "/data"}")
+    val data = dataArray(0)
+    assert(data.size == 3, s"Unable to load LogisticRegressionModel data from: ${path + "/data"}")
+    val lr = data match {
+      case Row(weights: Vector, intercept: Double, _) =>
+        new LogisticRegressionModel(weights, intercept)
+    }
+    if (data.isNullAt(2)) {
+      lr.clearThreshold()
+    } else {
+      lr.setThreshold(data.getDouble(2))
+    }
+    lr
+  }
+
+  private case class Metadata(clazz: String, version: String)
+
+  private case class Data(weights: Vector, intercept: Double, threshold: Option[Double])
+
 }
 
 /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/modelImportExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/modelImportExport.scala
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.util
+
+import org.apache.spark.SparkContext
+import org.apache.spark.annotation.DeveloperApi
+
+
+/**
+ * :: DeveloperApi ::
+ *
+ * Trait for models and transformers which may be saved as files.
+ * This should be inherited by the class which implements model instances.
+ */
+@DeveloperApi
+trait Exportable {
+
+  /**
+   * Save this model to the given path.
+   *
+   * This saves:
+   *  - human-readable (JSON) model metadata to path/metadata/
+   *  - Parquet formatted data to path/data/
+   *
+   * The model may be loaded using [[Importable.load]].
+   *
+   * @param sc  Spark context used to save model data.
+   * @param path  Path specifying the directory in which to save this model.
+   *              This directory and any intermediate directory will be created if needed.
+   */
+  def save(sc: SparkContext, path: String): Unit
+
+}
+
+object Exportable {
+
+  /** Current version of model import/export format. */
+  val version: String = "1.0"
+
+}
+
+/**
+ * :: DeveloperApi ::
+ *
+ * Trait for models and transformers which may be loaded from files.
+ * This should be inherited by an object paired with the model class.
+ */
+@DeveloperApi
+trait Importable[Model <: Exportable] {
+
+  /**
+   * Load a model from the given path.
+   *
+   * The model should have been saved by [[Exportable.save]].
+   *
+   * @param sc  Spark context used for loading model files.
+   * @param path  Path specifying the directory to which the model was saved.
+   * @return  Model instance
+   */
+  def load(sc: SparkContext, path: String): Model
+
+}
+
+object Importable {
+
+  /** Current version of model import/export format. */
+  val version: String = Exportable.version
+
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -18,6 +18,8 @@
 package org.apache.spark.mllib.classification
 
 import scala.util.control.Breaks._
+import org.apache.spark.util.Utils
+
 import scala.util.Random
 import scala.collection.JavaConversions._
 
@@ -407,16 +409,16 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext with M
      *
      * First of all, using the following scala code to save the data into `path`.
      *
-     *    testRDD.map(x => x.label+ ", " + x.features(0) + ", " + x.features(1) + ", " +
-     *      x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
+     * testRDD.map(x => x.label+ ", " + x.features(0) + ", " + x.features(1) + ", " +
+     * x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
      *
      * Using the following R code to load the data and train the model using glmnet package.
      *
-     *    library("glmnet")
-     *    data <- read.csv("path", header=FALSE)
-     *    label = factor(data$V1)
-     *    features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
-     *    weights = coef(glmnet(features,label, family="multinomial", alpha = 0, lambda = 0))
+     * library("glmnet")
+     * data <- read.csv("path", header=FALSE)
+     * label = factor(data$V1)
+     * features = as.matrix(data.frame(data$V2, data$V3, data$V4, data$V5))
+     * weights = coef(glmnet(features,label, family="multinomial", alpha = 0, lambda = 0))
      *
      * The model weights of mutinomial logstic regression in R have `K` set of linear predictors
      * for `K` classes classification problem; however, only `K-1` set is required if the first
@@ -425,25 +427,25 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext with M
      * weights. The mathematical discussion and proof can be found here:
      * http://en.wikipedia.org/wiki/Multinomial_logistic_regression
      *
-     *    weights1 = weights$`1` - weights$`0`
-     *    weights2 = weights$`2` - weights$`0`
+     * weights1 = weights$`1` - weights$`0`
+     * weights2 = weights$`2` - weights$`0`
      *
-     *    > weights1
-     *    5 x 1 sparse Matrix of class "dgCMatrix"
-     *                    s0
-     *             2.6228269
-     *    data.V2 -0.5837166
-     *    data.V3  0.9285260
-     *    data.V4 -0.3783612
-     *    data.V5 -0.8123411
-     *    > weights2
-     *    5 x 1 sparse Matrix of class "dgCMatrix"
-     *                     s0
-     *             4.11197445
-     *    data.V2 -0.16918650
-     *    data.V3 -0.81104784
-     *    data.V4 -0.06463799
-     *    data.V5 -0.29198337
+     * > weights1
+     * 5 x 1 sparse Matrix of class "dgCMatrix"
+     * s0
+     * 2.6228269
+     * data.V2 -0.5837166
+     * data.V3  0.9285260
+     * data.V4 -0.3783612
+     * data.V5 -0.8123411
+     * > weights2
+     * 5 x 1 sparse Matrix of class "dgCMatrix"
+     * s0
+     * 4.11197445
+     * data.V2 -0.16918650
+     * data.V3 -0.81104784
+     * data.V4 -0.06463799
+     * data.V5 -0.29198337
      */
 
     val weightsR = Vectors.dense(Array(
@@ -459,7 +461,41 @@ class LogisticRegressionSuite extends FunSuite with MLlibTestSparkContext with M
     // very steep curve in logistic function so that when we draw samples from distribution, it's
     // very easy to assign to another labels. However, this prediction result is consistent to R.
     validatePrediction(model.predict(validationRDD.map(_.features)).collect(), validationData, 0.47)
+  }
+
+  test("model export/import") {
+    val nPoints = 20
+    val A = 2.0
+    val B = -1.5
 
+    val testData = LogisticRegressionSuite.generateLogisticInput(A, B, nPoints, 42)
+    val testRDD = sc.parallelize(testData, 2)
+    testRDD.cache()
+
+    val lr = new LogisticRegressionWithLBFGS().setIntercept(true)
+    lr.optimizer.setNumIterations(1)
+    val model = lr.run(testRDD)
+    model.clearThreshold()
+    assert(model.getThreshold.isEmpty)
+
+    val tempDir = Utils.createTempDir()
+    val path = tempDir.toURI.toString
+
+    // Save model
+    model.save(sc, path)
+    val sameModel = LogisticRegressionModel.load(sc, path)
+    assert(model.weights == sameModel.weights)
+    assert(model.intercept == sameModel.intercept)
+    assert(sameModel.getThreshold.isEmpty)
+    Utils.deleteRecursively(tempDir)
+
+    // Save model with threshold
+    model.setThreshold(0.7)
+    model.save(sc, path)
+    val sameModel2 = LogisticRegressionModel.load(sc, path)
+    assert(model.getThreshold.get == sameModel2.getThreshold.get)
+
+    Utils.deleteRecursively(tempDir)
   }
 
 }