Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mllib/src/main/scala/org/apache/spark/ml/Estimator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import org.apache.spark.sql.DataFrame
* Abstract class for estimators that fit models to data.
*/
@AlphaComponent
abstract class Estimator[M <: Model[M]] extends PipelineStage with Params {
abstract class Estimator[M <: Model[M]] extends PipelineStage {

/**
* Fits a single model to the input data with optional parameters.
Expand Down
2 changes: 1 addition & 1 deletion mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ class Pipeline(override val uid: String) extends Estimator[PipelineModel] {

/**
* :: AlphaComponent ::
* Represents a compiled pipeline.
* Represents a fitted pipeline.
*/
@AlphaComponent
class PipelineModel private[ml] (
Expand Down
2 changes: 1 addition & 1 deletion mllib/src/main/scala/org/apache/spark/ml/Transformer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import org.apache.spark.sql.types._
* Abstract class for transformers that transform one dataset into another.
*/
@AlphaComponent
abstract class Transformer extends PipelineStage with Params {
abstract class Transformer extends PipelineStage {

/**
* Transforms the dataset with optional parameters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.Evaluator
import org.apache.spark.ml.evaluation.Evaluator
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* limitations under the License.
*/

package org.apache.spark.ml
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml.param.{ParamMap, Params}
Expand All @@ -29,7 +29,7 @@ import org.apache.spark.sql.DataFrame
abstract class Evaluator extends Params {

/**
* Evaluates the output.
* Evaluates model output and returns a scalar metric (larger is better).
*
* @param dataset a dataset that contains labels/observations and predictions.
* @param paramMap parameter map that specifies the input columns and output metrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class PolynomialExpansion(override val uid: String)
* To handle sparsity, if c is zero, we can skip all monomials that contain it. We remember the
* current index and increment it properly for sparse input.
*/
object PolynomialExpansion {
private[feature] object PolynomialExpansion {

private def choose(n: Int, k: Int): Int = {
Range(n, n - k, -1).product / Range(k, 1, -1).product
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@ class VectorAssembler(override val uid: String)
}
}

@AlphaComponent
object VectorAssembler {
private object VectorAssembler {

private[feature] def assemble(vv: Any*): Vector = {
val indices = ArrayBuilder.make[Int]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ private[feature] trait Word2VecBase extends Params

/**
* The dimension of the code that you want to transform from words.
* @group param
*/
final val vectorSize = new IntParam(
this, "vectorSize", "the dimension of codes after transforming from words")
Expand All @@ -47,6 +48,7 @@ private[feature] trait Word2VecBase extends Params

/**
* Number of partitions for sentences of words.
* @group param
*/
final val numPartitions = new IntParam(
this, "numPartitions", "number of partitions for sentences of words")
Expand All @@ -58,6 +60,7 @@ private[feature] trait Word2VecBase extends Params
/**
* The minimum number of times a token must appear to be included in the word2vec model's
* vocabulary.
* @group param
*/
final val minCount = new IntParam(this, "minCount", "the minimum number of times a token must " +
"appear to be included in the word2vec model's vocabulary")
Expand Down
6 changes: 5 additions & 1 deletion mllib/src/main/scala/org/apache/spark/ml/param/params.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import scala.annotation.varargs
import scala.collection.mutable
import scala.collection.JavaConverters._

import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
import org.apache.spark.ml.util.Identifiable

/**
Expand Down Expand Up @@ -92,9 +92,11 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
}

/**
* :: DeveloperApi ::
* Factory methods for common validation functions for [[Param.isValid]].
* The numerical methods only support Int, Long, Float, and Double.
*/
@DeveloperApi
object ParamValidators {

/** (private[param]) Default validation always return true */
Expand Down Expand Up @@ -530,11 +532,13 @@ trait Params extends Identifiable with Serializable {
}

/**
* :: DeveloperApi ::
* Java-friendly wrapper for [[Params]].
* Java developers who need to extend [[Params]] should use this class instead.
* If you need to extend a abstract class which already extends [[Params]], then that abstract
* class should be Java-friendly as well.
*/
@DeveloperApi
abstract class JavaParams extends Params

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import com.github.fommil.netlib.F2jBLAS
import org.apache.spark.Logging
import org.apache.spark.annotation.AlphaComponent
import org.apache.spark.ml._
import org.apache.spark.ml.evaluation.Evaluator
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.util.MLUtils
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,14 @@ package org.apache.spark.ml.util

import scala.collection.immutable.HashMap

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.attribute._
import org.apache.spark.sql.types.StructField


/**
* :: Experimental ::
*
* Helper utilities for tree-based algorithms
*/
@Experimental
object MetadataUtils {
private[spark] object MetadataUtils {

/**
* Examine a schema to identify the number of classes in a label column.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,13 @@

package org.apache.spark.ml.util

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.types.{DataType, StructField, StructType}


/**
* :: DeveloperApi ::
* Utils for handling schemas.
*/
@DeveloperApi
object SchemaUtils {
private[spark] object SchemaUtils {

// TODO: Move the utility methods to SQL.

Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/ml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
# limitations under the License.
#

from pyspark.ml.pipeline import Transformer, Estimator, Model, Pipeline, PipelineModel, Evaluator
from pyspark.ml.pipeline import Transformer, Estimator, Model, Pipeline, PipelineModel

__all__ = ["Transformer", "Estimator", "Model", "Pipeline", "PipelineModel", "Evaluator"]
__all__ = ["Transformer", "Estimator", "Model", "Pipeline", "PipelineModel"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not include Evaluator in this list?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is moved to ml.evaluation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But why is this only importing stuff from pipeline.py? Isn't Evaluator just as important a concept as the other items here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Scala side, we put Classifier under ml.classification and Regressor under ml.regression. So I moved Evaluator to ml.evaluation to match them. Evaluator is now under pyspark.ml.evaluation and hence it is not imported here. Under pyspark.ml, we have Transformer, Estimator, Model, Pipeline, and PipelineModel.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ohh, I see. Thanks for clarifying.

63 changes: 61 additions & 2 deletions python/pyspark/ml/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,72 @@
# limitations under the License.
#

from pyspark.ml.wrapper import JavaEvaluator
from abc import abstractmethod, ABCMeta

from pyspark.ml.wrapper import JavaWrapper
from pyspark.ml.param import Param, Params
from pyspark.ml.param.shared import HasLabelCol, HasRawPredictionCol
from pyspark.ml.util import keyword_only
from pyspark.mllib.common import inherit_doc

__all__ = ['BinaryClassificationEvaluator']
__all__ = ['Evaluator', 'BinaryClassificationEvaluator']


@inherit_doc
class Evaluator(Params):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add @inherit_doc?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is actually not needed because Evaluator doesn't override any methods from Params. Added @inherit_doc to be future-proof.

"""
Base class for evaluators that compute metrics from predictions.
"""

__metaclass__ = ABCMeta

@abstractmethod
def _evaluate(self, dataset):
"""
Evaluates the output.

:param dataset: a dataset that contains labels/observations and
predictions
:return: metric
"""
raise NotImplementedError()

def evaluate(self, dataset, params={}):
"""
Evaluates the output with optional parameters.

:param dataset: a dataset that contains labels/observations and
predictions
:param params: an optional param map that overrides embedded
params
:return: metric
"""
if isinstance(params, dict):
if params:
return self.copy(params)._evaluate(dataset)
else:
return self._evaluate(dataset)
else:
raise ValueError("Params must be a param map but got %s." % type(params))


@inherit_doc
class JavaEvaluator(Evaluator, JavaWrapper):
"""
Base class for :py:class:`Evaluator`s that wrap Java/Scala
implementations.
"""

__metaclass__ = ABCMeta

def _evaluate(self, dataset):
"""
Evaluates the output.
:param dataset: a dataset that contains labels/observations and predictions.
:return: evaluation metric
"""
self._transfer_params_to_java()
return self._java_obj.evaluate(dataset._jdf)


@inherit_doc
Expand Down
37 changes: 0 additions & 37 deletions python/pyspark/ml/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,40 +219,3 @@ def _transform(self, dataset):
def copy(self, extra={}):
stages = [stage.copy(extra) for stage in self.stages]
return PipelineModel(stages)


class Evaluator(Params):
"""
Base class for evaluators that compute metrics from predictions.
"""

__metaclass__ = ABCMeta

@abstractmethod
def _evaluate(self, dataset):
"""
Evaluates the output.

:param dataset: a dataset that contains labels/observations and
predictions
:return: metric
"""
raise NotImplementedError()

def evaluate(self, dataset, params={}):
"""
Evaluates the output with optional parameters.

:param dataset: a dataset that contains labels/observations and
predictions
:param params: an optional param map that overrides embedded
params
:return: metric
"""
if isinstance(params, dict):
if params:
return self.copy(params)._evaluate(dataset)
else:
return self._evaluate(dataset)
else:
raise ValueError("Params must be a param map but got %s." % type(params))
21 changes: 1 addition & 20 deletions python/pyspark/ml/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pyspark import SparkContext
from pyspark.sql import DataFrame
from pyspark.ml.param import Params
from pyspark.ml.pipeline import Estimator, Transformer, Evaluator, Model
from pyspark.ml.pipeline import Estimator, Transformer, Model
from pyspark.mllib.common import inherit_doc, _java2py, _py2java


Expand Down Expand Up @@ -185,22 +185,3 @@ def _call_java(self, name, *args):
sc = SparkContext._active_spark_context
java_args = [_py2java(sc, arg) for arg in args]
return _java2py(sc, m(*java_args))


@inherit_doc
class JavaEvaluator(Evaluator, JavaWrapper):
"""
Base class for :py:class:`Evaluator`s that wrap Java/Scala
implementations.
"""

__metaclass__ = ABCMeta

def _evaluate(self, dataset):
"""
Evaluates the output.
:param dataset: a dataset that contains labels/observations and predictions.
:return: evaluation metric
"""
self._transfer_params_to_java()
return self._java_obj.evaluate(dataset._jdf)