Skip to content
61 changes: 59 additions & 2 deletions R/pkg/R/mllib_classification.R
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,20 @@ function(object, path, overwrite = FALSE) {
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
#' or the number of partitions are large, this param could be adjusted to a larger size.
#' This is an expert parameter. Default value should be good for most cases.
#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization.
#' The bound matrix must be compatible with the shape (1, number of features) for binomial
#' regression, or (number of classes, number of features) for multinomial regression.
#' It is a R matrix.
#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization.
#' The bound matrix must be compatible with the shape (1, number of features) for binomial
#' regression, or (number of classes, number of features) for multinomial regression.
#' It is a R matrix.
#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization.
#' The bounds vector size must be equal to 1 for binomial regression, or the number
#' of classes for multinomial regression.
#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
#' The bound vector size must be equal to 1 for binomial regression, or the number
#' of classes for multinomial regression.
#' @param ... additional arguments passed to the method.
#' @return \code{spark.logit} returns a fitted logistic regression model.
#' @rdname spark.logit
Expand Down Expand Up @@ -239,21 +253,64 @@ function(object, path, overwrite = FALSE) {
setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
tol = 1E-6, family = "auto", standardization = TRUE,
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
formula <- paste(deparse(formula), collapse = "")
row <- 0
col <- 0

if (!is.null(weightCol) && weightCol == "") {
weightCol <- NULL
} else if (!is.null(weightCol)) {
weightCol <- as.character(weightCol)
}

if (!is.null(lowerBoundsOnIntercepts)) {
lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
}

if (!is.null(upperBoundsOnIntercepts)) {
upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts)
}

if (!is.null(lowerBoundsOnCoefficients)) {
if (class(lowerBoundsOnCoefficients) != "matrix") {
stop("lowerBoundsOnCoefficients must be a matrix.")
}
row <- nrow(lowerBoundsOnCoefficients)
col <- ncol(lowerBoundsOnCoefficients)
lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients))
}

if (!is.null(upperBoundsOnCoefficients)) {
if (class(upperBoundsOnCoefficients) != "matrix") {
stop("upperBoundsOnCoefficients must be a matrix.")
}

if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients)
|| col != ncol(upperBoundsOnCoefficients))) {
stop(paste0("dimension of upperBoundsOnCoefficients ",
"is not the same as lowerBoundsOnCoefficients", sep = ""))
}

if (is.null(lowerBoundsOnCoefficients)) {
row <- nrow(upperBoundsOnCoefficients)
col <- ncol(upperBoundsOnCoefficients)
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given how this is used later in scala code, should there be a check that nrow(upper) == nrow(lower) and ditto for ncol(upper) == ncol(lower)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the case where we only set the upperbound. We can set both or either one of them.

For the case that both are set. We enforce upperbound and lowerbound are the same dimension, as checked above.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok thanks, L290-291


upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
}

jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
data@sdf, formula, as.numeric(regParam),
as.numeric(elasticNetParam), as.integer(maxIter),
as.numeric(tol), as.character(family),
as.logical(standardization), as.array(thresholds),
weightCol, as.integer(aggregationDepth))
weightCol, as.integer(aggregationDepth),
as.integer(row), as.integer(col),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: no need to as.integer(row) and as.integer(col) since they are set internally and not a parameter

lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
new("LogisticRegressionModel", jobj = jobj)
})

Expand Down
40 changes: 40 additions & 0 deletions R/pkg/tests/fulltests/test_mllib_classification.R
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,46 @@ test_that("spark.logit", {
model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
prediction2 <- collect(select(predict(model2, df2), "prediction"))
expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))

# Test binomial logistic regression againt two classes with upperBoundsOnCoefficients
# and upperBoundsOnIntercepts
u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
upperBoundsOnIntercepts = 1.0)
summary <- summary(model)
coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000)
coefs <- summary$coefficients[, "Estimate"]
expect_true(all(abs(coefsR - coefs) < 0.1))
# Test upperBoundsOnCoefficients should be matrix
expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)),
upperBoundsOnIntercepts = 1.0))

# Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients
# and lowerBoundsOnIntercepts
l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4)
model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,
lowerBoundsOnIntercepts = 0.0)
summary <- summary(model)
coefsR <- c(0, 0, -1, 0, 1.902192)
coefs <- summary$coefficients[, "Estimate"]
expect_true(all(abs(coefsR - coefs) < 0.1))
# Test lowerBoundsOnCoefficients should be matrix
expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)),
lowerBoundsOnIntercepts = 0.0))

# Test multinomial logistic regression with lowerBoundsOnCoefficients
# and lowerBoundsOnIntercepts
l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
model <- spark.logit(training, Species ~ ., family = "multinomial",
lowerBoundsOnCoefficients = l,
lowerBoundsOnIntercepts = as.array(c(0.0, 0.0)))
summary <- summary(model)
versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429)
virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021)
versicolorCoefs <- summary$coefficients[, "versicolor"]
virginicaCoefs <- summary$coefficients[, "virginica"]
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
})

test_that("spark.mlp", {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas

/**
* The lower bounds on intercepts if fitting under bound constrained optimization.
* The bounds vector size must be equal with 1 for binomial regression, or the number
* The bounds vector size must be equal to 1 for binomial regression, or the number
* of classes for multinomial regression. Otherwise, it throws exception.
* Default is none.
*
Expand All @@ -230,7 +230,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas

/**
* The upper bounds on intercepts if fitting under bound constrained optimization.
* The bound vector size must be equal with 1 for binomial regression, or the number
* The bound vector size must be equal to 1 for binomial regression, or the number
* of classes for multinomial regression. Otherwise, it throws exception.
* Default is none.
*
Expand Down Expand Up @@ -451,12 +451,12 @@ class LogisticRegression @Since("1.2.0") (
}
if (isSet(lowerBoundsOnIntercepts)) {
require($(lowerBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
"lowerBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
"lowerBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
s"classes for multinomial regression, but found: ${getLowerBoundsOnIntercepts.size}.")
}
if (isSet(upperBoundsOnIntercepts)) {
require($(upperBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
"upperBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
"upperBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
s"classes for multinomial regression, but found: ${getUpperBoundsOnIntercepts.size}.")
}
if (isSet(lowerBoundsOnCoefficients) && isSet(upperBoundsOnCoefficients)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import org.json4s.jackson.JsonMethods._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.linalg.{Matrices, Vector, Vectors}
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset}
Expand Down Expand Up @@ -97,7 +97,13 @@ private[r] object LogisticRegressionWrapper
standardization: Boolean,
thresholds: Array[Double],
weightCol: String,
aggregationDepth: Int
aggregationDepth: Int,
numRowsOfBoundsOnCoefficients: Int,
numColsOfBoundsOnCoefficients: Int,
lowerBoundsOnCoefficients: Array[Double],
upperBoundsOnCoefficients: Array[Double],
lowerBoundsOnIntercepts: Array[Double],
upperBoundsOnIntercepts: Array[Double]
): LogisticRegressionWrapper = {

val rFormula = new RFormula()
Expand Down Expand Up @@ -133,6 +139,30 @@ private[r] object LogisticRegressionWrapper

if (weightCol != null) lr.setWeightCol(weightCol)

if (numRowsOfBoundsOnCoefficients != 0 &&
numColsOfBoundsOnCoefficients != 0 && lowerBoundsOnCoefficients != null) {
val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
numColsOfBoundsOnCoefficients, lowerBoundsOnCoefficients)
lr.setLowerBoundsOnCoefficients(coef)
}

if (numRowsOfBoundsOnCoefficients != 0 &&
numColsOfBoundsOnCoefficients != 0 && upperBoundsOnCoefficients != null) {
val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
numColsOfBoundsOnCoefficients, upperBoundsOnCoefficients)
lr.setUpperBoundsOnCoefficients(coef)
}

if (lowerBoundsOnIntercepts != null) {
val intercept = Vectors.dense(lowerBoundsOnIntercepts)
lr.setLowerBoundsOnIntercepts(intercept)
}

if (upperBoundsOnIntercepts != null) {
val intercept = Vectors.dense(upperBoundsOnIntercepts)
lr.setUpperBoundsOnIntercepts(intercept)
}

val idxToStr = new IndexToString()
.setInputCol(PREDICTED_LABEL_INDEX_COL)
.setOutputCol(PREDICTED_LABEL_COL)
Expand Down