Skip to content

Commit c273771

Browse files
author
Li Pu
committed
automatically determine SVD compute mode and parameters
1 parent 7148426 commit c273771

File tree

2 files changed

+104
-97
lines changed

2 files changed

+104
-97
lines changed

mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala

Lines changed: 91 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -250,70 +250,29 @@ class RowMatrix(
250250
}
251251

252252
/**
253-
* Computes singular value decomposition of this matrix using dense implementation.
254-
* Denote this matrix by A (m x n), this will compute matrices U, S, V such that A ~= U * S * V',
255-
* where S contains the leading singular values, U and V contain the corresponding singular
256-
* vectors.
253+
* Computes singular value decomposition of this matrix. Denote this matrix by A (m x n), this
254+
* will compute matrices U, S, V such that A ~= U * S * V', where S contains the leading k
255+
* singular values, U and V contain the corresponding singular vectors.
257256
*
258-
* This approach requires `n^2` doubles to fit in memory and `O(n^3)` time on the master node.
259-
* Further, n should be less than m. For problems with small n (e.g. n < 100 and n << m), the
260-
* dense implementation might be faster than the sparse implementation.
257+
* This approach assumes n is smaller than m, and invokes a dense matrix implementation when n is
258+
* small (n < 100) or the number of requested singular values is the same as n (k == n). For
259+
* problems with large n (n >= 100) and k < n, this approach invokes a sparse matrix
260+
* implementation that provides a function to ARPACK to multiply a vector with A'A. It iteratively
261+
* calls ARPACK-dsaupd on the master node, from which we recover S and V. Then we compute U via
262+
* easy matrix multiplication as U = A * (V * S^{-1}).
261263
*
262-
* At most k largest non-zero singular values and associated vectors are returned.
263-
* If there are k such values, then the dimensions of the return will be:
264+
* The dense implementation requires `n^2` doubles to fit in memory and `O(n^3)` time on the
265+
* master node.
264266
*
265-
* U is a RowMatrix of size m x k that satisfies U'U = eye(k),
266-
* s is a Vector of size k, holding the singular values in descending order,
267-
* and V is a Matrix of size n x k that satisfies V'V = eye(k).
267+
* The sparse implementation requires `n * (6 * k + 4)` doubles to fit in memory on the master
268+
* node and approximately `O(k * nnz(A))` time distributed on all worker nodes. There is no
269+
* restriction on m (number of rows).
268270
*
269-
* @param k number of leading singular values to keep (0 < k <= n). We might return less than
270-
* k if there are numerically zero singular values. See rCond.
271-
* @param computeU whether to compute U.
272-
* @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
273-
* are treated as zero, where sigma(0) is the largest singular value.
274-
* @return SingularValueDecomposition(U, s, V)
275-
*/
276-
def computeSVD(
277-
k: Int,
278-
computeU: Boolean,
279-
rCond: Double): SingularValueDecomposition[RowMatrix, Matrix] = {
280-
val n = numCols().toInt
281-
require(k > 0 && k <= n, s"Request up to n singular values k=$k n=$n.")
282-
val G = computeGramianMatrix()
283-
val (uFull: BDM[Double], sigmaSquaresFull: BDV[Double], vFull: BDM[Double]) =
284-
brzSvd(G.toBreeze.asInstanceOf[BDM[Double]])
285-
computeSVDEffectiveRank(k, n, computeU, rCond, sigmaSquaresFull, uFull)
286-
}
287-
288-
/**
289-
* Computes singular value decomposition of this matrix using dense implementation with default
290-
* reciprocal condition number (1e-9). See computeSVD for more details.
291-
*
292-
* @param k number of leading singular values to keep (0 < k <= n). We might return less than
293-
* k if there are numerically zero singular values.
294-
* @param computeU whether to compute U.
295-
* @return SingularValueDecomposition(U, s, V)
296-
*/
297-
def computeSVD(
298-
k: Int,
299-
computeU: Boolean = false): SingularValueDecomposition[RowMatrix, Matrix] = {
300-
computeSVD(k, computeU, 1e-9)
301-
}
302-
303-
/**
304-
* Computes singular value decomposition of this matrix using sparse implementation.
305-
* Denote this matrix by A (m x n), this will compute matrices U, S, V such that A ~= U * S * V',
306-
* where S contains the leading singular values, U and V contain the corresponding singular
307-
* vectors.
308-
*
309-
* The decomposition is computed by providing a function that multiples a vector with A'A to
310-
* ARPACK, and iteratively invoking ARPACK-dsaupd on master node, from which we recover S and V.
311-
* Then we compute U via easy matrix multiplication as U = A * (V * S^{-1}).
312-
* Note that this approach requires approximately `O(k * nnz(A))` time.
313-
*
314-
* There is no restriction on m, but we require `n*(6*k+4)` doubles to fit in memory on the master
315-
* node. Further, n should be less than m, and ARPACK requires k to be strictly less than n. If
316-
* the requested k = n, please use the dense implementation computeSVD.
271+
* Several internal parameters are set to default values. The reciprocal condition number rCond
272+
* is set to 1e-9. All singular values smaller than rCond * sigma(0) are treated as zeros, where
273+
* sigma(0) is the largest singular value. The maximum number of Arnoldi update iterations for
274+
* ARPACK is set to 300 or k * 3, whichever is larger. The numerical tolerance for ARPACK
275+
* eigen-decomposition is set to 1e-10.
317276
*
318277
* At most k largest non-zero singular values and associated vectors are returned.
319278
* If there are k such values, then the dimensions of the return will be:
@@ -322,44 +281,86 @@ class RowMatrix(
322281
* s is a Vector of size k, holding the singular values in descending order,
323282
* and V is a Matrix of size n x k that satisfies V'V = eye(k).
324283
*
325-
* @param k number of leading singular values to keep (0 < k < n). We might return less than
326-
* k if there are numerically zero singular values. See rCond.
284+
* @param k number of leading singular values to keep (0 < k <= n). It might return less than
285+
* k if there are numerically zero singular values or there are not enough Ritz values
286+
* converged before the maximum number of Arnoldi update iterations is reached (in case
287+
* that matrix A is ill-conditioned).
327288
* @param computeU whether to compute U.
328289
* @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
329290
* are treated as zero, where sigma(0) is the largest singular value.
330-
* @param tol the numerical tolerance of SVD computation. Larger tolerance means fewer iterations,
331-
* but less accurate result.
332-
* @param maxIterations the maximum number of Arnoldi update iterations.
333-
* @return SingularValueDecomposition(U, s, V)
291+
* @return SingularValueDecomposition(U, s, V), U = null if computeU = false
334292
*/
335-
def computeSparseSVD(
336-
k: Int,
337-
computeU: Boolean,
338-
rCond: Double,
339-
tol: Double,
340-
maxIterations: Int): SingularValueDecomposition[RowMatrix, Matrix] = {
341-
val n = numCols().toInt
342-
require(k > 0 && k < n, s"Request up to n - 1 singular values k=$k n=$n. " +
343-
s"For full SVD (i.e. k = n), please use dense implementation computeSVD.")
344-
val (sigmaSquares: BDV[Double], u: BDM[Double]) =
345-
EigenValueDecomposition.symmetricEigs(multiplyGramianMatrixBy, n, k, tol, maxIterations)
346-
computeSVDEffectiveRank(k, n, computeU, rCond, sigmaSquares, u)
293+
def computeSVD(k: Int,
294+
computeU: Boolean = false,
295+
rCond: Double = 1e-9): SingularValueDecomposition[RowMatrix, Matrix] = {
296+
// maximum number of Arnoldi update iterations for invoking ARPACK
297+
val maxIter = math.max(300, k * 3)
298+
// numerical tolerance for invoking ARPACK
299+
val tol = 1e-10
300+
computeSVD(k, computeU, rCond, maxIter, tol, "auto")
347301
}
348302

349303
/**
350-
* Computes singular value decomposition of this matrix using sparse implementation with default
351-
* reciprocal condition number (1e-9), tolerance (1e-10), and maximum number of Arnoldi iterations
352-
* (300). See computeSparseSVD for more details.
353-
*
354-
* @param k number of leading singular values to keep (0 < k < n). We might return less than
355-
* k if there are numerically zero singular values.
356-
* @param computeU whether to compute U.
357-
* @return SingularValueDecomposition(U, s, V)
304+
* Actual SVD computation, visible for testing.
358305
*/
359-
def computeSparseSVD(
360-
k: Int,
361-
computeU: Boolean = false): SingularValueDecomposition[RowMatrix, Matrix] = {
362-
computeSparseSVD(k, computeU, 1e-9, 1e-10, 300)
306+
private[mllib] def computeSVD(k: Int,
307+
computeU: Boolean,
308+
rCond: Double,
309+
maxIter: Int,
310+
tol: Double,
311+
mode: String): SingularValueDecomposition[RowMatrix, Matrix] = {
312+
val n = numCols().toInt
313+
314+
object SVDMode extends Enumeration {
315+
val DenseARPACK, DenseLAPACK, SparseARPACK = Value
316+
}
317+
318+
val derivedMode = mode match {
319+
case "auto" => if (n < 100 || k == n) {
320+
// invoke dense implementation when n is small or k == n (since ARPACK requires k < n)
321+
require(k > 0 && k <= n, s"Request up to n singular values k=$k n=$n.")
322+
"dense"
323+
} else {
324+
// invoke sparse implementation with ARPACK when n is large
325+
require(k > 0 && k < n, s"Request up to n - 1 singular values for ARPACK k=$k n=$n.")
326+
"sparse"
327+
}
328+
case "dense" => "dense"
329+
case "sparse" => "sparse"
330+
case _ => throw new IllegalArgumentException(s"Do not support mode $mode.")
331+
}
332+
333+
val computeMode = derivedMode match {
334+
case "dense" => if (k < n / 2) {
335+
// when k is small, call ARPACK
336+
require(k > 0 && k < n, s"Request up to n - 1 singular values for ARPACK k=$k n=$n.")
337+
SVDMode.DenseARPACK
338+
} else {
339+
// when k is large, call LAPACK
340+
SVDMode.DenseLAPACK
341+
}
342+
case "sparse" => SVDMode.SparseARPACK
343+
}
344+
345+
val (sigmaSquares: BDV[Double], u: BDM[Double]) = computeMode match {
346+
case SVDMode.DenseARPACK => {
347+
val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]]
348+
def multiplyDenseGramianMatrixBy(v: DenseVector): DenseVector = {
349+
Vectors.fromBreeze(G * v.toBreeze).asInstanceOf[DenseVector]
350+
}
351+
EigenValueDecomposition.symmetricEigs(multiplyDenseGramianMatrixBy, n, k, tol, maxIter)
352+
}
353+
case SVDMode.DenseLAPACK => {
354+
val G = computeGramianMatrix().toBreeze.asInstanceOf[BDM[Double]]
355+
val (uFull: BDM[Double], sigmaSquaresFull: BDV[Double], vFull: BDM[Double]) = brzSvd(G)
356+
(sigmaSquaresFull, uFull)
357+
}
358+
case SVDMode.SparseARPACK => {
359+
EigenValueDecomposition.symmetricEigs(multiplyGramianMatrixBy, n, k, tol, maxIter)
360+
}
361+
}
362+
363+
computeSVDEffectiveRank(k, n, computeU, rCond, sigmaSquares, u)
363364
}
364365

365366
/**

mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,14 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext {
102102
val localV: BDM[Double] = localVt.t.toDenseMatrix
103103
for (k <- 1 to n) {
104104
val svd = if (k < n) {
105-
if (denseSVD) mat.computeSVD(k, computeU = true)
106-
else mat.computeSparseSVD(k, computeU = true)
105+
if (denseSVD) {
106+
mat.computeSVD(k, computeU = true, 1e-9, 300, 1e-10, mode = "dense")
107+
} else {
108+
mat.computeSVD(k, computeU = true, 1e-9, 300, 1e-10, mode = "sparse")
109+
}
107110
} else {
108111
// when k = n, always use dense SVD
109-
mat.computeSVD(k, computeU = true)
112+
mat.computeSVD(k, computeU = true, 1e-9, 300, 1e-10, mode = "dense")
110113
}
111114
val U = svd.U
112115
val s = svd.s
@@ -120,8 +123,11 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext {
120123
assertColumnEqualUpToSign(V.toBreeze.asInstanceOf[BDM[Double]], localV, k)
121124
assert(closeToZero(s.toBreeze.asInstanceOf[BDV[Double]] - localSigma(0 until k)))
122125
}
123-
val svdWithoutU = if (denseSVD) mat.computeSVD(n - 1, computeU = false)
124-
else mat.computeSparseSVD(n - 1, computeU = false)
126+
val svdWithoutU = if (denseSVD) {
127+
mat.computeSVD(n - 1, computeU = false, 1e-9, 300, 1e-10, mode = "dense")
128+
} else {
129+
mat.computeSVD(n - 1, computeU = false, 1e-9, 300, 1e-10, mode = "sparse")
130+
}
125131
assert(svdWithoutU.U === null)
126132
}
127133
}
@@ -131,8 +137,8 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext {
131137
for (denseSVD <- Seq(true, false)) {
132138
val rows = sc.parallelize(Array.fill(4)(Vectors.dense(1.0, 1.0, 1.0)), 2)
133139
val mat = new RowMatrix(rows, 4, 3)
134-
val svd = if (denseSVD) mat.computeSVD(2, computeU = true)
135-
else mat.computeSparseSVD(2, computeU = true)
140+
val svd = if (denseSVD) mat.computeSVD(2, computeU = true, 1e-9, 300, 1e-10, mode = "dense")
141+
else mat.computeSVD(2, computeU = true, 1e-9, 300, 1e-10, mode = "sparse")
136142
assert(svd.s.size === 1, "should not return zero singular values")
137143
assert(svd.U.numRows() === 4)
138144
assert(svd.U.numCols() === 1)

0 commit comments

Comments
 (0)