@@ -250,70 +250,29 @@ class RowMatrix(
250250 }
251251
252252 /**
253- * Computes singular value decomposition of this matrix using dense implementation.
254- * Denote this matrix by A (m x n), this will compute matrices U, S, V such that A ~= U * S * V',
255- * where S contains the leading singular values, U and V contain the corresponding singular
256- * vectors.
253+ * Computes singular value decomposition of this matrix. Denote this matrix by A (m x n), this
254+ * will compute matrices U, S, V such that A ~= U * S * V', where S contains the leading k
255+ * singular values, U and V contain the corresponding singular vectors.
257256 *
258- * This approach requires `n^2` doubles to fit in memory and `O(n^3)` time on the master node.
259- * Further, n should be less than m. For problems with small n (e.g. n < 100 and n << m), the
260- * dense implementation might be faster than the sparse implementation.
257+ * This approach assumes n is smaller than m, and invokes a dense matrix implementation when n is
258+ * small (n < 100) or the number of requested singular values is the same as n (k == n). For
259+ * problems with large n (n >= 100) and k < n, this approach invokes a sparse matrix
260+ * implementation that provides a function to ARPACK to multiply a vector with A'A. It iteratively
261+ * calls ARPACK-dsaupd on the master node, from which we recover S and V. Then we compute U via
262+ * easy matrix multiplication as U = A * (V * S^{-1}).
261263 *
262- * At most k largest non-zero singular values and associated vectors are returned.
263- * If there are k such values, then the dimensions of the return will be:
264+ * The dense implementation requires `n^2` doubles to fit in memory and `O(n^3)` time on the
265+ * master node.
264266 *
265- * U is a RowMatrix of size m x k that satisfies U'U = eye(k),
266- * s is a Vector of size k, holding the singular values in descending order,
267- * and V is a Matrix of size n x k that satisfies V'V = eye(k ).
267+ * The sparse implementation requires `n * (6 * k + 4)` doubles to fit in memory on the master
268+ * node and approximately `O(k * nnz(A))` time distributed on all worker nodes. There is no
269+ * restriction on m (number of rows ).
268270 *
269- * @param k number of leading singular values to keep (0 < k <= n). We might return less than
270- * k if there are numerically zero singular values. See rCond.
271- * @param computeU whether to compute U.
272- * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
273- * are treated as zero, where sigma(0) is the largest singular value.
274- * @return SingularValueDecomposition(U, s, V)
275- */
276- def computeSVD (
277- k : Int ,
278- computeU : Boolean ,
279- rCond : Double ): SingularValueDecomposition [RowMatrix , Matrix ] = {
280- val n = numCols().toInt
281- require(k > 0 && k <= n, s " Request up to n singular values k= $k n= $n. " )
282- val G = computeGramianMatrix()
283- val (uFull : BDM [Double ], sigmaSquaresFull : BDV [Double ], vFull : BDM [Double ]) =
284- brzSvd(G .toBreeze.asInstanceOf [BDM [Double ]])
285- computeSVDEffectiveRank(k, n, computeU, rCond, sigmaSquaresFull, uFull)
286- }
287-
288- /**
289- * Computes singular value decomposition of this matrix using dense implementation with default
290- * reciprocal condition number (1e-9). See computeSVD for more details.
291- *
292- * @param k number of leading singular values to keep (0 < k <= n). We might return less than
293- * k if there are numerically zero singular values.
294- * @param computeU whether to compute U.
295- * @return SingularValueDecomposition(U, s, V)
296- */
297- def computeSVD (
298- k : Int ,
299- computeU : Boolean = false ): SingularValueDecomposition [RowMatrix , Matrix ] = {
300- computeSVD(k, computeU, 1e-9 )
301- }
302-
303- /**
304- * Computes singular value decomposition of this matrix using sparse implementation.
305- * Denote this matrix by A (m x n), this will compute matrices U, S, V such that A ~= U * S * V',
306- * where S contains the leading singular values, U and V contain the corresponding singular
307- * vectors.
308- *
309- * The decomposition is computed by providing a function that multiples a vector with A'A to
310- * ARPACK, and iteratively invoking ARPACK-dsaupd on master node, from which we recover S and V.
311- * Then we compute U via easy matrix multiplication as U = A * (V * S^{-1}).
312- * Note that this approach requires approximately `O(k * nnz(A))` time.
313- *
314- * There is no restriction on m, but we require `n*(6*k+4)` doubles to fit in memory on the master
315- * node. Further, n should be less than m, and ARPACK requires k to be strictly less than n. If
316- * the requested k = n, please use the dense implementation computeSVD.
271+ * Several internal parameters are set to default values. The reciprocal condition number rCond
272+ * is set to 1e-9. All singular values smaller than rCond * sigma(0) are treated as zeros, where
273+ * sigma(0) is the largest singular value. The maximum number of Arnoldi update iterations for
274+ * ARPACK is set to 300 or k * 3, whichever is larger. The numerical tolerance for ARPACK
275+ * eigen-decomposition is set to 1e-10.
317276 *
318277 * At most k largest non-zero singular values and associated vectors are returned.
319278 * If there are k such values, then the dimensions of the return will be:
@@ -322,44 +281,86 @@ class RowMatrix(
322281 * s is a Vector of size k, holding the singular values in descending order,
323282 * and V is a Matrix of size n x k that satisfies V'V = eye(k).
324283 *
325- * @param k number of leading singular values to keep (0 < k < n). We might return less than
326- * k if there are numerically zero singular values. See rCond.
284+ * @param k number of leading singular values to keep (0 < k <= n). It might return less than
285+ * k if there are numerically zero singular values or there are not enough Ritz values
286+ * converged before the maximum number of Arnoldi update iterations is reached (in case
287+ * that matrix A is ill-conditioned).
327288 * @param computeU whether to compute U.
328289 * @param rCond the reciprocal condition number. All singular values smaller than rCond * sigma(0)
329290 * are treated as zero, where sigma(0) is the largest singular value.
330- * @param tol the numerical tolerance of SVD computation. Larger tolerance means fewer iterations,
331- * but less accurate result.
332- * @param maxIterations the maximum number of Arnoldi update iterations.
333- * @return SingularValueDecomposition(U, s, V)
291+ * @return SingularValueDecomposition(U, s, V), U = null if computeU = false
334292 */
335- def computeSparseSVD (
336- k : Int ,
337- computeU : Boolean ,
338- rCond : Double ,
339- tol : Double ,
340- maxIterations : Int ): SingularValueDecomposition [RowMatrix , Matrix ] = {
341- val n = numCols().toInt
342- require(k > 0 && k < n, s " Request up to n - 1 singular values k= $k n= $n. " +
343- s " For full SVD (i.e. k = n), please use dense implementation computeSVD. " )
344- val (sigmaSquares : BDV [Double ], u : BDM [Double ]) =
345- EigenValueDecomposition .symmetricEigs(multiplyGramianMatrixBy, n, k, tol, maxIterations)
346- computeSVDEffectiveRank(k, n, computeU, rCond, sigmaSquares, u)
293+ def computeSVD (k : Int ,
294+ computeU : Boolean = false ,
295+ rCond : Double = 1e-9 ): SingularValueDecomposition [RowMatrix , Matrix ] = {
296+ // maximum number of Arnoldi update iterations for invoking ARPACK
297+ val maxIter = math.max(300 , k * 3 )
298+ // numerical tolerance for invoking ARPACK
299+ val tol = 1e-10
300+ computeSVD(k, computeU, rCond, maxIter, tol, " auto" )
347301 }
348302
349303 /**
350- * Computes singular value decomposition of this matrix using sparse implementation with default
351- * reciprocal condition number (1e-9), tolerance (1e-10), and maximum number of Arnoldi iterations
352- * (300). See computeSparseSVD for more details.
353- *
354- * @param k number of leading singular values to keep (0 < k < n). We might return less than
355- * k if there are numerically zero singular values.
356- * @param computeU whether to compute U.
357- * @return SingularValueDecomposition(U, s, V)
304+ * Actual SVD computation, visible for testing.
358305 */
359- def computeSparseSVD (
360- k : Int ,
361- computeU : Boolean = false ): SingularValueDecomposition [RowMatrix , Matrix ] = {
362- computeSparseSVD(k, computeU, 1e-9 , 1e-10 , 300 )
306+ private [mllib] def computeSVD (k : Int ,
307+ computeU : Boolean ,
308+ rCond : Double ,
309+ maxIter : Int ,
310+ tol : Double ,
311+ mode : String ): SingularValueDecomposition [RowMatrix , Matrix ] = {
312+ val n = numCols().toInt
313+
314+ object SVDMode extends Enumeration {
315+ val DenseARPACK, DenseLAPACK, SparseARPACK = Value
316+ }
317+
318+ val derivedMode = mode match {
319+ case " auto" => if (n < 100 || k == n) {
320+ // invoke dense implementation when n is small or k == n (since ARPACK requires k < n)
321+ require(k > 0 && k <= n, s " Request up to n singular values k= $k n= $n. " )
322+ " dense"
323+ } else {
324+ // invoke sparse implementation with ARPACK when n is large
325+ require(k > 0 && k < n, s " Request up to n - 1 singular values for ARPACK k= $k n= $n. " )
326+ " sparse"
327+ }
328+ case " dense" => " dense"
329+ case " sparse" => " sparse"
330+ case _ => throw new IllegalArgumentException (s " Do not support mode $mode. " )
331+ }
332+
333+ val computeMode = derivedMode match {
334+ case " dense" => if (k < n / 2 ) {
335+ // when k is small, call ARPACK
336+ require(k > 0 && k < n, s " Request up to n - 1 singular values for ARPACK k= $k n= $n. " )
337+ SVDMode .DenseARPACK
338+ } else {
339+ // when k is large, call LAPACK
340+ SVDMode .DenseLAPACK
341+ }
342+ case " sparse" => SVDMode .SparseARPACK
343+ }
344+
345+ val (sigmaSquares : BDV [Double ], u : BDM [Double ]) = computeMode match {
346+ case SVDMode .DenseARPACK => {
347+ val G = computeGramianMatrix().toBreeze.asInstanceOf [BDM [Double ]]
348+ def multiplyDenseGramianMatrixBy (v : DenseVector ): DenseVector = {
349+ Vectors .fromBreeze(G * v.toBreeze).asInstanceOf [DenseVector ]
350+ }
351+ EigenValueDecomposition .symmetricEigs(multiplyDenseGramianMatrixBy, n, k, tol, maxIter)
352+ }
353+ case SVDMode .DenseLAPACK => {
354+ val G = computeGramianMatrix().toBreeze.asInstanceOf [BDM [Double ]]
355+ val (uFull : BDM [Double ], sigmaSquaresFull : BDV [Double ], vFull : BDM [Double ]) = brzSvd(G )
356+ (sigmaSquaresFull, uFull)
357+ }
358+ case SVDMode .SparseARPACK => {
359+ EigenValueDecomposition .symmetricEigs(multiplyGramianMatrixBy, n, k, tol, maxIter)
360+ }
361+ }
362+
363+ computeSVDEffectiveRank(k, n, computeU, rCond, sigmaSquares, u)
363364 }
364365
365366 /**
0 commit comments