From 09cb849a23df7977164d44cbcaa397e38debd901 Mon Sep 17 00:00:00 2001 From: Angelika Schwarz Date: Sun, 25 Aug 2024 13:25:02 +0200 Subject: [PATCH] Use GEMMTR for SY/HE linear updates --- SRC/clahef.f | 55 +++++++++------------------------------------- SRC/clahef_rk.f | 54 ++++++++------------------------------------- SRC/clasyf.f | 49 +++++++---------------------------------- SRC/clasyf_rk.f | 48 ++++++---------------------------------- SRC/clasyf_rook.f | 50 +++++++++--------------------------------- SRC/dlasyf.f | 50 +++++++----------------------------------- SRC/dlasyf_rk.f | 50 +++++++----------------------------------- SRC/dlasyf_rook.f | 50 +++++++----------------------------------- SRC/slasyf.f | 50 +++++++----------------------------------- SRC/slasyf_rk.f | 48 ++++++---------------------------------- SRC/slasyf_rook.f | 50 +++++++----------------------------------- SRC/zlahef.f | 55 +++++++++------------------------------------- SRC/zlahef_rk.f | 56 +++++++++-------------------------------------- SRC/zlasyf.f | 49 +++++++---------------------------------- SRC/zlasyf_rk.f | 48 ++++++---------------------------------- SRC/zlasyf_rook.f | 50 +++++++++--------------------------------- 16 files changed, 136 insertions(+), 676 deletions(-) diff --git a/SRC/clahef.f b/SRC/clahef.f index 1372673027..1697a1d430 100644 --- a/SRC/clahef.f +++ b/SRC/clahef.f @@ -200,7 +200,7 @@ SUBROUTINE CLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, PARAMETER ( EIGHT = 8.0E+0, SEVTEN = 17.0E+0 ) * .. * .. Local Scalars .. - INTEGER IMAX, J, JB, JJ, JMAX, JP, K, KK, KKW, KP, + INTEGER IMAX, J, JJ, JMAX, JP, K, KK, KKW, KP, $ KSTEP, KW REAL ABSAKK, ALPHA, COLMAX, R1, ROWMAX, T COMPLEX D11, D21, D22, Z @@ -211,7 +211,7 @@ SUBROUTINE CLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, ICAMAX * .. * .. External Subroutines .. - EXTERNAL CCOPY, CGEMM, CGEMV, CLACGV, CSSCAL, + EXTERNAL CCOPY, CGEMMTR, CGEMV, CLACGV, CSSCAL, $ CSWAP * .. * .. Intrinsic Functions .. @@ -552,28 +552,11 @@ SUBROUTINE CLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**H = A11 - U12*W**H * -* computing blocks of NB columns at a time (note that conjg(W) is -* actually stored) +* (note that conjg(W) is actually stored) * - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - A( JJ, JJ ) = REAL( A( JJ, JJ ) ) - CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - A( JJ, JJ ) = REAL( A( JJ, JJ ) ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - CALL CGEMM( 'No transpose', 'Transpose', J-1, JB, N-K, - $ -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in of rows in columns k+1:n looping backwards from k+1 to n @@ -916,29 +899,11 @@ SUBROUTINE CLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**H = A22 - L21*W**H * -* computing blocks of NB columns at a time (note that conjg(W) is -* actually stored) -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - A( JJ, JJ ) = REAL( A( JJ, JJ ) ) - CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - A( JJ, JJ ) = REAL( A( JJ, JJ ) ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block +* (note that conjg(W) is actually stored) * - IF( J+JB.LE.N ) - $ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * of rows in columns 1:k-1 looping backwards from k-1 to 1 diff --git a/SRC/clahef_rk.f b/SRC/clahef_rk.f index b97d68b14a..baba1016f2 100644 --- a/SRC/clahef_rk.f +++ b/SRC/clahef_rk.f @@ -286,7 +286,7 @@ SUBROUTINE CLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * .. * .. Local Scalars .. LOGICAL DONE - INTEGER IMAX, ITEMP, II, J, JB, JJ, JMAX, K, KK, KKW, + INTEGER IMAX, ITEMP, II, J, JMAX, K, KK, KKW, $ KP, KSTEP, KW, P REAL ABSAKK, ALPHA, COLMAX, STEMP, R1, ROWMAX, T, $ SFMIN @@ -755,29 +755,11 @@ SUBROUTINE CLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**H = A11 - U12*W**H * -* computing blocks of NB columns at a time (note that conjg(W) is -* actually stored) +* (note that conjg(W) is actually stored) * - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - A( JJ, JJ ) = REAL( A( JJ, JJ ) ) - CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - A( JJ, JJ ) = REAL( A( JJ, JJ ) ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL CGEMM( 'No transpose', 'Transpose', J-1, JB, N-K, - $ -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Set KB to the number of columns factorized * @@ -1203,29 +1185,11 @@ SUBROUTINE CLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**H = A22 - L21*W**H * -* computing blocks of NB columns at a time (note that conjg(W) is -* actually stored) -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - A( JJ, JJ ) = REAL( A( JJ, JJ ) ) - CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - A( JJ, JJ ) = REAL( A( JJ, JJ ) ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block +* (note that conjg(W) is actually stored) * - IF( J+JB.LE.N ) - $ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Set KB to the number of columns factorized * diff --git a/SRC/clasyf.f b/SRC/clasyf.f index 4de35fa3a4..46056a16d0 100644 --- a/SRC/clasyf.f +++ b/SRC/clasyf.f @@ -200,7 +200,7 @@ SUBROUTINE CLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) * .. * .. Local Scalars .. - INTEGER IMAX, J, JB, JJ, JMAX, JP, K, KK, KKW, KP, + INTEGER IMAX, J, JJ, JMAX, JP, K, KK, KKW, KP, $ KSTEP, KW REAL ABSAKK, ALPHA, COLMAX, ROWMAX COMPLEX D11, D21, D22, R1, T, Z @@ -211,7 +211,7 @@ SUBROUTINE CLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, ICAMAX * .. * .. External Subroutines .. - EXTERNAL CCOPY, CGEMM, CGEMV, CSCAL, CSWAP + EXTERNAL CCOPY, CGEMMTR, CGEMV, CSCAL, CSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, MIN, REAL, SQRT @@ -482,25 +482,9 @@ SUBROUTINE CLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - CALL CGEMM( 'No transpose', 'Transpose', J-1, JB, N-K, - $ -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in columns k+1:n looping backwards from k+1 to n @@ -778,26 +762,9 @@ SUBROUTINE CLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * of rows in columns 1:k-1 looping backwards from k-1 to 1 diff --git a/SRC/clasyf_rk.f b/SRC/clasyf_rk.f index 654d0f0cce..ac437205b4 100644 --- a/SRC/clasyf_rk.f +++ b/SRC/clasyf_rk.f @@ -298,7 +298,7 @@ SUBROUTINE CLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, EXTERNAL LSAME, ICAMAX, SLAMCH * .. * .. External Subroutines .. - EXTERNAL CCOPY, CGEMM, CGEMV, CSCAL, CSWAP + EXTERNAL CCOPY, CGEMMTR, CGEMV, CSCAL, CSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, AIMAG, MAX, MIN, REAL, SQRT @@ -627,26 +627,9 @@ SUBROUTINE CLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL CGEMM( 'No transpose', 'Transpose', J-1, JB, - $ N-K, -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), - $ LDW, CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Set KB to the number of columns factorized * @@ -945,26 +928,9 @@ SUBROUTINE CLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Set KB to the number of columns factorized * diff --git a/SRC/clasyf_rook.f b/SRC/clasyf_rook.f index 3b76c09e15..e66f21d518 100644 --- a/SRC/clasyf_rook.f +++ b/SRC/clasyf_rook.f @@ -208,7 +208,7 @@ SUBROUTINE CLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * .. * .. Local Scalars .. LOGICAL DONE - INTEGER IMAX, ITEMP, J, JB, JJ, JMAX, JP1, JP2, K, KK, + INTEGER IMAX, ITEMP, J, JJ, JMAX, JP1, JP2, K, KK, $ KW, KKW, KP, KSTEP, P, II REAL ABSAKK, ALPHA, COLMAX, ROWMAX, STEMP, SFMIN COMPLEX D11, D12, D21, D22, R1, T, Z @@ -220,7 +220,7 @@ SUBROUTINE CLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, ICAMAX, SLAMCH * .. * .. External Subroutines .. - EXTERNAL CCOPY, CGEMM, CGEMV, CSCAL, CSWAP + EXTERNAL CCOPY, CGEMMTR, CGEMV, CSCAL, CSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT, AIMAG, REAL @@ -525,26 +525,11 @@ SUBROUTINE CLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time +* (note that conjg(W) is actually stored) * - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL CGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL CGEMM( 'No transpose', 'Transpose', J-1, JB, - $ N-K, -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL CGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in columns k+1:n @@ -846,26 +831,11 @@ SUBROUTINE CLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL CGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block +* (note that conjg(W) is actually stored) * - IF( J+JB.LE.N ) - $ CALL CGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), LDW, - $ CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL CGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * in columns 1:k-1 diff --git a/SRC/dlasyf.f b/SRC/dlasyf.f index 5b1ca4e564..c4f8d1b31e 100644 --- a/SRC/dlasyf.f +++ b/SRC/dlasyf.f @@ -197,7 +197,7 @@ SUBROUTINE DLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, PARAMETER ( EIGHT = 8.0D+0, SEVTEN = 17.0D+0 ) * .. * .. Local Scalars .. - INTEGER IMAX, J, JB, JJ, JMAX, JP, K, KK, KKW, KP, + INTEGER IMAX, J, JJ, JMAX, JP, K, KK, KKW, KP, $ KSTEP, KW DOUBLE PRECISION ABSAKK, ALPHA, COLMAX, D11, D21, D22, R1, $ ROWMAX, T @@ -208,7 +208,7 @@ SUBROUTINE DLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, IDAMAX * .. * .. External Subroutines .. - EXTERNAL DCOPY, DGEMM, DGEMV, DSCAL, DSWAP + EXTERNAL DCOPY, DGEMMTR, DGEMV, DSCAL, DSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -474,26 +474,9 @@ SUBROUTINE DLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL DGEMV( 'No transpose', JJ-J+1, N-K, -ONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, ONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - CALL DGEMM( 'No transpose', 'Transpose', J-1, JB, N-K, - $ -ONE, - $ A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, ONE, - $ A( 1, J ), LDA ) - 50 CONTINUE + CALL DGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -ONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ ONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in columns k+1:n looping backwards from k+1 to n @@ -770,26 +753,9 @@ SUBROUTINE DLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL DGEMV( 'No transpose', J+JB-JJ, K-1, -ONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, ONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL DGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -ONE, A( J+JB, 1 ), LDA, W( J, 1 ), LDW, - $ ONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL DGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -ONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ ONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * of rows in columns 1:k-1 looping backwards from k-1 to 1 diff --git a/SRC/dlasyf_rk.f b/SRC/dlasyf_rk.f index 3de5cdb7f1..7e6e5ba632 100644 --- a/SRC/dlasyf_rk.f +++ b/SRC/dlasyf_rk.f @@ -283,7 +283,7 @@ SUBROUTINE DLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * .. * .. Local Scalars .. LOGICAL DONE - INTEGER IMAX, ITEMP, J, JB, JJ, JMAX, K, KK, KW, KKW, + INTEGER IMAX, ITEMP, J, JMAX, K, KK, KW, KKW, $ KP, KSTEP, P, II DOUBLE PRECISION ABSAKK, ALPHA, COLMAX, D11, D12, D21, D22, $ DTEMP, R1, ROWMAX, T, SFMIN @@ -295,7 +295,7 @@ SUBROUTINE DLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, EXTERNAL LSAME, IDAMAX, DLAMCH * .. * .. External Subroutines .. - EXTERNAL DCOPY, DGEMM, DGEMV, DSCAL, DSWAP + EXTERNAL DCOPY, DGEMMTR, DGEMV, DSCAL, DSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -618,26 +618,9 @@ SUBROUTINE DLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL DGEMV( 'No transpose', JJ-J+1, N-K, -ONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, ONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL DGEMM( 'No transpose', 'Transpose', J-1, JB, - $ N-K, -ONE, A( 1, K+1 ), LDA, W( J, KW+1 ), - $ LDW, ONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL DGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -ONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ ONE, A( 1, 1 ), LDA ) * * Set KB to the number of columns factorized * @@ -936,26 +919,9 @@ SUBROUTINE DLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL DGEMV( 'No transpose', J+JB-JJ, K-1, -ONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, ONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL DGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -ONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, ONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL DGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -ONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ ONE, A( K, K ), LDA ) * * Set KB to the number of columns factorized * diff --git a/SRC/dlasyf_rook.f b/SRC/dlasyf_rook.f index f9d34c9a87..87e961e5d8 100644 --- a/SRC/dlasyf_rook.f +++ b/SRC/dlasyf_rook.f @@ -205,7 +205,7 @@ SUBROUTINE DLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * .. * .. Local Scalars .. LOGICAL DONE - INTEGER IMAX, ITEMP, J, JB, JJ, JMAX, JP1, JP2, K, KK, + INTEGER IMAX, ITEMP, J, JJ, JMAX, JP1, JP2, K, KK, $ KW, KKW, KP, KSTEP, P, II DOUBLE PRECISION ABSAKK, ALPHA, COLMAX, D11, D12, D21, D22, @@ -218,7 +218,7 @@ SUBROUTINE DLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, IDAMAX, DLAMCH * .. * .. External Subroutines .. - EXTERNAL DCOPY, DGEMM, DGEMV, DSCAL, DSWAP + EXTERNAL DCOPY, DGEMMTR, DGEMV, DSCAL, DSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -517,26 +517,9 @@ SUBROUTINE DLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL DGEMV( 'No transpose', JJ-J+1, N-K, -ONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, ONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL DGEMM( 'No transpose', 'Transpose', J-1, JB, - $ N-K, -ONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ ONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL DGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -ONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ ONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in columns k+1:n @@ -838,26 +821,9 @@ SUBROUTINE DLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL DGEMV( 'No transpose', J+JB-JJ, K-1, -ONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, ONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL DGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -ONE, A( J+JB, 1 ), LDA, W( J, 1 ), LDW, - $ ONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL DGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -ONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ ONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * in columns 1:k-1 diff --git a/SRC/slasyf.f b/SRC/slasyf.f index adde278267..17d3ee7f01 100644 --- a/SRC/slasyf.f +++ b/SRC/slasyf.f @@ -197,7 +197,7 @@ SUBROUTINE SLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, PARAMETER ( EIGHT = 8.0E+0, SEVTEN = 17.0E+0 ) * .. * .. Local Scalars .. - INTEGER IMAX, J, JB, JJ, JMAX, JP, K, KK, KKW, KP, + INTEGER IMAX, J, JJ, JMAX, JP, K, KK, KKW, KP, $ KSTEP, KW REAL ABSAKK, ALPHA, COLMAX, D11, D21, D22, R1, $ ROWMAX, T @@ -208,7 +208,7 @@ SUBROUTINE SLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, ISAMAX * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEMM, SGEMV, SSCAL, SSWAP + EXTERNAL SCOPY, SGEMMTR, SGEMV, SSCAL, SSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -474,26 +474,9 @@ SUBROUTINE SLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL SGEMV( 'No transpose', JJ-J+1, N-K, -ONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, ONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - CALL SGEMM( 'No transpose', 'Transpose', J-1, JB, N-K, - $ -ONE, - $ A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, ONE, - $ A( 1, J ), LDA ) - 50 CONTINUE + CALL SGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -ONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ ONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in columns k+1:n looping backwards from k+1 to n @@ -770,26 +753,9 @@ SUBROUTINE SLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL SGEMV( 'No transpose', J+JB-JJ, K-1, -ONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, ONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL SGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -ONE, A( J+JB, 1 ), LDA, W( J, 1 ), LDW, - $ ONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL SGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -ONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ ONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * of rows in columns 1:k-1 looping backwards from k-1 to 1 diff --git a/SRC/slasyf_rk.f b/SRC/slasyf_rk.f index a19a39b7ad..97293a41d1 100644 --- a/SRC/slasyf_rk.f +++ b/SRC/slasyf_rk.f @@ -295,7 +295,7 @@ SUBROUTINE SLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, EXTERNAL LSAME, ISAMAX, SLAMCH * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEMM, SGEMV, SSCAL, SSWAP + EXTERNAL SCOPY, SGEMMTR, SGEMV, SSCAL, SSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -618,26 +618,9 @@ SUBROUTINE SLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL SGEMV( 'No transpose', JJ-J+1, N-K, -ONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, ONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL SGEMM( 'No transpose', 'Transpose', J-1, JB, - $ N-K, -ONE, A( 1, K+1 ), LDA, W( J, KW+1 ), - $ LDW, ONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL SGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -ONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ ONE, A( 1, 1 ), LDA ) * * Set KB to the number of columns factorized * @@ -936,26 +919,9 @@ SUBROUTINE SLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL SGEMV( 'No transpose', J+JB-JJ, K-1, -ONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, ONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL SGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -ONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, ONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL SGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -ONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ ONE, A( K, K ), LDA ) * * Set KB to the number of columns factorized * diff --git a/SRC/slasyf_rook.f b/SRC/slasyf_rook.f index c214ae0a85..d2d445e7a8 100644 --- a/SRC/slasyf_rook.f +++ b/SRC/slasyf_rook.f @@ -205,7 +205,7 @@ SUBROUTINE SLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * .. * .. Local Scalars .. LOGICAL DONE - INTEGER IMAX, ITEMP, J, JB, JJ, JMAX, JP1, JP2, K, KK, + INTEGER IMAX, ITEMP, J, JJ, JMAX, JP1, JP2, K, KK, $ KW, KKW, KP, KSTEP, P, II REAL ABSAKK, ALPHA, COLMAX, D11, D12, D21, D22, @@ -218,7 +218,7 @@ SUBROUTINE SLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, ISAMAX, SLAMCH * .. * .. External Subroutines .. - EXTERNAL SCOPY, SGEMM, SGEMV, SSCAL, SSWAP + EXTERNAL SCOPY, SGEMMTR, SGEMV, SSCAL, SSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT @@ -517,26 +517,9 @@ SUBROUTINE SLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL SGEMV( 'No transpose', JJ-J+1, N-K, -ONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, ONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL SGEMM( 'No transpose', 'Transpose', J-1, JB, - $ N-K, -ONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ ONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL SGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -ONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ ONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in columns k+1:n @@ -838,26 +821,9 @@ SUBROUTINE SLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL SGEMV( 'No transpose', J+JB-JJ, K-1, -ONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, ONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL SGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -ONE, A( J+JB, 1 ), LDA, W( J, 1 ), LDW, - $ ONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL SGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -ONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ ONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * in columns 1:k-1 diff --git a/SRC/zlahef.f b/SRC/zlahef.f index 1df25bea06..1e124bccb6 100644 --- a/SRC/zlahef.f +++ b/SRC/zlahef.f @@ -200,7 +200,7 @@ SUBROUTINE ZLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, PARAMETER ( EIGHT = 8.0D+0, SEVTEN = 17.0D+0 ) * .. * .. Local Scalars .. - INTEGER IMAX, J, JB, JJ, JMAX, JP, K, KK, KKW, KP, + INTEGER IMAX, J, JJ, JMAX, JP, K, KK, KKW, KP, $ KSTEP, KW DOUBLE PRECISION ABSAKK, ALPHA, COLMAX, R1, ROWMAX, T COMPLEX*16 D11, D21, D22, Z @@ -211,7 +211,7 @@ SUBROUTINE ZLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, IZAMAX * .. * .. External Subroutines .. - EXTERNAL ZCOPY, ZDSCAL, ZGEMM, ZGEMV, ZLACGV, + EXTERNAL ZCOPY, ZDSCAL, ZGEMMTR, ZGEMV, ZLACGV, $ ZSWAP * .. * .. Intrinsic Functions .. @@ -551,28 +551,11 @@ SUBROUTINE ZLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**H = A11 - U12*W**H * -* computing blocks of NB columns at a time (note that conjg(W) is -* actually stored) +* (note that conjg(W) is actually stored) * - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - A( JJ, JJ ) = DBLE( A( JJ, JJ ) ) - CALL ZGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - A( JJ, JJ ) = DBLE( A( JJ, JJ ) ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - CALL ZGEMM( 'No transpose', 'Transpose', J-1, JB, N-K, - $ -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL ZGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in columns k+1:n looping backwards from k+1 to n @@ -915,29 +898,11 @@ SUBROUTINE ZLAHEF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**H = A22 - L21*W**H * -* computing blocks of NB columns at a time (note that conjg(W) is -* actually stored) -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - A( JJ, JJ ) = DBLE( A( JJ, JJ ) ) - CALL ZGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - A( JJ, JJ ) = DBLE( A( JJ, JJ ) ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block +* (note that conjg(W) is actually stored) * - IF( J+JB.LE.N ) - $ CALL ZGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL ZGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * of rows in columns 1:k-1 looping backwards from k-1 to 1 diff --git a/SRC/zlahef_rk.f b/SRC/zlahef_rk.f index 3e9b2dcc9b..03417895a5 100644 --- a/SRC/zlahef_rk.f +++ b/SRC/zlahef_rk.f @@ -287,7 +287,7 @@ SUBROUTINE ZLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * .. * .. Local Scalars .. LOGICAL DONE - INTEGER IMAX, ITEMP, II, J, JB, JJ, JMAX, K, KK, KKW, + INTEGER IMAX, ITEMP, II, J, JMAX, K, KK, KKW, $ KP, KSTEP, KW, P DOUBLE PRECISION ABSAKK, ALPHA, COLMAX, DTEMP, R1, ROWMAX, T, $ SFMIN @@ -300,7 +300,7 @@ SUBROUTINE ZLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, EXTERNAL LSAME, IZAMAX, DLAMCH * .. * .. External Subroutines .. - EXTERNAL ZCOPY, ZDSCAL, ZGEMM, ZGEMV, ZLACGV, + EXTERNAL ZCOPY, ZDSCAL, ZGEMMTR, ZGEMV, ZLACGV, $ ZSWAP * .. * .. Intrinsic Functions .. @@ -755,29 +755,11 @@ SUBROUTINE ZLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**H = A11 - U12*W**H * -* computing blocks of NB columns at a time (note that conjg(W) is -* actually stored) +* (note that conjg(W) is actually stored) * - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - A( JJ, JJ ) = DBLE( A( JJ, JJ ) ) - CALL ZGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - A( JJ, JJ ) = DBLE( A( JJ, JJ ) ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL ZGEMM( 'No transpose', 'Transpose', J-1, JB, N-K, - $ -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL ZGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Set KB to the number of columns factorized * @@ -1203,29 +1185,11 @@ SUBROUTINE ZLAHEF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**H = A22 - L21*W**H * -* computing blocks of NB columns at a time (note that conjg(W) is -* actually stored) -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - A( JJ, JJ ) = DBLE( A( JJ, JJ ) ) - CALL ZGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - A( JJ, JJ ) = DBLE( A( JJ, JJ ) ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block +* (note that conjg(W) is actually stored) * - IF( J+JB.LE.N ) - $ CALL ZGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL ZGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Set KB to the number of columns factorized * diff --git a/SRC/zlasyf.f b/SRC/zlasyf.f index d5728b00a5..ca70f451cc 100644 --- a/SRC/zlasyf.f +++ b/SRC/zlasyf.f @@ -200,7 +200,7 @@ SUBROUTINE ZLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. - INTEGER IMAX, J, JB, JJ, JMAX, JP, K, KK, KKW, KP, + INTEGER IMAX, J, JJ, JMAX, JP, K, KK, KKW, KP, $ KSTEP, KW DOUBLE PRECISION ABSAKK, ALPHA, COLMAX, ROWMAX COMPLEX*16 D11, D21, D22, R1, T, Z @@ -211,7 +211,7 @@ SUBROUTINE ZLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, IZAMAX * .. * .. External Subroutines .. - EXTERNAL ZCOPY, ZGEMM, ZGEMV, ZSCAL, ZSWAP + EXTERNAL ZCOPY, ZGEMMTR, ZGEMV, ZSCAL, ZSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX, MIN, SQRT @@ -481,25 +481,9 @@ SUBROUTINE ZLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL ZGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - CALL ZGEMM( 'No transpose', 'Transpose', J-1, JB, N-K, - $ -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL ZGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in columns k+1:n looping backwards from k+1 to n @@ -776,26 +760,9 @@ SUBROUTINE ZLASYF( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL ZGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL ZGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL ZGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * of rows in columns 1:k-1 looping backwards from k-1 to 1 diff --git a/SRC/zlasyf_rk.f b/SRC/zlasyf_rk.f index 29a6684b67..67fe8c848c 100644 --- a/SRC/zlasyf_rk.f +++ b/SRC/zlasyf_rk.f @@ -298,7 +298,7 @@ SUBROUTINE ZLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, EXTERNAL LSAME, IZAMAX, DLAMCH * .. * .. External Subroutines .. - EXTERNAL ZCOPY, ZGEMM, ZGEMV, ZSCAL, ZSWAP + EXTERNAL ZCOPY, ZGEMMTR, ZGEMV, ZSCAL, ZSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX, MIN, SQRT @@ -627,26 +627,9 @@ SUBROUTINE ZLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time -* - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL ZGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL ZGEMM( 'No transpose', 'Transpose', J-1, JB, - $ N-K, -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), - $ LDW, CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL ZGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Set KB to the number of columns factorized * @@ -945,26 +928,9 @@ SUBROUTINE ZLASYF_RK( UPLO, N, NB, KB, A, LDA, E, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL ZGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block -* - IF( J+JB.LE.N ) - $ CALL ZGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), - $ LDW, CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL ZGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Set KB to the number of columns factorized * diff --git a/SRC/zlasyf_rook.f b/SRC/zlasyf_rook.f index 3d1c1c9cdd..b5b2f58288 100644 --- a/SRC/zlasyf_rook.f +++ b/SRC/zlasyf_rook.f @@ -208,7 +208,7 @@ SUBROUTINE ZLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * .. * .. Local Scalars .. LOGICAL DONE - INTEGER IMAX, ITEMP, J, JB, JJ, JMAX, JP1, JP2, K, KK, + INTEGER IMAX, ITEMP, J, JJ, JMAX, JP1, JP2, K, KK, $ KW, KKW, KP, KSTEP, P, II DOUBLE PRECISION ABSAKK, ALPHA, COLMAX, ROWMAX, DTEMP, SFMIN COMPLEX*16 D11, D12, D21, D22, R1, T, Z @@ -220,7 +220,7 @@ SUBROUTINE ZLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, EXTERNAL LSAME, IZAMAX, DLAMCH * .. * .. External Subroutines .. - EXTERNAL ZCOPY, ZGEMM, ZGEMV, ZSCAL, ZSWAP + EXTERNAL ZCOPY, ZGEMMTR, ZGEMV, ZSCAL, ZSWAP * .. * .. Intrinsic Functions .. INTRINSIC ABS, MAX, MIN, SQRT, DIMAG, DBLE @@ -525,26 +525,11 @@ SUBROUTINE ZLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A11 := A11 - U12*D*U12**T = A11 - U12*W**T * -* computing blocks of NB columns at a time +* (note that conjg(W) is actually stored) * - DO 50 J = ( ( K-1 ) / NB )*NB + 1, 1, -NB - JB = MIN( NB, K-J+1 ) -* -* Update the upper triangle of the diagonal block -* - DO 40 JJ = J, J + JB - 1 - CALL ZGEMV( 'No transpose', JJ-J+1, N-K, -CONE, - $ A( J, K+1 ), LDA, W( JJ, KW+1 ), LDW, CONE, - $ A( J, JJ ), 1 ) - 40 CONTINUE -* -* Update the rectangular superdiagonal block -* - IF( J.GE.2 ) - $ CALL ZGEMM( 'No transpose', 'Transpose', J-1, JB, - $ N-K, -CONE, A( 1, K+1 ), LDA, W( J, KW+1 ), LDW, - $ CONE, A( 1, J ), LDA ) - 50 CONTINUE + CALL ZGEMMTR( 'Upper', 'No transpose', 'Transpose', K, N-K, + $ -CONE, A( 1, K+1 ), LDA, W( 1, KW+1 ), LDW, + $ CONE, A( 1, 1 ), LDA ) * * Put U12 in standard form by partially undoing the interchanges * in columns k+1:n @@ -846,26 +831,11 @@ SUBROUTINE ZLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, * * A22 := A22 - L21*D*L21**T = A22 - L21*W**T * -* computing blocks of NB columns at a time -* - DO 110 J = K, N, NB - JB = MIN( NB, N-J+1 ) -* -* Update the lower triangle of the diagonal block -* - DO 100 JJ = J, J + JB - 1 - CALL ZGEMV( 'No transpose', J+JB-JJ, K-1, -CONE, - $ A( JJ, 1 ), LDA, W( JJ, 1 ), LDW, CONE, - $ A( JJ, JJ ), 1 ) - 100 CONTINUE -* -* Update the rectangular subdiagonal block +* (note that conjg(W) is actually stored) * - IF( J+JB.LE.N ) - $ CALL ZGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, - $ K-1, -CONE, A( J+JB, 1 ), LDA, W( J, 1 ), LDW, - $ CONE, A( J+JB, J ), LDA ) - 110 CONTINUE + CALL ZGEMMTR( 'Lower', 'No transpose', 'Transpose', N-K+1, + $ K-1, -CONE, A( K, 1 ), LDA, W( K, 1 ), LDW, + $ CONE, A( K, K ), LDA ) * * Put L21 in standard form by partially undoing the interchanges * in columns 1:k-1