diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 676397e768..86ee3dc4f2 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -108,7 +108,7 @@ set(SLASRC slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarf1f.f slarf1l.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f + slarf.f slarf1f.f slarf1l.f slarfb.f slarfb0c2.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f slarmm.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f @@ -116,6 +116,8 @@ set(SLASRC slauu2.f slauum.f sopgtr.f sopmtr.f sorg2l.f sorg2r.f sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f sorgrq.f sorgtr.f sorgtsqr.f sorgtsqr_row.f sorm2l.f sorm2r.f sorm22.f + sorgkr.f sorgrk.f sorgkl.f sorglk.f + slumm.f strtrm.f strmmoop.f sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f spbstf.f spbsv.f spbsvx.f @@ -220,7 +222,7 @@ set(CLASRC claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f - clarf.f clarf1f.f clarf1l.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f + clarf.f clarf1f.f clarf1l.f clarfb.f clarfb0c2.f clarfb_gett.f clarfg.f clarfgp.f clarft.f clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90 claswp.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f @@ -248,6 +250,8 @@ set(CLASRC ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f ctrsyl.f ctrsyl3.f ctrti2.f ctrtri.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f + cungkr.f cungrk.f cungkl.f cunglk.f + clumm.f ctrtrm.f ctrmmoop.f cacxpy.f cungrq.f cungtr.f cungtsqr.f cungtsqr_row.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f @@ -309,13 +313,15 @@ set(DLASRC dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqp2rk.f dlaqp3rk.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f - dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f + dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f dlarfb0c2.f dlargv.f dlarmm.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlaswp.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrs3.f dlatrz.f dlauu2.f dlauum.f dopgtr.f dopmtr.f dorg2l.f dorg2r.f dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f + dorgkr.f dorgrk.f dorgkl.f dorglk.f + dlumm.f dtrtrm.f dtrmmoop.f dorgrq.f dorgtr.f dorgtsqr.f dorgtsqr_row.f dorm2l.f dorm2r.f dorm22.f dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f @@ -420,7 +426,7 @@ set(ZLASRC zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqp2rk.f zlaqp3rk.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f - zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f zlarf1f.f zlarf1l.f + zlarcm.f zlarf.f zlarfb.f zlarfb0c2.f zlarfb_gett.f zlarf1f.f zlarf1l.f zlarfg.f zlarfgp.f zlarft.f zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f @@ -450,6 +456,8 @@ set(ZLASRC ztrsyl.f ztrsyl3.f ztrti2.f ztrtri.f ztrtrs.f ztzrzf.f zung2l.f zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f zungrq.f zungtr.f zungtsqr.f zungtsqr_row.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f + zungkr.f zungrk.f zungkl.f zunglk.f + zlumm.f ztrtrm.f ztrmmoop.f zacxpy.f zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f zunmtr.f zupgtr.f zupmtr.f izmax1.f dzsum1.f zstemr.f diff --git a/SRC/Makefile b/SRC/Makefile index 0191626f0e..ec1e8451bf 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -137,7 +137,7 @@ SLASRC = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqp2rk.o slaqp3rk.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarf1f.o slarf1l.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \ + slarf.o slarf1f.o slarf1l.o slarfb.o slarfb0c2.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \ slargv.o slarmm.o slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ @@ -145,6 +145,8 @@ SLASRC = \ slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \ + sorgkr.o sorgrk.o sorgkl.o sorglk.o \ + slumm.o strtrm.o strmmoop.o \ sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ spbstf.o spbsv.o spbsvx.o \ @@ -249,7 +251,7 @@ CLASRC = \ claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqp2rk.o claqp3rk.o claqsb.o \ claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ - clarf.o clarf1f.o clarf1l.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \ + clarf.o clarf1f.o clarf1l.o clarfb.o clarfb0c2.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \ clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \ @@ -277,6 +279,8 @@ CLASRC = \ ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ ctrsyl.o ctrsyl3.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \ cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ + cungkr.o cungrk.o cungkl.o cunglk.o \ + clumm.o ctrtrm.o ctrmmoop.o cacxpy.o \ cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \ @@ -339,12 +343,14 @@ DLASRC = \ dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqp2rk.o dlaqp3rk.o dlaqsb.o dlaqsp.o dlaqsy.o \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ - dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o\ + dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o dlarfb0c2.o\ dlargv.o dlarmm.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrs3.o dlatrz.o dlauu2.o \ dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ + dorgkr.o dorgrk.o dorgkl.o dorglk.o \ + dlumm.o dtrtrm.o dtrmmoop.o \ dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \ dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ @@ -453,7 +459,7 @@ ZLASRC = \ zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqp2rk.o zlaqp3rk.o zlaqsb.o \ zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ - zlarcm.o zlarf.o zlarfb.o zlarfb_gett.o zlarf1f.o zlarf1l.o \ + zlarcm.o zlarf.o zlarfb.o zlarfb0c2.o zlarfb_gett.o zlarf1f.o zlarf1l.o \ zlarfg.o zlarft.o zlarfgp.o \ zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ @@ -483,6 +489,8 @@ ZLASRC = \ ztrsyl.o ztrsyl3.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \ zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ + zungkr.o zungrk.o zungkl.o zunglk.o \ + zlumm.o ztrtrm.o ztrmmoop.o zacxpy.o \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ zunmtr.o zupgtr.o \ zupmtr.o izmax1.o dzsum1.o zstemr.o \ diff --git a/SRC/cacxpy.f b/SRC/cacxpy.f new file mode 100644 index 0000000000..b27bd5b149 --- /dev/null +++ b/SRC/cacxpy.f @@ -0,0 +1,134 @@ +*> \brief \b CACXPY +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CACXPY(N,CA,CX,INCX,CY,INCY) +* +* .. Scalar Arguments .. +* COMPLEX CA +* INTEGER INCX,INCY,N +* .. +* .. Array Arguments .. +* COMPLEX CX(*),CY(*) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CACXPY constant times a conjugated vector plus a vector. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> number of elements in input vector(s) +*> \endverbatim +*> +*> \param[in] CA +*> \verbatim +*> CA is COMPLEX +*> On entry, CA specifies the scalar alpha. +*> \endverbatim +*> +*> \param[in] CX +*> \verbatim +*> CX is COMPLEX array, dimension ( 1 + ( N - 1 )*abs( INCX ) ) +*> \endverbatim +*> +*> \param[in] INCX +*> \verbatim +*> INCX is INTEGER +*> storage spacing between elements of CX +*> \endverbatim +*> +*> \param[in,out] CY +*> \verbatim +*> CY is COMPLEX array, dimension ( 1 + ( N - 1 )*abs( INCY ) ) +*> \endverbatim +*> +*> \param[in] INCY +*> \verbatim +*> INCY is INTEGER +*> storage spacing between elements of CY +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup axpy +*> +* ===================================================================== + SUBROUTINE CACXPY(N,CA,CX,INCX,CY,INCY) +* +* -- Reference BLAS level1 routine -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + COMPLEX CA + INTEGER INCX,INCY,N +* .. +* .. Array Arguments .. + COMPLEX CX(*),CY(*) +* .. +* +* ===================================================================== +* +* .. Local Scalars .. + INTEGER I,IX,IY +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG +* .. +* .. External Functions .. + REAL SCABS1 + EXTERNAL SCABS1 +* .. + IF (N.LE.0) RETURN + IF (SCABS1(CA).EQ.0.0d0) RETURN + IF (INCX.EQ.1 .AND. INCY.EQ.1) THEN +* +* code for both increments equal to 1 +* + DO I = 1,N + CY(I) = CY(I) + CA*CONJG(CX(I)) + END DO + ELSE +* +* code for unequal increments or equal increments +* not equal to 1 +* + IX = 1 + IY = 1 + IF (INCX.LT.0) IX = (-N+1)*INCX + 1 + IF (INCY.LT.0) IY = (-N+1)*INCY + 1 + DO I = 1,N + CY(IY) = CY(IY) + CA*CONJG(CX(IX)) + IX = IX + INCX + IY = IY + INCY + END DO + END IF +* + RETURN +* +* End of CACXPY +* + END + diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f new file mode 100644 index 0000000000..28fd002eee --- /dev/null +++ b/SRC/clarfb0c2.f @@ -0,0 +1,564 @@ +*> \brief \b CLARFB0C2 applies a block reflector or its conjugate-transpose +* to a rectangular matrix with a 0 block while constructing the explicit Q +* factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* COMPLEX V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLARFB0C2 applies a real block reflector H or its transpose H**H to a +*> complex m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**H from the Left +*> = 'R': apply H or H**H from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'C': apply H**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,K) +*> The triangular K-by-K matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension (LDC,N) +*> On entry, the M-by-N matrix C. +*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + ! Array arguments + COMPLEX V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! Intrinsic Functions + INTRINSIC CONJG + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External Subroutines + EXTERNAL CGEMM, CTRMM + ! Parameters + COMPLEX ONE, ZERO, NEG_ONE + PARAMETER(ONE=(1.0E+0, 0.0E+0), + $ ZERO = (0.0E+0, 0.0E+0), + $ NEG_ONE = (-1.0E+0, 0.0E+0)) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'C') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\C^{K\times K} + ! V2\in\C^{M-K\times K} + ! C1=0\in\C^{K\times N} + ! C2\in\C^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = CONJG(V(K+J,I)) + END DO + END DO + ELSE + CALL CGEMM('Conjugate', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL CTRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\C^{K\times K} + ! V2\in\C^{K\times N-K} + ! C1=0\in\C^{M\times K} + ! C2\in\C^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = CONJG(V(J,K+I)) + END DO + END DO + ELSE + CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + IF( TRANST ) THEN + CALL CTRMM('Right', 'Upper', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ELSE + CALL CTRMM('Right', 'Lower', 'No Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C, LDC) + END IF + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL CTRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\C^{K\times K} + ! V2\in\C^{M-K\times K} + ! C1=0\in\C^{K\times N} + ! C2\in\C^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('CLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL CGEMM('Conjugate', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL CTRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\C^{K\times K} + ! V2\in\C^{K\times N-K} + ! C1=0\in\C^{M\times K} + ! C2\in\C^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('CLARFB0C2', 2) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*T' + ! + IF( TRANST ) THEN + CALL CTRMM('Right', 'Lower', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ELSE + CALL CTRMM('Right', 'Upper', 'No Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC) + END IF + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL CTRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/clarft.f b/SRC/clarft.f index c757f3c172..d9d5b99114 100644 --- a/SRC/clarft.f +++ b/SRC/clarft.f @@ -6,11 +6,11 @@ * http://www.netlib.org/lapack/explore-html/ * *> Download CLARFT + dependencies -*> +*> *> [TGZ] -*> +*> *> [ZIP] -*> +*> *> [TXT] * * Definition: @@ -23,7 +23,7 @@ * INTEGER K, LDT, LDV, N * .. * .. Array Arguments .. -* COMPLEX T( LDT, * ), TAU( * ), V( LDV, * ) +* COMPLEX T( LDT, * ), TAU( * ), V( LDV, * ) * .. * * @@ -32,7 +32,7 @@ *> *> \verbatim *> -*> CLARFT forms the triangular factor T of a complex block reflector H +*> CLARFT forms the triangular factor T of a real block reflector H *> of order n, which is defined as a product of k elementary reflectors. *> *> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; @@ -42,12 +42,12 @@ *> If STOREV = 'C', the vector which defines the elementary reflector *> H(i) is stored in the i-th column of the array V, and *> -*> H = I - V * T * V**H +*> H = I - V * T * V**T *> *> If STOREV = 'R', the vector which defines the elementary reflector *> H(i) is stored in the i-th row of the array V, and *> -*> H = I - V**H * T * V +*> H = I - V**T * T * V *> \endverbatim * * Arguments: @@ -166,23 +166,25 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * * .. Scalar Arguments * - CHARACTER DIRECT, STOREV - INTEGER K, LDT, LDV, N + CHARACTER DIRECT, STOREV + INTEGER K, LDT, LDV, N * .. * .. Array Arguments .. * - COMPLEX T( LDT, * ), TAU( * ), V( LDV, * ) + COMPLEX T( LDT, * ), TAU( * ), V( LDV, * ) * .. * * .. Parameters .. * - COMPLEX ONE, NEG_ONE, ZERO - PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE=-1.0E+0) + COMPLEX ONE, NEG_ONE, ZERO + PARAMETER(ONE=(1.0E+0,0.0E+0), + $ ZERO = (0.0E+0,0.0E+0), + $ NEG_ONE=(-1.0E+0,0.0E+0)) * * .. Local Scalars .. * INTEGER I,J,L - LOGICAL QR,LQ,QL,DIRF,COLV + LOGICAL QR,LQ,QL,RQ,LQT,RQT,DIRF,COLV,TDIRF,TCOLV * * .. External Subroutines .. * @@ -193,10 +195,10 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, LOGICAL LSAME EXTERNAL LSAME * -* .. Intrinsic Functions.. +* .. Instrinsic Functions.. * INTRINSIC CONJG -* +* * The general scheme used is inspired by the approach inside DGEQRT3 * which was (at the time of writing this code): * Based on the algorithm of Elmroth and Gustavson, @@ -210,13 +212,6 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, RETURN END IF * -* Base case -* - IF(N.EQ.1.OR.K.EQ.1) THEN - T(1,1) = TAU(1) - RETURN - END IF -* * Beginning of executable statements * L = K / 2 @@ -227,26 +222,48 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * 'C' for STOREV, then they meant to provide 'R' * DIRF = LSAME(DIRECT,'F') + TDIRF = LSAME(DIRECT,'T') COLV = LSAME(STOREV,'C') + TCOLV = LSAME(STOREV,'T') * * QR happens when we have forward direction in column storage * QR = DIRF.AND.COLV * -* LQ happens when we have forward direction in row storage +* LQT happens when we have forward direction in row storage and want to compute the transpose of +* the T we would normally compute +* + LQT = DIRF.AND.TCOLV +* +* LQ happens when we have forward direction in row storage and want to compute the T we would +* normally compute * - LQ = DIRF.AND.(.NOT.COLV) + LQ = DIRF.AND.(.NOT.LQT) * * QL happens when we have backward direction in column storage * QL = (.NOT.DIRF).AND.COLV * -* The last case is RQ. Due to how we structured this, if the -* above 3 are false, then RQ must be true, so we never store -* this -* RQ happens when we have backward direction in row storage -* RQ = (.NOT.DIRF).AND.(.NOT.COLV) +* RQT happens when we have backward direction in row storage and want to compute the transpose +* of the T we would normally compute +* + RQT = TDIRF.AND.(.NOT.COLV) +* +* RQ happens when we have backward direction in row storage and want to compute the T that we +* would normally compute +* + RQ = (.NOT.RQT).AND.(.NOT.COLV) * +* Base case +* + IF(N.EQ.1.OR.K.EQ.1) THEN + IF( RQT.OR.LQT ) THEN + T(1,1) = CONJG(TAU(1)) + ELSE + T(1,1) = TAU(1) + END IF + RETURN + END IF IF(QR) THEN * * Break V apart into 6 components @@ -260,17 +277,17 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{1,1}\in\C^{l,l} unit lower triangular * V_{2,1}\in\C^{k-l,l} rectangular * V_{3,1}\in\C^{n-k,l} rectangular -* +* * V_{2,2}\in\C^{k-l,k-l} unit lower triangular * V_{3,2}\in\C^{n-k,k-l} rectangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} T_{1,2}| * |0 T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -281,17 +298,17 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* +* * (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2') * = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2' -* -* Define T{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2} -* -* Then, we can define the matrix V as +* +* Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2} +* +* Then, we can define the matrix V as * V = |-------| * |V_1 V_2| * |-------| -* +* * So, our product is equivalent to the matrix product * I - V*T*V' * This means, we can compute T_{1,1} and T_{2,2}, then use this information @@ -303,15 +320,15 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{2,2} recursively * - CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, - $ TAU(L+1), T(L+1, L+1), LDT) + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) * -* Compute T_{1,2} +* Compute T_{1,2} * T_{1,2} = V_{2,1}' * DO J = 1, L DO I = 1, K-L - T(J, L+I) = CONJG(V(L+I, J)) + T(J,L+I) = CONJG(V(L+I,J)) END DO END DO * @@ -324,9 +341,8 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2} * Note: We assume K <= N, and GEMM will do nothing if N=K * - CALL CGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, - $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE, T(1, L+1), - $ LDT) + CALL CGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, + $ V(K+1, 1), LDV, V(K+1,L+1), LDV, ONE, T(1, L+1), LDT) * * At this point, we have that T_{1,2} = V_1'*V_2 * All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} @@ -339,8 +355,8 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * * T_{1,2} = T_{1,2}*T_{2,2} * - CALL CTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + CALL CTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT) ELSE IF(LQ) THEN * @@ -354,19 +370,19 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{1,1}\in\C^{l,l} unit upper triangular * V_{1,2}\in\C^{l,k-l} rectangular * V_{1,3}\in\C^{l,n-k} rectangular -* +* * V_{2,2}\in\C^{k-l,k-l} unit upper triangular * V_{2,3}\in\C^{k-l,n-k} rectangular * * Where l = floor(k/2) * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} T_{1,2}| * |0 T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -375,20 +391,20 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2}\in\C^{l, k-l} rectangular * * Then, consider the product: -* -* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) -* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 -* +* +* (I - V_1'*T_{1,1}'*V_1)*(I - V_2'*T_{2,2}'*V_2) +* = I - V_1'*T_{1,1}'*V_1 - V_2'*T_{2,2}'*V_2 + V_1'*T_{1,1}'*V_1*V_2'*T_{2,2}'*V_2 +* * Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |---| * |V_1| * |V_2| * |---| -* +* * So, our product is equivalent to the matrix product -* I - V'*T*V +* I - V'*T'*V * This means, we can compute T_{1,1} and T_{2,2}, then use this information * to compute T_{1,2} * @@ -398,14 +414,14 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{2,2} recursively * - CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, - $ TAU(L+1), T(L+1, L+1), LDT) + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) * * Compute T_{1,2} * T_{1,2} = V_{1,2} * - CALL CLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT) + CALL CLACPY('All', L, K - L, V(1,L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*V_{2,2}' * @@ -433,6 +449,99 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * CALL CTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT) + ELSE IF(LQT) THEN +* +* Break V apart into 6 components +* +* V = |----------------------| +* |V_{1,1} V_{1,2} V{1,3}| +* |0 V_{2,2} V{2,3}| +* |----------------------| +* +* V_{1,1}\in\C^{l,l} unit upper triangular +* V_{1,2}\in\C^{l,k-l} rectangular +* V_{1,3}\in\C^{l,n-k} rectangular +* +* V_{2,2}\in\C^{k-l,k-l} unit upper triangular +* V_{2,3}\in\C^{k-l,n-k} rectangular +* +* Where l = floor(k/2) +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{l, l} lower triangular +* T_{2,2}\in\C^{k-l, k-l} lower triangular +* T_{2,1}\in\C^{k-l, l} rectangular +* +* Then, consider the product: +* +* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) +* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 +* +* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL CLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) + +* +* Compute T_{2,1} +* T_{2,1} = V_{1,2}' +* + DO I = 1, K-L + DO J = 1, L + T(L+I,J) = CONJG(V(J,L+I)) + END DO + END DO +* +* T_{2,1} = V_{2,2}*T_{2,1} +* + CALL CTRMM('Left', 'Upper', 'No Transpose', 'Unit', K-L, L, + $ ONE, V(L+1,L+1), LDV, T(L+1,1), LDT) +* +* T_{2,1} = V_{2,3}*V_{1,3}' + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL CGEMM('No Transpose', 'Conjugate', K-L, L, N-K, ONE, + $ V(L+1,K+1), LDV, V(1, K+1), LDV, ONE, T(L+1,1), LDT) +* +* At this point, we have that T_{2,1} = V_2*V_1' +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K-L, L, NEG_ONE, T(L+1,L+1), LDT, T(L+1,1), LDT) +* +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL CTRMM('Right', 'Lower', 'No Transpose', 'Non-unit', + $ K-L, L, ONE, T, LDT, T(L+1,1), LDT) ELSE IF(QL) THEN * * Break V apart into 6 components @@ -445,18 +554,18 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * * V_{1,1}\in\C^{n-k,k-l} rectangular * V_{2,1}\in\C^{k-l,k-l} unit upper triangular -* +* * V_{1,2}\in\C^{n-k,l} rectangular * V_{2,2}\in\C^{k-l,l} rectangular * V_{3,2}\in\C^{l,l} unit upper triangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} 0 | * |T_{2,1} T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -467,17 +576,17 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* +* * (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1') * = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1' -* +* * Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |-------| * |V_1 V_2| * |-------| -* +* * So, our product is equivalent to the matrix product * I - V*T*V' * This means, we can compute T_{1,1} and T_{2,2}, then use this information @@ -485,34 +594,34 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{1,1} recursively * - CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) * * Compute T_{2,2} recursively * CALL CLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV, - $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) * * Compute T_{2,1} * T_{2,1} = V_{2,2}' * DO J = 1, K-L DO I = 1, L - T(K-L+I, J) = CONJG(V(N-K+J, K-L+I)) + T(K-L+I,J) = CONJG(V(N-K+J, K-L+I)) END DO END DO * * T_{2,1} = T_{2,1}*V_{2,1} * CALL CTRMM('Right', 'Upper', 'No transpose', 'Unit', L, - $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT) + $ K-L, ONE, V(N-K+1,1), LDV, T(K-L+1,1), LDT) * * T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1} * Note: We assume K <= N, and GEMM will do nothing if N=K * CALL CGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, - $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1), - $ LDT) + $ V(1,K-L+1), LDV, V, LDV, ONE, T(K-L+1,1), LDT) * * At this point, we have that T_{2,1} = V_2'*V_1 * All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} @@ -521,17 +630,13 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{2,1} = -T_{2,2}*T_{2,1} * CALL CTRMM('Left', 'Lower', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, - $ T(K-L+1, 1), LDT) + $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT) * * T_{2,1} = T_{2,1}*T_{1,1} * CALL CTRMM('Right', 'Lower', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) - ELSE -* -* Else means RQ case -* + $ K-L, ONE, T, LDT, T(K-L+1,1), LDT) + ELSE IF(RQ) THEN * Break V apart into 6 components * * V = |-----------------------| @@ -546,13 +651,13 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{2,2}\in\C^{l,k-l} rectangular * V_{2,3}\in\C^{l,l} unit lower triangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} 0 | * |T_{2,1} T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -563,51 +668,51 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* -* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) -* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 -* +* +* (I - V_2'*T_{2,2}'*V_2)*(I - V_1'*T_{1,1}'*V_1) +* = I - V_2'*T_{2,2}'*V_2 - V_1'*T_{1,1}'*V_1 + V_2'*T_{2,2}'*V_2*V_1'*T_{1,1}'*V_1 +* * Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |---| * |V_1| * |V_2| * |---| -* +* * So, our product is equivalent to the matrix product -* I - V'*T*V +* I - V'*T'*V * This means, we can compute T_{1,1} and T_{2,2}, then use this information * to compute T_{2,1} * * Compute T_{1,1} recursively * - CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) * * Compute T_{2,2} recursively * CALL CLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV, - $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) * * Compute T_{2,1} * T_{2,1} = V_{2,2} * - CALL CLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV, - $ T(K-L+1, 1), LDT) + CALL CLACPY('All', L, K-L, V(K-L+1,N-K+1), LDV, T(K-L+1,1), + $ LDT) * * T_{2,1} = T_{2,1}*V_{1,2}' * CALL CTRMM('Right', 'Lower', 'Conjugate', 'Unit', L, K-L, - $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT) + $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT) * * T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} * Note: We assume K <= N, and GEMM will do nothing if N=K * - CALL CGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE, - $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1), - $ LDT) + CALL CGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE, + $ V(K-L+1,1), LDV, V, LDV, ONE, T(K-L+1,1), LDT) * * At this point, we have that T_{2,1} = V_2*V_1' @@ -617,13 +722,103 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{2,1} = -T_{2,2}*T_{2,1} * CALL CTRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L, - $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, - $ T(K-L+1, 1), LDT) + $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT) * * T_{2,1} = T_{2,1}*T_{1,1} * CALL CTRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L, - $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + $ K-L, ONE, T, LDT, T(K-L+1,1), LDT) + ELSE IF(RQT) THEN +* Break V apart into 6 components +* +* V = |-----------------------| +* |V_{1,1} V_{1,2} 0 | +* |V_{2,1} V_{2,2} V_{2,3}| +* |-----------------------| +* +* V_{1,1}\in\C^{k-l,n-k} rectangular +* V_{1,2}\in\C^{k-l,k-l} unit lower triangular +* +* V_{2,1}\in\C^{l,n-k} rectangular +* V_{2,2}\in\C^{l,k-l} rectangular +* V_{2,3}\in\C^{l,l} unit lower triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* | 0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{k-l, k-l} non-unit upper triangular +* T_{2,2}\in\C^{l, l} non-unit upper triangular +* T_{1,2}\in\C^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) +* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 +* +* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) +* +* Compute T_{2,2} recursively +* + CALL CLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV, + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) +* +* Compute T_{1,2} +* T_{1,2} = V_{2,2}' +* + DO I = 1, K-L + DO J = 1, L + T(I,K-L+J) = CONJG(V(K-L+J, N-K+I)) + END DO + END DO +* +* T_{1,2} = V_{1,2}T_{1,2} +* + CALL CTRMM('Left', 'Lower', 'No Transpose', 'Unit', K-L, L, + $ ONE, V(1,N-K+1), LDV, T(1,K-L+1), LDT) +* +* T_{1,2} = V_{1,1}V_{2,1}' + T_{1,2} +* + CALL CGEMM('No Tranpose', 'Conjugate', K-L, L, N-K, ONE, V, + $ LDV, V(K-L+1,1), LDV, ONE, T(1, K-L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1*V_2' +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit', + $ K-L, L, NEG_ONE, T, LDT, T(1, K-L+1), LDT) +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL CTRMM('Right', 'Upper', 'No Transpose', 'Non-Unit', + $ K-L, L, ONE, T(K-L+1,K-L+1), LDT, T(1, K-L+1), LDT) END IF END SUBROUTINE diff --git a/SRC/clumm.f b/SRC/clumm.f new file mode 100644 index 0000000000..671263ac9b --- /dev/null +++ b/SRC/clumm.f @@ -0,0 +1,350 @@ +*> \brief \b CLUMM computes an in place triangular times triangluar matrix multiplication +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE CLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA, +* $ A, LDA) +* +* .. Scalar Arguments .. +* INTEGER N, LDA +* CHARACTER SIDEL, DIAGL, DIAGU +* COMPLEX ALPHA +* +* .. Array Arguments .. +* COMPLEX A(LDA,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLUMM performs one of the matrix-matrix operations +*> +*> C = \alpha L * U +*> or +*> C = \alpha U * L +*> +*> where \alpha is a scalar, L is a unit, or non-unit, lower triangular matrix, and U is a unit, or +*> non-unit, upper triangular matrix, and at most one of L and U are non-unit +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDEL +*> \verbatim +*> SIDEL is CHARACTER*1 +*> On entry, SIDE specifies whether L multiplies U from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' A = \alpha L * U +*> +*> SIDE = 'R' or 'r' A = \alpha U * L +*> \endverbatim +*> +*> \param[in] DIAGL +*> \verbatim +*> DIAGL is CHARACTER*1 +*> On entry, DIAGL specifies whether or not L is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' L is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' L is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] DIAGU +*> \verbatim +*> DIAGU is CHARACTER*1 +*> On entry, DIAGU specifies whether or not U is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' U is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' U is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> M is INTEGER +*> On entry, N specifies the number of rows and columns of L and U. M must be at +*> least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is COMPLEX . +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then A is not referenced, and A need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension ( LDA, N ) where +*> Before entry the leading n-by-n strictly upper triangular part of the array +*> A must contain the upper triangular matrix U and the strictly lower triangular part of +*> the leading n-by-n submatrix must contain the lower triangular matrix L. +*> If DIAGL != 'U', then the diagonal is assumed to be part of L, and if +*> DIAGU != 'U', then the diagonal is assumed to be part of U. +*> Note: At most one of DIAGL and DIAGU can be not equal to 'U'. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> On entry, LDA specifies the first dimension of A as declared +*> in the calling (sub) program. LDA must be at least max( 1, n ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== +* Cost: 2/3 * (n^3 - n) + RECURSIVE SUBROUTINE CLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA, + $ A, LDA) +* +* .. Scalar Arguments .. + INTEGER N, LDA + CHARACTER SIDEL, DIAGL, DIAGU + COMPLEX ALPHA +* +* .. Array Arguments .. + COMPLEX A(LDA,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CTRMM, CLASET, + $ XERBLA +* .. +* .. Local Scalars .. + INTEGER K + LOGICAL LLEFT, LUNIT, UUNIT +* .. +* .. Local Parameters .. + COMPLEX ONE, ZERO + PARAMETER(ONE=(1.0E+0,0.0E+0), ZERO=(0.0E+0,0.0E+0)) +* .. +* +* Determine if our flags are valid or not. We can have at +* most one of DIAGU, DIAGL not equal to 'U' +* + LUNIT = LSAME(DIAGL, 'U') + UUNIT = LSAME(DIAGU, 'U') +* +* If both of the above are false, then it is impossible to have the +* structure that we are exploiting in this routine +* Note: It is possible to allow the matrices to share a non-unit +* diagonal as long as the values are the exact same, but there is +* currently no use case for this that I am aware of. +* + IF ((.NOT.LUNIT).AND.(.NOT.UUNIT)) THEN +* +* We say the error is in the last set DIAG value as we cannot know +* what the user actually meant. +* + CALL XERBLA( 'CLUMM', 3 ) + RETURN + END IF +* +* Determine which side L is on +* + LLEFT = LSAME(SIDEL, 'L') +* +* Early exit if possible +* + IF (N.EQ.0) THEN + RETURN + END IF + IF (ALPHA.EQ.ZERO) THEN + CALL CLASET('All', N, N, ZERO, ZERO, A, LDA) + RETURN + END IF +* +* Terminating Case +* + IF (N.EQ.1) THEN +* +* Since at most one of L and U are non-unit triangular, whatever side L is on, we are still +* always computing one of +* +* 1) A(1,1) = ALPHA*A(1,1) +* 2) A(1,1) = ALPHA +* +* Where the first case happens when exactly one of L and U are unit triangular, while the +* second case happens when both L and U are unit triangular +* + IF (LUNIT.AND.UUNIT) THEN + A(1,1) = ALPHA + ELSE + A(1,1) = ALPHA*A(1,1) + END IF + RETURN + END IF +* +* Recursive Case +* + K = N/2 +* +* Regardless of us computing A = L*U or A = U*L, break break A apart as follows: +* +* |---| +* A = | U | +* | L | +* |---| +* +* Further break down L as +* |---------------| +* L = | L_{11} 0 | +* | L_{21} L_{22} | +* |---------------| +* +* Where: +* +* L_{11}\in\C^{k\times k} is lower triangular (assumed unit iff DIAGL == 'U') +* L_{21}\in\C^{n-k\times n} is rectangular +* L_{22}\in\C^{n-k\times n-k} is lower triangular (assumed unit iff DIAGL == 'U') +* +* Further break down U as +* |---------------| +* U = | U_{11} U_{21} | +* | 0 U_{22} | +* |---------------| +* +* Where: +* +* U_{11}\in\C^{k\times k} is upper triangular (assumed unit iff DIAGU == 'U') +* U_{12}\in\C^{n\times n-k} is rectangular +* U_{22}\in\C^{n-k\times n-k} is upper triangular (assumed unit iff DIAGU == 'U') + IF (LLEFT) THEN +* +* This means we are computing +* |---------------| |---------------| +* A = L*U = \alpha | L_{11} 0 | * | U_{11} U_{12} | +* | L_{21} L_{22} | | 0 U_{22} | +* |---------------| |---------------| +* +* |---------------------------------------------| +* = \alpha | L_{11}*U_{11} L_{11}*U_{12} | +* | L_{21}*U_{11} L_{21}*U_{12} + L_{22}*U_{22} | +* |---------------------------------------------| +* +* We compute these in the following order +* +* A_{22} = \alpha*L_{22}*U_{22} (This routine) +* A_{22} = \alpha*L_{21}*U_{12} + A_{22} (GEMM) +* +* A_{12} = \alpha*L_{11}*U_{12} (TRMM) +* A_{21} = \alpha*L_{21}*U_{11} (TRMM) +* +* A_{11} = \alpha*L_{11}*U_{11} (This routine) +* +* Compute A_{22} +* +* A_{22} = \alpha*L_{22}*U_{22} +* + CALL CLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA, + $ A(K+1, K+1), LDA) +* +* A_{22} = \alpha L_{21}*U_{12} + A_{22} +* + CALL CGEMM('No Transpose', 'No Transpose', N-K, N-K, K, + $ ALPHA, A(K+1,1), LDA, A(1,K+1), LDA, ONE, A(K+1,K+1), + $ LDA) +* +* Compute A_{12} +* +* A_{12} = \alpha*L_{11}*U_{12} +* + CALL CTRMM('Left', 'Lower', 'No Transpose', DIAGL, K, N-K, + $ ALPHA, A, LDA, A(1,K+1), LDA) +* +* Compute A_{21} +* +* A_{21} = \alpha*L_{21}*U_{11} +* + CALL CTRMM('Right', 'Upper', 'No Transpose', DIAGU, N-K, K, + $ ALPHA, A, LDA, A(K+1,1), LDA) +* +* Compute A_{11} +* +* A_{11} = \alpha*L_{11}*U_{11} +* + CALL CLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA) + ELSE +* +* This means we are computing +* |---------------| |---------------| +* A = U*L = \alpha | U_{11} U_{12} | * | L_{11} 0 | +* | 0 U_{22} | | L_{21} L_{22} | +* |---------------| |---------------| +* +* |---------------------------------------------| +* = \alpha | U_{11}*L_{11} + U_{12}*L_{21} U_{12}*L_{22} | +* | U_{22}*L_{21} U_{22}*L_{22} | +* |---------------------------------------------| +* +* We compute these in the following order +* +* A_{11} = \alpha*U_{11}*L_{11} (This routine) +* A_{11} = \alpha*U_{12}*L_{21} + A_{11} (GEMM) +* +* A_{12} = \alpha*U_{12}*L_{22} (TRMM) +* A_{21} = \alpha*U_{22}*L_{21} (TRMM) +* +* A_{22} = \alpha*U_{22}*L_{22} (This routine) +* +* Compute A_{11} +* +* A_{11} = \alpha*U_{11}*L_{11} +* + CALL CLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA) +* +* A_{11} = \alpha*U_{12}*L_{21} + A_{11} +* + CALL CGEMM('No Transpose', 'No Transpose', K, K, N-K, + $ ALPHA, A(1,K+1), LDA, A(K+1,1), LDA, ONE, A, LDA) +* +* Compute A_{12} +* +* A_{12} = \alpha*U_{12}*L_{22} +* + CALL CTRMM('Right', 'Lower', 'No Transpose', DIAGL, K, N-K, + $ ALPHA, A(K+1,K+1), LDA, A(1,K+1), LDA) +* +* Compute A_{21} +* +* A_{21} = \alpha*U_{22}*L_{21} +* + CALL CTRMM('Left', 'Upper', 'No Transpose', DIAGU, N-K, K, + $ ALPHA, A(K+1, K+1), LDA, A(K+1,1), LDA) +* +* Compute A_{22} +* +* A_{22} = \alpha*U_{22}*L_{22} +* + CALL CLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA, + $ A(K+1, K+1), LDA) + END IF + END SUBROUTINE diff --git a/SRC/ctrmmoop.f b/SRC/ctrmmoop.f new file mode 100644 index 0000000000..a0ec301121 --- /dev/null +++ b/SRC/ctrmmoop.f @@ -0,0 +1,2572 @@ +*> \brief \b CTRMMOOP computes an out of place triangular times general matrix multiplication +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, +* $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* .. Scalar Arguments .. +* COMPLEX ALPHA, BETA +* INTEGER M, N, LDA, LDB, LDC +* CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG +* .. +* .. Array Arguments .. +* COMPLEX A(LDA,*), B(LDB,*), C(LDC,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CTRMMOOP performs one of the matrix-matrix operations +*> +*> C = \alpha op(A) * op(B) + \beta C +*> or +*> C = \alpha op(B) * op(A) + \beta C +*> +*> where \alpha and \beta are scalars, C is an m-by-n matrix, A is +*> a unit, or non-unit, upper or lower triangular matrix, and op(A) is +*> is one of +*> +*> op(A) = A or op(A) = A**T op(A) = A**H +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> On entry, SIDE specifies whether op(A) multiplies op(B) from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' C = \alpha op(A) * op(B) + \beta C +*> +*> SIDE = 'R' or 'r' C = \alpha op(B) * op(A) + \beta C +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> On entry, UPLO specifies whether the matrix A is an upper or +*> lower triangular matrix as follows: +*> UPLO = 'U' or 'u' A is upper triangular +*> +*> UPLO = 'L' or 'l' A is lower triangular +*> \Endverbatim +*> +*> \param[in] TRANSA +*> \verbatim +*> TRANSA is CHARACTER*1 +*> On entry, TRANSA specifies the form of op(A) to be used in +*> the matrix multiplication as follows: +*> TRANSA = 'N' or 'n' op(A) = A +*> +*> TRANSA = 'T' or 't' op(A) = A**T +*> +*> TRANSA = 'C' or 'c' op(A) = A**H +*> \endverbatim +*> +*> \param[in] TRANSB +*> \verbatim +*> TRANSB is CHARACTER*1 +*> On entry, TRANSB specifies the form of op(B) to be used in +*> the matrix multiplication as follows: +*> TRANSB = 'N' or 'n' op(B) = B +*> +*> TRANSB = 'T' or 't' op(B) = B**T +*> +*> TRANSB = 'C' or 'c' op(B) = B**H +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> On entry, DIAG specifies whether or not A is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' A is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' A is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> On entry, M specifies the number of rows of C. M must be at +*> least zero. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> On entry, N specifies the number of columns of C. N must be +*> at least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is COMPLEX . +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then A and B are not referenced, and A and B need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX array, dimension ( LDA, K ) where +*> K is M when SIDE = 'L' and K is N when SIDE='R' +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array A must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> A is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array A must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> A is not referenced. +*> Note that when DIAG = 'U' or 'u', the diagonal elements of +*> A are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> On entry, LDA specifies the first dimension of A as declared +*> in the calling (sub) program. When SIDE = 'L' or 'l' then +*> LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +*> then LDA must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX array, dimension ( LDB, K ), where K is M +*> If SIDE='R' and TRANSA='N', or SIDE='L' and TRANSA='T' and N +*> otherwise. On entry, the leading k-by-k submatrix must contain +*> B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> On entry, LDB specifies the first dimension of B as declared +*> in the calling (sub) program. When SIDE = 'R' and TRANSB='N' +*> then LDB must be at least max( 1, m ), when SIDE = 'R' +*> and TRANSB = 'T' then LDB must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] BETA +*> \verbatim +*> BETA is COMPLEX . +*> On entry, BETA specifies the scalar beta. When beta is +*> zero then C is not referenced on entry, and C need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX array, dimension ( LDC, N ) +*> Before entry, the leading m-by-n part of the array C must +*> contain the matrix C, and on exit is overwritten by the +*> transformed matrix. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> On entry, LDC specifies the first dimension of C as declared +*> in the calling (sub) program. LDC must be at least +*> max( 1, m ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + RECURSIVE SUBROUTINE CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, + $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER M, N, LDA, LDB, LDC + CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG +* .. +* .. Array Arguments .. + COMPLEX A(LDA,*), B(LDB,*), C(LDC,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + COMPLEX CDOTC, CDOTU + EXTERNAL LSAME, CDOTC, CDOTU +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CAXPY, CACXPY, + $ CSCAL, CLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG, MIN +* .. +* .. Local Scalars .. + INTEGER I, J, L, K, INCB + LOGICAL LSIDE, UPPER, UNIT, TRANST, TRANSG, + $ CONJA, CONJB +* .. +* .. Local Parameters .. + COMPLEX ONE, ZERO + PARAMETER(ONE=(1.0E+0,0.0E+0), ZERO=(0.0E+0,0.0E+0)) +* .. +* +* Beginning of Executable Statements +* + LSIDE = LSAME(SIDE, 'L') + UPPER = LSAME(UPLO, 'U') +* +* If we are transposing the triangular matrix (A) +* + CONJA = LSAME(TRANSA, 'C') + TRANST= LSAME(TRANSA, 'T').OR.CONJA +* +* If we are transposing the general matrix (B) +* + CONJB = LSAME(TRANSB, 'C') + TRANSG= LSAME(TRANSB, 'T').OR.CONJB +* +* Terminating Case +* + UNIT = LSAME(DIAG, 'U') + IF (M.EQ.1.AND.N.EQ.1) THEN +* +* This case is the simplest as we are just computing C = \alpha A*B + +* \beta C where all components are 1-by-1 matrices +* + + IF (BETA.EQ.ZERO) THEN + C(1,1) = ZERO + ELSE + C(1,1) = C(1,1) * BETA + END IF +* +* Now, we compute C = \alpha op(A)*op(B) +* + IF(ALPHA.NE.ZERO) THEN +* +* A = 1, so we do not care if A is conjugated or not +* + IF (UNIT) THEN + IF (CONJB) THEN + C(1,1) = C(1,1) + ALPHA*CONJG(B(1,1)) + ELSE + C(1,1) = C(1,1) + ALPHA*B(1,1) + END IF + ELSE +* +* A is not assumed unit, so we need to keep op(A) in mind +* + IF (CONJA) THEN + IF (CONJB) THEN + C(1,1) = C(1,1) + + $ ALPHA*CONJG(B(1,1))*CONJG(A(1,1)) + ELSE + C(1,1) = C(1,1) + ALPHA*B(1,1)*CONJG(A(1,1)) + END IF + ELSE + IF (CONJB) THEN + C(1,1) = C(1,1) + ALPHA*CONJG(B(1,1))*A(1,1) + ELSE + C(1,1) = C(1,1) + ALPHA*B(1,1)*A(1,1) + END IF + END IF + END IF + END IF + RETURN + ELSE IF (M.EQ.1) THEN +* +* This means that C is a row vector. If BETA is 0, then we +* set it explicitly, otherwise we overwrite it with BETA*C +* + IF (BETA.EQ.ZERO) THEN +* +* This ensures we don't reference C unless we need to +* + CALL CLASET('All', M, N, ZERO, ZERO, C, LDC) + ELSE + CALL CSCAL(N, BETA, C, LDC) + END IF + IF (ALPHA.NE.ZERO) THEN + IF (LSIDE) THEN +* +* We are computing C = \alpha op(A)*op(B) + \beta C +* Note: This means that A is a scalar +* + IF (CONJA) THEN +* +* op(A) = CONJG(A) +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL CACXPY(N, ALPHA*CONJG(A(1,1)), B, 1, + $ C, LDC) + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL CAXPY(N, ALPHA*CONJG(A(1,1)), B, 1, + $ C, LDC) + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL CAXPY(N, ALPHA*CONJG(A(1,1)), B, + $ LDB, C, LDC) + END IF + END IF + ELSE +* +* op(A) = A or op(A) = A**T = A +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL CACXPY(N, ALPHA*A(1,1), B, 1, + $ C, LDC) + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL CAXPY(N, ALPHA*A(1,1), B, 1, C, LDC) + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL CAXPY(N, ALPHA*A(1,1), B, LDB, + $ C, LDC) + END IF + END IF + END IF + ELSE +* +* We are computing C = \alpha op(B)*op(A) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (CONJA) THEN +* +* op(A) = CONJG(A) +* This is lower triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CONJG(CDOTU(N-J, + $ A(J,J+1), LDA, B(J+1,1), 1)) + + $ C(1,J) + END DO + CALL CACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CONJG(CDOTU(N-J+1, + $ A(J,J), LDA, B(J,1), 1)) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(N-J, + $ A(J,J+1), LDA, B(J+1,1), 1) + + $ C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(N-J+1, + $ A(J,J), LDA, B(J,1), 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(N-J, + $ A(J,J+1), LDA, B(1,J+1), LDB) + + $ C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(N-J+1, + $ A(J,J), LDA, B(1,J), LDB) + C(1,J) + END DO + END IF + END IF + ELSE IF (TRANST) THEN +* +* op(A) = A**T +* This is lower triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(N-J, + $ B(J+1,1), 1, A(J,J+1), LDA) + + $ C(1,J) + END DO + CALL CACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(N-J+1, + $ B(J,1), 1, A(J,J), LDA) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(N-J, + $ A(J,J+1), LDA, B(J+1,1), 1) + + $ C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(N-J+1, + $ A(J,J), LDA, B(J,1), 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(N-J, + $ A(J,J+1), LDA, B(1,J+1), LDB) + + $ C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(N-J+1, + $ A(J,J), LDA, B(1,J), LDB) + C(1,J) + END DO + END IF + END IF + ELSE +* +* op(A) = A +* This is upper triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(J-1, B, 1, + $ A(1,J), 1) + C(1,J) + END DO + CALL CACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(J, B, 1, + $ A(1,J), 1) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(J-1, + $ A(1,J), 1, B, 1) + + $ C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(J, + $ A(1,J), 1, B, 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(J-1, + $ A(1,J), 1, B, LDB) + + $ C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(J, + $ A(1,J), 1, B, LDB) + C(1,J) + END DO + END IF + END IF + END IF + ELSE +* +* A is lower triangular +* + IF (CONJA) THEN +* +* op(A) = CONJG(A) +* This is upper triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CONJG(CDOTU(J-1, + $ B, 1, A(J,1), LDA)) + C(1,J) + END DO + CALL CACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CONJG(CDOTU(J, B, + $ 1, A(J,1), LDA)) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(J-1, + $ A(J,1), LDA, B, 1) + C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(J, + $ A(J,1), LDA, B, 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(J-1, + $ A(J,1), LDA, B, LDB) + C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(J, + $ A(J,1), LDA, B, LDB) + C(1,J) + END DO + END IF + END IF + ELSE IF (TRANST) THEN +* +* op(A) = A**T +* This is upper triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(J-1, + $ B, 1, A(J,1), LDA) + C(1,J) + END DO + CALL CACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(J, B, + $ 1, A(J,1), LDA) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(J-1, + $ A(J,1), LDA, B, 1) + C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(J, + $ A(J,1), LDA, B, 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(J-1, + $ A(J,1), LDA, B, LDB) + C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(J, + $ A(J,1), LDA, B, LDB) + C(1,J) + END DO + END IF + END IF + ELSE +* +* op(A) = A +* This is lower triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(N-J, + $ B(J+1,1), 1, A(J+1,J), 1) + C(1,J) + END DO + CALL CACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTC(N-J+1, + $ B(J,1), 1, A(J,J), 1) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(N-J, + $ B(J+1,1), 1, A(J+1,J), 1) + C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(N-J+1, + $ B(J,1), 1, A(J,J), 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(N-J, + $ B(1,J+1), LDB, A(J+1,J), 1) + C(1,J) + END DO + CALL CAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CDOTU(N-J+1, + $ B(1,J), LDB, A(J,J), 1) + C(1,J) + END DO + END IF + END IF + END IF + END IF + END IF + END IF + RETURN + ELSE IF (N.EQ.1) THEN +* +* This means that C is a column vector. If BETA is 0, then we +* set it explicitly, otherwise we overwrite it with BETA*C +* + IF (BETA.EQ.ZERO) THEN +* +* This ensures we don't reference C unless we need to +* + CALL CLASET('All', M, N, ZERO, ZERO, C, LDC) + ELSE + CALL CSCAL(M, BETA, C, 1) + END IF + +* +* If alpha is 0, we are done +* + IF (ALPHA.NE.ZERO) THEN + IF (TRANSG) THEN + INCB = LDB + ELSE + INCB = 1 + END IF + IF (LSIDE) THEN +* +* This means we are computing +* C = \alpha op(A) * op(B) + \beta C +* + IF (UPPER) THEN +* +* This means A is upper triangular +* + IF (CONJA) THEN +* +* This means op(A) = CONJG(A) +* This is lower triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CONJG(CDOTU(I-1, B, + $ INCB, A(1,I), 1)) + C(I,1) + END DO + CALL CACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CONJG(CDOTU(I, B, + $ INCB, A(1,I), 1)) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(I-1, A(1,I), + $ 1, B, INCB) + C(I,1) + END DO + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(I, A(1,I), + $ 1, B, INCB) + C(I,1) + END DO + END IF + END IF + ELSE IF (TRANST) THEN +* +* This means op(A) = A**T +* This is lower triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(I-1, B, INCB, + $ A(1,I), 1) + C(I,1) + END DO + CALL CACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(I, B, INCB, + $ A(1,I), 1) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTU(I-1, B, INCB, + $ A(1,I), 1) + C(I,1) + END DO + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTU(I, B, INCB, + $ A(1,I), 1) + C(I,1) + END DO + END IF + END IF + ELSE +* +* This means op(A) = A +* This is upper triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CDOTC(M-I, B(1,I+1), + $ INCB, A(I,I+1), LDA) + C(I,1) + END DO + CALL CACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(M-I+1, B(1,I), + $ INCB, A(I,I), LDA) + C(I,1) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* This means that B is a row vector but not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CDOTU(M-I, B(1,I+1), + $ INCB, A(I,I+1), LDA) + C(I,1) + END DO + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTU(M-I+1, B(1,I), + $ INCB, A(I,I), LDA) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is a column vector and not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CDOTU(M-I, B(I+1,1), + $ INCB, A(I,I+1), LDA) + C(I,1) + END DO + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTU(M-I+1, B(I,1), + $ INCB, A(I,I), LDA) + C(I,1) + END DO + END IF + END IF + END IF +* +* This means A is lower triangular +* + ELSE + IF (CONJA) THEN +* +* This means op(A) = CONJG(A) +* This is upper triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CONJG(CDOTU(M-I, + $ B(1,I+1), INCB, A(I+1,I), 1)) + & + C(I,1) + END DO + CALL CACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CONJG(CDOTU(M-I+1, + $ B(1,I), INCB, A(I,I), 1)) + & + C(I,1) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* This means that B is a row vector but not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CDOTC(M-I, A(I+1,I), + $ 1, B(1,I+1), INCB) + C(I,1) + END DO + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(M-I+1, A(I,I), + $ 1, B(1,I), INCB) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is a column vector and not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CDOTC(M-I, A(I+1,I), + $ 1, B(I+1,1), INCB) + C(I,1) + END DO + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(M-I+1, A(I,I), + $ 1, B(I,1), INCB) + C(I,1) + END DO + END IF + END IF + ELSE IF (TRANST) THEN +* +* This means op(A) = A**T +* This is upper triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CDOTC(M-I, B(1,I+1), + $ INCB, A(I+1,I), 1) + C(I,1) + END DO + CALL CACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(M-I+1, B(1,I), + $ INCB, A(I,I), 1) + C(I,1) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* This means that B is a row vector but not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CDOTU(M-I, B(1,I+1), + $ INCB, A(I+1,I), 1) + C(I,1) + END DO + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTU(M-I+1, B(1,I), + $ INCB, A(I,I), 1) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is a column vector and not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CDOTU(M-I, B(I+1,1), + $ INCB, A(I+1,I), 1) + C(I,1) + END DO + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTU(M-I+1, B(I,1), + $ INCB, A(I,I), 1) + C(I,1) + END DO + END IF + END IF +* +* This means op(A) = A +* This is lower triangular[:w + +* + ELSE + IF (CONJB) THEN +* +* This means that B is conjugated and transposed +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(I-1, B, INCB, + $ A(I,1), LDA) + C(I,1) + END DO + CALL CACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTC(I, B, INCB, + $ A(I,1), LDA) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTU(I-1, B, INCB, + $ A(I,1), LDA) + C(I,1) + END DO + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CDOTU(I, B, INCB, + $ A(I,1), LDA) + C(I,1) + END DO + END IF + END IF + END IF + END IF + ELSE +* +* This means we are computing +* C = \alpha op(B) * op(A) + \beta C +* Note: This means A is a scalar +* + IF (CONJA) THEN +* +* This means op(A) = CONJG(A) +* + IF (CONJB) THEN +* +* This means we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + CALL CACXPY(M, ALPHA*CONJG(A(1,1)), B, + $ INCB, C, 1) + END IF + ELSE +* +* This means B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + CALL CAXPY(M, ALPHA*CONJG(A(1,1)), B, + $ INCB, C, 1) + END IF + END IF + ELSE +* +* This means op(A) = A or op(A) = A**T = A +* + IF (CONJB) THEN +* +* This means B is conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + CALL CACXPY(M, ALPHA*A(1,1), B, INCB, C, + $ 1) + END IF + ELSE +* +* This means B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL CAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + CALL CAXPY(M, ALPHA*A(1,1), B, INCB, C, + $ 1) + END IF + END IF + END IF + END IF + END IF + RETURN + END IF +* +* Recursive Case +* + L = MIN(M,N)/2 + IF (LSIDE) THEN +* +* We are multiplying A from the left IE we are computing +* C = \alpha op(A)*op(B) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing +* +* So we are computing +* C = \alpha A**T * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11}**T + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{21}**T + \beta C_{12} +* C_{21} = \alpha A_{12}**T * B_{11}**T + \alpha A_{22}**T * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{12}**T * B_{21}**T + \alpha A_{22}**T * B_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{12}**T * B_{11}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22}**T * B_{12}**T + C_{21} (This routine) +* +* C_{22} = \alpha A_{12}**T * B_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22}**T * B_{22}**T + C_{22} (This routine) +* +* C_{11} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, + $ LDC) +* +* C_{12} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL CGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(1, L+1), LDA, B, LDB, BETA, C(L+1,1), + $ LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1,L+1), LDA, B(1,L+1), + $ LDB, ONE, C(L+1,1), LDC) +* +* C_{22} +* + CALL CGEMM(TRANSA, TRANSB, M-L, N-L, L, ALPHA, + $ A(1, L+1), LDA, B(L+1,1), LDB, BETA, + $ C(L+1,L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1,L+1), LDA, + $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A**T * B + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11} + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{12} + \beta C_{12} +* C_{21} = \alpha A_{12}**T * B_{11} + \alpha A_{22}**T * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{12}**T * B_{12} + \alpha A_{22}**T * B_{22} + \beta C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{12}**T * B_{11} + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22}**T * B_{21} + C_{21} (This routine) +* +* C_{22} = \alpha A_{12}**T * B_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22}**T * B_{22} + C_{22} (This routine) +* +* C_{11} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, BETA, + $ C(1, L+1), LDC) +* +* C_{21} +* + CALL CGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(1, L+1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CGEMM(TRANSA, TRANSB, M-L, N-L, L, + $ ALPHA, A(1, L+1), LDA, B(1, L+1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11}**T + \alpha A_{12} * B_{12}**T + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12} +* C_{21} = \alpha A_{22} * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{22} * B_{22}**T + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{12} * B_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11} * B_{11}**T + C_{11} (This routine) +* +* C_{12} = \alpha A_{12} * B_{22}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11} * B_{21}**T + C_{12} (This routine) +* +* C_{11} +* + CALL CGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(1, L+1), LDA, B(1, L+1), LDB, BETA, C, LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL CGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1,1), LDB, ONE, + $ C(1, L+1), LDC) +* +* C_{21} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A * B + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11} + \alpha A_{12} * B_{21} + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{12} + \alpha A_{12} * B_{22} + \beta C_{12} +* C_{21} = \alpha A_{22} * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{22} * B_{22} + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{12} * B_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11} * B_{11} + C_{11} (This routine) +* +* C_{12} = \alpha A_{12} * B_{22} + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11} * B_{12} + C_{12} (This routine) +* +* C_{11} +* + CALL CGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, 1), LDB, BETA, C, LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL CGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, + $ ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + END IF + ELSE +* +* A is lower triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A**T * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11}**T + \alpha A_{21}**T * B_{12}**T + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{21}**T + \alpha A_{21}**T * B_{22}**T + \beta C_{12} +* C_{21} = \alpha A_{22}**T * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{22}**T * B_{22}**T + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{21}**T * B_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11}**T * B_{11}**T + C_{11} (This routine) +* +* C_{12} = \alpha A_{21}**T * B_{22}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11}**T * B_{21}**T + C_{12} (This routine) +* +* C_{11} +* + CALL CGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(1, L+1), LDB, BETA, C, LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL CGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE, + $ C(1, L+1), LDC) +* +* C_{21} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A**T * B + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11} + \alpha A_{21}**T * B_{21} + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{12} + \alpha A_{21}**T * B_{22} + \beta C_{12} +* C_{21} = \alpha A_{22}**T * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{22}**T * B_{22} + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{21}**T * B_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11}**T * B_{11} + C_{11} (This routine) +* +* C_{12} = \alpha A_{21}**T * B_{22} + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11}**T * B_{12} + C_{12} (This routine) +* +* C_{11} +* + CALL CGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, C, LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL CGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, ONE, + $ C(1, L+1), LDC) +* +* C_{21} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11}**T + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{21}**T + \beta C_{12} +* C_{21} = \alpha A_{21} * B_{11}**T + \alpha A_{22} * B_{12}**T + \beta * C_{21} +* C_{22} = \alpha A_{21} * B_{21}**T + \alpha A_{22} * B_{22}**T + \beta * C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{21} * B_{11}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22} * B_{12}**T + C_{21} (This routine) +* +* C_{22} = \alpha A_{21} * B_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22} * B_{22}**T + C_{22} (This routine) +* +* C_{11} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL CGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CGEMM(TRANSA, TRANSB, M-L, N-L, L, + $ ALPHA, A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A * B + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11} + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{12} + \beta C_{12} +* C_{21} = \alpha A_{21} * B_{11} + \alpha A_{22} * B_{21} + \beta * C_{21} +* C_{22} = \alpha A_{21} * B_{12} + \alpha A_{22} * B_{22} + \beta * C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{21} * B_{11} + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22} * B_{21} + C_{21} (This routine) +* +* C_{22} = \alpha A_{21} * B_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22} * B_{22} + C_{22} (This routine) +* +* C_{11} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL CGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, A(L+1, 1), LDA, B(1, L+1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + END IF + END IF + ELSE +* +* We are multiplying A from the right IE we are computing +* C = \alpha op(B)*op(A) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11}**T + \alpha B_{21}**T * A_{12}**T + \beta C_{11} +* C_{12} = \alpha B_{21}**T * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11}**T + \alpha B_{22}**T * A_{12}**T + \beta C_{21} +* C_{22} = \alpha B_{22}**T * A_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{21}**T * A_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11}**T * A_{11}**T + C_{11} (This routine) +* +* C_{21} = \alpha B_{22}**T * A_{12}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{12}**T * A_{11}**T + C_{21} (This routine) +* +* C_{11} +* + CALL CGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(L+1, 1), LDB, A(1, L+1), LDA, BETA, C, LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL CGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11}**T + \alpha B_{12} * A_{12}**T + \beta C_{11} +* C_{12} = \alpha B_{12} * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11}**T + \alpha B_{22} * A_{12}**T + \beta C_{21} +* C_{22} = \alpha B_{22} * A_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{12} * A_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11} * A_{11}**T + C_{11} (This routine) +* +* C_{21} = \alpha B_{22} * A_{12}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{21} * A_{11}**T + C_{21} (This routine) +* +* C_{11} +* + CALL CGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(1,L+1), LDB, A(1,L+1), LDA, BETA, C, LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL CGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11}**T * A_{12} + \alpha B_{21}**T * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11} + \beta C_{21} +* C_{22} = \alpha B_{12}**T * A_{12} + \alpha B_{22}**T * A_{22} + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11}**T * A_{12} + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{21}**T * A_{22} + C_{12} (This routine) +* +* C_{22} = \alpha B_{12}**T * A_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22}**T * A_{22} + C_{22} (This routine) +* +* C_{11} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL CGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, B(1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11} * A_{12} + \alpha B_{12} * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11} + \beta C_{21} +* C_{22} = \alpha B_{21} * A_{12} + \alpha B_{22} * A_{22} + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11} * A_{12} + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{12} * A_{22} + C_{12} (This routine) +* +* C_{22} = \alpha B_{21} * A_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22} * A_{22} + C_{22} (This routine) +* +* C_{11} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL CGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, 1), LDC) +* +* C_{22} +* + CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, B(L+1, 1), LDB, A(1, L+1), LDA, + $ BETA, C(L+1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + END IF + ELSE +* +* A is lower triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11}**T + \beta C_{11} +* C_{12} = \alpha B_{11}**T * A_{21}**T + \alpha B_{21}**T * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11}**T + \beta C_{21} +* C_{22} = \alpha B_{12}**T * A_{21}**T + \alpha B_{22}**T * A_{22}**T + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11}**T * A_{21}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{21}**T * A_{22}**T + C_{12} (This routine) +* +* C_{22} = \alpha B_{12}**T * A_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22}**T * A_{22}**T + C_{22} (This routine) +* +* C_{11} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL CGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA, + $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11} * A_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11}**T + \beta C_{21} +* C_{22} = \alpha B_{21} * A_{21}**T + \alpha A_{22} * B_{22}**T + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11} * A_{21}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{12} * A_{22}**T + C_{12} (This routine) +* +* C_{22} = \alpha B_{21} * A_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22} * A_{22}**T + C_{22} (This routine) +* +* C_{11} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL CGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, 1), LDC) +* +* C_{22} +* + CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA, + $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11} + \alpha B_{21}**T * A_{21} + \beta C_{11} +* C_{12} = \alpha B_{21}**T * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11} + \alpha B_{22}**T * A_{21} + \beta C_{21} +* C_{22} = \alpha B_{22}**T * A_{22} + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{21}**T * A_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11}**T * A_{11} + C_{11}(This routine) +* +* C_{21} = \alpha B_{22}**T * A_{21} + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{12}**T * A_{11} + C_{21} (This routine) +* +* C_{11} +* + CALL CGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, C, LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL CGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, ONE, + $ C(L+1, 1), LDC) +* +* C_{22} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \alpha B_{12} * A_{21} + \beta C_{11} +* C_{12} = \alpha B_{12} * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11} + \alpha B_{22} * A_{21} + \beta C_{21} +* C_{22} = \alpha B_{22} * A_{22} + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{12} * A_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11} * A_{11} + C_{11}(This routine) +* +* C_{21} = \alpha B_{22} * A_{21} + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{21} * A_{11} + C_{21} (This routine) +* +* C_{11} +* + CALL CGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, C, LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL CGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE, + $ C(L+1, 1), LDC) +* +* C_{22} +* + CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + END IF + END IF + END IF + END SUBROUTINE diff --git a/SRC/ctrtrm.f b/SRC/ctrtrm.f new file mode 100644 index 0000000000..659ab49e7d --- /dev/null +++ b/SRC/ctrtrm.f @@ -0,0 +1,584 @@ +*> \brief \b CTRTRM computes an in place triangular-triangular matrix product +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE CTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, +* $ N, ALPHA, T, LDT, V, LDV) +* +* .. Scalar Arguments .. +* INTEGER N, LDT, LDV +* CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV +* COMPLEX ALPHA +* .. +* .. Array Arguments .. +* COMPLEX T(LDT,*), V(LDV,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CTRTRM performs one of the matrix-matrix operations +*> +*> T = \alpha op(V) * T +*> or +*> T = \alpha T * op(V) +*> where \alpha is a scalar, T and V are unit, or non-unit, upper or +*> lower triangular matrix, and op(V) is one of +*> +*> op(V) = V or op(V) = V**T or op(V) = V**H +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> On entry, SIDE specifies whether op(V) multiplies T from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' T = \alpha op(V) * T +*> +*> SIDE = 'R' or 'r' T = \alpha T * op(V) +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> On entry, UPLO specifies whether T an op(V) are upper or +*> lower triangular as follows: +*> UPLO = 'U' or 'u' T and op(V) are upper triangular +*> +*> UPLO = 'L' or 'l' T and op(V) are lower triangular +*> \Endverbatim +*> +*> \param[in] TRANSV +*> \verbatim +*> TRANSV is CHARACTER*1 +*> On entry, TRANSV specifies the form of op(V) to be used in +*> the matrix multiplication as follows: +*> TRANSV = 'N' or 'n' op(V) = V +*> +*> TRANSV = 'T' or 't' op(V) = V**T +*> +*> TRANSV = 'C' or 'c' op(V) = V**H +*> \endverbatim +*> +*> \param[in] DIAGT +*> \verbatim +*> DIAGT is CHARACTER*1 +*> On entry, DIAGT specifies whether or not T is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' T is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' T is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] DIAGV +*> \verbatim +*> DIAGV is CHARACTER*1 +*> On entry, DIAGV specifies whether or not V is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' V is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' V is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> On entry, N specifies the number of rows and columns of T. +*> N must be at least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is COMPLEX. +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then T and V are not referenced, and T and V need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, dimension ( LDT, N ) +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array T must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> T is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array T must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> T is not referenced. +*> Note that when DIAGT = 'U' or 'u', the diagonal elements of +*> T are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> On entry, LDT specifies the first dimension of T as declared +*> in the calling (sub) program. LDT must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX array, dimension ( LDV, N ) +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array op(V) must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> V is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array op(V) must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> V is not referenced. +*> Note that when DIAGV = 'U' or 'u', the diagonal elements of +*> V are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> On entry, LDV specifies the first dimension of T as declared +*> in the calling (sub) program. LDV must be at least max( 1, n ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + RECURSIVE SUBROUTINE CTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, + $ N, ALPHA, T, LDT, V, LDV) +* +* .. Scalar Arguments .. + INTEGER N, LDT, LDV + CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV + COMPLEX ALPHA +* .. +* .. Array Arguments .. + COMPLEX T(LDT,*), V(LDV,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CTRMM, CTRMMOOP, CLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG +* .. +* .. Local Scalars .. + INTEGER K, INFO + LOGICAL TLEFT, TUPPER, VTRANS, VUNIT, TUNIT, CONJV +* .. +* .. Local Parameters .. + COMPLEX ONE, ZERO + PARAMETER(ONE=(1.0E+0,0.E+0), ZERO=(0.0E+0,0.0E+0)) +* .. +* +* Beginning of Executable Statements +* +* +* Early Termination Criteria +* + IF (ALPHA.EQ.ZERO) THEN +* +* If ALPHA is 0, then we are just setting T to be the 0 matrix +* + CALL CLASET(UPLO, N, N, ZERO, ZERO, T, LDT) + RETURN + END IF + TUNIT = LSAME(DIAGT, 'U') + VUNIT = LSAME(DIAGV, 'U') +* +* Terminating Case +* + IF (N.EQ.1) THEN + IF (VUNIT.AND.TUNIT) THEN + T(1,1) = ALPHA + ELSE IF (VUNIT) THEN + T(1,1) = ALPHA*T(1,1) + ELSE IF (TUNIT) THEN + IF (CONJV) THEN + T(1,1) = ALPHA*CONJG(V(1,1)) + ELSE + T(1,1) = ALPHA*V(1,1) + END IF + ELSE + T(1,1) = ALPHA*T(1,1)*V(1,1) + END IF + RETURN + ELSE IF(N.LE.0) THEN + RETURN + END IF +* +* Recursive case +* + TLEFT = LSAME(SIDE, 'R') + TUPPER = LSAME(UPLO, 'U') + CONJV = LSAME(TRANSV, 'C') + VTRANS = CONJV.OR.LSAME(TRANSV, 'T') + + K = N / 2 + IF(TUPPER) THEN +* +* T is upper triangular +* + IF(TLEFT) THEN +* +* Compute T = T*op(V) +* + IF(VTRANS) THEN +* +* We are computing T = T*V**T, which we break down as follows +* |--------------| |--------------| |--------------------| +* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11}**T V_{21}**T| +* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}**T| +* |--------------| |--------------| |--------------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11}**T +* T_{12} = \alpha T_{11}*V_{21}**T + \alpha T_{12}*V_{22}**T +* T_{22} = \alpha T_{22}*V_{22}**T +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha T_{12}*V_{22}**T (CTRMM) +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} (CTRMMOOP) +* +* T_{12} = \alpha T_{12}*V_{22}**T +* + CALL CTRMM('Right', 'Lower', TRANSV, DIAGV, K, + $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} +* + CALL CTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT, + $ V(K+1, 1), LDV, ONE, T(1, K+1), LDT) + ELSE +* +* We are computing T = T*V, which we break down as follows +* |--------------| |--------------| |-------------| +* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11} V_{12}| +* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}| +* |--------------| |--------------| |-------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11} +* T_{12} = \alpha T_{11}*V_{12} + \alpha T_{12}*V_{22} +* T_{22} = \alpha T_{22}*V_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha T_{12}*V_{22} (CTRMM) +* T_{12} = \alpha T_{11}*V_{12} + T_{12} (CTRMMOOP) +* +* T_{12} = \alpha T_{12}*V_{22} +* + CALL CTRMM('Right', 'Upper', TRANSV, DIAGV, K, + $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} +* + CALL CTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT, + $ V(1, K+1), LDV, ONE, T(1, K+1), LDT) + END IF + ELSE +* +* Compute T = op(V)*T +* + IF(VTRANS) THEN +* +* We are computing T = V**T*T, which we break down as follows +* |--------------| |--------------------| |--------------| +* |T_{11} T_{12}| |V_{11}**T V_{21}**T| |T_{11} T_{12}| +* |0 T_{22}| = \alpha |0 V_{22}**T| * |0 T_{22}| +* |--------------| |--------------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}**T*T_{11} +* T_{12} = \alpha V_{11}**T*T_{12} + \alpha V_{21}**T*T_{22} +* T_{22} = \alpha V_{22}**T*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha V_{11}**T*T_{12} (CTRMM) +* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} (CTRMMOOP) +* +* T_{12} = \alpha V_{11}**T*T_{12} +* + CALL CTRMM('Left', 'Lower', TRANSV, DIAGV, K, + $ N-K, ALPHA, V, LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} +* + CALL CTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1), + $ LDT, V(K+1, 1), LDV, ONE, T(1, K+1), LDT) + ELSE +* +* We are computing T = V*T, which we break down as follows +* |--------------| |--------------| |--------------| +* |T_{11} T_{12}| |V_{11} V_{12}| |T_{11} T_{12}| +* |0 T_{22}| = \alpha |0 V_{22}| * |0 T_{22}| +* |--------------| |--------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}*T_{11} +* T_{12} = \alpha V_{11}*T_{12} + \alpha V_{12}*T_{22} +* T_{22} = \alpha V_{22}*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha V_{11}*T_{12} (CTRMM) +* T_{12} = \alpha V_{12}*T_{22} + T_{12} (CTRMMOOP) +* +* T_{12} = \alpha V_{11}*T_{12} +* + CALL CTRMM('Left', 'Upper', TRANSV, DIAGV, K, + $ N-K, ALPHA, V, LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha V_{12}*T_{22} + T_{12} (CTRMMOOP) +* + CALL CTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1), + $ LDT, V(1, K+1), LDV, ONE, T(1, K+1), LDT) + END IF + END IF + ELSE +* +* T is lower triangular +* + IF(TLEFT) THEN +* +* Compute T = T*op(V) +* + IF(VTRANS) THEN +* +* We are computing T = T*V**T, which we break down as follows +* |--------------| |--------------| |--------------------| +* |T_{11} 0 | |T_{11} 0 | |V_{11}**T 0 | +* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{12}**T V_{22}**T| +* |--------------| |--------------| |--------------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11}**T +* T_{21} = \alpha T_{21}*V_{11}**T + \alpha T_{22}*V_{12}**T +* T_{22} = \alpha T_{22}*V_{22}**T +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha T_{21}*V_{11}**T (CTRMM) +* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} (CTRMMOOP) +* +* T_{21} = \alpha T_{21}*V_{11}**T +* + CALL CTRMM('Right', 'Upper', TRANSV, DIAGV, N-K, + $ K, ALPHA, V, LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} +* + CALL CTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1), + $ LDT, V(1, K+1), LDV, ONE, T(K+1, 1), LDT) + ELSE +* +* We are computing T = T*V, which we break down as follows +* |--------------| |--------------| |-------------| +* |T_{11} 0 | |T_{11} 0 | |V_{11} 0 | +* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{21} V_{22}| +* |--------------| |--------------| |-------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11} +* T_{21} = \alpha T_{21}*V_{11} + \alpha T_{22}*V_{21} +* T_{22} = \alpha T_{22}*V_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha T_{21}*V_{11} (CTRMM) +* T_{21} = \alpha T_{22}*V_{21} + T_{21} (CTRMMOOP) +* +* T_{21} = \alpha T_{21}*V_{11} +* + CALL CTRMM('Right', 'Lower', TRANSV, DIAGV, N-K, + $ K, ALPHA, V, LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha T_{22}*V_{12} + T_{21} +* + CALL CTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1), + $ LDT, V(K+1, 1), LDV, ONE, T(K+1, 1), LDT) + END IF + ELSE +* +* Compute T = op(V)*T +* + IF(VTRANS) THEN +* +* We are computing T = V**T*T, which we break down as follows +* |--------------| |--------------------| |--------------| +* |T_{11} 0 | |V_{11}**T 0 | |T_{11} 0 | +* |T_{21} T_{22}| = \alpha |V_{12}**T V_{22}**T| * |T_{21} T_{22}| +* |--------------| |--------------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}**T*T_{11} +* T_{21} = \alpha V_{12}**T*T_{11} + \alpha V_{22}**T*T_{21} +* T_{22} = \alpha V_{22}**T*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha V_{22}**T*T_{21} (CTRMM) +* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} (CTRMMOOP) +* +* T_{21} = \alpha V_{22}**T*T_{21} +* + CALL CTRMM('Left', 'Upper', TRANSV, DIAGV, N-K, K, + $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} +* + CALL CTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT, + $ V(1, K+1), LDV, ONE, T(K+1, 1), LDT) + ELSE +* +* We are computing T = V*T, which we break down as follows +* |--------------| |-------------| |--------------| +* |T_{11} 0 | |V_{11} 0 | |T_{11} 0 | +* |T_{21} T_{22}| = \alpha |V_{21} V_{22}| * |T_{21} T_{22}| +* |--------------| |-------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}*T_{11} +* T_{21} = \alpha V_{21}*T_{11} + \alpha V_{22}*T_{21} +* T_{22} = \alpha V_{22}*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{21} = \alpha V_{22}*T_{21} (CTRMM) +* T_{21} = \alpha V_{12}*T_{11} + T_{21} (CTRMMOOP) +* +* T_{21} = \alpha V_{22}*T_{12} +* + CALL CTRMM('Left', 'Lower', TRANSV, DIAGV, N-K, K, + $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha V_{12}*T_{11} + T_{21} +* + CALL CTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT, + $ V(K+1, 1), LDV, ONE, T(K+1, 1), LDT) + END IF + END IF + END IF +* +* Since in all the above cases, we compute T_{11} and T_{22} +* the same, we pass in our flags and call this routine recursively +* +* Compute T_{11} recursively +* + CALL CTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, K, ALPHA, + $ T, LDT, V, LDV) +* +* Compute T_{22} recursively +* + CALL CTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, N-K, ALPHA, + $ T(K+1, K+1), LDT, V(K+1, K+1), LDV) + END SUBROUTINE diff --git a/SRC/cungkl.f b/SRC/cungkl.f new file mode 100644 index 0000000000..ffc4feec10 --- /dev/null +++ b/SRC/cungkl.f @@ -0,0 +1,174 @@ +*> \brief \b CUNGKL computes the explicit Q factor from ZGEQLF and CLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CUNGKL(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* COMPLEX Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNGKL generates an m by n complex matrix Q with orthonormal columns, +*> which is defined as the last n columns of the product of n +*> elementary reflectors +*> +*> Q = I - V*T*V**H = H(n) . . . H(2) H(1) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by ZGEQLF and T is the n by n matrix returned by CLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V, and the order of T. +*> N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is COMPLEX array, dimension (LDQ,N) +*> On entry, Q(1:m-n+i-1,i) contains the vector which defines the +*> elementary reflector H(i), for i=1,...,n as returned by ZGEQLF. +*> In addition, the lower triangular portion of the submatrix given +*> by Q(m-n+1:m,1:n) will contain the arry T as returned by CLARFT. +*> See further details for more information. +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The storage of the V and T components inside Q is best illustrated by +*> the following example with m = 5, n = 3. +*> +*> Q = |----------| +*> | V1 V2 V3 | +*> | V1 V2 V3 | +*> | T1 V2 V3 | +*> | T1 T2 V3 | +*> | T1 T2 T3 | +*> |----------| +*> +*> \endverbatim +*> +* ===================================================================== + + SUBROUTINE CUNGKL(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + COMPLEX Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX NEG_ONE, ONE + PARAMETER(NEG_ONE=(-1.0E+0,0.0E+0), ONE=(1.0E+0,0.0E+0)) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL CTRMM, CTRTRM, CLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |---| +* Q = | V | +* | T | +* |---| +* +* Where T is an n-by-n lower triangular matrix, and V is as described +* in the Further Details section +* +* In turn, break apart V as follows +* +* |-----| +* V = | V_2 | +* | V_1 | +* |-----| +* +* Where: +* +* V_1 \in \C^{n\times n} assumed unit upper triangular +* V_2 \in \C^{m-n\times n} +* +* Compute T = T*V_1**T +* + CALL CTRTRM('Right', 'Lower', 'Conjugate Transpose', + $ 'Non-Unit', 'Unit', N, ONE, Q(M-N+1,1), LDQ, Q(M-N+1,1), + $ LDQ) +* +* Compute Q = -VT. This means that we need to break apart +* Our computation in two parts +* +* |--------| +* Q = | -V_2*T | +* | -V_1*T | +* |--------| +* +* Q_2 = -V_2*T (TRMM) but only when necessary +* + IF (M.GT.N) THEN + CALL CTRMM('Right', 'Lower', 'No Transpose', 'Non-Unit', + $ M-N, N, NEG_ONE, Q(M-N+1,1), LDQ, Q, LDQ) + END IF +* +* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication) +* + CALL CLUMM('Right', 'Non-Unit', 'Unit', N, NEG_ONE, + $ Q(M-N+1,1), LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(M-N+I,I) = Q(M-N+I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/cungkr.f b/SRC/cungkr.f new file mode 100644 index 0000000000..cb7c1d74a0 --- /dev/null +++ b/SRC/cungkr.f @@ -0,0 +1,154 @@ +*> \brief \b CUNGKR computes the explicit Q factor from CGEQRF and CLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CUNGKR(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* COMPLEX Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNGKR generates an m by n complex matrix Q with orthonormal columns, +*> which is defined as the first n columns of the product of n +*> elementary reflectors +*> +*> Q = I - V*T*V**H = H(1) H(2) . . . H(n) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by CGEQRF and T is the n by n matrix returned by CLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V, and the order of T. +*> N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is COMPLEX array, dimension (LDQ,N) +*> On entry, the upper triangular part and diagonal contains +*> The array T as returned from CLARFT. In addition, the +*> strictly lower triangular portion of the i-th column contains +*> the vector which defines the elementary reflector H(i), +*> for i = 1,2,...,n, as returned by CGEQRF +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== +* Cost: (2mn**2 + n**2 - n)/2 + SUBROUTINE CUNGKR(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + COMPLEX Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX NEG_ONE, ONE + PARAMETER(NEG_ONE=(-1.0E+0,0.0E+0), ONE=(1.0E+0,0.0E+0)) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL CTRMM, CTRTRM, CLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |---| +* Q = | T | +* | V | +* |---| +* +* Where T is an n-by-n upper triangular matrix, and V is an +* m-by-n assumed unit lower trapezoidal matrix +* +* In turn, break apart V as follows +* +* |-----| +* V = | V_1 | +* | V_2 | +* |-----| +* +* Where: +* +* V_1 \in \C^{n\times n} assumed unit lower triangular +* V_2 \in \C^{m-n\times n} +* +* Compute T = T*V_1**H +* + CALL CTRTRM('Right', 'Upper', 'Conjugate Transpose', + $ 'Non-unit', 'Unit', N, ONE, Q, LDQ, Q, LDQ) +* +* Compute Q = -VT. This means that we need to break apart +* Our computation in two parts +* +* |--------| +* Q = | -V_1*T | +* | -V_2*T | +* |--------| +* +* Q_2 = -V_2*T (TRMM) but only when necessary +* + IF (M.GT.N) THEN + CALL CTRMM('Right', 'Upper', 'No Transpose', 'Non-unit', + $ M-N, N, NEG_ONE, Q, LDQ, Q(N+1,1), LDQ) + END IF +* +* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication) +* + CALL CLUMM('Left', 'Unit', 'Non-Unit', N, NEG_ONE, Q, LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,I) = Q(I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/cunglk.f b/SRC/cunglk.f new file mode 100644 index 0000000000..3a1a5dfc89 --- /dev/null +++ b/SRC/cunglk.f @@ -0,0 +1,149 @@ +*> \brief \b CUNGLK computes the explicit Q factor from CGELQF and CLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CUNGLK(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* COMPLEX Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNGLK generates an m by n complex matrix Q with orthonormal columns, +*> which is defined as the first n rows of the product of n +*> elementary reflectors +*> +*> Q = I - V'*T*V = H(1) H(2) . . . H(n) +*> +*> Where V is an m by n matrix whose rows are householder reflectors +*> as returned by CGELQF and T is the n by n matrix returned by CLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V, and the order of T. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V. N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is COMPLEX array, dimension (LDQ,N) +*> On entry, the lower triangular part and diagonal contains +*> The array T as returned from CLARFT. In addition, the +*> strictly upper triangular portion of the i-th row contains +*> the vector which defines the elementary reflector H(i), +*> for i = 1,2,...,m, as returned by CGELQF +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + SUBROUTINE CUNGLK(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + COMPLEX Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX NEG_ONE, ONE + PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0)) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL CTRMM, CTRTRM, CLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |-----| +* Q = | T V | +* |-----| +* +* Where T is an m-by-m lower triangular matrix, and V is an +* m-by-n assumed unit upper trapezoidal matrix +* +* In turn, break apart V as follows +* +* |---------| +* V = | V_1 V_2 | +* |---------| +* +* Where: +* +* V_1 \in \R^{m\times m} assumed unit upper triangular +* V_2 \in \R^{m\times n-m} +* +* Compute T = V_1'*T +* + CALL CTRTRM('Left', 'Lower', 'Conjugate Transpose', + $ 'Non-unit', 'Unit', M, ONE, Q, LDQ, Q, LDQ) +* +* Compute Q = -TV. This means that we need to break apart +* Our computation in two parts +* +* |---------------| +* Q = | -T*V_1 -T*V_2 | +* |---------------| +* +* Q_2 = -T*V_2 (TRMM) but only when necessary +* + IF (N.GT.M) THEN + CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ M, N-M, NEG_ONE, Q, LDQ, Q(1,M+1), LDQ) + END IF +* +* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication) +* + CALL CLUMM('Left', 'Non-unit', 'Unit', M, NEG_ONE, Q, LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,I) = Q(I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/cunglq.f b/SRC/cunglq.f index 7f42c3dcb0..6361766d1c 100644 --- a/SRC/cunglq.f +++ b/SRC/cunglq.f @@ -136,17 +136,14 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNGL2, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNGL2, + $ CUNGLK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -190,95 +187,97 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) RETURN END IF * - NBMIN = 2 - NX = 0 + NBMIN = MAX( 2, ILAENV( 2, 'CUNGLQ', ' ', M, N, K, -1 ) ) + NX = MAX( 0, ILAENV( 3, 'CUNGLQ', ' ', M, N, K, -1 ) ) IWS = M - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'CUNGLQ', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - LDWORK = M - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked version * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'CUNGLQ', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL CUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. +* Factor the last block assuming that our first application +* will be on the Identity matrix * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + I = KK + 1 + IB = NB * -* Set A(kk+1:m,1:kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL CLARFT( 'Forward', 'Transpose', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), A( I, I ), LDA ) * -* Use unblocked code for the last or only block. +* Apply H to A(i+ib:m,i:n) from the right * - IF( KK.LT.M ) - $ CALL CUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL CLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I+IB,I), LDA) * - IF( KK.GT.0 ) THEN +* Apply H to columns i:n of current block + + CALL CUNGLK( IB, N-I+1, A( I, I ), LDA) * * Use blocked code * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + DO I = KI + 1, 1, -NB + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL CLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I), + $ LDA, TAU( I ), A( I, I ), LDA ) * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Apply H to A(i+ib:m,i:n) from the right * - CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL CLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I+IB,I), LDA) * -* Apply H**H to A(i+ib:m,i:n) from the right +* Apply H to columns i:n of current block * - CALL CLARFB( 'Right', 'Conjugate transpose', - $ 'Forward', - $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ), - $ LDA, WORK, LDWORK, A( I+IB, I ), LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL CUNGLK( IB, N-I+1, A( I, I ), LDA) + END DO * -* Apply H**H to columns i:n of current block +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * - CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set columns 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL CLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I), + $ LDA, TAU( I ), A( I, I ), LDA ) +* +* Apply H to A(i+ib:m,i:n) from the right +* + CALL CLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I+IB,I), LDA) +* +* Apply H to columns i:n of current block +* + CALL CUNGLK( IB, N-I+1, A( I, I ), LDA) + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/cungql.f b/SRC/cungql.f index 3da2702d8e..c4d228495c 100644 --- a/SRC/cungql.f +++ b/SRC/cungql.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -137,17 +135,14 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN, + $ NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNG2L, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNG2L, + $ CUNGKL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -178,7 +173,11 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB +* +* Only need a workspace for cung2l in case of bailout +* and for the panel factorization +* + LWKOPT = N END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * @@ -201,88 +200,75 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 )) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We use blocked code for the entire construction * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'CUNGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL CUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Exploit the fact that we are applying to an identity * - CALL CUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block * -* Use blocked code + CALL CUNGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA) + +* Use blocked code on the remaining blocks if there are any. * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL CLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) * -* Apply H to rows 1:m-k+i+ib-1 of current block +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, - $ TAU( I ), WORK, IINFO ) + CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * -* Set rows m-k+i+ib:m of current block to zero +* Apply H to rows 1:m-k+i+ib-1 of current block * - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL CUNGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA) + END DO END IF * WORK( 1 ) = CMPLX( IWS ) diff --git a/SRC/cungqr.f b/SRC/cungqr.f index eb49d2fed5..655810939a 100644 --- a/SRC/cungqr.f +++ b/SRC/cungqr.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -137,20 +135,17 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB, + $ NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNG2R, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNG2R, + $ CUNGKR, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX * .. * .. External Functions .. INTEGER ILAENV @@ -163,7 +158,11 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB +* +* Only need a workspace for zung2r in case of bailout +* and for the panel factorization +* + LWKOPT = MAX( 1, N ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -192,92 +191,99 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 +* Determine when to cross over from unblocked to blocked + NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) ) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'CUNGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL CUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Factor the last block assuming that our first application +* will be on the Identity matrix * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + I = KK + 1 + IB = NB * -* Set A(1:kk,kk+1:n) to zero. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Use unblocked code for the last or only block. +* Apply H to A(i:m,i+ib:n) from the left * - IF( KK.LT.N ) - $ CALL CUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows i:m of current block * -* Use blocked code + CALL CUNGKR(M-I+1, IB, A(I,I), LDA) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Use our standard blocking method after the last block * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + DO I = KI + 1, 1, -NB + IB = NB * - CALL CLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H to A(i:m,i+ib:n) from the left + CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - CALL CLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* Apply H to A(i:m,i+ib:n) from the left +* + CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * * Apply H to rows i:m of current block * - CALL CUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + CALL CUNGKR(M-I+1, IB, A(I,I), LDA) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set rows 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) +* +* Apply H to A(i:m,i+ib:n) from the left +* + CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + +* +* Apply H to rows i:m of current block +* + CALL CUNGKR(M-I+1, IB, A(I,I), LDA) + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/cungrk.f b/SRC/cungrk.f new file mode 100644 index 0000000000..72e304176e --- /dev/null +++ b/SRC/cungrk.f @@ -0,0 +1,169 @@ +*> \brief \b CUNGRK computes the explicit Q factor from CGERQF and ZLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CUNGRK(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* COMPLEX Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNGRK generates an m by n complex matrix Q with orthonormal rows, +*> which is defined as the last m rows of the product of m +*> elementary reflectors +*> +*> Q = I - V'*T*V = H(m) . . . H(2) H(1) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by CGERQF and T is the n by n matrix returned by ZLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V, and the order of T. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V. N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is COMPLEX array, dimension (LDQ,N) +*> On entry, Q(i,1:n-m-1+i) contains the vector which defines the +*> elementary reflector H(i), for i=1,...,n as returned by ZGERKF. +*> In addition, the upper triangular portion of the submatrix given +*> by Q(1:m,n-m:n) will contain the array T as returned by ZLARFT. +*> See further details for more information. +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The storage of the V and T components inside Q is best illustrated by +*> the following example with m = 3, n = 5. +*> +*> Q = |----------------| +*> | V1 V1 T1 T1 T1 | +*> | V2 V2 V2 T2 T2 | +*> | V3 V3 V3 V3 T3 | +*> |----------------| +*> +*> \endverbatim +*> +* ===================================================================== + + SUBROUTINE CUNGRK(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + COMPLEX Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX NEG_ONE, ONE + PARAMETER(NEG_ONE=(-1.0E+0,0.0E+0), ONE=(1.0E+0,0.0E+0)) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL CTRMM, CTRTRM, CLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |-----| +* Q = | V T | +* |-----| +* +* Where T is an m-by-m upper triangular matrix, and V is as described +* in the Further Details section +* +* In turn, break apart V as follows +* +* |---------| +* V = | V_2 V_1 | +* |---------| +* +* Where: +* +* V_1 \in \R^{m\times m} assumed unit lower triangular +* V_2 \in \R^{m\times n-m} +* +* Compute T = V_1'*T +* + CALL CTRTRM('Left', 'Upper', 'Conjugate Transpose', + $ 'Non-Unit', 'Unit', M, ONE, Q(1,N-M+1), LDQ, Q(1,N-M+1), + & LDQ) +* +* Compute Q = -TV. This means that we need to break apart +* Our computation in two parts +* +* |---------------| +* Q = | -T*V_2 -T*V_1 | +* |---------------| +* +* Q_2 = -T*V_2 (TRMM) but only when necessary +* + IF (N.GT.M) THEN + CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit', + $ M, N-M, NEG_ONE, Q(1,N-M+1), LDQ, Q, LDQ) + END IF +* +* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication) +* + CALL CLUMM('Right', 'Unit', 'Non-Unit', M, NEG_ONE, + $ Q(1,N-M+1), LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,N-M+I) = Q(I,N-M+I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/cungrq.f b/SRC/cungrq.f index 7e812b839f..5448e55228 100644 --- a/SRC/cungrq.f +++ b/SRC/cungrq.f @@ -137,17 +137,14 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX ZERO - PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK, + INTEGER I, IB, II, IINFO, IWS, KK, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL CLARFB, CLARFT, CUNGR2, XERBLA + EXTERNAL CLARFB0C2, CLARFT, CUNGR2, + $ CUNGRK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -200,92 +197,74 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) RETURN END IF * - NBMIN = 2 - NX = 0 + NBMIN = MAX( 2, ILAENV( 2, 'CUNGRQ', ' ', M, N, K, -1 ) ) + NX = MAX( 0, ILAENV( 3, 'CUNGRQ', ' ', M, N, K, -1 ) ) IWS = M - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'CUNGRQ', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN -* -* Determine if workspace is large enough for blocked code. -* - LDWORK = M - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN -* -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. -* - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'CUNGRQ', ' ', M, N, K, - $ -1 ) ) - END IF - END IF - END IF * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * * Use blocked code after the first block. * The last kk rows are handled by the block method. * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) -* -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL CUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL CUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I +* +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL CLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA ) * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + CALL CLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, + $ A( II, N-K+I ), LDA, A, LDA) +* +* Apply H to columns 1:n-k+i+ib-1 of current block +* + CALL CUNGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA ) + + DO I = NB+1, K, NB +* +* The last block may be less than size NB +* + IB = MIN(NB, K-I+1) + II = M - K + I * -* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL CLARFB( 'Right', 'Conjugate transpose', - $ 'Backward', - $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ), - $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL CLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA ) * -* Apply H**H to columns 1:n-k+i+ib-1 of current block +* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL CLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), + $ LDA, A( II, N-K+I ), LDA, A, LDA) * -* Set columns n-k+i+ib:n of current block to zero +* Apply H to columns 1:n-k+i+ib-1 of current block * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL CUNGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA ) + END DO END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f new file mode 100644 index 0000000000..4609263849 --- /dev/null +++ b/SRC/dlarfb0c2.f @@ -0,0 +1,556 @@ +*> \brief \b DLARFB0C2 applies a block reflector or its transpose to a +* rectangular matrix with a 0 block while constructing the explicit Q factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLARFB0C2 applies a real block reflector H or its transpose H**T to a +*> real m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**T from the Left +*> = 'R': apply H or H**T from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'T': apply H**T (Transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is DOUBLE PRECISION array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> The matrix V. See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,K) +*> The triangular k by k matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension (LDC,N) +*> On entry, the m by n matrix C. +*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + ! Array arguments + DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, RQ, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External subroutines + EXTERNAL DGEMM, DTRMM, XERBLA + ! Parameters + DOUBLE PRECISION ONE, ZERO, NEG_ONE + PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE = -1.0D+0) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'T') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + RQ = (.NOT.DIRF).AND.(.NOT.COLV) + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = V(K+J,I) + END DO + END DO + ELSE + CALL DGEMM('Transpose', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = C op(H) = C(I-V' op(T) V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = C op(H) = [ 0, C2 ](I - [ V1' ]op(T)[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]op(T)[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*op(T)[ V1, V2 ] + ! + ! = [ -C2*V2'*op(T)*V1, C2 - C2*V2'*op(T)*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*op(T) + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = V(J,K+I) + END DO + END DO + ELSE + CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + IF (TRANST) THEN + CALL DTRMM('Right', 'Upper', 'Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C, LDC) + ELSE + CALL DTRMM('Right', 'Lower', 'No Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C, LDC) + END IF + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL DTRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('DLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = V(J,I) + END DO + END DO + ELSE + CALL DGEMM('Transpose', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE IF (RQ) THEN + ! We are computing C = C op(H) = C(I-V' op(T) V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = C op(H) = [ C2, 0 ] (I - [ V2' ]op(T)[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]op(T)[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*op(T)[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*op(T)*V2, C2*V2'*op(T)*V1 ] + ! + ! = [ C2 - C2*V2'*op(T)*V2, -C2*V2'*op(T)*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*op(T) + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side has the expected value + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('DLARFB0C2', 2) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = V(J,I) + END DO + END DO + ELSE + CALL DGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*op(T) + ! + IF( TRANST ) THEN + CALL DTRMM('Right', 'Lower', 'Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ELSE + CALL DTRMM('Right', 'Upper', 'No Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC) + END IF + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/dlarft.f b/SRC/dlarft.f index e6149ded7b..bf9e3d5334 100644 --- a/SRC/dlarft.f +++ b/SRC/dlarft.f @@ -182,7 +182,7 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * .. Local Scalars .. * INTEGER I,J,L - LOGICAL QR,LQ,QL,DIRF,COLV + LOGICAL QR,LQ,QL,RQ,LQT,RQT,DIRF,COLV,TDIRF,TCOLV * * .. External Subroutines .. * @@ -192,7 +192,7 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * LOGICAL LSAME EXTERNAL LSAME -* +* * The general scheme used is inspired by the approach inside DGEQRT3 * which was (at the time of writing this code): * Based on the algorithm of Elmroth and Gustavson, @@ -223,26 +223,37 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * 'C' for STOREV, then they meant to provide 'R' * DIRF = LSAME(DIRECT,'F') + TDIRF = LSAME(DIRECT,'T') COLV = LSAME(STOREV,'C') + TCOLV = LSAME(STOREV,'T') * * QR happens when we have forward direction in column storage * QR = DIRF.AND.COLV * -* LQ happens when we have forward direction in row storage +* LQT happens when we have forward direction in row storage and want to compute the transpose of +* the T we would normally compute +* + LQT = DIRF.AND.TCOLV +* +* LQ happens when we have forward direction in row storage and want to compute the T we would +* normally compute * - LQ = DIRF.AND.(.NOT.COLV) + LQ = DIRF.AND.(.NOT.LQT) * * QL happens when we have backward direction in column storage * QL = (.NOT.DIRF).AND.COLV * -* The last case is RQ. Due to how we structured this, if the -* above 3 are false, then RQ must be true, so we never store -* this -* RQ happens when we have backward direction in row storage -* RQ = (.NOT.DIRF).AND.(.NOT.COLV) +* RQT happens when we have backward direction in row storage and want to compute the transpose +* of the T we would normally compute * + RQT = TDIRF.AND.(.NOT.COLV) +* +* RQ happens when we have backward direction in row storage and want to compute the T that we +* would normally compute +* + RQ = (.NOT.RQT).AND.(.NOT.COLV) IF(QR) THEN * * Break V apart into 6 components @@ -256,17 +267,17 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{1,1}\in\R^{l,l} unit lower triangular * V_{2,1}\in\R^{k-l,l} rectangular * V_{3,1}\in\R^{n-k,l} rectangular -* +* * V_{2,2}\in\R^{k-l,k-l} unit lower triangular * V_{3,2}\in\R^{n-k,k-l} rectangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} T_{1,2}| * |0 T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -277,17 +288,17 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* +* * (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2') * = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2' -* +* * Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |-------| * |V_1 V_2| * |-------| -* +* * So, our product is equivalent to the matrix product * I - V*T*V' * This means, we can compute T_{1,1} and T_{2,2}, then use this information @@ -299,30 +310,29 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{2,2} recursively * - CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, - $ TAU(L+1), T(L+1, L+1), LDT) + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) * -* Compute T_{1,2} +* Compute T_{1,2} * T_{1,2} = V_{2,1}' * DO J = 1, L DO I = 1, K-L - T(J, L+I) = V(L+I, J) + T(J,L+I) = V(L+I,J) END DO END DO * * T_{1,2} = T_{1,2}*V_{2,2} * CALL DTRMM('Right', 'Lower', 'No transpose', 'Unit', L, - $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2} * Note: We assume K <= N, and GEMM will do nothing if N=K * - CALL DGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, - $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE, - $ T(1, L+1), LDT) + CALL DGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, + $ V(K+1, 1), LDV, V(K+1,L+1), LDV, ONE, T(1, L+1), LDT) * * At this point, we have that T_{1,2} = V_1'*V_2 * All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} @@ -331,12 +341,12 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2} = -T_{1,1}*T_{1,2} * CALL DTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*T_{2,2} * - CALL DTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + CALL DTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT) ELSE IF(LQ) THEN * @@ -350,19 +360,19 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{1,1}\in\R^{l,l} unit upper triangular * V_{1,2}\in\R^{l,k-l} rectangular * V_{1,3}\in\R^{l,n-k} rectangular -* +* * V_{2,2}\in\R^{k-l,k-l} unit upper triangular * V_{2,3}\in\R^{k-l,n-k} rectangular * * Where l = floor(k/2) * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} T_{1,2}| * |0 T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -371,20 +381,20 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2}\in\R^{l, k-l} rectangular * * Then, consider the product: -* -* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) -* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 -* +* +* (I - V_1'*T_{1,1}'*V_1)*(I - V_2'*T_{2,2}'*V_2) +* = I - V_1'*T_{1,1}'*V_1 - V_2'*T_{2,2}'*V_2 + V_1'*T_{1,1}'*V_1*V_2'*T_{2,2}'*V_2 +* * Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |---| * |V_1| * |V_2| * |---| -* +* * So, our product is equivalent to the matrix product -* I - V'*T*V +* I - V'*T'*V * This means, we can compute T_{1,1} and T_{2,2}, then use this information * to compute T_{1,2} * @@ -394,27 +404,26 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{2,2} recursively * - CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, - $ TAU(L+1), T(L+1, L+1), LDT) + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) * * Compute T_{1,2} * T_{1,2} = V_{1,2} * - CALL DLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT) + CALL DLACPY('All', L, K - L, V(1,L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*V_{2,2}' * CALL DTRMM('Right', 'Upper', 'Transpose', 'Unit', L, K-L, - $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2} * Note: We assume K <= N, and GEMM will do nothing if N=K * CALL DGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, - $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, - $ T(1, L+1), LDT) + $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, T(1, L+1), LDT) * * At this point, we have that T_{1,2} = V_1*V_2' * All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} @@ -423,13 +432,106 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2} = -T_{1,1}*T_{1,2} * CALL DTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*T_{2,2} * CALL DTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT) + ELSE IF(LQT) THEN +* +* Break V apart into 6 components +* +* V = |----------------------| +* |V_{1,1} V_{1,2} V{1,3}| +* |0 V_{2,2} V{2,3}| +* |----------------------| +* +* V_{1,1}\in\R^{l,l} unit upper triangular +* V_{1,2}\in\R^{l,k-l} rectangular +* V_{1,3}\in\R^{l,n-k} rectangular +* +* V_{2,2}\in\R^{k-l,k-l} unit upper triangular +* V_{2,3}\in\R^{k-l,n-k} rectangular +* +* Where l = floor(k/2) +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{l, l} lower triangular +* T_{2,2}\in\R^{k-l, k-l} lower triangular +* T_{2,1}\in\R^{k-l, l} rectangular +* +* Then, consider the product: +* +* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) +* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 +* +* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL DLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) + +* +* Compute T_{2,1} +* T_{2,1} = V_{1,2}' +* + DO I = 1, K-L + DO J = 1, L + T(L+I,J) = V(J,L+I) + END DO + END DO +* +* T_{2,1} = V_{2,2}*T_{2,1} +* + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit', K-L, L, + $ ONE, V(L+1,L+1), LDV, T(L+1,1), LDT) +* +* T_{2,1} = V_{2,3}*V_{1,3}' + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL DGEMM('No Transpose', 'Transpose', K-L, L, N-K, ONE, + $ V(L+1,K+1), LDV, V(1, K+1), LDV, ONE, T(L+1,1), LDT) +* +* At this point, we have that T_{2,1} = V_2*V_1' +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K-L, L, NEG_ONE, T(L+1,L+1), LDT, T(L+1,1), LDT) +* +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL DTRMM('Right', 'Lower', 'No Transpose', 'Non-unit', + $ K-L, L, ONE, T, LDT, T(L+1,1), LDT) ELSE IF(QL) THEN * * Break V apart into 6 components @@ -442,18 +544,18 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * * V_{1,1}\in\R^{n-k,k-l} rectangular * V_{2,1}\in\R^{k-l,k-l} unit upper triangular -* +* * V_{1,2}\in\R^{n-k,l} rectangular * V_{2,2}\in\R^{k-l,l} rectangular * V_{3,2}\in\R^{l,l} unit upper triangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} 0 | * |T_{2,1} T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -464,17 +566,17 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* +* * (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1') * = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1' -* +* * Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |-------| * |V_1 V_2| * |-------| -* +* * So, our product is equivalent to the matrix product * I - V*T*V' * This means, we can compute T_{1,1} and T_{2,2}, then use this information @@ -482,34 +584,34 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{1,1} recursively * - CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) * * Compute T_{2,2} recursively * CALL DLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV, - $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) * * Compute T_{2,1} * T_{2,1} = V_{2,2}' * DO J = 1, K-L DO I = 1, L - T(K-L+I, J) = V(N-K+J, K-L+I) + T(K-L+I,J) = V(N-K+J, K-L+I) END DO END DO * * T_{2,1} = T_{2,1}*V_{2,1} * CALL DTRMM('Right', 'Upper', 'No transpose', 'Unit', L, - $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT) + $ K-L, ONE, V(N-K+1,1), LDV, T(K-L+1,1), LDT) * * T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1} * Note: We assume K <= N, and GEMM will do nothing if N=K * CALL DGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, - $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1), - $ LDT) + $ V(1,K-L+1), LDV, V, LDV, ONE, T(K-L+1,1), LDT) * * At this point, we have that T_{2,1} = V_2'*V_1 * All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} @@ -518,17 +620,13 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{2,1} = -T_{2,2}*T_{2,1} * CALL DTRMM('Left', 'Lower', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, - $ T(K-L+1, 1), LDT) + $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT) * * T_{2,1} = T_{2,1}*T_{1,1} * CALL DTRMM('Right', 'Lower', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) - ELSE -* -* Else means RQ case -* + $ K-L, ONE, T, LDT, T(K-L+1,1), LDT) + ELSE IF(RQ) THEN * Break V apart into 6 components * * V = |-----------------------| @@ -543,13 +641,13 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{2,2}\in\R^{l,k-l} rectangular * V_{2,3}\in\R^{l,l} unit lower triangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} 0 | * |T_{2,1} T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -560,51 +658,51 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* -* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) -* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 -* +* +* (I - V_2'*T_{2,2}'*V_2)*(I - V_1'*T_{1,1}'*V_1) +* = I - V_2'*T_{2,2}'*V_2 - V_1'*T_{1,1}'*V_1 + V_2'*T_{2,2}'*V_2*V_1'*T_{1,1}'*V_1 +* * Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |---| * |V_1| * |V_2| * |---| -* +* * So, our product is equivalent to the matrix product -* I - V'*T*V +* I - V'*T'*V * This means, we can compute T_{1,1} and T_{2,2}, then use this information * to compute T_{2,1} * * Compute T_{1,1} recursively * - CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) * * Compute T_{2,2} recursively * - CALL DLARFT(DIRECT, STOREV, N, L, V(K-L+1, 1), LDV, - $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) + CALL DLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV, + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) * * Compute T_{2,1} * T_{2,1} = V_{2,2} * - CALL DLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV, - $ T(K-L+1, 1), LDT) + CALL DLACPY('All', L, K-L, V(K-L+1,N-K+1), LDV, T(K-L+1,1), + $ LDT) * * T_{2,1} = T_{2,1}*V_{1,2}' * CALL DTRMM('Right', 'Lower', 'Transpose', 'Unit', L, K-L, - $ ONE, V(1, N-K+1), LDV, T(K-L+1, 1), LDT) + $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT) * -* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} +* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} * Note: We assume K <= N, and GEMM will do nothing if N=K * - CALL DGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, - $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1), - $ LDT) + CALL DGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, + $ V(K-L+1,1), LDV, V, LDV, ONE, T(K-L+1,1), LDT) * * At this point, we have that T_{2,1} = V_2*V_1' @@ -614,13 +712,103 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{2,1} = -T_{2,2}*T_{2,1} * CALL DTRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L, - $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, - $ T(K-L+1, 1), LDT) + $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT) * * T_{2,1} = T_{2,1}*T_{1,1} * CALL DTRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L, - $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + $ K-L, ONE, T, LDT, T(K-L+1,1), LDT) + ELSE IF(RQT) THEN +* Break V apart into 6 components +* +* V = |-----------------------| +* |V_{1,1} V_{1,2} 0 | +* |V_{2,1} V_{2,2} V_{2,3}| +* |-----------------------| +* +* V_{1,1}\in\R^{k-l,n-k} rectangular +* V_{1,2}\in\R^{k-l,k-l} unit lower triangular +* +* V_{2,1}\in\R^{l,n-k} rectangular +* V_{2,2}\in\R^{l,k-l} rectangular +* V_{2,3}\in\R^{l,l} unit lower triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* | 0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{k-l, k-l} non-unit upper triangular +* T_{2,2}\in\R^{l, l} non-unit upper triangular +* T_{1,2}\in\R^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) +* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 +* +* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) +* +* Compute T_{2,2} recursively +* + CALL DLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV, + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) +* +* Compute T_{1,2} +* T_{1,2} = V_{2,2}' +* + DO I = 1, K-L + DO J = 1, L + T(I,K-L+J) = V(K-L+J, N-K+I) + END DO + END DO +* +* T_{1,2} = V_{1,2}T_{1,2} +* + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit', K-L, L, + $ ONE, V(1,N-K+1), LDV, T(1,K-L+1), LDT) +* +* T_{1,2} = V_{1,1}V_{2,1}' + T_{1,2} +* + CALL DGEMM('No Tranpose', 'Transpose', K-L, L, N-K, ONE, V, + $ LDV, V(K-L+1,1), LDV, ONE, T(1, K-L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1*V_2' +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit', + $ K-L, L, NEG_ONE, T, LDT, T(1, K-L+1), LDT) +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL DTRMM('Right', 'Upper', 'No Transpose', 'Non-Unit', + $ K-L, L, ONE, T(K-L+1,K-L+1), LDT, T(1, K-L+1), LDT) END IF END SUBROUTINE diff --git a/SRC/dlumm.f b/SRC/dlumm.f new file mode 100644 index 0000000000..ca572f6505 --- /dev/null +++ b/SRC/dlumm.f @@ -0,0 +1,349 @@ +*> \brief \b DLUMM computes an in place triangular times triangluar matrix multiplication +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE DLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA, +* $ A, LDA) +* +* .. Scalar Arguments .. +* INTEGER N, LDA +* CHARACTER SIDEL, DIAGL, DIAGU +* DOUBLE PRECISION ALPHA +* +* .. Array Arguments .. +* DOUBLE PRECISION A(LDA,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLUMM performs one of the matrix-matrix operations +*> +*> C = \alpha L * U +*> or +*> C = \alpha U * L +*> +*> where \alpha is a scalar, L is a unit, or non-unit, lower triangular matrix, and U is a unit, or +*> non-unit, upper triangular matrix, and at most one of L and U are non-unit +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDEL +*> \verbatim +*> SIDEL is CHARACTER*1 +*> On entry, SIDE specifies whether L multiplies U from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' A = \alpha L * U +*> +*> SIDE = 'R' or 'r' A = \alpha U * L +*> \endverbatim +*> +*> \param[in] DIAGL +*> \verbatim +*> DIAGL is CHARACTER*1 +*> On entry, DIAGL specifies whether or not L is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' L is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' L is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] DIAGU +*> \verbatim +*> DIAGU is CHARACTER*1 +*> On entry, DIAGU specifies whether or not U is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' U is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' U is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> M is INTEGER +*> On entry, N specifies the number of rows and columns of L and U. M must be at +*> least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is DOUBLE PRECISION. +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then A is not referenced, and A need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension ( LDA, N ) where +*> Before entry the leading n-by-n strictly upper triangular part of the array +*> A must contain the upper triangular matrix U and the strictly lower triangular part of +*> the leading n-by-n submatrix must contain the lower triangular matrix L. +*> If DIAGL != 'U', then the diagonal is assumed to be part of L, and if +*> DIAGU != 'U', then the diagonal is assumed to be part of U. +*> Note: At most one of DIAGL and DIAGU can be not equal to 'U'. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> On entry, LDA specifies the first dimension of A as declared +*> in the calling (sub) program. LDA must be at least max( 1, n ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== +c Cost: 2/3 * (n^3 - n) + RECURSIVE SUBROUTINE DLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA, + $ A, LDA) +* +* .. Scalar Arguments .. + INTEGER N, LDA + CHARACTER SIDEL, DIAGL, DIAGU + DOUBLE PRECISION ALPHA +* +* .. Array Arguments .. + DOUBLE PRECISION A(LDA,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DTRMM, DLASET, XERBLA +* .. +* .. Local Scalars .. + INTEGER K + LOGICAL LLEFT, LUNIT, UUNIT +* .. +* .. Local Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER(ONE=1.0D+0, ZERO=0.0D+0) +* .. +* +* Determine if our flags are valid or not. We can have at +* most one of DIAGU, DIAGL not equal to 'U' +* + LUNIT = LSAME(DIAGL, 'U') + UUNIT = LSAME(DIAGU, 'U') +* +* If both of the above are false, then it is impossible to have the +* structure that we are exploiting in this routine +* Note: It is possible to allow the matrices to share a non-unit +* diagonal as long as the values are the exact same, but there is +* currently no use case for this that I am aware of. +* + IF ((.NOT.LUNIT).AND.(.NOT.UUNIT)) THEN +* +* We say the error is in the last set DIAG value as we cannot know +* what the user actually meant. +* + CALL XERBLA( 'DLUMM', 3 ) + RETURN + END IF +* +* Determine which side L is on +* + LLEFT = LSAME(SIDEL, 'L') +* +* Early exit if possible +* + IF (N.EQ.0) THEN + RETURN + END IF + IF (ALPHA.EQ.ZERO) THEN + CALL DLASET('All', N, N, ZERO, ZERO, A, LDA) + RETURN + END IF +* +* Terminating Case +* + IF (N.EQ.1) THEN +* +* Since at most one of L and U are non-unit triangular, whatever side L is on, we are still +* always computing one of +* +* 1) A(1,1) = ALPHA*A(1,1) +* 2) A(1,1) = ALPHA +* +* Where the first case happens when exactly one of L and U are unit triangular, while the +* second case happens when both L and U are unit triangular +* + IF (LUNIT.AND.UUNIT) THEN + A(1,1) = ALPHA + ELSE + A(1,1) = ALPHA*A(1,1) + END IF + RETURN + END IF +* +* Recursive Case +* + K = N/2 +* +* Regardless of us computing A = L*U or A = U*L, break break A apart as follows: +* +* |---| +* A = | U | +* | L | +* |---| +* +* Further break down L as +* |---------------| +* L = | L_{11} 0 | +* | L_{21} L_{22} | +* |---------------| +* +* Where: +* +* L_{11}\in\R^{k\times k} is lower triangular (assumed unit iff DIAGL == 'U') +* L_{21}\in\R^{n-k\times n} is rectangular +* L_{22}\in\R^{n-k\times n-k} is lower triangular (assumed unit iff DIAGL == 'U') +* +* Further break down U as +* |---------------| +* U = | U_{11} U_{21} | +* | 0 U_{22} | +* |---------------| +* +* Where: +* +* U_{11}\in\R^{k\times k} is upper triangular (assumed unit iff DIAGU == 'U') +* U_{12}\in\R^{n\times n-k} is rectangular +* U_{22}\in\R^{n-k\times n-k} is upper triangular (assumed unit iff DIAGU == 'U') + IF (LLEFT) THEN +* +* This means we are computing +* |---------------| |---------------| +* A = L*U = \alpha | L_{11} 0 | * | U_{11} U_{12} | +* | L_{21} L_{22} | | 0 U_{22} | +* |---------------| |---------------| +* +* |---------------------------------------------| +* = \alpha | L_{11}*U_{11} L_{11}*U_{12} | +* | L_{21}*U_{11} L_{21}*U_{12} + L_{22}*U_{22} | +* |---------------------------------------------| +* +* We compute these in the following order +* +* A_{22} = \alpha*L_{22}*U_{22} (This routine) +* A_{22} = \alpha*L_{21}*U_{12} + A_{22} (GEMM) +* +* A_{12} = \alpha*L_{11}*U_{12} (TRMM) +* A_{21} = \alpha*L_{21}*U_{11} (TRMM) +* +* A_{11} = \alpha*L_{11}*U_{11} (This routine) +* +* Compute A_{22} +* +* A_{22} = \alpha*L_{22}*U_{22} +* + CALL DLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA, + $ A(K+1, K+1), LDA) +* +* A_{22} = \alpha L_{21}*U_{12} + A_{22} +* + CALL DGEMM('No Transpose', 'No Transpose', N-K, N-K, K, + $ ALPHA, A(K+1,1), LDA, A(1,K+1), LDA, ONE, + $ A(K+1,K+1), LDA) +* +* Compute A_{12} +* +* A_{12} = \alpha*L_{11}*U_{12} +* + CALL DTRMM('Left', 'Lower', 'No Transpose', DIAGL, K, + $ N-K, ALPHA, A, LDA, A(1,K+1), LDA) +* +* Compute A_{21} +* +* A_{21} = \alpha*L_{21}*U_{11} +* + CALL DTRMM('Right', 'Upper', 'No Transpose', DIAGU, N-K, + $ K, ALPHA, A, LDA, A(K+1,1), LDA) +* +* Compute A_{11} +* +* A_{11} = \alpha*L_{11}*U_{11} +* + CALL DLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA) + ELSE +* +* This means we are computing +* |---------------| |---------------| +* A = U*L = \alpha | U_{11} U_{12} | * | L_{11} 0 | +* | 0 U_{22} | | L_{21} L_{22} | +* |---------------| |---------------| +* +* |---------------------------------------------| +* = \alpha | U_{11}*L_{11} + U_{12}*L_{21} U_{12}*L_{22} | +* | U_{22}*L_{21} U_{22}*L_{22} | +* |---------------------------------------------| +* +* We compute these in the following order +* +* A_{11} = \alpha*U_{11}*L_{11} (This routine) +* A_{11} = \alpha*U_{12}*L_{21} + A_{11} (GEMM) +* +* A_{12} = \alpha*U_{12}*L_{22} (TRMM) +* A_{21} = \alpha*U_{22}*L_{21} (TRMM) +* +* A_{22} = \alpha*U_{22}*L_{22} (This routine) +* +* Compute A_{11} +* +* A_{11} = \alpha*U_{11}*L_{11} +* + CALL DLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA) +* +* A_{11} = \alpha*U_{12}*L_{21} + A_{11} +* + CALL DGEMM('No Transpose', 'No Transpose', K, K, N-K, + $ ALPHA, A(1,K+1), LDA, A(K+1,1), LDA, ONE, A, LDA) +* +* Compute A_{12} +* +* A_{12} = \alpha*U_{12}*L_{22} +* + CALL DTRMM('Right', 'Lower', 'No Transpose', DIAGL, K, + $ N-K, ALPHA, A(K+1,K+1), LDA, A(1,K+1), LDA) +* +* Compute A_{21} +* +* A_{21} = \alpha*U_{22}*L_{21} +* + CALL DTRMM('Left', 'Upper', 'No Transpose', DIAGU, N-K, + $ K, ALPHA, A(K+1, K+1), LDA, A(K+1,1), LDA) +* +* Compute A_{22} +* +* A_{22} = \alpha*U_{22}*L_{22} +* + CALL DLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA, + $ A(K+1, K+1), LDA) + END IF + END SUBROUTINE diff --git a/SRC/dorgkl.f b/SRC/dorgkl.f new file mode 100644 index 0000000000..5223dc74b8 --- /dev/null +++ b/SRC/dorgkl.f @@ -0,0 +1,173 @@ +*> \brief \b DORGKL computes the explicit Q factor from DGEQLF and DLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DORGKL(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* DOUBLE PRECISION Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORGKL generates an m by n real matrix Q with orthonormal columns, +*> which is defined as the last n columns of the product of n +*> elementary reflectors +*> +*> Q = I - V*T*V**T = H(n) . . . H(2) H(1) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by DGEQLF and T is the n by n matrix returned by DLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V, and the order of T. +*> N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is DOUBLE PRECISION array, dimension (LDQ,N) +*> On entry, Q(1:m-n+i-1,i) contains the vector which defines the +*> elementary reflector H(i), for i=1,...,n as returned by DGEQLF. +*> In addition, the lower triangular portion of the submatrix given +*> by Q(m-n+1:m,1:n) will contain the arry T as returned by DLARFT. +*> See further details for more information. +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The storage of the V and T components inside Q is best illustrated by +*> the following example with m = 5, n = 3. +*> +*> Q = |----------| +*> | V1 V2 V3 | +*> | V1 V2 V3 | +*> | T1 V2 V3 | +*> | T1 T2 V3 | +*> | T1 T2 T3 | +*> |----------| +*> +*> \endverbatim +*> +* ===================================================================== + + SUBROUTINE DORGKL(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + DOUBLE PRECISION Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION NEG_ONE, ONE + PARAMETER(NEG_ONE=-1.0D+0, ONE=1.0D+0) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL DTRMM, DTRTRM, DLUMM +* .. +* .. Intrinsic Functions.. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |---| +* Q = | V | +* | T | +* |---| +* +* Where T is an n-by-n lower triangular matrix, and V is as described +* in the Further Details section +* +* In turn, break apart V as follows +* +* |-----| +* V = | V_2 | +* | V_1 | +* |-----| +* +* Where: +* +* V_1 \in \R^{n\times n} assumed unit upper triangular +* V_2 \in \R^{m-n\times n} +* +* Compute T = T*V_1**T +* + CALL DTRTRM('Right', 'Lower', 'Transpose', 'Non-Unit', 'Unit', + $ N, ONE, Q(M-N+1,1), LDQ, Q(M-N+1,1), LDQ) +* +* Compute Q = -VT. This means that we need to break apart +* Our computation in two parts +* +* |--------| +* Q = | -V_2*T | +* | -V_1*T | +* |--------| +* +* Q_2 = -V_2*T (TRMM) but only when necessary +* + IF (M.GT.N) THEN + CALL DTRMM('Right', 'Lower', 'No Transpose', 'Non-Unit', + $ M-N, N, NEG_ONE, Q(M-N+1,1), LDQ, Q, LDQ) + END IF +* +* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication) +* + CALL DLUMM('Right', 'Non-Unit', 'Unit', N, NEG_ONE, + $ Q(M-N+1,1), LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(M-N+I,I) = Q(M-N+I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/dorgkr.f b/SRC/dorgkr.f new file mode 100644 index 0000000000..51b79c3772 --- /dev/null +++ b/SRC/dorgkr.f @@ -0,0 +1,153 @@ +*> \brief \b DORGKR computes the explicit Q factor from DGEQRF and DLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DORGKR(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* DOUBLE PRECISION Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORGKR generates an m by n real matrix Q with orthonormal columns, +*> which is defined as the first n columns of the product of n +*> elementary reflectors +*> +*> Q = I - V*T*V**T = H(1) H(2) . . . H(n) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by DGEQRF and T is the n by n matrix returned by DLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V, and the order of T. +*> N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is DOUBLE PRECISION array, dimension (LDQ,N) +*> On entry, the upper triangular part and diagonal contains +*> The array T as returned from DLARFT. In addition, the +*> strictly lower triangular portion of the i-th column contains +*> the vector which defines the elementary reflector H(i), +*> for i = 1,2,...,n, as returned by DGEQRF +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + SUBROUTINE DORGKR(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + DOUBLE PRECISION Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION NEG_ONE, ONE + PARAMETER(NEG_ONE=-1.0D+0, ONE=1.0D+0) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL DTRMM, DTRTRM, DLUMM +* .. +* .. Intrinsic Functions.. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |---| +* Q = | T | +* | V | +* |---| +* +* Where T is an n-by-n upper triangular matrix, and V is an +* m-by-n assumed unit lower trapezoidal matrix +* +* In turn, break apart V as follows +* +* |-----| +* V = | V_1 | +* | V_2 | +* |-----| +* +* Where: +* +* V_1 \in \R^{n\times n} assumed unit lower triangular +* V_2 \in \R^{m-n\times n} +* +* Compute T = T*V_1**T +* + CALL DTRTRM('Right', 'Upper', 'Transpose', 'Non-unit', 'Unit', + $ N, ONE, Q, LDQ, Q, LDQ) +* +* Compute Q = -VT. This means that we need to break apart +* Our computation in two parts +* +* |--------| +* Q = | -V_1*T | +* | -V_2*T | +* |--------| +* +* Q_2 = -V_2*T (TRMM) but only when necessary +* + IF (M.GT.N) THEN + CALL DTRMM('Right', 'Upper', 'No Transpose', 'Non-unit', + $ M-N, N, NEG_ONE, Q, LDQ, Q(N+1,1), LDQ) + END IF +* +* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication) +* + CALL DLUMM('Left', 'Unit', 'Non-Unit', N, NEG_ONE, Q, LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,I) = Q(I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/dorglk.f b/SRC/dorglk.f new file mode 100644 index 0000000000..e68939aa8c --- /dev/null +++ b/SRC/dorglk.f @@ -0,0 +1,149 @@ +*> \brief \b DORGLK computes the explicit Q factor from DGELQF and DLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DORGLK(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* DOUBLE PRECISION Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORGLK generates an m by n real matrix Q with orthonormal columns, +*> which is defined as the first n rows of the product of n +*> elementary reflectors +*> +*> Q = I - V'*T*V = H(1) H(2) . . . H(n) +*> +*> Where V is an m by n matrix whose rows are householder reflectors +*> as returned by DGELQF and T is the n by n matrix returned by DLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V, and the order of T. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V. N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is DOUBLE PRECISION array, dimension (LDQ,N) +*> On entry, the lower triangular part and diagonal contains +*> The array T as returned from DLARFT. In addition, the +*> strictly upper triangular portion of the i-th row contains +*> the vector which defines the elementary reflector H(i), +*> for i = 1,2,...,m, as returned by DGELQF +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + SUBROUTINE DORGLK(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + DOUBLE PRECISION Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION NEG_ONE, ONE + PARAMETER(NEG_ONE=-1.0D+0, ONE=1.0D+0) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL DTRMM, DTRTRM, DLUMM +* .. +* .. Intrinsic Functions.. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |-----| +* Q = | T V | +* |-----| +* +* Where T is an m-by-m lower triangular matrix, and V is an +* m-by-n assumed unit upper trapezoidal matrix +* +* In turn, break apart V as follows +* +* |---------| +* V = | V_1 V_2 | +* |---------| +* +* Where: +* +* V_1 \in \R^{m\times m} assumed unit upper triangular +* V_2 \in \R^{m\times n-m} +* +* Compute T = V_1'*T +* + CALL DTRTRM('Left', 'Lower', 'Transpose', 'Non-unit', 'Unit', + $ M, ONE, Q, LDQ, Q, LDQ) +* +* Compute Q = -TV. This means that we need to break apart +* Our computation in two parts +* +* |---------------| +* Q = | -T*V_1 -T*V_2 | +* |---------------| +* +* Q_2 = -T*V_2 (TRMM) but only when necessary +* + IF (N.GT.M) THEN + CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ M, N-M, NEG_ONE, Q, LDQ, Q(1,M+1), LDQ) + END IF +* +* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication) +* + CALL DLUMM('Left', 'Non-unit', 'Unit', M, NEG_ONE, Q, LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,I) = Q(I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/dorglq.f b/SRC/dorglq.f index 71ff93fa66..152aa11c6a 100644 --- a/SRC/dorglq.f +++ b/SRC/dorglq.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGLQ + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -136,17 +138,14 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LDWORK, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORGL2, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORGL2, + $ DORGLK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -161,7 +160,8 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'DORGLQ', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, M )*NB + ! Only need a workspace for calls to dorgl2 + LWKOPT = MAX( 1, M ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -190,94 +190,92 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX( 0, ILAENV( 3, 'DORGLQ', ' ', M, N, K, -1 ) ) IWS = M - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'DORGLQ', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * - LDWORK = M - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked version * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'DORGLQ', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL DORGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + CALL DLARFT( 'Forward', 'Transpose', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), A( I, I ), LDA ) * -* Set A(kk+1:m,1:kk) to zero. +* Apply H to A(i+ib:m,i:n) from the right * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL DLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I+IB,I), LDA) * -* Use unblocked code for the last or only block. +* Apply H to columns i:n of current block + + CALL DORGLK( IB, N-I+1, A( I, I ), LDA) * - IF( KK.LT.M ) - $ CALL DORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) +* Use blocked code * - IF( KK.GT.0 ) THEN + DO I = KI + 1, 1, -NB + IB = NB * -* Use blocked code +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + CALL DLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I), + $ LDA, TAU( I ), A( I, I ), LDA ) * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Apply H to A(i+ib:m,i:n) from the right * - CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL DLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I+IB,I), LDA) * -* Apply H**T to A(i+ib:m,i:n) from the right +* Apply H to columns i:n of current block * - CALL DLARFB( 'Right', 'Transpose', 'Forward', - $ 'Rowwise', - $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK, - $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL DORGLK( IB, N-I+1, A( I, I ), LDA) + END DO * -* Apply H**T to columns i:n of current block +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * - CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set columns 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL DLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I), + $ LDA, TAU( I ), A( I, I ), LDA ) +* +* Apply H to A(i+ib:m,i:n) from the right +* + CALL DLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I+IB,I), LDA) +* +* Apply H to columns i:n of current block +* + CALL DORGLK( IB, N-I+1, A( I, I ), LDA) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/dorgql.f b/SRC/dorgql.f index c7bba63c22..7eb8cd6468 100644 --- a/SRC/dorgql.f +++ b/SRC/dorgql.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGQL + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -95,8 +97,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -137,17 +137,13 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORG2L, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORG2L, + $ DORGKL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -177,7 +173,8 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'DORGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB + ! Only need a workspace for calls to dorg2l + LWKOPT = N END IF WORK( 1 ) = LWKOPT * @@ -200,88 +197,75 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'DORGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We want to use the blocking method as long as our matrix is big enough * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'DORGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL DORG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN +* Use our blocked code for everything +* + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL DORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block * -* Use blocked code + CALL DORGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA) + +* Use blocked code on the remaining blocks if there are any. * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL DLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) * -* Apply H to rows 1:m-k+i+ib-1 of current block +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, - $ TAU( I ), WORK, IINFO ) + CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * -* Set rows m-k+i+ib:m of current block to zero +* Apply H to rows 1:m-k+i+ib-1 of current block * - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL DORGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA) + END DO END IF * WORK( 1 ) = IWS diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f index 83e26588e3..c09b48990f 100644 --- a/SRC/dorgqr.f +++ b/SRC/dorgqr.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGQR + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -95,8 +97,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -119,7 +119,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup ungqr +*> \ingroup doubleOTHERcomputational * * ===================================================================== SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) @@ -137,20 +137,17 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, KI, KK, LWKOPT, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORG2R, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORG2R, + $ DORGKR, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX * .. * .. External Functions .. INTEGER ILAENV @@ -162,7 +159,10 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'DORGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB +* +* Only need a workspace for dorg2r in case of bail out +* + LWKOPT = MAX( 1, N ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -191,95 +191,93 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 - IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN + NX = MAX(0, ILAENV(3, 'DORGQR', ' ', M, N, K, -1)) * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'DORGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'DORGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Set A(1:kk,kk+1:n) to zero. +* Apply H to A(i:m,i+ib:n) from the left * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * -* Use unblocked code for the last or only block. +* Apply H to rows i:m of current block * - IF( KK.LT.N ) - $ CALL DORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL DORGKR(M-I+1, IB, A(I,I), LDA) + DO I = KI + 1, 1, -NB + IB = NB * - IF( KK.GT.0 ) THEN +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Use blocked code + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Apply H to A(i:m,i+ib:n) from the left * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * - CALL DLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Apply H to rows i:m of current block * -* Apply H to A(i:m,i+ib:n) from the left + CALL DORGKR(M-I+1, IB, A(I,I), LDA) + END DO * - CALL DLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * -* Apply H to rows i:m of current block + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * - CALL DORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set rows 1:i-1 of current block to zero + CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE +* Apply H to A(i:m,i+ib:n) from the left +* + CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + +* +* Apply H to rows i:m of current block +* + CALL DORGKR(M-I+1, IB, A(I,I), LDA) + END IF END IF * - WORK( 1 ) = IWS + WORK( 1 ) = N RETURN * * End of DORGQR diff --git a/SRC/dorgrk.f b/SRC/dorgrk.f new file mode 100644 index 0000000000..4017352d3f --- /dev/null +++ b/SRC/dorgrk.f @@ -0,0 +1,168 @@ +*> \brief \b DORGRK computes the explicit Q factor from DGERQF and DLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DORGRK(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* DOUBLE PRECISION Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORGRK generates an m by n real matrix Q with orthonormal rows, +*> which is defined as the last m rows of the product of m +*> elementary reflectors +*> +*> Q = I - V'*T*V = H(m) . . . H(2) H(1) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by DGERQF and T is the n by n matrix returned by DLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V, and the order of T. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V. N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is DOUBLE PRECISION array, dimension (LDQ,N) +*> On entry, Q(i,1:n-m-1+i) contains the vector which defines the +*> elementary reflector H(i), for i=1,...,n as returned by DGERQF. +*> In addition, the upper triangular portion of the submatrix given +*> by Q(1:m,n-m:n) will contain the array T as returned by DLARFT. +*> See further details for more information. +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The storage of the V and T components inside Q is best illustrated by +*> the following example with m = 3, n = 5. +*> +*> Q = |----------------| +*> | V1 V1 T1 T1 T1 | +*> | V2 V2 V2 T2 T2 | +*> | V3 V3 V3 V3 T3 | +*> |----------------| +*> +*> \endverbatim +*> +* ===================================================================== + + SUBROUTINE DORGRK(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + DOUBLE PRECISION Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION NEG_ONE, ONE + PARAMETER(NEG_ONE=-1.0D+0, ONE=1.0D+0) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL DTRMM, DTRTRM, DLUMM +* .. +* .. Intrinsic Functions.. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |-----| +* Q = | V T | +* |-----| +* +* Where T is an m-by-m upper triangular matrix, and V is as described +* in the Further Details section +* +* In turn, break apart V as follows +* +* |---------| +* V = | V_2 V_1 | +* |---------| +* +* Where: +* +* V_1 \in \R^{m\times m} assumed unit lower triangular +* V_2 \in \R^{m\times n-m} +* +* Compute T = V_1'*T +* + CALL DTRTRM('Left', 'Upper', 'Transpose', 'Non-Unit', 'Unit', + $ M, ONE, Q(1,N-M+1), LDQ, Q(1,N-M+1), LDQ) +* +* Compute Q = -TV. This means that we need to break apart +* Our computation in two parts +* +* |---------------| +* Q = | -T*V_2 -T*V_1 | +* |---------------| +* +* Q_2 = -T*V_2 (TRMM) but only when necessary +* + IF (N.GT.M) THEN + CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit', + $ M, N-M, NEG_ONE, Q(1,N-M+1), LDQ, Q, LDQ) + END IF +* +* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication) +* + CALL DLUMM('Right', 'Unit', 'Non-Unit', M, NEG_ONE, + $ Q(1,N-M+1), LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,N-M+I) = Q(I,N-M+I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f index 126489bbe9..9b945df4ea 100644 --- a/SRC/dorgrq.f +++ b/SRC/dorgrq.f @@ -5,6 +5,7 @@ * Online html documentation available at * http://www.netlib.org/lapack/explore-html/ * +*> \htmlonly *> Download DORGRQ + dependencies *> *> [TGZ] @@ -12,6 +13,7 @@ *> [ZIP] *> *> [TXT] +*> \endhtmlonly * * Definition: * =========== @@ -137,17 +139,14 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - DOUBLE PRECISION ZERO - PARAMETER ( ZERO = 0.0D+0 ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK, + INTEGER I, IB, II, IINFO, IWS, KK, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL DLARFB, DLARFT, DORGR2, XERBLA + EXTERNAL DLARFB0C2, DLARFT, DORGR2, + $ DORGRK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -177,7 +176,8 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'DORGRQ', ' ', M, N, K, -1 ) - LWKOPT = M*NB + ! Only need a workspace for calls to dorgr2 + LWKOPT = M END IF WORK( 1 ) = LWKOPT * @@ -207,83 +207,72 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * Determine when to cross over from blocked to unblocked code. * NX = MAX( 0, ILAENV( 3, 'DORGRQ', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN -* -* Determine if workspace is large enough for blocked code. -* - LDWORK = M - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN -* -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. -* - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'DORGRQ', ' ', M, N, K, - $ -1 ) ) - END IF - END IF END IF * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the first block. -* The last kk rows are handled by the block method. -* - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) +* We want to use the blocking method as long as our matrix is big enough +* and it's deemed worthwhile * -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL DORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL DORGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I +* +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL DLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA ) +* +* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) + CALL DLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, + $ A( II, N-K+I ), LDA, A, LDA) * - CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* Apply H to columns 1:n-k+i+ib-1 of current block +* + CALL DORGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA ) + + DO I = NB + 1, K, NB +* +* The last block may be less than size NB +* + IB = MIN(NB, K-I+1) + II = M - K + I * -* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL DLARFB( 'Right', 'Transpose', 'Backward', - $ 'Rowwise', - $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK, - $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK ) - END IF + CALL DLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA ) * -* Apply H**T to columns 1:n-k+i+ib-1 of current block +* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL DLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), + $ LDA, A( II, N-K+I ), LDA, A, LDA) * -* Set columns n-k+i+ib:n of current block to zero +* Apply H to columns 1:n-k+i+ib-1 of current block * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL DORGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA ) + END DO END IF * WORK( 1 ) = IWS diff --git a/SRC/dtrmmoop.f b/SRC/dtrmmoop.f new file mode 100644 index 0000000000..3052d2eef3 --- /dev/null +++ b/SRC/dtrmmoop.f @@ -0,0 +1,1625 @@ +*> \brief \b DTRMMOOP computes an out of place triangular times general matrix multiplication +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, +* $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* .. Scalar Arguments .. +* DOUBLE PRECISION ALPHA, BETA +* INTEGER M, N, LDA, LDB, LDC +* CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A(LDA,*), B(LDB,*), C(LDC,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DTRMMOOP performs one of the matrix-matrix operations +*> +*> C = \alpha op(A) * op(B) + \beta C +*> or +*> C = \alpha op(B) * op(A) + \beta C +*> +*> where \alpha and \beta are scalars, C is an m-by-n matrix, A is +*> a unit, or non-unit, upper or lower triangular matrix, and op(A) is +*> is one of +*> +*> op(A) = A or op(A) = A**T +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> On entry, SIDE specifies whether op(A) multiplies op(B) from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' C = \alpha op(A) * op(B) + \beta C +*> +*> SIDE = 'R' or 'r' C = \alpha op(B) * op(A) + \beta C +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> On entry, UPLO specifies whether the matrix A is an upper or +*> lower triangular matrix as follows: +*> UPLO = 'U' or 'u' A is upper triangular +*> +*> UPLO = 'L' or 'l' A is lower triangular +*> \Endverbatim +*> +*> \param[in] TRANSA +*> \verbatim +*> TRANSA is CHARACTER*1 +*> On entry, TRANSA specifies the form of op(A) to be used in +*> the matrix multiplication as follows: +*> TRANSA = 'N' or 'n' op(A) = A +*> +*> TRANSA = 'T' or 't' op(A) = A**T +*> +*> TRANSA = 'C' or 'c' op(A) = A**T +*> \endverbatim +*> +*> \param[in] TRANSB +*> \verbatim +*> TRANSB is CHARACTER*1 +*> On entry, TRANSB specifies the form of op(B) to be used in +*> the matrix multiplication as follows: +*> TRANSB = 'N' or 'n' op(B) = B +*> +*> TRANSB = 'T' or 't' op(B) = B**T +*> +*> TRANSB = 'C' or 'c' op(B) = B**T +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> On entry, DIAG specifies whether or not A is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' A is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' A is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> On entry, M specifies the number of rows of C. M must be at +*> least zero. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> On entry, N specifies the number of columns of C. N must be +*> at least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is DOUBLE PRECISION. +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then A and B are not referenced, and A and B need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension ( LDA, K ) where +*> K is M when SIDE = 'L' and K is N when SIDE='R' +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array A must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> A is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array A must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> A is not referenced. +*> Note that when DIAG = 'U' or 'u', the diagonal elements of +*> A are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> On entry, LDA specifies the first dimension of A as declared +*> in the calling (sub) program. When SIDE = 'L' or 'l' then +*> LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +*> then LDA must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension ( LDB, K ), where K is M +*> If SIDE='R' and TRANSA='N', or SIDE='L' and TRANSA='T' and N +*> otherwise. On entry, the leading k-by-k submatrix must contain +*> B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> On entry, LDB specifies the first dimension of B as declared +*> in the calling (sub) program. When SIDE = 'R' and TRANSB='N' +*> then LDB must be at least max( 1, m ), when SIDE = 'R' +*> and TRANSB = 'T' then LDB must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] BETA +*> \verbatim +*> BETA is DOUBLE PRECISION. +*> On entry, BETA specifies the scalar beta. When beta is +*> zero then C is not referenced on entry, and C need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is DOUBLE PRECISION array, dimension ( LDC, N ) +*> Before entry, the leading m-by-n part of the array C must +*> contain the matrix C, and on exit is overwritten by the +*> transformed matrix. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> On entry, LDC specifies the first dimension of C as declared +*> in the calling (sub) program. LDC must be at least +*> max( 1, m ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + RECURSIVE SUBROUTINE DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, + $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER M, N, LDA, LDB, LDC + CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG +* .. +* .. Array Arguments .. + DOUBLE PRECISION A(LDA,*), B(LDB,*), C(LDC,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DDOT + EXTERNAL LSAME, DDOT +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DAXPY, DLASET, DSCAL +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Local Scalars .. + INTEGER I, L, K, INCB + LOGICAL LSIDE, UPPER, UNIT, TRANST, TRANSG +* .. +* .. Local Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER(ONE=1.0D+0, ZERO=0.0D+0) +* .. +* +* Beginning of Executable Statements +* + LSIDE = LSAME(SIDE, 'L') + UPPER = LSAME(UPLO, 'U') + ! If we are transposing the triangular matrix (A) + TRANST= LSAME(TRANSA, 'T').OR.LSAME(TRANSA, 'C') + ! If we are transposing the general matrix (B) + TRANSG= LSAME(TRANSB, 'T').OR.LSAME(TRANSB, 'C') +* +* Terminating Case +* + UNIT = LSAME(DIAG, 'U') + IF (M.EQ.1.AND.N.EQ.1) THEN +* +* This case is the simplest as we are just computing C = \alpha A*B + +* \beta C where all components are 1-by-1 matrices +* + + IF (BETA.EQ.ZERO) THEN + C(1,1) = ZERO + ELSE + C(1,1) = C(1,1) * BETA + END IF + IF(ALPHA.NE.ZERO) THEN + IF(UNIT) THEN + C(1,1) = C(1,1) + ALPHA*B(1,1) + ELSE + C(1,1) = C(1,1) + ALPHA*A(1,1)*B(1,1) + END IF + END IF + RETURN + ELSE IF (M.EQ.1) THEN +* +* This means that C is a row vector. If BETA is 0, then we +* set it explicitly, otherwise we overwrite it with BETA*C +* + IF (BETA.EQ.ZERO) THEN + ! This ensures we don't reference C unless we need to + CALL DLASET('All', M, N, ZERO, ZERO, C, LDC) + ELSE + CALL DSCAL(N, BETA, C, LDC) + END IF + IF (ALPHA.NE.ZERO) THEN +* +* Recall that the number of columns of A is determined by SIDE +* + IF (LSIDE) THEN +* +* Determine if B is a row or column vector +* + IF (TRANSG) THEN + INCB = 1 ! This means that B is a column vector + ELSE + INCB = LDB ! This means that B is a row vector + END IF +* +* This means that A is a scalar, so it is either assumed to be +* ONE or explicitly stored in A(1,1) +* + IF (UNIT) THEN + CALL DAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + CALL DAXPY(N, ALPHA * A(1,1), B, INCB, C, LDC) + END IF + ELSE ! A is on the right +* +* Determine if B is a row or column vector +* + IF (TRANSG) THEN + INCB = 1 ! This means that B is a column vector + ELSE + INCB = LDB ! This means that B is a row vector + END IF +* +* This means that A is an n-by-n matrix +* + IF (UPPER) THEN + IF (TRANST) THEN + IF (TRANSG) THEN + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * DDOT(N-I, + $ B(I+1,1), 1, A(I,I+1), LDA) + + $ C(1,I) + END DO + CALL DAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * DDOT(N-I+1, + $ B(I,1), 1, A(I,I), LDA) + + $ C(1,I) + END DO + END IF + ELSE ! Not transposing B + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * DDOT(N-I, + $ B(1,I+1), LDB, A(I,I+1), LDA) + + $ C(1,I) + END DO + CALL DAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * DDOT(N-I+1, + $ B(1,I), LDB, A(I,I), LDA) + + $ C(1,I) + END DO + END IF + END IF + ELSE ! Not transposing A + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * DDOT(I-1, B, INCB, + $ A(1,I), 1) + C(1,I) + END DO + + CALL DAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * DDOT(I, B, INCB, + $ A(1,I), 1) + C(1,I) + END DO + END IF + END IF + ELSE ! A is lower + IF (TRANST) THEN + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * DDOT(I-1, B, INCB, + $ A(I,1), LDA) + C(1,I) + END DO + + CALL DAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * DDOT(I, B, INCB, + $ A(I,1), LDA) + C(1,I) + END DO + END IF + ELSE ! A is not transposed + IF (TRANSG) THEN + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * DDOT(N-I, + $ B(I+1,1), 1, A(I+1,I), 1) + + $ C(1,I) + END DO + CALL DAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * DDOT(N-I+1, + $ B(I,1), 1, A(I,I), 1) + + $ C(1,I) + END DO + END IF + ELSE! B is not transposed + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * DDOT(N-I, + $ B(1,I+1), LDB, A(I+1,I), 1) + + $ C(1,I) + END DO + CALL DAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * DDOT(N-I+1, + $ B(1,I), LDB, A(I,I), 1) + + $ C(1,I) + END DO + END IF + END IF + END IF + END IF + END IF + END IF + RETURN + ELSE IF (N.EQ.1) THEN +* +* This means that C is a column vector. If BETA is 0, then we +* set it explicitly, otherwise we overwrite it with BETA*C +* + IF (BETA.EQ.ZERO) THEN + ! This ensures we don't reference C unless we need to + CALL DLASET('All', M, N, ZERO, ZERO, C, LDC) + ELSE + CALL DSCAL(M, BETA, C, 1) + END IF + + IF (ALPHA.NE.ZERO) THEN + IF (TRANSG) THEN + INCB = LDB ! B is a row vector + ELSE + INCB = 1 ! B is a column vector + END IF + IF (LSIDE) THEN + IF (UPPER) THEN + IF (TRANST) THEN + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * DDOT(I-1, A(1, I), + $ 1, B, INCB) + C(I,1) + END DO + CALL DAXPY(M, ALPHA, B, INCB, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * DDOT(I, A(1, I), 1, + $ B, INCB) + C(I,1) + END DO + END IF + ELSE ! A is not transposed + IF (TRANSG) THEN + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * DDOT(M-I, + $ A(I,I+1), LDA, B(1, I+1), LDB) + + $ C(I,1) + END DO + + CALL DAXPY(M, ALPHA, B, LDB, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * DDOT(M-I+1, + $ A(I,I), LDA, B(1,I), LDB) + + $ C(I,1) + END DO + END IF + ELSE ! B is not transposed + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * DDOT(M-I, + $ A(I,I+1), LDA, B(I+1,1), 1) + + $ C(I,1) + END DO + + CALL DAXPY(M, ALPHA, B, 1, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * DDOT(M-I+1, + $ A(I,I), LDA, B(I,1), 1) + + $ C(I,1) + END DO + END IF + END IF + END IF + ELSE ! A is lower + IF (TRANST) THEN + IF (TRANSG) THEN + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * DDOT(M-I, + $ A(I+1,I), 1, B(1,I+1), LDB) + + $ C(I,1) + END DO + + CALL DAXPY(M, ALPHA, B, LDB, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * DDOT(M-I+1, + $ A(I,I), 1, B(1,I), LDB) + + $ C(I,1) + END DO + END IF + ELSE ! A is not transposed + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * DDOT(M-I, + $ A(I+1,I), 1, B(I+1,1), 1) + + $ C(I,1) + END DO + + CALL DAXPY(M, ALPHA, B, 1, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * DDOT(M-I+1, + $ A(I,I), 1, B(I,1), 1) + + $ C(I,1) + END DO + END IF + END IF + ELSE ! A is not transposed + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * DDOT(I-1, A(I,1), + $ LDA, B, INCB) + C(I,1) + END DO + CALL DAXPY(M, ALPHA, B, INCB, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * DDOT(I, A(I,1), LDA, + $ B, INCB) + C(I,1) + END DO + END IF + END IF + END IF + ELSE ! A is on the right + ! Since the trailing dimension of op(A) must be 1, + ! we know that A must be a scalar + IF (UNIT) THEN + CALL DAXPY(M, ALPHA, B, INCB, C, 1) + ELSE + CALL DAXPY(M, ALPHA*A(1,1), B, INCB, C, 1) + END IF + END IF + END IF + RETURN + END IF +* +* Recursive Case +* + L = MIN(M,N)/2 + IF (LSIDE) THEN +* +* We are multiplying A from the left IE we are computing +* C = \alpha op(A)*op(B) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing +* +* So we are computing +* C = \alpha A**T * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11}**T + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{21}**T + \beta C_{12} +* C_{21} = \alpha A_{12}**T * B_{11}**T + \alpha A_{22}**T * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{12}**T * B_{21}**T + \alpha A_{22}**T * B_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{12}**T * B_{11}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22}**T * B_{12}**T + C_{21} (This routine) +* +* C_{22} = \alpha A_{12}**T * B_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22}**T * B_{22}**T + C_{22} (This routine) +* + ! C_{11} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, + $ LDC) + ! C_{12} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ BETA, C(1, L+1), LDC) + ! C_{21} + CALL DGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(1, L+1), LDA, B, LDB, BETA, C(L+1,1), + $ LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1,L+1), LDA, B(1,L+1), + $ LDB, ONE, C(L+1,1), LDC) + ! C_{22} + CALL DGEMM(TRANSA, TRANSB, M-L, N-L, L, ALPHA, + $ A(1, L+1), LDA, B(L+1,1), LDB, BETA, + $ C(L+1,L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1,L+1), LDA, + $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A**T * B + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11} + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{12} + \beta C_{12} +* C_{21} = \alpha A_{12}**T * B_{11} + \alpha A_{22}**T * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{12}**T * B_{12} + \alpha A_{22}**T * B_{22} + \beta C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{12}**T * B_{11} + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22}**T * B_{21} + C_{21} (This routine) +* +* C_{22} = \alpha A_{12}**T * B_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22}**T * B_{22} + C_{22} (This routine) +* + ! C_{11} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + ! C_{21} + CALL DGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(1, L+1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(L+1, 1), LDC) + ! C_{22} + CALL DGEMM(TRANSA, TRANSB, M-L, N-L, L, + $ ALPHA, A(1, L+1), LDA, B(1, L+1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11}**T + \alpha A_{12} * B_{12}**T + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12} +* C_{21} = \alpha A_{22} * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{22} * B_{22}**T + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{12} * B_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11} * B_{11}**T + C_{11} (This routine) +* +* C_{12} = \alpha A_{12} * B_{22}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11} * B_{21}**T + C_{12} (This routine) +* + ! C_{11} + CALL DGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(1, L+1), LDA, B(1, L+1), LDB, BETA, C, LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL DGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1,1), LDB, ONE, + $ C(1, L+1), LDC) + ! C_{21} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(L+1, 1), LDC) + ! C_{22} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A * B + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11} + \alpha A_{12} * B_{21} + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{12} + \alpha A_{12} * B_{22} + \beta C_{12} +* C_{21} = \alpha A_{22} * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{22} * B_{22} + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{12} * B_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11} * B_{11} + C_{11} (This routine) +* +* C_{12} = \alpha A_{12} * B_{22} + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11} * B_{12} + C_{12} (This routine) +* + ! C_{11} + CALL DGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, 1), LDB, BETA, C, LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL DGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, + $ ONE, C(1, L+1), LDC) + ! C_{21} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(L+1, 1), LDC) + ! C_{22} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + END IF + ELSE +* +* A is lower triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A**T * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11}**T + \alpha A_{21}**T * B_{12}**T + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{21}**T + \alpha A_{21}**T * B_{22}**T + \beta C_{12} +* C_{21} = \alpha A_{22}**T * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{22}**T * B_{22}**T + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{21}**T * B_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11}**T * B_{11}**T + C_{11} (This routine) +* +* C_{12} = \alpha A_{21}**T * B_{22}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11}**T * B_{21}**T + C_{12} (This routine) +* + ! C_{11} + CALL DGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(1, L+1), LDB, BETA, C, LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL DGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE, + $ C(1, L+1), LDC) + ! C_{21} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(L+1, 1), LDC) + ! C_{22} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A**T * B + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11} + \alpha A_{21}**T * B_{21} + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{12} + \alpha A_{21}**T * B_{22} + \beta C_{12} +* C_{21} = \alpha A_{22}**T * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{22}**T * B_{22} + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{21}**T * B_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11}**T * B_{11} + C_{11} (This routine) +* +* C_{12} = \alpha A_{21}**T * B_{22} + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11}**T * B_{12} + C_{12} (This routine) +* + ! C_{11} + CALL DGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, C, LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL DGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, ONE, + $ C(1, L+1), LDC) + ! C_{21} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(L+1, 1), LDC) + ! C_{22} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11}**T + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{21}**T + \beta C_{12} +* C_{21} = \alpha A_{21} * B_{11}**T + \alpha A_{22} * B_{12}**T + \beta * C_{21} +* C_{22} = \alpha A_{21} * B_{21}**T + \alpha A_{22} * B_{22}**T + \beta * C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{21} * B_{11}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22} * B_{12}**T + C_{21} (This routine) +* +* C_{22} = \alpha A_{21} * B_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22} * B_{22}**T + C_{22} (This routine) +* + ! C_{11} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ BETA, C(1, L+1), LDC) + ! C_{21} + CALL DGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(L+1, 1), LDC) + ! C_{22} + CALL DGEMM(TRANSA, TRANSB, M-L, N-L, L, + $ ALPHA, A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A * B + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11} + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{12} + \beta C_{12} +* C_{21} = \alpha A_{21} * B_{11} + \alpha A_{22} * B_{21} + \beta * C_{21} +* C_{22} = \alpha A_{21} * B_{12} + \alpha A_{22} * B_{22} + \beta * C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{21} * B_{11} + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22} * B_{21} + C_{21} (This routine) +* +* C_{22} = \alpha A_{21} * B_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22} * B_{22} + C_{22} (This routine) +* + ! C_{11} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(1, L+1), LDC) + ! C_{21} + CALL DGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(L+1, 1), LDC) + ! C_{22} + CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, A(L+1, 1), LDA, B(1, L+1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + END IF + END IF + ELSE +* +* We are multiplying A from the right IE we are computing +* C = \alpha op(B)*op(A) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11}**T + \alpha B_{21}**T * A_{12}**T + \beta C_{11} +* C_{12} = \alpha B_{21}**T * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11}**T + \alpha B_{22}**T * A_{12}**T + \beta C_{21} +* C_{22} = \alpha B_{22}**T * A_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{21}**T * A_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11}**T * A_{11}**T + C_{11} (This routine) +* +* C_{21} = \alpha B_{22}**T * A_{12}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{12}**T * A_{11}**T + C_{21} (This routine) +* + ! C_{11} + CALL DGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(L+1, 1), LDB, A(1, L+1), LDA, BETA, C, LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(1, L+1), LDC) + ! C_{21} + CALL DGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ ONE, C(L+1, 1), LDC) + ! C_{22} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11}**T + \alpha B_{12} * A_{12}**T + \beta C_{11} +* C_{12} = \alpha B_{12} * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11}**T + \alpha B_{22} * A_{12}**T + \beta C_{21} +* C_{22} = \alpha B_{22} * A_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{12} * A_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11} * A_{11}**T + C_{11} (This routine) +* +* C_{21} = \alpha B_{22} * A_{12}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{21} * A_{11}**T + C_{21} (This routine) +* + ! C_{11} + CALL DGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(1,L+1), LDB, A(1,L+1), LDA, BETA, C, LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(1, L+1), LDC) + ! C_{21} + CALL DGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ ONE, C(L+1, 1), LDC) + ! C_{22} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11}**T * A_{12} + \alpha B_{21}**T * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11} + \beta C_{21} +* C_{22} = \alpha B_{12}**T * A_{12} + \alpha B_{22}**T * A_{22} + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11}**T * A_{12} + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{21}**T * A_{22} + C_{12} (This routine) +* +* C_{22} = \alpha B_{12}**T * A_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22}**T * A_{22} + C_{22} (This routine) +* + ! C_{11} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL DGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(1, L+1), LDC) + ! C_{21} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(L+1, 1), LDC) + ! C_{22} + CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, B(1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11} * A_{12} + \alpha B_{12} * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11} + \beta C_{21} +* C_{22} = \alpha B_{21} * A_{12} + \alpha B_{22} * A_{22} + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11} * A_{12} + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{12} * A_{22} + C_{12} (This routine) +* +* C_{22} = \alpha B_{21} * A_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22} * A_{22} + C_{22} (This routine) +* + ! C_{11} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL DGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(1, L+1), LDC) + ! C_{21} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, 1), LDC) + ! C_{22} + CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, B(L+1, 1), LDB, A(1, L+1), LDA, + $ BETA, C(L+1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + END IF + ELSE +* +* A is lower triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11}**T + \beta C_{11} +* C_{12} = \alpha B_{11}**T * A_{21}**T + \alpha B_{21}**T * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11}**T + \beta C_{21} +* C_{22} = \alpha B_{12}**T * A_{21}**T + \alpha B_{22}**T * A_{22}**T + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11}**T * A_{21}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{21}**T * A_{22}**T + C_{12} (This routine) +* +* C_{22} = \alpha B_{12}**T * A_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22}**T * A_{22}**T + C_{22} (This routine) +* + ! C_{11} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL DGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(1, L+1), LDC) + ! C_{21} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(L+1, 1), LDC) + ! C_{22} + CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA, + $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11} * A_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11}**T + \beta C_{21} +* C_{22} = \alpha B_{21} * A_{21}**T + \alpha A_{22} * B_{22}**T + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11} * A_{21}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{12} * A_{22}**T + C_{12} (This routine) +* +* C_{22} = \alpha B_{21} * A_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22} * A_{22}**T + C_{22} (This routine) +* + ! C_{11} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL DGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(1, L+1), LDC) + ! C_{21} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, 1), LDC) + ! C_{22} + CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA, + $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11} + \alpha B_{21}**T * A_{21} + \beta C_{11} +* C_{12} = \alpha B_{21}**T * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11} + \alpha B_{22}**T * A_{21} + \beta C_{21} +* C_{22} = \alpha B_{22}**T * A_{22} + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{21}**T * A_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11}**T * A_{11} + C_{11}(This routine) +* +* C_{21} = \alpha B_{22}**T * A_{21} + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{12}**T * A_{11} + C_{21} (This routine) +* + ! C_{11} + CALL DGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, C, LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(1, L+1), LDC) + ! C_{21} + CALL DGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, ONE, + $ C(L+1, 1), LDC) + ! C_{22} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \alpha B_{12} * A_{21} + \beta C_{11} +* C_{12} = \alpha B_{12} * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11} + \alpha B_{22} * A_{21} + \beta C_{21} +* C_{22} = \alpha B_{22} * A_{22} + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{12} * A_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11} * A_{11} + C_{11}(This routine) +* +* C_{21} = \alpha B_{22} * A_{21} + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{21} * A_{11} + C_{21} (This routine) +* + ! C_{11} + CALL DGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, C, LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(1, L+1), LDC) + ! C_{21} + CALL DGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE, + $ C(L+1, 1), LDC) + ! C_{22} + CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + END IF + END IF + END IF + END SUBROUTINE diff --git a/SRC/dtrtrm.f b/SRC/dtrtrm.f new file mode 100644 index 0000000000..6a5aabb3b7 --- /dev/null +++ b/SRC/dtrtrm.f @@ -0,0 +1,577 @@ +*> \brief \b DTRTRM computes an in place triangular-triangular matrix product +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE DTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, +* $ N, ALPHA, T, LDT, V, LDV) +* +* .. Scalar Arguments .. +* INTEGER N, LDT, LDV +* CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV +* DOUBLE PRECISION ALPHA +* .. +* .. Array Arguments .. +* DOUBLE PRECISION T(LDT,*), V(LDV,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DTRTRM performs one of the matrix-matrix operations +*> +*> T = \alpha op(V) * T +*> or +*> T = \alpha T * op(V) +*> where \alpha is a scalar, T and V are unit, or non-unit, upper or +*> lower triangular matrix, and op(V) is one of +*> +*> op(V) = V or op(V) = V**T +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> On entry, SIDE specifies whether op(V) multiplies T from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' T = \alpha op(V) * T +*> +*> SIDE = 'R' or 'r' T = \alpha T * op(V) +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> On entry, UPLO specifies whether the matrix T is an upper or +*> lower triangular matrix as follows: +*> UPLO = 'U' or 'u' T is upper triangular +*> +*> UPLO = 'L' or 'l' T is lower triangular +*> \Endverbatim +*> +*> \param[in] TRANSV +*> \verbatim +*> TRANSV is CHARACTER*1 +*> On entry, TRANSV specifies the form of op(V) to be used in +*> the matrix multiplication as follows: +*> TRANSV = 'N' or 'n' op(V) = V +*> +*> TRANSV = 'T' or 't' op(V) = V**T +*> +*> TRANSV = 'C' or 'c' op(V) = V**T +*> \endverbatim +*> +*> \param[in] DIAGT +*> \verbatim +*> DIAGT is CHARACTER*1 +*> On entry, DIAGT specifies whether or not T is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' T is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' T is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] DIAGV +*> \verbatim +*> DIAGV is CHARACTER*1 +*> On entry, DIAGV specifies whether or not V is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' V is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' V is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> On entry, N specifies the number of rows and columns of T. +*> N must be at least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is DOUBLE PRECISION. +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then T and V are not referenced, and T and V need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension ( LDT, N ) +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array T must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> T is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array T must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> T is not referenced. +*> Note that when DIAGT = 'U' or 'u', the diagonal elements of +*> T are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> On entry, LDT specifies the first dimension of T as declared +*> in the calling (sub) program. LDT must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is DOUBLE PRECISION array, dimension ( LDV, N ) +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array op(V) must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> V is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array op(V) must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> V is not referenced. +*> Note that when DIAGV = 'U' or 'u', the diagonal elements of +*> V are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> On entry, LDV specifies the first dimension of T as declared +*> in the calling (sub) program. LDV must be at least max( 1, n ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + RECURSIVE SUBROUTINE DTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, + $ N, ALPHA, T, LDT, V, LDV) +* +* .. Scalar Arguments .. + INTEGER N, LDT, LDV + CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV + DOUBLE PRECISION ALPHA +* .. +* .. Array Arguments .. + DOUBLE PRECISION T(LDT,*), V(LDV,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DTRMM, DTRMMOOP, DLASET +* .. +* .. Local Scalars .. + INTEGER K, INFO + LOGICAL TLEFT, TUPPER, VTRANS, VUNIT, TUNIT +* .. +* .. Local Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER(ONE=1.0D+0, ZERO=0.0D+0) +* .. +* +* Beginning of Executable Statements +* +* +* Early Termination Criteria +* + IF (ALPHA.EQ.ZERO) THEN +* +* If ALPHA is 0, then we are just setting T to be the 0 matrix +* + CALL DLASET(UPLO, N, N, ZERO, ZERO, T, LDT) + RETURN + END IF + TUNIT = LSAME(DIAGT, 'U') + VUNIT = LSAME(DIAGV, 'U') +* +* Terminating Case +* + IF (N.EQ.1) THEN + IF (VUNIT.AND.TUNIT) THEN + T(1,1) = ALPHA + ELSE IF (VUNIT) THEN + T(1,1) = ALPHA*T(1,1) + ELSE IF (TUNIT) THEN + T(1,1) = ALPHA*V(1,1) + ELSE + T(1,1) = ALPHA*T(1,1)*V(1,1) + END IF + RETURN + ELSE IF(N.LE.0) THEN + RETURN + END IF +* +* Recursive case +* + TUPPER = LSAME(UPLO,'U') + TLEFT = LSAME(SIDE,'R') + VTRANS = LSAME(TRANSV,'T').OR.LSAME(TRANSV,'C') + + K = N / 2 + IF(TUPPER) THEN +* +* T is upper triangular +* + IF(TLEFT) THEN +* +* Compute T = T*op(V) +* + IF(VTRANS) THEN +* +* We are computing T = T*V**T, which we break down as follows +* |--------------| |--------------| |--------------------| +* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11}**T V_{21}**T| +* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}**T| +* |--------------| |--------------| |--------------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11}**T +* T_{12} = \alpha T_{11}*V_{21}**T + \alpha T_{12}*V_{22}**T +* T_{22} = \alpha T_{22}*V_{22}**T +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha T_{12}*V_{22}**T (DTRMM) +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} (DTRMMOOP) +* +* T_{12} = \alpha T_{12}*V_{22}**T +* + CALL DTRMM('Right', 'Lower', TRANSV, DIAGV, K, + $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} +* + CALL DTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT, + $ V(K+1, 1), LDV, ONE, T(1, K+1), LDT) + ELSE +* +* We are computing T = T*V, which we break down as follows +* |--------------| |--------------| |-------------| +* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11} V_{12}| +* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}| +* |--------------| |--------------| |-------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11} +* T_{12} = \alpha T_{11}*V_{12} + \alpha T_{12}*V_{22} +* T_{22} = \alpha T_{22}*V_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha T_{12}*V_{22} (DTRMM) +* T_{12} = \alpha T_{11}*V_{12} + T_{12} (DTRMMOOP) +* +* T_{12} = \alpha T_{12}*V_{22} +* + CALL DTRMM('Right', 'Upper', TRANSV, DIAGV, K, + $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} +* + CALL DTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT, + $ V(1, K+1), LDV, ONE, T(1, K+1), LDT) + END IF + ELSE +* +* Compute T = op(V)*T +* + IF(VTRANS) THEN +* +* We are computing T = V**T*T, which we break down as follows +* |--------------| |--------------------| |--------------| +* |T_{11} T_{12}| |V_{11}**T V_{21}**T| |T_{11} T_{12}| +* |0 T_{22}| = \alpha |0 V_{22}**T| * |0 T_{22}| +* |--------------| |--------------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}**T*T_{11} +* T_{12} = \alpha V_{11}**T*T_{12} + \alpha V_{21}**T*T_{22} +* T_{22} = \alpha V_{22}**T*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha V_{11}**T*T_{12} (DTRMM) +* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} (DTRMMOOP) +* +* T_{12} = \alpha V_{11}**T*T_{12} +* + CALL DTRMM('Left', 'Lower', TRANSV, DIAGV, K, + $ N-K, ALPHA, V, LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} +* + CALL DTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1), + $ LDT, V(K+1, 1), LDV, ONE, T(1, K+1), LDT) + ELSE +* +* We are computing T = V*T, which we break down as follows +* |--------------| |--------------| |--------------| +* |T_{11} T_{12}| |V_{11} V_{12}| |T_{11} T_{12}| +* |0 T_{22}| = \alpha |0 V_{22}| * |0 T_{22}| +* |--------------| |--------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}*T_{11} +* T_{12} = \alpha V_{11}*T_{12} + \alpha V_{12}*T_{22} +* T_{22} = \alpha V_{22}*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha V_{11}*T_{12} (DTRMM) +* T_{12} = \alpha V_{12}*T_{22} + T_{12} (DTRMMOOP) +* +* T_{12} = \alpha V_{11}*T_{12} +* + CALL DTRMM('Left', 'Upper', TRANSV, DIAGV, K, + $ N-K, ALPHA, V, LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha V_{12}*T_{22} + T_{12} (DTRMMOOP) +* + CALL DTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1), + $ LDT, V(1, K+1), LDV, ONE, T(1, K+1), LDT) + END IF + END IF + ELSE +* +* T is lower triangular +* + IF(TLEFT) THEN +* +* Compute T = T*op(V) +* + IF(VTRANS) THEN +* +* We are computing T = T*V**T, which we break down as follows +* |--------------| |--------------| |--------------------| +* |T_{11} 0 | |T_{11} 0 | |V_{11}**T 0 | +* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{12}**T V_{22}**T| +* |--------------| |--------------| |--------------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11}**T +* T_{21} = \alpha T_{21}*V_{11}**T + \alpha T_{22}*V_{12}**T +* T_{22} = \alpha T_{22}*V_{22}**T +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha T_{21}*V_{11}**T (DTRMM) +* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} (DTRMMOOP) +* +* T_{21} = \alpha T_{21}*V_{11}**T +* + CALL DTRMM('Right', 'Upper', TRANSV, DIAGV, N-K, + $ K, ALPHA, V, LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} +* + CALL DTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1), + $ LDT, V(1, K+1), LDV, ONE, T(K+1, 1), LDT) + ELSE +* +* We are computing T = T*V, which we break down as follows +* |--------------| |--------------| |-------------| +* |T_{11} 0 | |T_{11} 0 | |V_{11} 0 | +* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{21} V_{22}| +* |--------------| |--------------| |-------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11} +* T_{21} = \alpha T_{21}*V_{11} + \alpha T_{22}*V_{21} +* T_{22} = \alpha T_{22}*V_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha T_{21}*V_{11} (DTRMM) +* T_{21} = \alpha T_{22}*V_{21} + T_{21} (DTRMMOOP) +* +* T_{21} = \alpha T_{21}*V_{11} +* + CALL DTRMM('Right', 'Lower', TRANSV, DIAGV, N-K, + $ K, ALPHA, V, LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha T_{22}*V_{12} + T_{21} +* + CALL DTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1), + $ LDT, V(K+1, 1), LDV, ONE, T(K+1, 1), LDT) + END IF + ELSE +* +* Compute T = op(V)*T +* + IF(VTRANS) THEN +* +* We are computing T = V**T*T, which we break down as follows +* |--------------| |--------------------| |--------------| +* |T_{11} 0 | |V_{11}**T 0 | |T_{11} 0 | +* |T_{21} T_{22}| = \alpha |V_{12}**T V_{22}**T| * |T_{21} T_{22}| +* |--------------| |--------------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}**T*T_{11} +* T_{21} = \alpha V_{12}**T*T_{11} + \alpha V_{22}**T*T_{21} +* T_{22} = \alpha V_{22}**T*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha V_{22}**T*T_{21} (DTRMM) +* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} (DTRMMOOP) +* +* T_{21} = \alpha V_{22}**T*T_{21} +* + CALL DTRMM('Left', 'Upper', TRANSV, DIAGV, N-K, K, + $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} +* + CALL DTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT, + $ V(1, K+1), LDV, ONE, T(K+1, 1), LDT) + ELSE +* +* We are computing T = V*T, which we break down as follows +* |--------------| |-------------| |--------------| +* |T_{11} 0 | |V_{11} 0 | |T_{11} 0 | +* |T_{21} T_{22}| = \alpha |V_{21} V_{22}| * |T_{21} T_{22}| +* |--------------| |-------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}*T_{11} +* T_{21} = \alpha V_{21}*T_{11} + \alpha V_{22}*T_{21} +* T_{22} = \alpha V_{22}*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{21} = \alpha V_{22}*T_{21} (DTRMM) +* T_{21} = \alpha V_{12}*T_{11} + T_{21} (DTRMMOOP) +* +* T_{21} = \alpha V_{22}*T_{12} +* + CALL DTRMM('Left', 'Lower', TRANSV, DIAGV, N-K, K, + $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha V_{12}*T_{11} + T_{21} +* + CALL DTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT, + $ V(K+1, 1), LDV, ONE, T(K+1, 1), LDT) + END IF + END IF + END IF +* +* Since in all the above cases, we compute T_{11} and T_{22} +* the same, we pass in our flags and call this routine recursively +* +* Compute T_{11} recursively +* + CALL DTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, K, ALPHA, + $ T, LDT, V, LDV) +* +* Compute T_{22} recursively +* + CALL DTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, N-K, ALPHA, + $ T(K+1, K+1), LDT, V(K+1, K+1), LDV) + + END SUBROUTINE diff --git a/SRC/lapack_64.h b/SRC/lapack_64.h index e8000bf2c4..f899d642a8 100644 --- a/SRC/lapack_64.h +++ b/SRC/lapack_64.h @@ -331,6 +331,7 @@ #define CLARF1F CLARF1F_64 #define CLARF1L CLARF1L_64 #define CLARFB CLARFB_64 +#define CLARFB0C2 CLARFB0C2_64 #define CLARFB_GETT CLARFB_GETT_64 #define CLARFG CLARFG_64 #define CLARFGP CLARFGP_64 @@ -551,9 +552,18 @@ #define CUNGQR CUNGQR_64 #define CUNGR2 CUNGR2_64 #define CUNGRQ CUNGRQ_64 +#define CUNGRQ CUNGRQ_64 +#define CUNGLK CUNGLK_64 +#define CUNGKL CUNGKL_64 +#define CUNGKR CUNGKR_64 +#define CUNGRK CUNGRK_64 #define CUNGTR CUNGTR_64 #define CUNGTSQR CUNGTSQR_64 #define CUNGTSQR_ROW CUNGTSQR_ROW_64 +#define CLUMM CLUMM_64 +#define CTRTRM CTRTRM_64 +#define CACXPY CACXPY_64 +#define CTRMMOOP CTRMMOOP_64 #define CUNHR_COL CUNHR_COL_64 #define CUNM22 CUNM22_64 #define CUNM2L CUNM2L_64 @@ -805,6 +815,7 @@ #define DLARF1F DLARF1F_64 #define DLARF1L DLARF1L_64 #define DLARFB DLARFB_64 +#define DLARFB0C2 DLARFB0C2_64 #define DLARFB_GETT DLARFB_GETT_64 #define DLARFG DLARFG_64 #define DLARFGP DLARFGP_64 @@ -905,9 +916,16 @@ #define DORGQR DORGQR_64 #define DORGR2 DORGR2_64 #define DORGRQ DORGRQ_64 +#define DORGLK DORGLK_64 +#define DORGKL DORGKL_64 +#define DORGKR DORGKR_64 +#define DORGRK DORGRK_64 #define DORGTR DORGTR_64 #define DORGTSQR DORGTSQR_64 #define DORGTSQR_ROW DORGTSQR_ROW_64 +#define DLUMM DLUMM_64 +#define DTRTRM DTRTRM_64 +#define DTRMMOOP DTRMMOOP_64 #define DORHR_COL DORHR_COL_64 #define DORM22 DORM22_64 #define DORM2L DORM2L_64 @@ -1400,6 +1418,7 @@ #define SLARF1F SLARF1F_64 #define SLARF1L SLARF1L_64 #define SLARFB SLARFB_64 +#define SLARFB0C2 SLARFB0C2_64 #define SLARFB_GETT SLARFB_GETT_64 #define SLARFG SLARFG_64 #define SLARFGP SLARFGP_64 @@ -1499,9 +1518,16 @@ #define SORGQR SORGQR_64 #define SORGR2 SORGR2_64 #define SORGRQ SORGRQ_64 +#define SORGLK SORGLK_64 +#define SORGKL SORGKL_64 +#define SORGKR SORGKR_64 +#define SORGRK SORGRK_64 #define SORGTR SORGTR_64 #define SORGTSQR SORGTSQR_64 #define SORGTSQR_ROW SORGTSQR_ROW_64 +#define SLUMM SLUMM_64 +#define STRTRM STRTRM_64 +#define STRMMOOP STRMMOOP_64 #define SORHR_COL SORHR_COL_64 #define SORM22 SORM22_64 #define SORM2L SORM2L_64 @@ -2046,6 +2072,7 @@ #define ZLARF1F ZLARF1F_64 #define ZLARF1L ZLARF1L_64 #define ZLARFB ZLARFB_64 +#define ZLARFB0C2 ZLARFB0C2_64 #define ZLARFB_GETT ZLARFB_GETT_64 #define ZLARFG ZLARFG_64 #define ZLARFGP ZLARFGP_64 @@ -2264,9 +2291,18 @@ #define ZUNGQR ZUNGQR_64 #define ZUNGR2 ZUNGR2_64 #define ZUNGRQ ZUNGRQ_64 +#define ZUNGLK ZUNGLK_64 +#define ZUNGKL ZUNGKL_64 +#define ZUNGKR ZUNGKR_64 +#define ZUNGRK ZUNGRK_64 +#define ZUNGTR ZUNGTR_64 #define ZUNGTR ZUNGTR_64 #define ZUNGTSQR ZUNGTSQR_64 #define ZUNGTSQR_ROW ZUNGTSQR_ROW_64 +#define ZLUMM ZLUMM_64 +#define ZTRTRM ZTRTRM_64 +#define ZACXPY ZACXPY_64 +#define ZTRMMOOP ZTRMMOOP_64 #define ZUNHR_COL ZUNHR_COL_64 #define ZUNM22 ZUNM22_64 #define ZUNM2L ZUNM2L_64 diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f new file mode 100644 index 0000000000..f4c8ddd36f --- /dev/null +++ b/SRC/slarfb0c2.f @@ -0,0 +1,556 @@ +*> \brief \b SLARFB0C2 applies a block reflector or its transpose to a +* rectangular matrix with a 0 block while constructing the explicit Q factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* REAL V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLARFB0C2 applies a real block reflector H or its transpose H**T to a +*> real m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**T from the Left +*> = 'R': apply H or H**T from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'T': apply H**T (Transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is REAL array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> The matrix V. See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, dimension (LDT,K) +*> The triangular k by k matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension (LDC,N) +*> On entry, the m by n matrix C. +*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + ! Array arguments + REAL V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, RQ, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External subroutines + EXTERNAL SGEMM, STRMM, XERBLA + ! Parameters + REAL ONE, ZERO, NEG_ONE + PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE = -1.0E+0) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'T') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + RQ = (.NOT.DIRF).AND.(.NOT.COLV) + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('SLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('SLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = V(K+J,I) + END DO + END DO + ELSE + CALL SGEMM('Transpose', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = C op(H) = C(I-V' op(T) V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = C op(H) = [ 0, C2 ](I - [ V1' ]op(T)[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]op(T)[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*op(T)[ V1, V2 ] + ! + ! = [ -C2*V2'*op(T)*V1, C2 - C2*V2'*op(T)*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*op(T) + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('SLARFB0C2', 2) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = V(J,K+I) + END DO + END DO + ELSE + CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + IF (TRANST) THEN + CALL STRMM('Right', 'Upper', 'Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C, LDC) + ELSE + CALL STRMM('Right', 'Lower', 'No Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C, LDC) + END IF + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL STRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{M-K\times K} + ! C1=0\in\R^{K\times N} + ! C2\in\R^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('SLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('SLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = V(J,I) + END DO + END DO + ELSE + CALL SGEMM('Transpose', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE IF (RQ) THEN + ! We are computing C = C op(H) = C(I-V' op(T) V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\R^{K\times K} + ! V2\in\R^{K\times N-K} + ! C1=0\in\R^{M\times K} + ! C2\in\R^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = C op(H) = [ C2, 0 ] (I - [ V2' ]op(T)[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]op(T)[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*op(T)[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*op(T)*V2, C2*V2'*op(T)*V1 ] + ! + ! = [ C2 - C2*V2'*op(T)*V2, -C2*V2'*op(T)*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*op(T) + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side has the expected value + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('SLARFB0C2', 2) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = V(J,I) + END DO + END DO + ELSE + CALL SGEMM('No Transpose', 'Transpose', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*op(T) + ! + IF( TRANST ) THEN + CALL STRMM('Right', 'Lower', 'Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ELSE + CALL STRMM('Right', 'Upper', 'No Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC) + END IF + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/slarft.f b/SRC/slarft.f index 7a67fa57f4..515dcff5d1 100644 --- a/SRC/slarft.f +++ b/SRC/slarft.f @@ -176,23 +176,23 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * * .. Parameters .. * - REAL ONE, NEG_ONE, ZERO + REAL ONE, NEG_ONE, ZERO PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE=-1.0E+0) * * .. Local Scalars .. * - INTEGER I,J,L - LOGICAL QR,LQ,QL,DIRF,COLV + INTEGER I,J,L + LOGICAL QR,LQ,QL,RQ,LQT,RQT,DIRF,COLV,TDIRF,TCOLV * * .. External Subroutines .. * - EXTERNAL STRMM,SGEMM,SLACPY + EXTERNAL STRMM,SGEMM,SLACPY * * .. External Functions.. * - LOGICAL LSAME - EXTERNAL LSAME -* + LOGICAL LSAME + EXTERNAL LSAME +* * The general scheme used is inspired by the approach inside DGEQRT3 * which was (at the time of writing this code): * Based on the algorithm of Elmroth and Gustavson, @@ -223,26 +223,37 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * 'C' for STOREV, then they meant to provide 'R' * DIRF = LSAME(DIRECT,'F') + TDIRF = LSAME(DIRECT,'T') COLV = LSAME(STOREV,'C') + TCOLV = LSAME(STOREV,'T') * * QR happens when we have forward direction in column storage * QR = DIRF.AND.COLV * -* LQ happens when we have forward direction in row storage +* LQT happens when we have forward direction in row storage and want to compute the transpose of +* the T we would normally compute +* + LQT = DIRF.AND.TCOLV +* +* LQ happens when we have forward direction in row storage and want to compute the T we would +* normally compute * - LQ = DIRF.AND.(.NOT.COLV) + LQ = DIRF.AND.(.NOT.LQT) * * QL happens when we have backward direction in column storage * QL = (.NOT.DIRF).AND.COLV * -* The last case is RQ. Due to how we structured this, if the -* above 3 are false, then RQ must be true, so we never store -* this -* RQ happens when we have backward direction in row storage -* RQ = (.NOT.DIRF).AND.(.NOT.COLV) +* RQT happens when we have backward direction in row storage and want to compute the transpose +* of the T we would normally compute * + RQT = TDIRF.AND.(.NOT.COLV) +* +* RQ happens when we have backward direction in row storage and want to compute the T that we +* would normally compute +* + RQ = (.NOT.RQT).AND.(.NOT.COLV) IF(QR) THEN * * Break V apart into 6 components @@ -256,17 +267,17 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{1,1}\in\R^{l,l} unit lower triangular * V_{2,1}\in\R^{k-l,l} rectangular * V_{3,1}\in\R^{n-k,l} rectangular -* +* * V_{2,2}\in\R^{k-l,k-l} unit lower triangular * V_{3,2}\in\R^{n-k,k-l} rectangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} T_{1,2}| * |0 T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -277,17 +288,17 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* +* * (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2') * = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2' -* +* * Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |-------| * |V_1 V_2| * |-------| -* +* * So, our product is equivalent to the matrix product * I - V*T*V' * This means, we can compute T_{1,1} and T_{2,2}, then use this information @@ -299,30 +310,29 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{2,2} recursively * - CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, - $ TAU(L+1), T(L+1, L+1), LDT) + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) * -* Compute T_{1,2} +* Compute T_{1,2} * T_{1,2} = V_{2,1}' * DO J = 1, L DO I = 1, K-L - T(J, L+I) = V(L+I, J) + T(J,L+I) = V(L+I,J) END DO END DO * * T_{1,2} = T_{1,2}*V_{2,2} * CALL STRMM('Right', 'Lower', 'No transpose', 'Unit', L, - $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2} * Note: We assume K <= N, and GEMM will do nothing if N=K * - CALL SGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, - $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE, - $ T(1, L+1), LDT) + CALL SGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, + $ V(K+1, 1), LDV, V(K+1,L+1), LDV, ONE, T(1, L+1), LDT) * * At this point, we have that T_{1,2} = V_1'*V_2 * All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} @@ -331,12 +341,12 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2} = -T_{1,1}*T_{1,2} * CALL STRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*T_{2,2} * - CALL STRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + CALL STRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT) ELSE IF(LQ) THEN * @@ -350,19 +360,19 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{1,1}\in\R^{l,l} unit upper triangular * V_{1,2}\in\R^{l,k-l} rectangular * V_{1,3}\in\R^{l,n-k} rectangular -* +* * V_{2,2}\in\R^{k-l,k-l} unit upper triangular * V_{2,3}\in\R^{k-l,n-k} rectangular * * Where l = floor(k/2) * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} T_{1,2}| * |0 T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -371,20 +381,20 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2}\in\R^{l, k-l} rectangular * * Then, consider the product: -* -* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) -* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 -* +* +* (I - V_1'*T_{1,1}'*V_1)*(I - V_2'*T_{2,2}'*V_2) +* = I - V_1'*T_{1,1}'*V_1 - V_2'*T_{2,2}'*V_2 + V_1'*T_{1,1}'*V_1*V_2'*T_{2,2}'*V_2 +* * Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |---| * |V_1| * |V_2| * |---| -* +* * So, our product is equivalent to the matrix product -* I - V'*T*V +* I - V'*T'*V * This means, we can compute T_{1,1} and T_{2,2}, then use this information * to compute T_{1,2} * @@ -394,27 +404,26 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{2,2} recursively * - CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, - $ TAU(L+1), T(L+1, L+1), LDT) + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) * * Compute T_{1,2} * T_{1,2} = V_{1,2} * - CALL SLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT) + CALL SLACPY('All', L, K - L, V(1,L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*V_{2,2}' * CALL STRMM('Right', 'Upper', 'Transpose', 'Unit', L, K-L, - $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2} * Note: We assume K <= N, and GEMM will do nothing if N=K * CALL SGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, - $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, - $ T(1, L+1), LDT) + $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, T(1, L+1), LDT) * * At this point, we have that T_{1,2} = V_1*V_2' * All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} @@ -423,13 +432,106 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2} = -T_{1,1}*T_{1,2} * CALL STRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*T_{2,2} * CALL STRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT) + ELSE IF(LQT) THEN +* +* Break V apart into 6 components +* +* V = |----------------------| +* |V_{1,1} V_{1,2} V{1,3}| +* |0 V_{2,2} V{2,3}| +* |----------------------| +* +* V_{1,1}\in\R^{l,l} unit upper triangular +* V_{1,2}\in\R^{l,k-l} rectangular +* V_{1,3}\in\R^{l,n-k} rectangular +* +* V_{2,2}\in\R^{k-l,k-l} unit upper triangular +* V_{2,3}\in\R^{k-l,n-k} rectangular +* +* Where l = floor(k/2) +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{l, l} lower triangular +* T_{2,2}\in\R^{k-l, k-l} lower triangular +* T_{2,1}\in\R^{k-l, l} rectangular +* +* Then, consider the product: +* +* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) +* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 +* +* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL SLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) + +* +* Compute T_{2,1} +* T_{2,1} = V_{1,2}' +* + DO I = 1, K-L + DO J = 1, L + T(L+I,J) = V(J,L+I) + END DO + END DO +* +* T_{2,1} = V_{2,2}*T_{2,1} +* + CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit', K-L, L, + $ ONE, V(L+1,L+1), LDV, T(L+1,1), LDT) +* +* T_{2,1} = V_{2,3}*V_{1,3}' + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL SGEMM('No Transpose', 'Transpose', K-L, L, N-K, ONE, + $ V(L+1,K+1), LDV, V(1, K+1), LDV, ONE, T(L+1,1), LDT) +* +* At this point, we have that T_{2,1} = V_2*V_1' +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K-L, L, NEG_ONE, T(L+1,L+1), LDT, T(L+1,1), LDT) +* +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL STRMM('Right', 'Lower', 'No Transpose', 'Non-unit', + $ K-L, L, ONE, T, LDT, T(L+1,1), LDT) ELSE IF(QL) THEN * * Break V apart into 6 components @@ -442,18 +544,18 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * * V_{1,1}\in\R^{n-k,k-l} rectangular * V_{2,1}\in\R^{k-l,k-l} unit upper triangular -* +* * V_{1,2}\in\R^{n-k,l} rectangular * V_{2,2}\in\R^{k-l,l} rectangular * V_{3,2}\in\R^{l,l} unit upper triangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} 0 | * |T_{2,1} T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -464,17 +566,17 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* +* * (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1') * = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1' -* +* * Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |-------| * |V_1 V_2| * |-------| -* +* * So, our product is equivalent to the matrix product * I - V*T*V' * This means, we can compute T_{1,1} and T_{2,2}, then use this information @@ -482,34 +584,34 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{1,1} recursively * - CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) * * Compute T_{2,2} recursively * CALL SLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV, - $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) * * Compute T_{2,1} * T_{2,1} = V_{2,2}' * DO J = 1, K-L DO I = 1, L - T(K-L+I, J) = V(N-K+J, K-L+I) + T(K-L+I,J) = V(N-K+J, K-L+I) END DO END DO * * T_{2,1} = T_{2,1}*V_{2,1} * CALL STRMM('Right', 'Upper', 'No transpose', 'Unit', L, - $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT) + $ K-L, ONE, V(N-K+1,1), LDV, T(K-L+1,1), LDT) * * T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1} * Note: We assume K <= N, and GEMM will do nothing if N=K * CALL SGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, - $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1), - $ LDT) + $ V(1,K-L+1), LDV, V, LDV, ONE, T(K-L+1,1), LDT) * * At this point, we have that T_{2,1} = V_2'*V_1 * All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} @@ -518,17 +620,13 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{2,1} = -T_{2,2}*T_{2,1} * CALL STRMM('Left', 'Lower', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, - $ T(K-L+1, 1), LDT) + $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT) * * T_{2,1} = T_{2,1}*T_{1,1} * CALL STRMM('Right', 'Lower', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) - ELSE -* -* Else means RQ case -* + $ K-L, ONE, T, LDT, T(K-L+1,1), LDT) + ELSE IF(RQ) THEN * Break V apart into 6 components * * V = |-----------------------| @@ -543,13 +641,13 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{2,2}\in\R^{l,k-l} rectangular * V_{2,3}\in\R^{l,l} unit lower triangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} 0 | * |T_{2,1} T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -560,51 +658,51 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* -* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) -* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 -* +* +* (I - V_2'*T_{2,2}'*V_2)*(I - V_1'*T_{1,1}'*V_1) +* = I - V_2'*T_{2,2}'*V_2 - V_1'*T_{1,1}'*V_1 + V_2'*T_{2,2}'*V_2*V_1'*T_{1,1}'*V_1 +* * Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |---| * |V_1| * |V_2| * |---| -* +* * So, our product is equivalent to the matrix product -* I - V'TV +* I - V'*T'*V * This means, we can compute T_{1,1} and T_{2,2}, then use this information * to compute T_{2,1} * * Compute T_{1,1} recursively * - CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) * * Compute T_{2,2} recursively * - CALL SLARFT(DIRECT, STOREV, N, L, V(K-L+1, 1), LDV, - $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) + CALL SLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV, + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) * * Compute T_{2,1} * T_{2,1} = V_{2,2} * - CALL SLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV, - $ T(K-L+1, 1), LDT) + CALL SLACPY('All', L, K-L, V(K-L+1,N-K+1), LDV, T(K-L+1,1), + $ LDT) * * T_{2,1} = T_{2,1}*V_{1,2}' * CALL STRMM('Right', 'Lower', 'Transpose', 'Unit', L, K-L, - $ ONE, V(1, N-K+1), LDV, T(K-L+1, 1), LDT) + $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT) * -* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} +* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} * Note: We assume K <= N, and GEMM will do nothing if N=K * - CALL SGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, - $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1), - $ LDT) + CALL SGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, + $ V(K-L+1,1), LDV, V, LDV, ONE, T(K-L+1,1), LDT) * * At this point, we have that T_{2,1} = V_2*V_1' @@ -614,13 +712,103 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{2,1} = -T_{2,2}*T_{2,1} * CALL STRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L, - $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, - $ T(K-L+1, 1), LDT) + $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT) * * T_{2,1} = T_{2,1}*T_{1,1} * CALL STRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L, - $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + $ K-L, ONE, T, LDT, T(K-L+1,1), LDT) + ELSE IF(RQT) THEN +* Break V apart into 6 components +* +* V = |-----------------------| +* |V_{1,1} V_{1,2} 0 | +* |V_{2,1} V_{2,2} V_{2,3}| +* |-----------------------| +* +* V_{1,1}\in\R^{k-l,n-k} rectangular +* V_{1,2}\in\R^{k-l,k-l} unit lower triangular +* +* V_{2,1}\in\R^{l,n-k} rectangular +* V_{2,2}\in\R^{l,k-l} rectangular +* V_{2,3}\in\R^{l,l} unit lower triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* | 0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{k-l, k-l} non-unit upper triangular +* T_{2,2}\in\R^{l, l} non-unit upper triangular +* T_{1,2}\in\R^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) +* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 +* +* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) +* +* Compute T_{2,2} recursively +* + CALL SLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV, + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) +* +* Compute T_{1,2} +* T_{1,2} = V_{2,2}' +* + DO I = 1, K-L + DO J = 1, L + T(I,K-L+J) = V(K-L+J, N-K+I) + END DO + END DO +* +* T_{1,2} = V_{1,2}T_{1,2} +* + CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit', K-L, L, + $ ONE, V(1,N-K+1), LDV, T(1,K-L+1), LDT) +* +* T_{1,2} = V_{1,1}V_{2,1}' + T_{1,2} +* + CALL SGEMM('No Tranpose', 'Transpose', K-L, L, N-K, ONE, V, + $ LDV, V(K-L+1,1), LDV, ONE, T(1, K-L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1*V_2' +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-Unit', + $ K-L, L, NEG_ONE, T, LDT, T(1, K-L+1), LDT) +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL STRMM('Right', 'Upper', 'No Transpose', 'Non-Unit', + $ K-L, L, ONE, T(K-L+1,K-L+1), LDT, T(1, K-L+1), LDT) END IF END SUBROUTINE diff --git a/SRC/slumm.f b/SRC/slumm.f new file mode 100644 index 0000000000..1712ce32d0 --- /dev/null +++ b/SRC/slumm.f @@ -0,0 +1,350 @@ +*> \brief \b SLUMM computes an in place triangular times triangluar matrix multiplication +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE SLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA, +* $ A, LDA) +* +* .. Scalar Arguments .. +* INTEGER N, LDA +* CHARACTER SIDEL, DIAGL, DIAGU +* REAL ALPHA +* +* .. Array Arguments .. +* REAL A(LDA,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLUMM performs one of the matrix-matrix operations +*> +*> C = \alpha L * U +*> or +*> C = \alpha U * L +*> +*> where \alpha is a scalar, L is a unit, or non-unit, lower triangular matrix, and U is a unit, or +*> non-unit, upper triangular matrix, and at most one of L and U are non-unit +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDEL +*> \verbatim +*> SIDEL is CHARACTER*1 +*> On entry, SIDE specifies whether L multiplies U from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' A = \alpha L * U +*> +*> SIDE = 'R' or 'r' A = \alpha U * L +*> \endverbatim +*> +*> \param[in] DIAGL +*> \verbatim +*> DIAGL is CHARACTER*1 +*> On entry, DIAGL specifies whether or not L is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' L is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' L is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] DIAGU +*> \verbatim +*> DIAGU is CHARACTER*1 +*> On entry, DIAGU specifies whether or not U is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' U is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' U is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> M is INTEGER +*> On entry, N specifies the number of rows and columns of L and U. M must be at +*> least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is REAL. +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then A is not referenced, and A need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension ( LDA, N ) where +*> Before entry the leading n-by-n strictly upper triangular part of the array +*> A must contain the upper triangular matrix U and the strictly lower triangular part of +*> the leading n-by-n submatrix must contain the lower triangular matrix L. +*> If DIAGL != 'U', then the diagonal is assumed to be part of L, and if +*> DIAGU != 'U', then the diagonal is assumed to be part of U. +*> Note: At most one of DIAGL and DIAGU can be not equal to 'U'. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> On entry, LDA specifies the first dimension of A as declared +*> in the calling (sub) program. LDA must be at least max( 1, n ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== +c Cost: 2/3 * (n^3 - n) + RECURSIVE SUBROUTINE SLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA, + $ A, LDA) +* +* .. Scalar Arguments .. + INTEGER N, LDA + CHARACTER SIDEL, DIAGL, DIAGU + REAL ALPHA +* +* .. Array Arguments .. + REAL A(LDA,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, STRMM, SLASET, + $ XERBLA +* .. +* .. Local Scalars .. + INTEGER K + LOGICAL LLEFT, LUNIT, UUNIT +* .. +* .. Local Parameters .. + REAL ONE, ZERO + PARAMETER(ONE=1.0E+0, ZERO=0.0E+0) +* .. +* +* Determine if our flags are valid or not. We can have at +* most one of DIAGU, DIAGL not equal to 'U' +* + LUNIT = LSAME(DIAGL, 'U') + UUNIT = LSAME(DIAGU, 'U') +* +* If both of the above are false, then it is impossible to have the +* structure that we are exploiting in this routine +* Note: It is possible to allow the matrices to share a non-unit +* diagonal as long as the values are the exact same, but there is +* currently no use case for this that I am aware of. +* + IF ((.NOT.LUNIT).AND.(.NOT.UUNIT)) THEN +* +* We say the error is in the last set DIAG value as we cannot know +* what the user actually meant. +* + CALL XERBLA( 'SLUMM', 3 ) + RETURN + END IF +* +* Determine which side L is on +* + LLEFT = LSAME(SIDEL, 'L') +* +* Early exit if possible +* + IF (N.EQ.0) THEN + RETURN + END IF + IF (ALPHA.EQ.ZERO) THEN + CALL SLASET('All', N, N, ZERO, ZERO, A, LDA) + RETURN + END IF +* +* Terminating Case +* + IF (N.EQ.1) THEN +* +* Since at most one of L and U are non-unit triangular, whatever side L is on, we are still +* always computing one of +* +* 1) A(1,1) = ALPHA*A(1,1) +* 2) A(1,1) = ALPHA +* +* Where the first case happens when exactly one of L and U are unit triangular, while the +* second case happens when both L and U are unit triangular +* + IF (LUNIT.AND.UUNIT) THEN + A(1,1) = ALPHA + ELSE + A(1,1) = ALPHA*A(1,1) + END IF + RETURN + END IF +* +* Recursive Case +* + K = N/2 +* +* Regardless of us computing A = L*U or A = U*L, break break A apart as follows: +* +* |---| +* A = | U | +* | L | +* |---| +* +* Further break down L as +* |---------------| +* L = | L_{11} 0 | +* | L_{21} L_{22} | +* |---------------| +* +* Where: +* +* L_{11}\in\R^{k\times k} is lower triangular (assumed unit iff DIAGL == 'U') +* L_{21}\in\R^{n-k\times n} is rectangular +* L_{22}\in\R^{n-k\times n-k} is lower triangular (assumed unit iff DIAGL == 'U') +* +* Further break down U as +* |---------------| +* U = | U_{11} U_{21} | +* | 0 U_{22} | +* |---------------| +* +* Where: +* +* U_{11}\in\R^{k\times k} is upper triangular (assumed unit iff DIAGU == 'U') +* U_{12}\in\R^{n\times n-k} is rectangular +* U_{22}\in\R^{n-k\times n-k} is upper triangular (assumed unit iff DIAGU == 'U') + IF (LLEFT) THEN +* +* This means we are computing +* |---------------| |---------------| +* A = L*U = \alpha | L_{11} 0 | * | U_{11} U_{12} | +* | L_{21} L_{22} | | 0 U_{22} | +* |---------------| |---------------| +* +* |---------------------------------------------| +* = \alpha | L_{11}*U_{11} L_{11}*U_{12} | +* | L_{21}*U_{11} L_{21}*U_{12} + L_{22}*U_{22} | +* |---------------------------------------------| +* +* We compute these in the following order +* +* A_{22} = \alpha*L_{22}*U_{22} (This routine) +* A_{22} = \alpha*L_{21}*U_{12} + A_{22} (GEMM) +* +* A_{12} = \alpha*L_{11}*U_{12} (TRMM) +* A_{21} = \alpha*L_{21}*U_{11} (TRMM) +* +* A_{11} = \alpha*L_{11}*U_{11} (This routine) +* +* Compute A_{22} +* +* A_{22} = \alpha*L_{22}*U_{22} +* + CALL SLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA, + $ A(K+1, K+1), LDA) +* +* A_{22} = \alpha L_{21}*U_{12} + A_{22} +* + CALL SGEMM('No Transpose', 'No Transpose', N-K, N-K, K, + $ ALPHA, A(K+1,1), LDA, A(1,K+1), LDA, ONE, + $ A(K+1,K+1), LDA) +* +* Compute A_{12} +* +* A_{12} = \alpha*L_{11}*U_{12} +* + CALL STRMM('Left', 'Lower', 'No Transpose', DIAGL, K, + $ N-K, ALPHA, A, LDA, A(1,K+1), LDA) +* +* Compute A_{21} +* +* A_{21} = \alpha*L_{21}*U_{11} +* + CALL STRMM('Right', 'Upper', 'No Transpose', DIAGU, N-K, + $ K, ALPHA, A, LDA, A(K+1,1), LDA) +* +* Compute A_{11} +* +* A_{11} = \alpha*L_{11}*U_{11} +* + CALL SLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA) + ELSE +* +* This means we are computing +* |---------------| |---------------| +* A = U*L = \alpha | U_{11} U_{12} | * | L_{11} 0 | +* | 0 U_{22} | | L_{21} L_{22} | +* |---------------| |---------------| +* +* |---------------------------------------------| +* = \alpha | U_{11}*L_{11} + U_{12}*L_{21} U_{12}*L_{22} | +* | U_{22}*L_{21} U_{22}*L_{22} | +* |---------------------------------------------| +* +* We compute these in the following order +* +* A_{11} = \alpha*U_{11}*L_{11} (This routine) +* A_{11} = \alpha*U_{12}*L_{21} + A_{11} (GEMM) +* +* A_{12} = \alpha*U_{12}*L_{22} (TRMM) +* A_{21} = \alpha*U_{22}*L_{21} (TRMM) +* +* A_{22} = \alpha*U_{22}*L_{22} (This routine) +* +* Compute A_{11} +* +* A_{11} = \alpha*U_{11}*L_{11} +* + CALL SLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA) +* +* A_{11} = \alpha*U_{12}*L_{21} + A_{11} +* + CALL SGEMM('No Transpose', 'No Transpose', K, K, N-K, + $ ALPHA, A(1,K+1), LDA, A(K+1,1), LDA, ONE, A, LDA) +* +* Compute A_{12} +* +* A_{12} = \alpha*U_{12}*L_{22} +* + CALL STRMM('Right', 'Lower', 'No Transpose', DIAGL, K, + $ N-K, ALPHA, A(K+1,K+1), LDA, A(1,K+1), LDA) +* +* Compute A_{21} +* +* A_{21} = \alpha*U_{22}*L_{21} +* + CALL STRMM('Left', 'Upper', 'No Transpose', DIAGU, N-K, + $ K, ALPHA, A(K+1, K+1), LDA, A(K+1,1), LDA) +* +* Compute A_{22} +* +* A_{22} = \alpha*U_{22}*L_{22} +* + CALL SLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA, + $ A(K+1, K+1), LDA) + END IF + END SUBROUTINE diff --git a/SRC/sorgkl.f b/SRC/sorgkl.f new file mode 100644 index 0000000000..e90f1c9ee3 --- /dev/null +++ b/SRC/sorgkl.f @@ -0,0 +1,173 @@ +*> \brief \b SORGKL computes the explicit Q factor from SGEQLF and SLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SORGKL(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* REAL Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORGKL generates an m by n real matrix Q with orthonormal columns, +*> which is defined as the last n columns of the product of n +*> elementary reflectors +*> +*> Q = I - V*T*V**T = H(n) . . . H(2) H(1) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by SGEQLF and T is the n by n matrix returned by SLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V, and the order of T. +*> N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is REAL array, dimension (LDQ,N) +*> On entry, Q(1:m-n+i-1,i) contains the vector which defines the +*> elementary reflector H(i), for i=1,...,n as returned by SGEQLF. +*> In addition, the lower triangular portion of the submatrix given +*> by Q(m-n+1:m,1:n) will contain the arry T as returned by SLARFT. +*> See further details for more information. +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The storage of the V and T components inside Q is best illustrated by +*> the following example with m = 5, n = 3. +*> +*> Q = |----------| +*> | V1 V2 V3 | +*> | V1 V2 V3 | +*> | T1 V2 V3 | +*> | T1 T2 V3 | +*> | T1 T2 T3 | +*> |----------| +*> +*> \endverbatim +*> +* ===================================================================== + + SUBROUTINE SORGKL(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + REAL Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL NEG_ONE, ONE + PARAMETER(NEG_ONE=-1.0E+0, ONE=1.0E+0) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL STRMM, STRTRM, SLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |---| +* Q = | V | +* | T | +* |---| +* +* Where T is an n-by-n lower triangular matrix, and V is as described +* in the Further Details section +* +* In turn, break apart V as follows +* +* |-----| +* V = | V_2 | +* | V_1 | +* |-----| +* +* Where: +* +* V_1 \in \R^{n\times n} assumed unit upper triangular +* V_2 \in \R^{m-n\times n} +* +* Compute T = T*V_1**T +* + CALL STRTRM('Right', 'Lower', 'Transpose', 'Non-Unit', 'Unit', + $ N, ONE, Q(M-N+1,1), LDQ, Q(M-N+1,1), LDQ) +* +* Compute Q = -VT. This means that we need to break apart +* Our computation in two parts +* +* |--------| +* Q = | -V_2*T | +* | -V_1*T | +* |--------| +* +* Q_2 = -V_2*T (TRMM) but only when necessary +* + IF (M.GT.N) THEN + CALL STRMM('Right', 'Lower', 'No Transpose', 'Non-Unit', + $ M-N, N, NEG_ONE, Q(M-N+1,1), LDQ, Q, LDQ) + END IF +* +* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication) +* + CALL SLUMM('Right', 'Non-Unit', 'Unit', N, NEG_ONE, + $ Q(M-N+1,1), LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(M-N+I,I) = Q(M-N+I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/sorgkr.f b/SRC/sorgkr.f new file mode 100644 index 0000000000..4bd870cdd8 --- /dev/null +++ b/SRC/sorgkr.f @@ -0,0 +1,153 @@ +*> \brief \b SORGKR computes the explicit Q factor from SGEQRF and SLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SORGKR(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* REAL Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORGKR generates an m by n real matrix Q with orthonormal columns, +*> which is defined as the first n columns of the product of n +*> elementary reflectors +*> +*> Q = I - V*T*V**T = H(1) H(2) . . . H(n) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by SGEQRF and T is the n by n matrix returned by SLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V, and the order of T. +*> N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is REAL array, dimension (LDQ,N) +*> On entry, the upper triangular part and diagonal contains +*> The array T as returned from SLARFT. In addition, the +*> strictly lower triangular portion of the i-th column contains +*> the vector which defines the elementary reflector H(i), +*> for i = 1,2,...,n, as returned by SGEQRF +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + SUBROUTINE SORGKR(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + REAL Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL NEG_ONE, ONE + PARAMETER(NEG_ONE=-1.0E+0, ONE=1.0E+0) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL STRMM, STRTRM, SLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |---| +* Q = | T | +* | V | +* |---| +* +* Where T is an n-by-n upper triangular matrix, and V is an +* m-by-n assumed unit lower trapezoidal matrix +* +* In turn, break apart V as follows +* +* |-----| +* V = | V_1 | +* | V_2 | +* |-----| +* +* Where: +* +* V_1 \in \R^{n\times n} assumed unit lower triangular +* V_2 \in \R^{m-n\times n} +* +* Compute T = T*V_1**T +* + CALL STRTRM('Right', 'Upper', 'Transpose', 'Non-unit', 'Unit', + $ N, ONE, Q, LDQ, Q, LDQ) +* +* Compute Q = -VT. This means that we need to break apart +* Our computation in two parts +* +* |--------| +* Q = | -V_1*T | +* | -V_2*T | +* |--------| +* +* Q_2 = -V_2*T (TRMM) but only when necessary +* + IF (M.GT.N) THEN + CALL STRMM('Right', 'Upper', 'No Transpose', 'Non-unit', + $ M-N, N, NEG_ONE, Q, LDQ, Q(N+1,1), LDQ) + END IF +* +* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication) +* + CALL SLUMM('Left', 'Unit', 'Non-Unit', N, NEG_ONE, Q, LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,I) = Q(I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/sorglk.f b/SRC/sorglk.f new file mode 100644 index 0000000000..f5a486ff79 --- /dev/null +++ b/SRC/sorglk.f @@ -0,0 +1,149 @@ +*> \brief \b SORGLK computes the explicit Q factor from SGELQF and SLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SORGLK(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* REAL Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORGLK generates an m by n real matrix Q with orthonormal columns, +*> which is defined as the first n rows of the product of n +*> elementary reflectors +*> +*> Q = I - V'*T*V = H(1) H(2) . . . H(n) +*> +*> Where V is an m by n matrix whose rows are householder reflectors +*> as returned by SGELQF and T is the n by n matrix returned by SLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V, and the order of T. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V. N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is REAL array, dimension (LDQ,N) +*> On entry, the lower triangular part and diagonal contains +*> The array T as returned from SLARFT. In addition, the +*> strictly upper triangular portion of the i-th row contains +*> the vector which defines the elementary reflector H(i), +*> for i = 1,2,...,m, as returned by SGELQF +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + SUBROUTINE SORGLK(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + REAL Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL NEG_ONE, ONE + PARAMETER(NEG_ONE=-1.0E+0, ONE=1.0E+0) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL STRMM, STRTRM, SLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |-----| +* Q = | T V | +* |-----| +* +* Where T is an m-by-m lower triangular matrix, and V is an +* m-by-n assumed unit upper trapezoidal matrix +* +* In turn, break apart V as follows +* +* |---------| +* V = | V_1 V_2 | +* |---------| +* +* Where: +* +* V_1 \in \R^{m\times m} assumed unit upper triangular +* V_2 \in \R^{m\times n-m} +* +* Compute T = V_1'*T +* + CALL STRTRM('Left', 'Lower', 'Transpose', 'Non-unit', 'Unit', + $ M, ONE, Q, LDQ, Q, LDQ) +* +* Compute Q = -TV. This means that we need to break apart +* Our computation in two parts +* +* |---------------| +* Q = | -T*V_1 -T*V_2 | +* |---------------| +* +* Q_2 = -T*V_2 (TRMM) but only when necessary +* + IF (N.GT.M) THEN + CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ M, N-M, NEG_ONE, Q, LDQ, Q(1,M+1), LDQ) + END IF +* +* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication) +* + CALL SLUMM('Left', 'Non-unit', 'Unit', M, NEG_ONE, Q, LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,I) = Q(I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/sorglq.f b/SRC/sorglq.f index fb5ef79cce..61e0c4c968 100644 --- a/SRC/sorglq.f +++ b/SRC/sorglq.f @@ -146,7 +146,8 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORGL2, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORGL2, + $ SORGLK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -162,7 +163,7 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'SORGLQ', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, M )*NB + LWKOPT = MAX( 1, M ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -191,94 +192,92 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX( 0, ILAENV( 3, 'SORGLQ', ' ', M, N, K, -1 ) ) IWS = M - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'SORGLQ', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * - LDWORK = M - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked version * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'SORGLQ', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL SORGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + CALL SLARFT( 'Forward', 'Transpose', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), A( I, I ), LDA ) * -* Set A(kk+1:m,1:kk) to zero. +* Apply H to A(i+ib:m,i:n) from the right * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL SLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I+IB,I), LDA) * -* Use unblocked code for the last or only block. +* Apply H to columns i:n of current block + + CALL SORGLK( IB, N-I+1, A( I, I ), LDA) * - IF( KK.LT.M ) - $ CALL SORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) +* Use blocked code * - IF( KK.GT.0 ) THEN + DO I = KI + 1, 1, -NB + IB = NB * -* Use blocked code +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL SLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I), + $ LDA, TAU( I ), A( I, I ), LDA ) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN +* Apply H to A(i+ib:m,i:n) from the right * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + CALL SLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I+IB,I), LDA) * - CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) +* Apply H to columns i:n of current block * -* Apply H**T to A(i+ib:m,i:n) from the right + CALL SORGLK( IB, N-I+1, A( I, I ), LDA) + END DO * - CALL SLARFB( 'Right', 'Transpose', 'Forward', - $ 'Rowwise', - $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK, - $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ), - $ LDWORK ) - END IF +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * -* Apply H**T to columns i:n of current block + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * - CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set columns 1:i-1 of current block to zero + CALL SLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I), + $ LDA, TAU( I ), A( I, I ), LDA ) * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE +* Apply H to A(i+ib:m,i:n) from the right +* + CALL SLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I+IB,I), LDA) +* +* Apply H to columns i:n of current block +* + CALL SORGLK( IB, N-I+1, A( I, I ), LDA) + END IF END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/sorgql.f b/SRC/sorgql.f index 7257ff2d95..960b4857cc 100644 --- a/SRC/sorgql.f +++ b/SRC/sorgql.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -143,11 +141,12 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, J, KK, LWKOPT, NB, + $ NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORG2L, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORG2L, + $ SORGKL, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -178,7 +177,8 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB + ! Only need a workspace for calls to dorg2l + LWKOPT = N END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * @@ -201,88 +201,74 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX(0, ILAENV(3, 'SORGQL', ' ', M, N, K, -1)) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'SORGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We want to use the blocking method as long as our matrix is big enough * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'SORGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL SORG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL SORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block * -* Use blocked code + CALL SORGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA) + +* Use blocked code on the remaining blocks if there are any. * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL SLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) * -* Apply H to rows 1:m-k+i+ib-1 of current block +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, - $ TAU( I ), WORK, IINFO ) + CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * -* Set rows m-k+i+ib:m of current block to zero +* Apply H to rows 1:m-k+i+ib-1 of current block * - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL SORGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA) + END DO END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/sorgqr.f b/SRC/sorgqr.f index 47d1fd248c..c52acef089 100644 --- a/SRC/sorgqr.f +++ b/SRC/sorgqr.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -147,7 +145,8 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORG2R, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORG2R, + $ SORGKR, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -163,7 +162,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB + LWKOPT = MAX( 1, N ) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -192,94 +191,92 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX(0, ILAENV(3, 'SORGQR', ' ', M, N, K, -1)) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'SORGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Handle the first block assuming we are applying to the +* identity, then resume regular blocking method after * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'SORGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL SORG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN + I = KK + 1 + IB = NB * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Set A(1:kk,kk+1:n) to zero. +* Apply H to A(i:m,i+ib:n) from the left * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * -* Use unblocked code for the last or only block. +* Apply H to rows i:m of current block * - IF( KK.LT.N ) - $ CALL SORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL SORGKR(M-I+1, IB, A(I,I), LDA) + DO I = KI + 1, 1, -NB + IB = NB * - IF( KK.GT.0 ) THEN +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Use blocked code + CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Apply H to A(i:m,i+ib:n) from the left * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * - CALL SLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Apply H to rows i:m of current block * -* Apply H to A(i:m,i+ib:n) from the left + CALL SORGKR(M-I+1, IB, A(I,I), LDA) + END DO * - CALL SLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * -* Apply H to rows i:m of current block + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * - CALL SORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Set rows 1:i-1 of current block to zero + CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE - END IF +* Apply H to A(i:m,i+ib:n) from the left * + CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + +* +* Apply H to rows i:m of current block +* + CALL SORGKR(M-I+1, IB, A(I,I), LDA) + END IF + END IF WORK( 1 ) = SROUNDUP_LWORK(IWS) RETURN * diff --git a/SRC/sorgrk.f b/SRC/sorgrk.f new file mode 100644 index 0000000000..c2429d93e4 --- /dev/null +++ b/SRC/sorgrk.f @@ -0,0 +1,168 @@ +*> \brief \b SORGRK computes the explicit Q factor from SGERQF and SLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SORGRK(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* REAL Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORGRK generates an m by n real matrix Q with orthonormal rows, +*> which is defined as the last m rows of the product of m +*> elementary reflectors +*> +*> Q = I - V'*T*V = H(m) . . . H(2) H(1) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by SGERQF and T is the n by n matrix returned by SLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V, and the order of T. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V. N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is REAL array, dimension (LDQ,N) +*> On entry, Q(i,1:n-m-1+i) contains the vector which defines the +*> elementary reflector H(i), for i=1,...,n as returned by SGERQF. +*> In addition, the upper triangular portion of the submatrix given +*> by Q(1:m,n-m:n) will contain the array T as returned by SLARFT. +*> See further details for more information. +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The storage of the V and T components inside Q is best illustrated by +*> the following example with m = 3, n = 5. +*> +*> Q = |----------------| +*> | V1 V1 T1 T1 T1 | +*> | V2 V2 V2 T2 T2 | +*> | V3 V3 V3 V3 T3 | +*> |----------------| +*> +*> \endverbatim +*> +* ===================================================================== + + SUBROUTINE SORGRK(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + REAL Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL NEG_ONE, ONE + PARAMETER(NEG_ONE=-1.0E+0, ONE=1.0E+0) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL STRMM, STRTRM, SLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |-----| +* Q = | V T | +* |-----| +* +* Where T is an m-by-m upper triangular matrix, and V is as described +* in the Further Details section +* +* In turn, break apart V as follows +* +* |---------| +* V = | V_2 V_1 | +* |---------| +* +* Where: +* +* V_1 \in \R^{m\times m} assumed unit lower triangular +* V_2 \in \R^{m\times n-m} +* +* Compute T = V_1'*T +* + CALL STRTRM('Left', 'Upper', 'Transpose', 'Non-Unit', 'Unit', + $ M, ONE, Q(1,N-M+1), LDQ, Q(1,N-M+1), LDQ) +* +* Compute Q = -TV. This means that we need to break apart +* Our computation in two parts +* +* |---------------| +* Q = | -T*V_2 -T*V_1 | +* |---------------| +* +* Q_2 = -T*V_2 (TRMM) but only when necessary +* + IF (N.GT.M) THEN + CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-Unit', + $ M, N-M, NEG_ONE, Q(1,N-M+1), LDQ, Q, LDQ) + END IF +* +* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication) +* + CALL SLUMM('Right', 'Unit', 'Non-Unit', M, NEG_ONE, + $ Q(1,N-M+1), LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,N-M+I) = Q(I,N-M+I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/sorgrq.f b/SRC/sorgrq.f index 5fe685c1d8..27fd47803c 100644 --- a/SRC/sorgrq.f +++ b/SRC/sorgrq.f @@ -147,7 +147,8 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL SLARFB, SLARFT, SORGR2, XERBLA + EXTERNAL SLARFB0C2, SLARFT, SORGR2, + $ SORGRK, XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -178,7 +179,7 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'SORGRQ', ' ', M, N, K, -1 ) - LWKOPT = M*NB + LWKOPT = M END IF WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) * @@ -208,83 +209,72 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * Determine when to cross over from blocked to unblocked code. * NX = MAX( 0, ILAENV( 3, 'SORGRQ', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN -* -* Determine if workspace is large enough for blocked code. -* - LDWORK = M - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN -* -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. -* - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'SORGRQ', ' ', M, N, K, - $ -1 ) ) - END IF - END IF END IF * IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Use blocked code after the first block. -* The last kk rows are handled by the block method. -* - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) +* We want to use the blocking method as long as our matrix is big enough +* and it's deemed worthwhile * -* Set A(1:m-kk,n-kk+1:n) to zero. -* - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE + KK = K ELSE KK = 0 END IF * -* Use unblocked code for the first or only block. +* Potentially bail to the unblocked code * - CALL SORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + IF( KK.EQ.0 ) THEN + CALL SORGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) + END IF * IF( KK.GT.0 ) THEN * -* Use blocked code +* Factor the first block assuming that our first application +* will be on the Identity matrix * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + I = 1 + IB = NB + II = M - K + I +* +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) +* + CALL SLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA ) +* +* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) + CALL SLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, + $ A( II, N-K+I ), LDA, A, LDA) * - CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) +* Apply H to columns 1:n-k+i+ib-1 of current block +* + CALL SORGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA ) + + DO I = NB + 1, K, NB +* +* The last block may be less than size NB +* + IB = MIN(NB, K-I+1) + II = M - K + I * -* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL SLARFB( 'Right', 'Transpose', 'Backward', - $ 'Rowwise', - $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK, - $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK ) - END IF + CALL SLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA ) * -* Apply H**T to columns 1:n-k+i+ib-1 of current block +* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL SLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), + $ LDA, A( II, N-K+I ), LDA, A, LDA) * -* Set columns n-k+i+ib:n of current block to zero +* Apply H to columns 1:n-k+i+ib-1 of current block * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL SORGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA ) + END DO END IF * WORK( 1 ) = SROUNDUP_LWORK(IWS) diff --git a/SRC/strmmoop.f b/SRC/strmmoop.f new file mode 100644 index 0000000000..26776369b0 --- /dev/null +++ b/SRC/strmmoop.f @@ -0,0 +1,1625 @@ +*> \brief \b STRMMOOP computes an out of place triangular times general matrix multiplication +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, +* $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* .. Scalar Arguments .. +* REAL ALPHA, BETA +* INTEGER M, N, LDA, LDB, LDC +* CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG +* .. +* .. Array Arguments .. +* REAL A(LDA,*), B(LDB,*), C(LDC,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> STRMMOOP performs one of the matrix-matrix operations +*> +*> C = \alpha op(A) * op(B) + \beta C +*> or +*> C = \alpha op(B) * op(A) + \beta C +*> +*> where \alpha and \beta are scalars, C is an m-by-n matrix, A is +*> a unit, or non-unit, upper or lower triangular matrix, and op(A) is +*> is one of +*> +*> op(A) = A or op(A) = A**T +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> On entry, SIDE specifies whether op(A) multiplies op(B) from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' C = \alpha op(A) * op(B) + \beta C +*> +*> SIDE = 'R' or 'r' C = \alpha op(B) * op(A) + \beta C +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> On entry, UPLO specifies whether the matrix A is an upper or +*> lower triangular matrix as follows: +*> UPLO = 'U' or 'u' A is upper triangular +*> +*> UPLO = 'L' or 'l' A is lower triangular +*> \Endverbatim +*> +*> \param[in] TRANSA +*> \verbatim +*> TRANSA is CHARACTER*1 +*> On entry, TRANSA specifies the form of op(A) to be used in +*> the matrix multiplication as follows: +*> TRANSA = 'N' or 'n' op(A) = A +*> +*> TRANSA = 'T' or 't' op(A) = A**T +*> +*> TRANSA = 'C' or 'c' op(A) = A**T +*> \endverbatim +*> +*> \param[in] TRANSB +*> \verbatim +*> TRANSB is CHARACTER*1 +*> On entry, TRANSB specifies the form of op(B) to be used in +*> the matrix multiplication as follows: +*> TRANSB = 'N' or 'n' op(B) = B +*> +*> TRANSB = 'T' or 't' op(B) = B**T +*> +*> TRANSB = 'C' or 'c' op(B) = B**T +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> On entry, DIAG specifies whether or not A is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' A is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' A is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> On entry, M specifies the number of rows of C. M must be at +*> least zero. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> On entry, N specifies the number of columns of C. N must be +*> at least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is REAL +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then A and B are not referenced, and A and B need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is REAL array, dimension ( LDA, K ) where +*> K is M when SIDE = 'L' and K is N when SIDE='R' +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array A must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> A is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array A must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> A is not referenced. +*> Note that when DIAG = 'U' or 'u', the diagonal elements of +*> A are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> On entry, LDA specifies the first dimension of A as declared +*> in the calling (sub) program. When SIDE = 'L' or 'l' then +*> LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +*> then LDA must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is REAL array, dimension ( LDB, K ), where K is M +*> If SIDE='R' and TRANSA='N', or SIDE='L' and TRANSA='T' and N +*> otherwise. On entry, the leading k-by-k submatrix must contain +*> B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> On entry, LDB specifies the first dimension of B as declared +*> in the calling (sub) program. When SIDE = 'R' and TRANSB='N' +*> then LDB must be at least max( 1, m ), when SIDE = 'R' +*> and TRANSB = 'T' then LDB must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] BETA +*> \verbatim +*> BETA is REAL . +*> On entry, BETA specifies the scalar beta. When beta is +*> zero then C is not referenced on entry, and C need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is REAL array, dimension ( LDC, N ) +*> Before entry, the leading m-by-n part of the array C must +*> contain the matrix C, and on exit is overwritten by the +*> transformed matrix. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> On entry, LDC specifies the first dimension of C as declared +*> in the calling (sub) program. LDC must be at least +*> max( 1, m ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + RECURSIVE SUBROUTINE STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, + $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER M, N, LDA, LDB, LDC + CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG +* .. +* .. Array Arguments .. + REAL A(LDA,*), B(LDB,*), C(LDC,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + REAL SDOT + EXTERNAL LSAME, SDOT +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SAXPY, SLASET, SSCAL +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Local Scalars .. + INTEGER I, L, K, INCB + LOGICAL LSIDE, UPPER, UNIT, TRANST, TRANSG +* .. +* .. Local Parameters .. + REAL ONE, ZERO + PARAMETER(ONE=1.0E+0, ZERO=0.0E+0) +* .. +* +* Beginning of Executable Statements +* + LSIDE = LSAME(SIDE, 'L') + UPPER = LSAME(UPLO, 'U') + ! If we are transposing the triangular matrix (A) + TRANST= LSAME(TRANSA, 'T').OR.LSAME(TRANSA, 'C') + ! If we are transposing the general matrix (B) + TRANSG= LSAME(TRANSB, 'T').OR.LSAME(TRANSB, 'C') +* +* Terminating Case +* + UNIT = LSAME(DIAG, 'U') + IF (M.EQ.1.AND.N.EQ.1) THEN +* +* This case is the simplest as we are just computing C = \alpha A*B + +* \beta C where all components are 1-by-1 matrices +* + + IF (BETA.EQ.ZERO) THEN + C(1,1) = ZERO + ELSE + C(1,1) = C(1,1) * BETA + END IF + IF(ALPHA.NE.ZERO) THEN + IF(UNIT) THEN + C(1,1) = C(1,1) + ALPHA*B(1,1) + ELSE + C(1,1) = C(1,1) + ALPHA*A(1,1)*B(1,1) + END IF + END IF + RETURN + ELSE IF (M.EQ.1) THEN +* +* This means that C is a row vector. If BETA is 0, then we +* set it explicitly, otherwise we overwrite it with BETA*C +* + IF (BETA.EQ.ZERO) THEN + ! This ensures we don't reference C unless we need to + CALL SLASET('All', M, N, ZERO, ZERO, C, LDC) + ELSE + CALL SSCAL(N, BETA, C, LDC) + END IF + IF (ALPHA.NE.ZERO) THEN +* +* Recall that the number of columns of A is determined by SIDE +* + IF (LSIDE) THEN +* +* Determine if B is a row or column vector +* + IF (TRANSG) THEN + INCB = 1 ! This means that B is a column vector + ELSE + INCB = LDB ! This means that B is a row vector + END IF +* +* This means that A is a scalar, so it is either assumed to be +* ONE or explicitly stored in A(1,1) +* + IF (UNIT) THEN + CALL SAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + CALL SAXPY(N, ALPHA * A(1,1), B, INCB, C, LDC) + END IF + ELSE ! A is on the right +* +* Determine if B is a row or column vector +* + IF (TRANSG) THEN + INCB = 1 ! This means that B is a column vector + ELSE + INCB = LDB ! This means that B is a row vector + END IF +* +* This means that A is an n-by-n matrix +* + IF (UPPER) THEN + IF (TRANST) THEN + IF (TRANSG) THEN + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * SDOT(N-I, + $ B(I+1,1), 1, A(I,I+1), LDA) + + $ C(1,I) + END DO + CALL SAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * SDOT(N-I+1, + $ B(I,1), 1, A(I,I), LDA) + + $ C(1,I) + END DO + END IF + ELSE ! Not transposing B + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * SDOT(N-I, + $ B(1,I+1), LDB, A(I,I+1), LDA) + + $ C(1,I) + END DO + CALL SAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * SDOT(N-I+1, + $ B(1,I), LDB, A(I,I), LDA) + + $ C(1,I) + END DO + END IF + END IF + ELSE ! Not transposing A + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * SDOT(I-1, B, INCB, + $ A(1,I), 1) + C(1,I) + END DO + + CALL SAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * SDOT(I, B, INCB, + $ A(1,I), 1) + C(1,I) + END DO + END IF + END IF + ELSE ! A is lower + IF (TRANST) THEN + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * SDOT(I-1, B, INCB, + $ A(I,1), LDA) + C(1,I) + END DO + + CALL SAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * SDOT(I, B, INCB, + $ A(I,1), LDA) + C(1,I) + END DO + END IF + ELSE ! A is not transposed + IF (TRANSG) THEN + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * SDOT(N-I, + $ B(I+1,1), 1, A(I+1,I), 1) + + $ C(1,I) + END DO + CALL SAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * SDOT(N-I+1, + $ B(I,1), 1, A(I,I), 1) + + $ C(1,I) + END DO + END IF + ELSE! B is not transposed + IF (UNIT) THEN + DO I = 1, N + C(1,I) = ALPHA * SDOT(N-I, + $ B(1,I+1), LDB, A(I+1,I), 1) + + $ C(1,I) + END DO + CALL SAXPY(N, ALPHA, B, INCB, C, LDC) + ELSE + DO I = 1, N + C(1,I) = ALPHA * SDOT(N-I+1, + $ B(1,I), LDB, A(I,I), 1) + + $ C(1,I) + END DO + END IF + END IF + END IF + END IF + END IF + END IF + RETURN + ELSE IF (N.EQ.1) THEN +* +* This means that C is a column vector. If BETA is 0, then we +* set it explicitly, otherwise we overwrite it with BETA*C +* + IF (BETA.EQ.ZERO) THEN + ! This ensures we don't reference C unless we need to + CALL SLASET('All', M, N, ZERO, ZERO, C, LDC) + ELSE + CALL SSCAL(M, BETA, C, 1) + END IF + + IF (ALPHA.NE.ZERO) THEN + IF (TRANSG) THEN + INCB = LDB ! B is a row vector + ELSE + INCB = 1 ! B is a column vector + END IF + IF (LSIDE) THEN + IF (UPPER) THEN + IF (TRANST) THEN + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * SDOT(I-1, A(1, I), + $ 1, B, INCB) + C(I,1) + END DO + CALL SAXPY(M, ALPHA, B, INCB, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * SDOT(I, A(1, I), 1, + $ B, INCB) + C(I,1) + END DO + END IF + ELSE ! A is not transposed + IF (TRANSG) THEN + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * SDOT(M-I, + $ A(I,I+1), LDA, B(1, I+1), LDB) + + $ C(I,1) + END DO + + CALL SAXPY(M, ALPHA, B, LDB, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * SDOT(M-I+1, + $ A(I,I), LDA, B(1,I), LDB) + + $ C(I,1) + END DO + END IF + ELSE ! B is not transposed + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * SDOT(M-I, + $ A(I,I+1), LDA, B(I+1,1), 1) + + $ C(I,1) + END DO + + CALL SAXPY(M, ALPHA, B, 1, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * SDOT(M-I+1, + $ A(I,I), LDA, B(I,1), 1) + + $ C(I,1) + END DO + END IF + END IF + END IF + ELSE ! A is lower + IF (TRANST) THEN + IF (TRANSG) THEN + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * SDOT(M-I, + $ A(I+1,I), 1, B(1,I+1), LDB) + + $ C(I,1) + END DO + + CALL SAXPY(M, ALPHA, B, LDB, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * SDOT(M-I+1, + $ A(I,I), 1, B(1,I), LDB) + + $ C(I,1) + END DO + END IF + ELSE ! A is not transposed + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * SDOT(M-I, + $ A(I+1,I), 1, B(I+1,1), 1) + + $ C(I,1) + END DO + + CALL SAXPY(M, ALPHA, B, 1, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * SDOT(M-I+1, + $ A(I,I), 1, B(I,1), 1) + + $ C(I,1) + END DO + END IF + END IF + ELSE ! A is not transposed + IF (UNIT) THEN + DO I = 1, M + C(I,1) = ALPHA * SDOT(I-1, A(I,1), + $ LDA, B, INCB) + C(I,1) + END DO + CALL SAXPY(M, ALPHA, B, INCB, C, 1) + ELSE + DO I = 1, M + C(I,1) = ALPHA * SDOT(I, A(I,1), LDA, + $ B, INCB) + C(I,1) + END DO + END IF + END IF + END IF + ELSE ! A is on the right + ! Since the trailing dimension of op(A) must be 1, + ! we know that A must be a scalar + IF (UNIT) THEN + CALL SAXPY(M, ALPHA, B, INCB, C, 1) + ELSE + CALL SAXPY(M, ALPHA*A(1,1), B, INCB, C, 1) + END IF + END IF + END IF + RETURN + END IF +* +* Recursive Case +* + L = MIN(M,N)/2 + IF (LSIDE) THEN +* +* We are multiplying A from the left IE we are computing +* C = \alpha op(A)*op(B) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing +* +* So we are computing +* C = \alpha A**T * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11}**T + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{21}**T + \beta C_{12} +* C_{21} = \alpha A_{12}**T * B_{11}**T + \alpha A_{22}**T * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{12}**T * B_{21}**T + \alpha A_{22}**T * B_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{12}**T * B_{11}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22}**T * B_{12}**T + C_{21} (This routine) +* +* C_{22} = \alpha A_{12}**T * B_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22}**T * B_{22}**T + C_{22} (This routine) +* + ! C_{11} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, + $ LDC) + ! C_{12} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ BETA, C(1, L+1), LDC) + ! C_{21} + CALL SGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(1, L+1), LDA, B, LDB, BETA, C(L+1,1), + $ LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1,L+1), LDA, B(1,L+1), + $ LDB, ONE, C(L+1,1), LDC) + ! C_{22} + CALL SGEMM(TRANSA, TRANSB, M-L, N-L, L, ALPHA, + $ A(1, L+1), LDA, B(L+1,1), LDB, BETA, + $ C(L+1,L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1,L+1), LDA, + $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A**T * B + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11} + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{12} + \beta C_{12} +* C_{21} = \alpha A_{12}**T * B_{11} + \alpha A_{22}**T * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{12}**T * B_{12} + \alpha A_{22}**T * B_{22} + \beta C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{12}**T * B_{11} + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22}**T * B_{21} + C_{21} (This routine) +* +* C_{22} = \alpha A_{12}**T * B_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22}**T * B_{22} + C_{22} (This routine) +* + ! C_{11} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + ! C_{21} + CALL SGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(1, L+1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(L+1, 1), LDC) + ! C_{22} + CALL SGEMM(TRANSA, TRANSB, M-L, N-L, L, + $ ALPHA, A(1, L+1), LDA, B(1, L+1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11}**T + \alpha A_{12} * B_{12}**T + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12} +* C_{21} = \alpha A_{22} * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{22} * B_{22}**T + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{12} * B_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11} * B_{11}**T + C_{11} (This routine) +* +* C_{12} = \alpha A_{12} * B_{22}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11} * B_{21}**T + C_{12} (This routine) +* + ! C_{11} + CALL SGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(1, L+1), LDA, B(1, L+1), LDB, BETA, C, LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL SGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1,1), LDB, ONE, + $ C(1, L+1), LDC) + ! C_{21} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(L+1, 1), LDC) + ! C_{22} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A * B + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11} + \alpha A_{12} * B_{21} + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{12} + \alpha A_{12} * B_{22} + \beta C_{12} +* C_{21} = \alpha A_{22} * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{22} * B_{22} + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{12} * B_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11} * B_{11} + C_{11} (This routine) +* +* C_{12} = \alpha A_{12} * B_{22} + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11} * B_{12} + C_{12} (This routine) +* + ! C_{11} + CALL SGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, 1), LDB, BETA, C, LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL SGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, + $ ONE, C(1, L+1), LDC) + ! C_{21} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(L+1, 1), LDC) + ! C_{22} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + END IF + ELSE +* +* A is lower triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A**T * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11}**T + \alpha A_{21}**T * B_{12}**T + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{21}**T + \alpha A_{21}**T * B_{22}**T + \beta C_{12} +* C_{21} = \alpha A_{22}**T * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{22}**T * B_{22}**T + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{21}**T * B_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11}**T * B_{11}**T + C_{11} (This routine) +* +* C_{12} = \alpha A_{21}**T * B_{22}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11}**T * B_{21}**T + C_{12} (This routine) +* + ! C_{11} + CALL SGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(1, L+1), LDB, BETA, C, LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL SGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE, + $ C(1, L+1), LDC) + ! C_{21} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(L+1, 1), LDC) + ! C_{22} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A**T * B + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11} + \alpha A_{21}**T * B_{21} + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{12} + \alpha A_{21}**T * B_{22} + \beta C_{12} +* C_{21} = \alpha A_{22}**T * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{22}**T * B_{22} + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{21}**T * B_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11}**T * B_{11} + C_{11} (This routine) +* +* C_{12} = \alpha A_{21}**T * B_{22} + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11}**T * B_{12} + C_{12} (This routine) +* + ! C_{11} + CALL SGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, C, LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL SGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, ONE, + $ C(1, L+1), LDC) + ! C_{21} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(L+1, 1), LDC) + ! C_{22} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11}**T + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{21}**T + \beta C_{12} +* C_{21} = \alpha A_{21} * B_{11}**T + \alpha A_{22} * B_{12}**T + \beta * C_{21} +* C_{22} = \alpha A_{21} * B_{21}**T + \alpha A_{22} * B_{22}**T + \beta * C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{21} * B_{11}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22} * B_{12}**T + C_{21} (This routine) +* +* C_{22} = \alpha A_{21} * B_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22} * B_{22}**T + C_{22} (This routine) +* + ! C_{11} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ BETA, C(1, L+1), LDC) + ! C_{21} + CALL SGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(L+1, 1), LDC) + ! C_{22} + CALL SGEMM(TRANSA, TRANSB, M-L, N-L, L, + $ ALPHA, A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A * B + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11} + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{12} + \beta C_{12} +* C_{21} = \alpha A_{21} * B_{11} + \alpha A_{22} * B_{21} + \beta * C_{21} +* C_{22} = \alpha A_{21} * B_{12} + \alpha A_{22} * B_{22} + \beta * C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{21} * B_{11} + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22} * B_{21} + C_{21} (This routine) +* +* C_{22} = \alpha A_{21} * B_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22} * B_{22} + C_{22} (This routine) +* + ! C_{11} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(1, L+1), LDC) + ! C_{21} + CALL SGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(L+1, 1), LDC) + ! C_{22} + CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, A(L+1, 1), LDA, B(1, L+1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + END IF + END IF + ELSE +* +* We are multiplying A from the right IE we are computing +* C = \alpha op(B)*op(A) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11}**T + \alpha B_{21}**T * A_{12}**T + \beta C_{11} +* C_{12} = \alpha B_{21}**T * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11}**T + \alpha B_{22}**T * A_{12}**T + \beta C_{21} +* C_{22} = \alpha B_{22}**T * A_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{21}**T * A_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11}**T * A_{11}**T + C_{11} (This routine) +* +* C_{21} = \alpha B_{22}**T * A_{12}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{12}**T * A_{11}**T + C_{21} (This routine) +* + ! C_{11} + CALL SGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(L+1, 1), LDB, A(1, L+1), LDA, BETA, C, LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(1, L+1), LDC) + ! C_{21} + CALL SGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ ONE, C(L+1, 1), LDC) + ! C_{22} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11}**T + \alpha B_{12} * A_{12}**T + \beta C_{11} +* C_{12} = \alpha B_{12} * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11}**T + \alpha B_{22} * A_{12}**T + \beta C_{21} +* C_{22} = \alpha B_{22} * A_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{12} * A_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11} * A_{11}**T + C_{11} (This routine) +* +* C_{21} = \alpha B_{22} * A_{12}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{21} * A_{11}**T + C_{21} (This routine) +* + ! C_{11} + CALL SGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(1,L+1), LDB, A(1,L+1), LDA, BETA, C, LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(1, L+1), LDC) + ! C_{21} + CALL SGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ ONE, C(L+1, 1), LDC) + ! C_{22} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11}**T * A_{12} + \alpha B_{21}**T * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11} + \beta C_{21} +* C_{22} = \alpha B_{12}**T * A_{12} + \alpha B_{22}**T * A_{22} + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11}**T * A_{12} + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{21}**T * A_{22} + C_{12} (This routine) +* +* C_{22} = \alpha B_{12}**T * A_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22}**T * A_{22} + C_{22} (This routine) +* + ! C_{11} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL SGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(1, L+1), LDC) + ! C_{21} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(L+1, 1), LDC) + ! C_{22} + CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, B(1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11} * A_{12} + \alpha B_{12} * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11} + \beta C_{21} +* C_{22} = \alpha B_{21} * A_{12} + \alpha B_{22} * A_{22} + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11} * A_{12} + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{12} * A_{22} + C_{12} (This routine) +* +* C_{22} = \alpha B_{21} * A_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22} * A_{22} + C_{22} (This routine) +* + ! C_{11} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL SGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(1, L+1), LDC) + ! C_{21} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, 1), LDC) + ! C_{22} + CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, B(L+1, 1), LDB, A(1, L+1), LDA, + $ BETA, C(L+1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + END IF + ELSE +* +* A is lower triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11}**T + \beta C_{11} +* C_{12} = \alpha B_{11}**T * A_{21}**T + \alpha B_{21}**T * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11}**T + \beta C_{21} +* C_{22} = \alpha B_{12}**T * A_{21}**T + \alpha B_{22}**T * A_{22}**T + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11}**T * A_{21}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{21}**T * A_{22}**T + C_{12} (This routine) +* +* C_{22} = \alpha B_{12}**T * A_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22}**T * A_{22}**T + C_{22} (This routine) +* + ! C_{11} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL SGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(1, L+1), LDC) + ! C_{21} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(L+1, 1), LDC) + ! C_{22} + CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA, + $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11} * A_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11}**T + \beta C_{21} +* C_{22} = \alpha B_{21} * A_{21}**T + \alpha A_{22} * B_{22}**T + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11} * A_{21}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{12} * A_{22}**T + C_{12} (This routine) +* +* C_{22} = \alpha B_{21} * A_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22} * A_{22}**T + C_{22} (This routine) +* + ! C_{11} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) + ! C_{12} + CALL SGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(1, L+1), LDC) + ! C_{21} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, 1), LDC) + ! C_{22} + CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA, + $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11} + \alpha B_{21}**T * A_{21} + \beta C_{11} +* C_{12} = \alpha B_{21}**T * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11} + \alpha B_{22}**T * A_{21} + \beta C_{21} +* C_{22} = \alpha B_{22}**T * A_{22} + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{21}**T * A_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11}**T * A_{11} + C_{11}(This routine) +* +* C_{21} = \alpha B_{22}**T * A_{21} + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{12}**T * A_{11} + C_{21} (This routine) +* + ! C_{11} + CALL SGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, C, LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(1, L+1), LDC) + ! C_{21} + CALL SGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, ONE, + $ C(L+1, 1), LDC) + ! C_{22} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \alpha B_{12} * A_{21} + \beta C_{11} +* C_{12} = \alpha B_{12} * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11} + \alpha B_{22} * A_{21} + \beta C_{21} +* C_{22} = \alpha B_{22} * A_{22} + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{12} * A_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11} * A_{11} + C_{11}(This routine) +* +* C_{21} = \alpha B_{22} * A_{21} + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{21} * A_{11} + C_{21} (This routine) +* + ! C_{11} + CALL SGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, C, LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) + ! C_{12} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(1, L+1), LDC) + ! C_{21} + CALL SGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE, + $ C(L+1, 1), LDC) + ! C_{22} + CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + END IF + END IF + END IF + END SUBROUTINE diff --git a/SRC/strtrm.f b/SRC/strtrm.f new file mode 100644 index 0000000000..ba8f4a0dba --- /dev/null +++ b/SRC/strtrm.f @@ -0,0 +1,577 @@ +*> \brief \b STRTRM computes an in place triangular-triangular matrix product +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE STRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, +* $ N, ALPHA, T, LDT, V, LDV) +* +* .. Scalar Arguments .. +* INTEGER N, LDT, LDV +* CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV +* REAL ALPHA +* .. +* .. Array Arguments .. +* REAL T(LDT,*), V(LDV,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> STRTRM performs one of the matrix-matrix operations +*> +*> T = \alpha op(V) * T +*> or +*> T = \alpha T * op(V) +*> where \alpha is a scalar, T and V are unit, or non-unit, upper or +*> lower triangular matrix, and op(V) is one of +*> +*> op(V) = V or op(V) = V**T +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> On entry, SIDE specifies whether op(V) multiplies T from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' T = \alpha op(V) * T +*> +*> SIDE = 'R' or 'r' T = \alpha T * op(V) +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> On entry, UPLO specifies whether the matrix T is an upper or +*> lower triangular matrix as follows: +*> UPLO = 'U' or 'u' T is upper triangular +*> +*> UPLO = 'L' or 'l' T is lower triangular +*> \Endverbatim +*> +*> \param[in] TRANSV +*> \verbatim +*> TRANSV is CHARACTER*1 +*> On entry, TRANSV specifies the form of op(V) to be used in +*> the matrix multiplication as follows: +*> TRANSV = 'N' or 'n' op(V) = V +*> +*> TRANSV = 'T' or 't' op(V) = V**T +*> +*> TRANSV = 'C' or 'c' op(V) = V**T +*> \endverbatim +*> +*> \param[in] DIAGT +*> \verbatim +*> DIAGT is CHARACTER*1 +*> On entry, DIAGT specifies whether or not T is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' T is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' T is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] DIAGV +*> \verbatim +*> DIAGV is CHARACTER*1 +*> On entry, DIAGV specifies whether or not V is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' V is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' V is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> On entry, N specifies the number of rows and columns of T. +*> N must be at least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is REAL . +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then T and V are not referenced, and T and V need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, dimension ( LDT, N ) +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array T must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> T is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array T must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> T is not referenced. +*> Note that when DIAGT = 'U' or 'u', the diagonal elements of +*> T are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> On entry, LDT specifies the first dimension of T as declared +*> in the calling (sub) program. LDT must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is REAL array, dimension ( LDV, N ) +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array op(V) must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> V is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array op(V) must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> V is not referenced. +*> Note that when DIAGV = 'U' or 'u', the diagonal elements of +*> V are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> On entry, LDV specifies the first dimension of T as declared +*> in the calling (sub) program. LDV must be at least max( 1, n ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + RECURSIVE SUBROUTINE STRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, + $ N, ALPHA, T, LDT, V, LDV) +* +* .. Scalar Arguments .. + INTEGER N, LDT, LDV + CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV + REAL ALPHA +* .. +* .. Array Arguments .. + REAL T(LDT,*), V(LDV,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL STRMM, STRMMOOP, SLASET +* .. +* .. Local Scalars .. + INTEGER K, INFO + LOGICAL TLEFT, TUPPER, VTRANS, VUNIT, TUNIT +* .. +* .. Local Parameters .. + REAL ONE, ZERO + PARAMETER(ONE=1.0E+0, ZERO=0.0E+0) +* .. +* +* Beginning of Executable Statements +* +* +* Early Termination Criteria +* + IF (ALPHA.EQ.ZERO) THEN +* +* If ALPHA is 0, then we are just setting T to be the 0 matrix +* + CALL SLASET(UPLO, N, N, ZERO, ZERO, T, LDT) + RETURN + END IF + TUNIT = LSAME(DIAGT, 'U') + VUNIT = LSAME(DIAGV, 'U') +* +* Terminating Case +* + IF (N.EQ.1) THEN + IF (VUNIT.AND.TUNIT) THEN + T(1,1) = ALPHA + ELSE IF (VUNIT) THEN + T(1,1) = ALPHA*T(1,1) + ELSE IF (TUNIT) THEN + T(1,1) = ALPHA*V(1,1) + ELSE + T(1,1) = ALPHA*T(1,1)*V(1,1) + END IF + RETURN + ELSE IF(N.LE.0) THEN + RETURN + END IF +* +* Recursive case +* + TUPPER = LSAME(UPLO,'U') + TLEFT = LSAME(SIDE,'R') + VTRANS = LSAME(TRANSV,'T').OR.LSAME(TRANSV,'C') + + K = N / 2 + IF(TUPPER) THEN +* +* T is upper triangular +* + IF(TLEFT) THEN +* +* Compute T = T*op(V) +* + IF(VTRANS) THEN +* +* We are computing T = T*V**T, which we break down as follows +* |--------------| |--------------| |--------------------| +* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11}**T V_{21}**T| +* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}**T| +* |--------------| |--------------| |--------------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11}**T +* T_{12} = \alpha T_{11}*V_{21}**T + \alpha T_{12}*V_{22}**T +* T_{22} = \alpha T_{22}*V_{22}**T +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha T_{12}*V_{22}**T (STRMM) +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} (STRMMOOP) +* +* T_{12} = \alpha T_{12}*V_{22}**T +* + CALL STRMM('Right', 'Lower', TRANSV, DIAGV, K, + $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} +* + CALL STRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT, + $ V(K+1, 1), LDV, ONE, T(1, K+1), LDT) + ELSE +* +* We are computing T = T*V, which we break down as follows +* |--------------| |--------------| |-------------| +* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11} V_{12}| +* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}| +* |--------------| |--------------| |-------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11} +* T_{12} = \alpha T_{11}*V_{12} + \alpha T_{12}*V_{22} +* T_{22} = \alpha T_{22}*V_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha T_{12}*V_{22} (STRMM) +* T_{12} = \alpha T_{11}*V_{12} + T_{12} (STRMMOOP) +* +* T_{12} = \alpha T_{12}*V_{22} +* + CALL STRMM('Right', 'Upper', TRANSV, DIAGV, K, + $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} +* + CALL STRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT, + $ V(1, K+1), LDV, ONE, T(1, K+1), LDT) + END IF + ELSE +* +* Compute T = op(V)*T +* + IF(VTRANS) THEN +* +* We are computing T = V**T*T, which we break down as follows +* |--------------| |--------------------| |--------------| +* |T_{11} T_{12}| |V_{11}**T V_{21}**T| |T_{11} T_{12}| +* |0 T_{22}| = \alpha |0 V_{22}**T| * |0 T_{22}| +* |--------------| |--------------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}**T*T_{11} +* T_{12} = \alpha V_{11}**T*T_{12} + \alpha V_{21}**T*T_{22} +* T_{22} = \alpha V_{22}**T*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha V_{11}**T*T_{12} (STRMM) +* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} (STRMMOOP) +* +* T_{12} = \alpha V_{11}**T*T_{12} +* + CALL STRMM('Left', 'Lower', TRANSV, DIAGV, K, + $ N-K, ALPHA, V, LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} +* + CALL STRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1), + $ LDT, V(K+1, 1), LDV, ONE, T(1, K+1), LDT) + ELSE +* +* We are computing T = V*T, which we break down as follows +* |--------------| |--------------| |--------------| +* |T_{11} T_{12}| |V_{11} V_{12}| |T_{11} T_{12}| +* |0 T_{22}| = \alpha |0 V_{22}| * |0 T_{22}| +* |--------------| |--------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}*T_{11} +* T_{12} = \alpha V_{11}*T_{12} + \alpha V_{12}*T_{22} +* T_{22} = \alpha V_{22}*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha V_{11}*T_{12} (STRMM) +* T_{12} = \alpha V_{12}*T_{22} + T_{12} (STRMMOOP) +* +* T_{12} = \alpha V_{11}*T_{12} +* + CALL STRMM('Left', 'Upper', TRANSV, DIAGV, K, + $ N-K, ALPHA, V, LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha V_{12}*T_{22} + T_{12} (STRMMOOP) +* + CALL STRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1), + $ LDT, V(1, K+1), LDV, ONE, T(1, K+1), LDT) + END IF + END IF + ELSE +* +* T is lower triangular +* + IF(TLEFT) THEN +* +* Compute T = T*op(V) +* + IF(VTRANS) THEN +* +* We are computing T = T*V**T, which we break down as follows +* |--------------| |--------------| |--------------------| +* |T_{11} 0 | |T_{11} 0 | |V_{11}**T 0 | +* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{12}**T V_{22}**T| +* |--------------| |--------------| |--------------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11}**T +* T_{21} = \alpha T_{21}*V_{11}**T + \alpha T_{22}*V_{12}**T +* T_{22} = \alpha T_{22}*V_{22}**T +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha T_{21}*V_{11}**T (STRMM) +* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} (STRMMOOP) +* +* T_{21} = \alpha T_{21}*V_{11}**T +* + CALL STRMM('Right', 'Upper', TRANSV, DIAGV, N-K, + $ K, ALPHA, V, LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} +* + CALL STRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1), + $ LDT, V(1, K+1), LDV, ONE, T(K+1, 1), LDT) + ELSE +* +* We are computing T = T*V, which we break down as follows +* |--------------| |--------------| |-------------| +* |T_{11} 0 | |T_{11} 0 | |V_{11} 0 | +* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{21} V_{22}| +* |--------------| |--------------| |-------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11} +* T_{21} = \alpha T_{21}*V_{11} + \alpha T_{22}*V_{21} +* T_{22} = \alpha T_{22}*V_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha T_{21}*V_{11} (STRMM) +* T_{21} = \alpha T_{22}*V_{21} + T_{21} (STRMMOOP) +* +* T_{21} = \alpha T_{21}*V_{11} +* + CALL STRMM('Right', 'Lower', TRANSV, DIAGV, N-K, + $ K, ALPHA, V, LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha T_{22}*V_{12} + T_{21} +* + CALL STRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1), + $ LDT, V(K+1, 1), LDV, ONE, T(K+1, 1), LDT) + END IF + ELSE +* +* Compute T = op(V)*T +* + IF(VTRANS) THEN +* +* We are computing T = V**T*T, which we break down as follows +* |--------------| |--------------------| |--------------| +* |T_{11} 0 | |V_{11}**T 0 | |T_{11} 0 | +* |T_{21} T_{22}| = \alpha |V_{12}**T V_{22}**T| * |T_{21} T_{22}| +* |--------------| |--------------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}**T*T_{11} +* T_{21} = \alpha V_{12}**T*T_{11} + \alpha V_{22}**T*T_{21} +* T_{22} = \alpha V_{22}**T*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha V_{22}**T*T_{21} (STRMM) +* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} (STRMMOOP) +* +* T_{21} = \alpha V_{22}**T*T_{21} +* + CALL STRMM('Left', 'Upper', TRANSV, DIAGV, N-K, K, + $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} +* + CALL STRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT, + $ V(1, K+1), LDV, ONE, T(K+1, 1), LDT) + ELSE +* +* We are computing T = V*T, which we break down as follows +* |--------------| |-------------| |--------------| +* |T_{11} 0 | |V_{11} 0 | |T_{11} 0 | +* |T_{21} T_{22}| = \alpha |V_{21} V_{22}| * |T_{21} T_{22}| +* |--------------| |-------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}*T_{11} +* T_{21} = \alpha V_{21}*T_{11} + \alpha V_{22}*T_{21} +* T_{22} = \alpha V_{22}*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{21} = \alpha V_{22}*T_{21} (STRMM) +* T_{21} = \alpha V_{12}*T_{11} + T_{21} (STRMMOOP) +* +* T_{21} = \alpha V_{22}*T_{12} +* + CALL STRMM('Left', 'Lower', TRANSV, DIAGV, N-K, K, + $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha V_{12}*T_{11} + T_{21} +* + CALL STRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT, + $ V(K+1, 1), LDV, ONE, T(K+1, 1), LDT) + END IF + END IF + END IF +* +* Since in all the above cases, we compute T_{11} and T_{22} +* the same, we pass in our flags and call this routine recursively +* +* Compute T_{11} recursively +* + CALL STRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, K, ALPHA, + $ T, LDT, V, LDV) +* +* Compute T_{22} recursively +* + CALL STRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, N-K, ALPHA, + $ T(K+1, K+1), LDT, V(K+1, K+1), LDV) + + END SUBROUTINE diff --git a/SRC/zacxpy.f b/SRC/zacxpy.f new file mode 100644 index 0000000000..14d07f77d2 --- /dev/null +++ b/SRC/zacxpy.f @@ -0,0 +1,133 @@ +*> \brief \b ZACXPY +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZACXPY(N,ZA,ZX,INCX,ZY,INCY) +* +* .. Scalar Arguments .. +* COMPLEX*16 ZA +* INTEGER INCX,INCY,N +* .. +* .. Array Arguments .. +* COMPLEX*16 ZX(*),ZY(*) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZACXPY constant times a conjugated vector plus a vector. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> number of elements in input vector(s) +*> \endverbatim +*> +*> \param[in] ZA +*> \verbatim +*> ZA is COMPLEX*16 +*> On entry, ZA specifies the scalar alpha. +*> \endverbatim +*> +*> \param[in] ZX +*> \verbatim +*> ZX is COMPLEX*16 array, dimension ( 1 + ( N - 1 )*abs( INCX ) ) +*> \endverbatim +*> +*> \param[in] INCX +*> \verbatim +*> INCX is INTEGER +*> storage spacing between elements of ZX +*> \endverbatim +*> +*> \param[in,out] ZY +*> \verbatim +*> ZY is COMPLEX*16 array, dimension ( 1 + ( N - 1 )*abs( INCY ) ) +*> \endverbatim +*> +*> \param[in] INCY +*> \verbatim +*> INCY is INTEGER +*> storage spacing between elements of ZY +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup axpy +*> +* ===================================================================== + SUBROUTINE ZACXPY(N,ZA,ZX,INCX,ZY,INCY) +* +* -- Reference BLAS level1 routine -- +* -- Reference BLAS is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + COMPLEX*16 ZA + INTEGER INCX,INCY,N +* .. +* .. Array Arguments .. + COMPLEX*16 ZX(*),ZY(*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + DOUBLE PRECISION DCABS1 + EXTERNAL DCABS1 +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG +* .. Local Scalars .. + INTEGER I,IX,IY +* .. + IF (N.LE.0) RETURN + IF (DCABS1(ZA).EQ.0.0d0) RETURN + IF (INCX.EQ.1 .AND. INCY.EQ.1) THEN +* +* code for both increments equal to 1 +* + DO I = 1,N + ZY(I) = ZY(I) + ZA*CONJG(ZX(I)) + END DO + ELSE +* +* code for unequal increments or equal increments +* not equal to 1 +* + IX = 1 + IY = 1 + IF (INCX.LT.0) IX = (-N+1)*INCX + 1 + IF (INCY.LT.0) IY = (-N+1)*INCY + 1 + DO I = 1,N + ZY(IY) = ZY(IY) + ZA*CONJG(ZX(IX)) + IX = IX + INCX + IY = IY + INCY + END DO + END IF +* + RETURN +* +* End of ZACXPY +* + END + diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f new file mode 100644 index 0000000000..a73b693901 --- /dev/null +++ b/SRC/zlarfb0c2.f @@ -0,0 +1,564 @@ +*> \brief \b ZLARFB0C2 applies a block reflector or its conjugate-transpose +* to a rectangular matrix with a 0 block while constructing the explicit Q +* factor +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* +* Definition: +* =========== +* +* SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, +* $ K, V, LDV, T, LDT, C, LDC) +* ! Scalar arguments +* INTEGER M, N, K, LDV, LDC, LDT +* CHARACTER SIDE, TRANS, DIRECT, STOREV +* ! True means that we are assuming C2 is the identity matrix +* ! and thus don't reference whatever is present in C2 +* ! at the beginning. +* LOGICAL C2I +* ! Array arguments +* COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLARFB0C2 applies a real block reflector H or its transpose H**H to a +*> complex m by n matrix C with a 0 block, while computing the explicit Q factor +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] C2I +*> \verbatim +*> C2I is LOGICAL +*> = .TRUE.: Assume the nonzero block of C is the identity matrix +*> = .FALSE.: Use existing data in the nonzero block of C +*> \endverbatim +*> +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> = 'L': apply H or H**H from the Left +*> = 'R': apply H or H**H from the Right +*> \endverbatim +*> +*> \param[in] TRANS +*> \verbatim +*> TRANS is CHARACTER*1 +*> = 'N': apply H (No transpose) +*> = 'C': apply H**H (Conjugate transpose) +*> \endverbatim +*> +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Indicates how H is formed from a product of elementary +*> reflectors +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Indicates how the vectors which define the elementary +*> reflectors are stored: +*> = 'C': Columnwise +*> = 'R': Rowwise +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix C. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix C. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the matrix T (= the number of elementary +*> reflectors whose product defines the block reflector). +*> If SIDE = 'L', M >= K >= 0; +*> if SIDE = 'R', N >= K >= 0. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX*16 array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,M) if STOREV = 'R' and SIDE = 'L' +*> (LDV,N) if STOREV = 'R' and SIDE = 'R' +*> See Further Details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); +*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); +*> if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,K) +*> The triangular K-by-K matrix T in the representation of the +*> block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension (LDC,N) +*> On entry, the M-by-N matrix C. +*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> The leading dimension of the array C. LDC >= max(1,M). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larfb +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The triangular part of V (including its diagonal) is not +*> referenced. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N, + $ K, V, LDV, T, LDT, C, LDC) + ! Scalar arguments + INTEGER M, N, K, LDV, LDC, LDT + CHARACTER SIDE, TRANS, DIRECT, STOREV + ! True means that we are assuming C2 is the identity matrix + ! and thus don't reference whatever is present in C2 + ! at the beginning. + LOGICAL C2I + ! Array arguments + COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*) + ! Local scalars + LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER, + $ TRANST + INTEGER I, J + ! Intrinsic Functions + INTRINSIC CONJG + ! External functions + LOGICAL LSAME + EXTERNAL LSAME + ! External Subroutines + EXTERNAL ZGEMM, ZTRMM + ! Parameters + COMPLEX*16 ONE, ZERO, NEG_ONE + PARAMETER(ONE=(1.0D+0, 0.0D+0), + $ ZERO = (0.0D+0, 0.0D+0), + $ NEG_ONE = (-1.0D+0, 0.0D+0)) + + ! Beginning of executable statements + ! Convert our character flags to logical values + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') + SIDEL = LSAME(SIDE,'L') + SIDER = LSAME(SIDE,'R') + TRANST = LSAME(TRANS,'C') + + ! Determine which of the 4 modes are using. + ! QR is when we store the reflectors column by column and have the + ! 'first' reflector stored in the first column + QR = DIRF.AND.COLV + + ! LQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the first row + LQ = DIRF.AND.(.NOT.COLV) + + ! QL is when we store the reflectors column by column and have the + ! 'first' reflector stored in the last column + QL = (.NOT.DIRF).AND.COLV + + ! RQ is when we store the reflectors row by row and have the + ! 'first' reflector stored in the last row + ! RQ = (.NOT.DIRF).AND.(.NOT.COLV) + ! Since we have exactly one of these 4 modes, we don't need to actually + ! store the value of RQ, instead we assume this is the case if we fail + ! the above 3 checks. + + IF (QR) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V1 ] and C = [ C1 ] + ! [ V2 ] [ C2 ] + ! with the following dimensions: + ! V1\in\C^{K\times K} + ! V2\in\C^{M-K\times K} + ! C1=0\in\C^{K\times N} + ! C2\in\C^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ] + ! [ V2 ] [ C2 ] + ! = [ 0 ] - [ V1 ]T*V2'*C2 + ! [ C2 ] [ V2 ] + ! + ! = [ 0 ] - [ V1*T*V2'*C2 ] + ! [ C2 ] [ V2*T*V2'*C2 ] + ! + ! = [ V1*T*V2'*C2 ] + ! [ C2 - V2*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('ZLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('ZLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF (C2I) THEN + DO J = 1, N + DO I = 1, K + C(I,J) = CONJG(V(K+J,I)) + END DO + END DO + ELSE + CALL ZGEMM('Conjugate', 'No Transpose', K, N, M - K, + $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO, + $ C, LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C, LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF (C2I) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO, + $ C(K+1,1), LDC) + DO I = 1, N + C(K+I,I) = C(K+I,I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE, + $ C(K+1,1), LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V, LDV, C, LDC) + ELSE IF (LQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V1 V2 ] and C = [ C1 C2 ] + ! with the following dimensions: + ! V1\in\C^{K\times K} + ! V2\in\C^{K\times N-K} + ! C1=0\in\C^{M\times K} + ! C2\in\C^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ]) + ! [ V2' ] + ! + ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ] + ! [ V2' ] + ! + ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ] + ! + ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('ZLARFB0C2', 2) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,J) = CONJG(V(J,K+I)) + END DO + END DO + ELSE + CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C, + $ LDC) + END IF + ! + ! C1 = C1*T' + ! + IF( TRANST ) THEN + CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C, LDC) + ELSE + CALL ZTRMM('Right', 'Lower', 'No Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C, LDC) + END IF + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1), + $ LDC) + DO I = 1, M + C(I,K+I) = C(I,K+I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1), + $ LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V, LDV, C, LDC) + ELSE IF (QL) THEN + ! We are computing C = HC = (I - VTV')C + ! Where: V = [ V2 ] and C = [ C2 ] + ! [ V1 ] [ C1 ] + ! with the following dimensions: + ! V1\in\C^{K\times K} + ! V2\in\C^{M-K\times K} + ! C1=0\in\C^{K\times N} + ! C2\in\C^{M-K\times N} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ] + ! [ V1 ] [ 0 ] + ! + ! = [ C2 ] - [ V2 ]T*V2'*C2 + ! [ 0 ] [ V1 ] + ! + ! = [ C2 ] - [ V2*T*V2'*C2 ] + ! [ 0 ] [ V1*T*V2'*C2 ] + ! + ! = [ C2 - V2*T*V2'*C2 ] + ! [ - V1*T*V2'*C2 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = V2'*C2 + ! C1 = T*C1 + ! C2 = C2 - V2*C1 + ! C1 = -V1*C1 + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDEL ) THEN + CALL XERBLA('ZLARFB0C2', 2) + RETURN + ELSE IF(TRANST) THEN + CALL XERBLA('ZLARFB0C2', 3) + RETURN + END IF + ! + ! C1 = V2'*C2 + ! + IF( C2I ) THEN + DO J = 1, N + DO I = 1, K + C(M-K+I,J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL ZGEMM('Conjugate', 'No Transpose', K, N, M-K, + $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC) + END IF + ! + ! C1 = T*C1 + ! + CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K, N, ONE, T, LDT, C(M-K+1,1), LDC) + ! + ! C2 = C2 - V2*C1 = -V2*C1 + C2 + ! + IF( C2I ) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC) + DO I = 1, N + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K, + $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC) + END IF + ! + ! C1 = -V1*C1 + ! + CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Unit', + $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC) + ELSE ! IF (RQ) THEN + ! We are computing C = CH' = C(I-V'T'V) + ! Where: V = [ V2 V1] and C = [ C2 C1 ] + ! with the following dimensions: + ! V1\in\C^{K\times K} + ! V2\in\C^{K\times N-K} + ! C1=0\in\C^{M\times K} + ! C2\in\C^{M\times N-K} + ! Since we are assuming that C1 is a zero matrix and it will be + ! overwritten on exit, we can use this spot as a temporary workspace + ! without having to allocate anything extra. + ! This lets us simplify our above equation to get + ! + ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ] + ! [ V1' ] + ! + ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ] + ! + ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ] + ! + ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ] + ! + ! So, we can order our computations as follows: + ! + ! C1 = C2*V2' + ! C1 = C1*T' + ! C2 = C2 - C1*V2 + ! C1 = -C1*V1 + ! + ! + ! To achieve the same end result + ! + ! Check to ensure side and trans are the expected values + ! + IF( .NOT.SIDER ) THEN + CALL XERBLA('ZLARFB0C2', 2) + RETURN + END IF + ! + ! C1 = C2*V2' + ! + IF( C2I ) THEN + DO J = 1, K + DO I = 1, M + C(I,N-K+J) = CONJG(V(J,I)) + END DO + END DO + ELSE + CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K, + $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC) + END IF + ! + ! C1 = C1*T' + ! + IF( TRANST ) THEN + CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Non-unit', + $ M, K, ONE, T, LDT, C(1, N-K+1), LDC) + ELSE + CALL ZTRMM('Right', 'Upper', 'No Transpose', + $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC) + END IF + ! + ! C2 = C2 - C1*V2 = -C1*V2 + C2 + ! + IF( C2I ) THEN + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC) + DO I = 1, M + C(I,I) = C(I,I) + ONE + END DO + ELSE + CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K, + $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC) + END IF + ! + ! C1 = -C1*V1 + ! + CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Unit', + $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC) + END IF + END SUBROUTINE diff --git a/SRC/zlarft.f b/SRC/zlarft.f index 10e90d3346..47ed80fa60 100644 --- a/SRC/zlarft.f +++ b/SRC/zlarft.f @@ -6,11 +6,11 @@ * http://www.netlib.org/lapack/explore-html/ * *> Download ZLARFT + dependencies -*> +*> *> [TGZ] -*> +*> *> [ZIP] -*> +*> *> [TXT] * * Definition: @@ -23,7 +23,7 @@ * INTEGER K, LDT, LDV, N * .. * .. Array Arguments .. -* COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * ) +* COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * ) * .. * * @@ -32,7 +32,7 @@ *> *> \verbatim *> -*> ZLARFT forms the triangular factor T of a complex block reflector H +*> ZLARFT forms the triangular factor T of a real block reflector H *> of order n, which is defined as a product of k elementary reflectors. *> *> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; @@ -42,12 +42,12 @@ *> If STOREV = 'C', the vector which defines the elementary reflector *> H(i) is stored in the i-th column of the array V, and *> -*> H = I - V * T * V**H +*> H = I - V * T * V**T *> *> If STOREV = 'R', the vector which defines the elementary reflector *> H(i) is stored in the i-th row of the array V, and *> -*> H = I - V**H * T * V +*> H = I - V**T * T * V *> \endverbatim * * Arguments: @@ -166,23 +166,25 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * * .. Scalar Arguments * - CHARACTER DIRECT, STOREV - INTEGER K, LDT, LDV, N + CHARACTER DIRECT, STOREV + INTEGER K, LDT, LDV, N * .. * .. Array Arguments .. * - COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * ) + COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * ) * .. * * .. Parameters .. * - COMPLEX*16 ONE, NEG_ONE, ZERO - PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE=-1.0D+0) + COMPLEX*16 ONE, NEG_ONE, ZERO + PARAMETER(ONE=(1.0D+0,0.0D+0), + $ ZERO = (0.0D+0,0.0D+0), + $ NEG_ONE=(-1.0D+0,0.0D+0)) * * .. Local Scalars .. * INTEGER I,J,L - LOGICAL QR,LQ,QL,DIRF,COLV + LOGICAL QR,LQ,QL,RQ,LQT,RQT,DIRF,COLV,TDIRF,TCOLV * * .. External Subroutines .. * @@ -196,7 +198,7 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * .. Intrinsic Functions.. * INTRINSIC CONJG -* +* * The general scheme used is inspired by the approach inside DGEQRT3 * which was (at the time of writing this code): * Based on the algorithm of Elmroth and Gustavson, @@ -210,13 +212,6 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, RETURN END IF * -* Base case -* - IF(N.EQ.1.OR.K.EQ.1) THEN - T(1,1) = TAU(1) - RETURN - END IF -* * Beginning of executable statements * L = K / 2 @@ -227,26 +222,48 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * 'C' for STOREV, then they meant to provide 'R' * DIRF = LSAME(DIRECT,'F') + TDIRF = LSAME(DIRECT,'T') COLV = LSAME(STOREV,'C') + TCOLV = LSAME(STOREV,'T') * * QR happens when we have forward direction in column storage * QR = DIRF.AND.COLV * -* LQ happens when we have forward direction in row storage +* LQT happens when we have forward direction in row storage and want to compute the transpose of +* the T we would normally compute +* + LQT = DIRF.AND.TCOLV +* +* LQ happens when we have forward direction in row storage and want to compute the T we would +* normally compute * - LQ = DIRF.AND.(.NOT.COLV) + LQ = DIRF.AND.(.NOT.LQT) * * QL happens when we have backward direction in column storage * QL = (.NOT.DIRF).AND.COLV * -* The last case is RQ. Due to how we structured this, if the -* above 3 are false, then RQ must be true, so we never store -* this -* RQ happens when we have backward direction in row storage -* RQ = (.NOT.DIRF).AND.(.NOT.COLV) +* RQT happens when we have backward direction in row storage and want to compute the transpose +* of the T we would normally compute +* + RQT = TDIRF.AND.(.NOT.COLV) +* +* RQ happens when we have backward direction in row storage and want to compute the T that we +* would normally compute +* + RQ = (.NOT.RQT).AND.(.NOT.COLV) * +* Base case +* + IF(N.EQ.1.OR.K.EQ.1) THEN + IF( RQT.OR.LQT ) THEN + T(1,1) = CONJG(TAU(1)) + ELSE + T(1,1) = TAU(1) + END IF + RETURN + END IF IF(QR) THEN * * Break V apart into 6 components @@ -260,17 +277,17 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{1,1}\in\C^{l,l} unit lower triangular * V_{2,1}\in\C^{k-l,l} rectangular * V_{3,1}\in\C^{n-k,l} rectangular -* +* * V_{2,2}\in\C^{k-l,k-l} unit lower triangular * V_{3,2}\in\C^{n-k,k-l} rectangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} T_{1,2}| * |0 T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -281,17 +298,17 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* +* * (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2') * = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2' -* +* * Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |-------| * |V_1 V_2| * |-------| -* +* * So, our product is equivalent to the matrix product * I - V*T*V' * This means, we can compute T_{1,1} and T_{2,2}, then use this information @@ -303,30 +320,29 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{2,2} recursively * - CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, - $ TAU(L+1), T(L+1, L+1), LDT) + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) * -* Compute T_{1,2} +* Compute T_{1,2} * T_{1,2} = V_{2,1}' * DO J = 1, L DO I = 1, K-L - T(J, L+I) = CONJG(V(L+I, J)) + T(J,L+I) = CONJG(V(L+I,J)) END DO END DO * * T_{1,2} = T_{1,2}*V_{2,2} * CALL ZTRMM('Right', 'Lower', 'No transpose', 'Unit', L, - $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2} * Note: We assume K <= N, and GEMM will do nothing if N=K * - CALL ZGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, - $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE, - $ T(1, L+1), LDT) + CALL ZGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, + $ V(K+1, 1), LDV, V(K+1,L+1), LDV, ONE, T(1, L+1), LDT) * * At this point, we have that T_{1,2} = V_1'*V_2 * All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} @@ -335,12 +351,12 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2} = -T_{1,1}*T_{1,2} * CALL ZTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*T_{2,2} * - CALL ZTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + CALL ZTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT) ELSE IF(LQ) THEN * @@ -354,41 +370,41 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{1,1}\in\C^{l,l} unit upper triangular * V_{1,2}\in\C^{l,k-l} rectangular * V_{1,3}\in\C^{l,n-k} rectangular -* +* * V_{2,2}\in\C^{k-l,k-l} unit upper triangular * V_{2,3}\in\C^{k-l,n-k} rectangular * * Where l = floor(k/2) * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} T_{1,2}| * |0 T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * -* T_{1,1}\in\C^{l, l} upper triangular -* T_{2,2}\in\C^{k-l, k-l} upper triangular -* T_{1,2}\in\C^{l, k-l} rectangular +* T_{1,1}\in\C^{l, l} upper triangular +* T_{2,2}\in\C^{k-l, k-l} upper triangular +* T_{1,2}\in\C^{l, k-l} rectangular * * Then, consider the product: -* -* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) -* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 -* +* +* (I - V_1'*T_{1,1}'*V_1)*(I - V_2'*T_{2,2}'*V_2) +* = I - V_1'*T_{1,1}'*V_1 - V_2'*T_{2,2}'*V_2 + V_1'*T_{1,1}'*V_1*V_2'*T_{2,2}'*V_2 +* * Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |---| * |V_1| * |V_2| * |---| -* +* * So, our product is equivalent to the matrix product -* I - V'*T*V +* I - V'*T'*V * This means, we can compute T_{1,1} and T_{2,2}, then use this information * to compute T_{1,2} * @@ -398,27 +414,26 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{2,2} recursively * - CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, - $ TAU(L+1), T(L+1, L+1), LDT) + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) * * Compute T_{1,2} * T_{1,2} = V_{1,2} * - CALL ZLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT) + CALL ZLACPY('All', L, K - L, V(1,L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*V_{2,2}' * CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Unit', L, K-L, - $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) * * T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2} * Note: We assume K <= N, and GEMM will do nothing if N=K * CALL ZGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE, - $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, - $ T(1, L+1), LDT) + $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, T(1, L+1), LDT) * * At this point, we have that T_{1,2} = V_1*V_2' * All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} @@ -427,13 +442,106 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{1,2} = -T_{1,1}*T_{1,2} * CALL ZTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) * * T_{1,2} = T_{1,2}*T_{2,2} * CALL ZTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT) + ELSE IF(LQT) THEN +* +* Break V apart into 6 components +* +* V = |----------------------| +* |V_{1,1} V_{1,2} V{1,3}| +* |0 V_{2,2} V{2,3}| +* |----------------------| +* +* V_{1,1}\in\C^{l,l} unit upper triangular +* V_{1,2}\in\C^{l,k-l} rectangular +* V_{1,3}\in\C^{l,n-k} rectangular +* +* V_{2,2}\in\C^{k-l,k-l} unit upper triangular +* V_{2,3}\in\C^{k-l,n-k} rectangular +* +* Where l = floor(k/2) +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{l, l} lower triangular +* T_{2,2}\in\C^{k-l, k-l} lower triangular +* T_{2,1}\in\C^{k-l, l} rectangular +* +* Then, consider the product: +* +* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) +* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 +* +* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL ZLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV, + $ TAU(L+1), T(L+1,L+1), LDT) + +* +* Compute T_{2,1} +* T_{2,1} = V_{1,2}' +* + DO I = 1, K-L + DO J = 1, L + T(L+I,J) = CONJG(V(J,L+I)) + END DO + END DO +* +* T_{2,1} = V_{2,2}*T_{2,1} +* + CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Unit', K-L, L, + $ ONE, V(L+1,L+1), LDV, T(L+1,1), LDT) +* +* T_{2,1} = V_{2,3}*V_{1,3}' + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL ZGEMM('No Transpose', 'Conjugate', K-L, L, N-K, ONE, + $ V(L+1,K+1), LDV, V(1, K+1), LDV, ONE, T(L+1,1), LDT) +* +* At this point, we have that T_{2,1} = V_2*V_1' +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ K-L, L, NEG_ONE, T(L+1,L+1), LDT, T(L+1,1), LDT) +* +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Non-unit', + $ K-L, L, ONE, T, LDT, T(L+1,1), LDT) ELSE IF(QL) THEN * * Break V apart into 6 components @@ -446,18 +554,18 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * * V_{1,1}\in\C^{n-k,k-l} rectangular * V_{2,1}\in\C^{k-l,k-l} unit upper triangular -* +* * V_{1,2}\in\C^{n-k,l} rectangular * V_{2,2}\in\C^{k-l,l} rectangular * V_{3,2}\in\C^{l,l} unit upper triangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} 0 | * |T_{2,1} T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -468,17 +576,17 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* +* * (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1') * = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1' -* +* * Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |-------| * |V_1 V_2| * |-------| -* +* * So, our product is equivalent to the matrix product * I - V*T*V' * This means, we can compute T_{1,1} and T_{2,2}, then use this information @@ -486,34 +594,34 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * * Compute T_{1,1} recursively * - CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) * * Compute T_{2,2} recursively * CALL ZLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV, - $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) * * Compute T_{2,1} * T_{2,1} = V_{2,2}' * DO J = 1, K-L DO I = 1, L - T(K-L+I, J) = CONJG(V(N-K+J, K-L+I)) + T(K-L+I,J) = CONJG(V(N-K+J, K-L+I)) END DO END DO * * T_{2,1} = T_{2,1}*V_{2,1} * CALL ZTRMM('Right', 'Upper', 'No transpose', 'Unit', L, - $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT) + $ K-L, ONE, V(N-K+1,1), LDV, T(K-L+1,1), LDT) * * T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1} * Note: We assume K <= N, and GEMM will do nothing if N=K * CALL ZGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, - $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1), - $ LDT) + $ V(1,K-L+1), LDV, V, LDV, ONE, T(K-L+1,1), LDT) * * At this point, we have that T_{2,1} = V_2'*V_1 * All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} @@ -522,17 +630,13 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{2,1} = -T_{2,2}*T_{2,1} * CALL ZTRMM('Left', 'Lower', 'No transpose', 'Non-unit', L, - $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, - $ T(K-L+1, 1), LDT) + $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT) * * T_{2,1} = T_{2,1}*T_{1,1} * CALL ZTRMM('Right', 'Lower', 'No transpose', 'Non-unit', L, - $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) - ELSE -* -* Else means RQ case -* + $ K-L, ONE, T, LDT, T(K-L+1,1), LDT) + ELSE IF(RQ) THEN * Break V apart into 6 components * * V = |-----------------------| @@ -547,13 +651,13 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * V_{2,2}\in\C^{l,k-l} rectangular * V_{2,3}\in\C^{l,l} unit lower triangular * -* We will construct the T matrix +* We will construct the T matrix * T = |---------------| * |T_{1,1} 0 | * |T_{2,1} T_{2,2}| * |---------------| * -* T is the triangular factor obtained from block reflectors. +* T is the triangular factor obtained from block reflectors. * To motivate the structure, assume we have already computed T_{1,1} * and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 * @@ -564,51 +668,51 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * Where l = floor(k/2) * * Then, consider the product: -* -* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) -* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 -* +* +* (I - V_2'*T_{2,2}'*V_2)*(I - V_1'*T_{1,1}'*V_1) +* = I - V_2'*T_{2,2}'*V_2 - V_1'*T_{1,1}'*V_1 + V_2'*T_{2,2}'*V_2*V_1'*T_{1,1}'*V_1 +* * Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} -* -* Then, we can define the matrix V as +* +* Then, we can define the matrix V as * V = |---| * |V_1| * |V_2| * |---| -* +* * So, our product is equivalent to the matrix product -* I - V'*T*V +* I - V'*T'*V * This means, we can compute T_{1,1} and T_{2,2}, then use this information * to compute T_{2,1} * * Compute T_{1,1} recursively * - CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) * * Compute T_{2,2} recursively * - CALL ZLARFT(DIRECT, STOREV, N, L, V(K-L+1, 1), LDV, - $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) + CALL ZLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV, + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) * * Compute T_{2,1} * T_{2,1} = V_{2,2} * - CALL ZLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV, - $ T(K-L+1, 1), LDT) + CALL ZLACPY('All', L, K-L, V(K-L+1,N-K+1), LDV, T(K-L+1,1), + $ LDT) * * T_{2,1} = T_{2,1}*V_{1,2}' * CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Unit', L, K-L, - $ ONE, V(1, N-K+1), LDV, T(K-L+1, 1), LDT) + $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT) * -* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} +* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} * Note: We assume K <= N, and GEMM will do nothing if N=K * - CALL ZGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE, - $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1), - $ LDT) + CALL ZGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE, + $ V(K-L+1,1), LDV, V, LDV, ONE, T(K-L+1,1), LDT) * * At this point, we have that T_{2,1} = V_2*V_1' @@ -618,13 +722,103 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, * T_{2,1} = -T_{2,2}*T_{2,1} * CALL ZTRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L, - $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, - $ T(K-L+1, 1), LDT) + $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT) * * T_{2,1} = T_{2,1}*T_{1,1} * CALL ZTRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L, - $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + $ K-L, ONE, T, LDT, T(K-L+1,1), LDT) + ELSE IF(RQT) THEN +* Break V apart into 6 components +* +* V = |-----------------------| +* |V_{1,1} V_{1,2} 0 | +* |V_{2,1} V_{2,2} V_{2,3}| +* |-----------------------| +* +* V_{1,1}\in\C^{k-l,n-k} rectangular +* V_{1,2}\in\C^{k-l,k-l} unit lower triangular +* +* V_{2,1}\in\C^{l,n-k} rectangular +* V_{2,2}\in\C^{l,k-l} rectangular +* V_{2,3}\in\C^{l,l} unit lower triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* | 0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{k-l, k-l} non-unit upper triangular +* T_{2,2}\in\C^{l, l} non-unit upper triangular +* T_{1,2}\in\C^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) +* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 +* +* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, + $ LDT) +* +* Compute T_{2,2} recursively +* + CALL ZLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV, + $ TAU(K-L+1), T(K-L+1,K-L+1), LDT) +* +* Compute T_{1,2} +* T_{1,2} = V_{2,2}' +* + DO I = 1, K-L + DO J = 1, L + T(I,K-L+J) = CONJG(V(K-L+J, N-K+I)) + END DO + END DO +* +* T_{1,2} = V_{1,2}T_{1,2} +* + CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Unit', K-L, L, + $ ONE, V(1,N-K+1), LDV, T(1,K-L+1), LDT) +* +* T_{1,2} = V_{1,1}V_{2,1}' + T_{1,2} +* + CALL ZGEMM('No Tranpose', 'Conjugate', K-L, L, N-K, ONE, V, + $ LDV, V(K-L+1,1), LDV, ONE, T(1, K-L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1*V_2' +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit', + $ K-L, L, NEG_ONE, T, LDT, T(1, K-L+1), LDT) +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Non-Unit', + $ K-L, L, ONE, T(K-L+1,K-L+1), LDT, T(1, K-L+1), LDT) END IF END SUBROUTINE diff --git a/SRC/zlumm.f b/SRC/zlumm.f new file mode 100644 index 0000000000..f6d1fdb9e6 --- /dev/null +++ b/SRC/zlumm.f @@ -0,0 +1,349 @@ +*> \brief \b ZLUMM computes an in place triangular times triangluar matrix multiplication +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE ZLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA, +* $ A, LDA) +* +* .. Scalar Arguments .. +* INTEGER N, LDA +* CHARACTER SIDEL, DIAGL, DIAGU +* COMPLEX*16 ALPHA +* +* .. Array Arguments .. +* COMPLEX*16 A(LDA,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLUMM performs one of the matrix-matrix operations +*> +*> C = \alpha L * U +*> or +*> C = \alpha U * L +*> +*> where \alpha is a scalar, L is a unit, or non-unit, lower triangular matrix, and U is a unit, or +*> non-unit, upper triangular matrix, and at most one of L and U are non-unit +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDEL +*> \verbatim +*> SIDEL is CHARACTER*1 +*> On entry, SIDE specifies whether L multiplies U from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' A = \alpha L * U +*> +*> SIDE = 'R' or 'r' A = \alpha U * L +*> \endverbatim +*> +*> \param[in] DIAGL +*> \verbatim +*> DIAGL is CHARACTER*1 +*> On entry, DIAGL specifies whether or not L is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' L is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' L is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] DIAGU +*> \verbatim +*> DIAGU is CHARACTER*1 +*> On entry, DIAGU specifies whether or not U is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' U is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' U is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> M is INTEGER +*> On entry, N specifies the number of rows and columns of L and U. M must be at +*> least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is COMPLEX*16 . +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then A is not referenced, and A need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension ( LDA, N ) where +*> Before entry the leading n-by-n strictly upper triangular part of the array +*> A must contain the upper triangular matrix U and the strictly lower triangular part of +*> the leading n-by-n submatrix must contain the lower triangular matrix L. +*> If DIAGL != 'U', then the diagonal is assumed to be part of L, and if +*> DIAGU != 'U', then the diagonal is assumed to be part of U. +*> Note: At most one of DIAGL and DIAGU can be not equal to 'U'. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> On entry, LDA specifies the first dimension of A as declared +*> in the calling (sub) program. LDA must be at least max( 1, n ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== +* Cost: 2/3 * (n^3 - n) + RECURSIVE SUBROUTINE ZLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA, + $ A, LDA) +* +* .. Scalar Arguments .. + INTEGER N, LDA + CHARACTER SIDEL, DIAGL, DIAGU + COMPLEX*16 ALPHA +* +* .. Array Arguments .. + COMPLEX*16 A(LDA,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZTRMM, ZLASET, XERBLA +* .. +* .. Local Scalars .. + INTEGER K + LOGICAL LLEFT, LUNIT, UUNIT +* .. +* .. Local Parameters .. + COMPLEX*16 ONE, ZERO + PARAMETER(ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0)) +* .. +* +* Determine if our flags are valid or not. We can have at +* most one of DIAGU, DIAGL not equal to 'U' +* + LUNIT = LSAME(DIAGL, 'U') + UUNIT = LSAME(DIAGU, 'U') +* +* If both of the above are false, then it is impossible to have the +* structure that we are exploiting in this routine +* Note: It is possible to allow the matrices to share a non-unit +* diagonal as long as the values are the exact same, but there is +* currently no use case for this that I am aware of. +* + IF ((.NOT.LUNIT).AND.(.NOT.UUNIT)) THEN +* +* We say the error is in the last set DIAG value as we cannot know +* what the user actually meant. +* + CALL XERBLA( 'ZLUMM', 3 ) + RETURN + END IF +* +* Determine which side L is on +* + LLEFT = LSAME(SIDEL, 'L') +* +* Early exit if possible +* + IF (N.EQ.0) THEN + RETURN + END IF + IF (ALPHA.EQ.ZERO) THEN + CALL ZLASET('All', N, N, ZERO, ZERO, A, LDA) + RETURN + END IF +* +* Terminating Case +* + IF (N.EQ.1) THEN +* +* Since at most one of L and U are non-unit triangular, whatever side L is on, we are still +* always computing one of +* +* 1) A(1,1) = ALPHA*A(1,1) +* 2) A(1,1) = ALPHA +* +* Where the first case happens when exactly one of L and U are unit triangular, while the +* second case happens when both L and U are unit triangular +* + IF (LUNIT.AND.UUNIT) THEN + A(1,1) = ALPHA + ELSE + A(1,1) = ALPHA*A(1,1) + END IF + RETURN + END IF +* +* Recursive Case +* + K = N/2 +* +* Regardless of us computing A = L*U or A = U*L, break break A apart as follows: +* +* |---| +* A = | U | +* | L | +* |---| +* +* Further break down L as +* |---------------| +* L = | L_{11} 0 | +* | L_{21} L_{22} | +* |---------------| +* +* Where: +* +* L_{11}\in\C^{k\times k} is lower triangular (assumed unit iff DIAGL == 'U') +* L_{21}\in\C^{n-k\times n} is rectangular +* L_{22}\in\C^{n-k\times n-k} is lower triangular (assumed unit iff DIAGL == 'U') +* +* Further break down U as +* |---------------| +* U = | U_{11} U_{21} | +* | 0 U_{22} | +* |---------------| +* +* Where: +* +* U_{11}\in\C^{k\times k} is upper triangular (assumed unit iff DIAGU == 'U') +* U_{12}\in\C^{n\times n-k} is rectangular +* U_{22}\in\C^{n-k\times n-k} is upper triangular (assumed unit iff DIAGU == 'U') + IF (LLEFT) THEN +* +* This means we are computing +* |---------------| |---------------| +* A = L*U = \alpha | L_{11} 0 | * | U_{11} U_{12} | +* | L_{21} L_{22} | | 0 U_{22} | +* |---------------| |---------------| +* +* |---------------------------------------------| +* = \alpha | L_{11}*U_{11} L_{11}*U_{12} | +* | L_{21}*U_{11} L_{21}*U_{12} + L_{22}*U_{22} | +* |---------------------------------------------| +* +* We compute these in the following order +* +* A_{22} = \alpha*L_{22}*U_{22} (This routine) +* A_{22} = \alpha*L_{21}*U_{12} + A_{22} (GEMM) +* +* A_{12} = \alpha*L_{11}*U_{12} (TRMM) +* A_{21} = \alpha*L_{21}*U_{11} (TRMM) +* +* A_{11} = \alpha*L_{11}*U_{11} (This routine) +* +* Compute A_{22} +* +* A_{22} = \alpha*L_{22}*U_{22} +* + CALL ZLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA, + $ A(K+1, K+1), LDA) +* +* A_{22} = \alpha L_{21}*U_{12} + A_{22} +* + CALL ZGEMM('No Transpose', 'No Transpose', N-K, N-K, K, + $ ALPHA, A(K+1,1), LDA, A(1,K+1), LDA, ONE, A(K+1,K+1), + $ LDA) +* +* Compute A_{12} +* +* A_{12} = \alpha*L_{11}*U_{12} +* + CALL ZTRMM('Left', 'Lower', 'No Transpose', DIAGL, K, N-K, + $ ALPHA, A, LDA, A(1,K+1), LDA) +* +* Compute A_{21} +* +* A_{21} = \alpha*L_{21}*U_{11} +* + CALL ZTRMM('Right', 'Upper', 'No Transpose', DIAGU, N-K, K, + $ ALPHA, A, LDA, A(K+1,1), LDA) +* +* Compute A_{11} +* +* A_{11} = \alpha*L_{11}*U_{11} +* + CALL ZLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA) + ELSE +* +* This means we are computing +* |---------------| |---------------| +* A = U*L = \alpha | U_{11} U_{12} | * | L_{11} 0 | +* | 0 U_{22} | | L_{21} L_{22} | +* |---------------| |---------------| +* +* |---------------------------------------------| +* = \alpha | U_{11}*L_{11} + U_{12}*L_{21} U_{12}*L_{22} | +* | U_{22}*L_{21} U_{22}*L_{22} | +* |---------------------------------------------| +* +* We compute these in the following order +* +* A_{11} = \alpha*U_{11}*L_{11} (This routine) +* A_{11} = \alpha*U_{12}*L_{21} + A_{11} (GEMM) +* +* A_{12} = \alpha*U_{12}*L_{22} (TRMM) +* A_{21} = \alpha*U_{22}*L_{21} (TRMM) +* +* A_{22} = \alpha*U_{22}*L_{22} (This routine) +* +* Compute A_{11} +* +* A_{11} = \alpha*U_{11}*L_{11} +* + CALL ZLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA) +* +* A_{11} = \alpha*U_{12}*L_{21} + A_{11} +* + CALL ZGEMM('No Transpose', 'No Transpose', K, K, N-K, + $ ALPHA, A(1,K+1), LDA, A(K+1,1), LDA, ONE, A, LDA) +* +* Compute A_{12} +* +* A_{12} = \alpha*U_{12}*L_{22} +* + CALL ZTRMM('Right', 'Lower', 'No Transpose', DIAGL, K, N-K, + $ ALPHA, A(K+1,K+1), LDA, A(1,K+1), LDA) +* +* Compute A_{21} +* +* A_{21} = \alpha*U_{22}*L_{21} +* + CALL ZTRMM('Left', 'Upper', 'No Transpose', DIAGU, N-K, K, + $ ALPHA, A(K+1, K+1), LDA, A(K+1,1), LDA) +* +* Compute A_{22} +* +* A_{22} = \alpha*U_{22}*L_{22} +* + CALL ZLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA, + $ A(K+1, K+1), LDA) + END IF + END SUBROUTINE diff --git a/SRC/ztrmmoop.f b/SRC/ztrmmoop.f new file mode 100644 index 0000000000..d55b3f8523 --- /dev/null +++ b/SRC/ztrmmoop.f @@ -0,0 +1,2572 @@ +*> \brief \b ZTRMMOOP computes an out of place triangular times general matrix multiplication +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, +* $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* .. Scalar Arguments .. +* COMPLEX*16 ALPHA, BETA +* INTEGER M, N, LDA, LDB, LDC +* CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG +* .. +* .. Array Arguments .. +* COMPLEX*16 A(LDA,*), B(LDB,*), C(LDC,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZTRMMOOP performs one of the matrix-matrix operations +*> +*> C = \alpha op(A) * op(B) + \beta C +*> or +*> C = \alpha op(B) * op(A) + \beta C +*> +*> where \alpha and \beta are scalars, C is an m-by-n matrix, A is +*> a unit, or non-unit, upper or lower triangular matrix, and op(A) is +*> is one of +*> +*> op(A) = A or op(A) = A**T op(A) = A**H +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> On entry, SIDE specifies whether op(A) multiplies op(B) from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' C = \alpha op(A) * op(B) + \beta C +*> +*> SIDE = 'R' or 'r' C = \alpha op(B) * op(A) + \beta C +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> On entry, UPLO specifies whether the matrix A is an upper or +*> lower triangular matrix as follows: +*> UPLO = 'U' or 'u' A is upper triangular +*> +*> UPLO = 'L' or 'l' A is lower triangular +*> \Endverbatim +*> +*> \param[in] TRANSA +*> \verbatim +*> TRANSA is CHARACTER*1 +*> On entry, TRANSA specifies the form of op(A) to be used in +*> the matrix multiplication as follows: +*> TRANSA = 'N' or 'n' op(A) = A +*> +*> TRANSA = 'T' or 't' op(A) = A**T +*> +*> TRANSA = 'C' or 'c' op(A) = A**H +*> \endverbatim +*> +*> \param[in] TRANSB +*> \verbatim +*> TRANSB is CHARACTER*1 +*> On entry, TRANSB specifies the form of op(B) to be used in +*> the matrix multiplication as follows: +*> TRANSB = 'N' or 'n' op(B) = B +*> +*> TRANSB = 'T' or 't' op(B) = B**T +*> +*> TRANSB = 'C' or 'c' op(B) = B**H +*> \endverbatim +*> +*> \param[in] DIAG +*> \verbatim +*> DIAG is CHARACTER*1 +*> On entry, DIAG specifies whether or not A is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' A is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' A is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> On entry, M specifies the number of rows of C. M must be at +*> least zero. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> On entry, N specifies the number of columns of C. N must be +*> at least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is COMPLEX*16. +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then A and B are not referenced, and A and B need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] A +*> \verbatim +*> A is COMPLEX*16 array, dimension ( LDA, K ) where +*> K is M when SIDE = 'L' and K is N when SIDE='R' +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array A must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> A is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array A must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> A is not referenced. +*> Note that when DIAG = 'U' or 'u', the diagonal elements of +*> A are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> On entry, LDA specifies the first dimension of A as declared +*> in the calling (sub) program. When SIDE = 'L' or 'l' then +*> LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +*> then LDA must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] B +*> \verbatim +*> B is COMPLEX*16 array, dimension ( LDB, K ), where K is M +*> If SIDE='R' and TRANSA='N', or SIDE='L' and TRANSA='T' and N +*> otherwise. On entry, the leading k-by-k submatrix must contain +*> B. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> On entry, LDB specifies the first dimension of B as declared +*> in the calling (sub) program. When SIDE = 'R' and TRANSB='N' +*> then LDB must be at least max( 1, m ), when SIDE = 'R' +*> and TRANSB = 'T' then LDB must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] BETA +*> \verbatim +*> BETA is COMPLEX*16. +*> On entry, BETA specifies the scalar beta. When beta is +*> zero then C is not referenced on entry, and C need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in,out] C +*> \verbatim +*> C is COMPLEX*16 array, dimension ( LDC, N ) +*> Before entry, the leading m-by-n part of the array C must +*> contain the matrix C, and on exit is overwritten by the +*> transformed matrix. +*> \endverbatim +*> +*> \param[in] LDC +*> \verbatim +*> LDC is INTEGER +*> On entry, LDC specifies the first dimension of C as declared +*> in the calling (sub) program. LDC must be at least +*> max( 1, m ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + RECURSIVE SUBROUTINE ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, + $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + INTEGER M, N, LDA, LDB, LDC + CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG +* .. +* .. Array Arguments .. + COMPLEX*16 A(LDA,*), B(LDB,*), C(LDC,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + COMPLEX*16 ZDOTC, ZDOTU + EXTERNAL LSAME, ZDOTC, ZDOTU +* .. +* .. External Subroutines .. + EXTERNAL ZGEMM, ZAXPY, ZACXPY, + $ ZSCAL, ZLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG, MIN +* .. +* .. Local Scalars .. + INTEGER I, J, L, K, INCB + LOGICAL LSIDE, UPPER, UNIT, TRANST, TRANSG, + $ CONJA, CONJB +* .. +* .. Local Parameters .. + COMPLEX*16 ONE, ZERO + PARAMETER(ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0)) +* .. +* +* Beginning of Executable Statements +* + LSIDE = LSAME(SIDE, 'L') + UPPER = LSAME(UPLO, 'U') +* +* If we are transposing the triangular matrix (A) +* + CONJA = LSAME(TRANSA, 'C') + TRANST= LSAME(TRANSA, 'T').OR.CONJA +* +* If we are transposing the general matrix (B) +* + CONJB = LSAME(TRANSB, 'C') + TRANSG= LSAME(TRANSB, 'T').OR.CONJB +* +* Terminating Case +* + UNIT = LSAME(DIAG, 'U') + IF (M.EQ.1.AND.N.EQ.1) THEN +* +* This case is the simplest as we are just computing C = \alpha A*B + +* \beta C where all components are 1-by-1 matrices +* + + IF (BETA.EQ.ZERO) THEN + C(1,1) = ZERO + ELSE + C(1,1) = C(1,1) * BETA + END IF +* +* Now, we compute C = \alpha op(A)*op(B) +* + IF(ALPHA.NE.ZERO) THEN +* +* A = 1, so we do not care if A is conjugated or not +* + IF (UNIT) THEN + IF (CONJB) THEN + C(1,1) = C(1,1) + ALPHA*CONJG(B(1,1)) + ELSE + C(1,1) = C(1,1) + ALPHA*B(1,1) + END IF + ELSE +* +* A is not assumed unit, so we need to keep op(A) in mind +* + IF (CONJA) THEN + IF (CONJB) THEN + C(1,1) = C(1,1) + + $ ALPHA*CONJG(B(1,1))*CONJG(A(1,1)) + ELSE + C(1,1) = C(1,1) + ALPHA*B(1,1)*CONJG(A(1,1)) + END IF + ELSE + IF (CONJB) THEN + C(1,1) = C(1,1) + ALPHA*CONJG(B(1,1))*A(1,1) + ELSE + C(1,1) = C(1,1) + ALPHA*B(1,1)*A(1,1) + END IF + END IF + END IF + END IF + RETURN + ELSE IF (M.EQ.1) THEN +* +* This means that C is a row vector. If BETA is 0, then we +* set it explicitly, otherwise we overwrite it with BETA*C +* + IF (BETA.EQ.ZERO) THEN +* +* This ensures we don't reference C unless we need to +* + CALL ZLASET('All', M, N, ZERO, ZERO, C, LDC) + ELSE + CALL ZSCAL(N, BETA, C, LDC) + END IF + IF (ALPHA.NE.ZERO) THEN + IF (LSIDE) THEN +* +* We are computing C = \alpha op(A)*op(B) + \beta C +* Note: This means that A is a scalar +* + IF (CONJA) THEN +* +* op(A) = CONJG(A) +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL ZACXPY(N, ALPHA*CONJG(A(1,1)), B, 1, + $ C, LDC) + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL ZAXPY(N, ALPHA*CONJG(A(1,1)), B, 1, + $ C, LDC) + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL ZAXPY(N, ALPHA*CONJG(A(1,1)), B, + $ LDB, C, LDC) + END IF + END IF + ELSE +* +* op(A) = A or op(A) = A**T = A +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL ZACXPY(N, ALPHA*A(1,1), B, 1, + $ C, LDC) + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL ZAXPY(N, ALPHA*A(1,1), B, 1, C, LDC) + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + CALL ZAXPY(N, ALPHA*A(1,1), B, LDB, + $ C, LDC) + END IF + END IF + END IF + ELSE +* +* We are computing C = \alpha op(B)*op(A) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (CONJA) THEN +* +* op(A) = CONJG(A) +* This is lower triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CONJG(ZDOTU(N-J, + $ A(J,J+1), LDA, B(J+1,1), 1)) + + $ C(1,J) + END DO + CALL ZACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CONJG(ZDOTU(N-J+1, + $ A(J,J), LDA, B(J,1), 1)) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(N-J, + $ A(J,J+1), LDA, B(J+1,1), 1) + + $ C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(N-J+1, + $ A(J,J), LDA, B(J,1), 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(N-J, + $ A(J,J+1), LDA, B(1,J+1), LDB) + + $ C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(N-J+1, + $ A(J,J), LDA, B(1,J), LDB) + C(1,J) + END DO + END IF + END IF + ELSE IF (TRANST) THEN +* +* op(A) = A**T +* This is lower triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(N-J, + $ B(J+1,1), 1, A(J,J+1), LDA) + + $ C(1,J) + END DO + CALL ZACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(N-J+1, + $ B(J,1), 1, A(J,J), LDA) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(N-J, + $ A(J,J+1), LDA, B(J+1,1), 1) + + $ C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(N-J+1, + $ A(J,J), LDA, B(J,1), 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(N-J, + $ A(J,J+1), LDA, B(1,J+1), LDB) + + $ C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(N-J+1, + $ A(J,J), LDA, B(1,J), LDB) + C(1,J) + END DO + END IF + END IF + ELSE +* +* op(A) = A +* This is upper triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(J-1, B, 1, + $ A(1,J), 1) + C(1,J) + END DO + CALL ZACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(J, B, 1, + $ A(1,J), 1) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(J-1, + $ A(1,J), 1, B, 1) + + $ C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(J, + $ A(1,J), 1, B, 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(J-1, + $ A(1,J), 1, B, LDB) + + $ C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(J, + $ A(1,J), 1, B, LDB) + C(1,J) + END DO + END IF + END IF + END IF + ELSE +* +* A is lower triangular +* + IF (CONJA) THEN +* +* op(A) = CONJG(A) +* This is upper triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CONJG(ZDOTU(J-1, + $ B, 1, A(J,1), LDA)) + C(1,J) + END DO + CALL ZACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * CONJG(ZDOTU(J, B, + $ 1, A(J,1), LDA)) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(J-1, + $ A(J,1), LDA, B, 1) + C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(J, + $ A(J,1), LDA, B, 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(J-1, + $ A(J,1), LDA, B, LDB) + C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(J, + $ A(J,1), LDA, B, LDB) + C(1,J) + END DO + END IF + END IF + ELSE IF (TRANST) THEN +* +* op(A) = A**T +* This is upper triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(J-1, + $ B, 1, A(J,1), LDA) + C(1,J) + END DO + CALL ZACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(J, B, + $ 1, A(J,1), LDA) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(J-1, + $ A(J,1), LDA, B, 1) + C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(J, + $ A(J,1), LDA, B, 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(J-1, + $ A(J,1), LDA, B, LDB) + C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(J, + $ A(J,1), LDA, B, LDB) + C(1,J) + END DO + END IF + END IF + ELSE +* +* op(A) = A +* This is lower triangular +* + IF (CONJB) THEN +* +* op(B) = CONJG(B) +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(N-J, + $ B(J+1,1), 1, A(J+1,J), 1) + C(1,J) + END DO + CALL ZACXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTC(N-J+1, + $ B(J,1), 1, A(J,J), 1) + C(1,J) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* op(B) = B**T +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(N-J, + $ B(J+1,1), 1, A(J+1,J), 1) + C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, 1, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(N-J+1, + $ B(J,1), 1, A(J,J), 1) + C(1,J) + END DO + END IF + ELSE +* +* op(B) = B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(N-J, + $ B(1,J+1), LDB, A(J+1,J), 1) + C(1,J) + END DO + CALL ZAXPY(N, ALPHA, B, LDB, C, LDC) + ELSE +* +* A is not assumed unit triangular +* + DO J = 1,N + C(1,J) = ALPHA * ZDOTU(N-J+1, + $ B(1,J), LDB, A(J,J), 1) + C(1,J) + END DO + END IF + END IF + END IF + END IF + END IF + END IF + RETURN + ELSE IF (N.EQ.1) THEN +* +* This means that C is a column vector. If BETA is 0, then we +* set it explicitly, otherwise we overwrite it with BETA*C +* + IF (BETA.EQ.ZERO) THEN +* +* This ensures we don't reference C unless we need to +* + CALL ZLASET('All', M, N, ZERO, ZERO, C, LDC) + ELSE + CALL ZSCAL(M, BETA, C, 1) + END IF + +* +* If alpha is 0, we are done +* + IF (ALPHA.NE.ZERO) THEN + IF (TRANSG) THEN + INCB = LDB + ELSE + INCB = 1 + END IF + IF (LSIDE) THEN +* +* This means we are computing +* C = \alpha op(A) * op(B) + \beta C +* + IF (UPPER) THEN +* +* This means A is upper triangular +* + IF (CONJA) THEN +* +* This means op(A) = CONJG(A) +* This is lower triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CONJG(ZDOTU(I-1, B, + $ INCB, A(1,I), 1)) + C(I,1) + END DO + CALL ZACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CONJG(ZDOTU(I, B, + $ INCB, A(1,I), 1)) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(I-1, A(1,I), + $ 1, B, INCB) + C(I,1) + END DO + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(I, A(1,I), + $ 1, B, INCB) + C(I,1) + END DO + END IF + END IF + ELSE IF (TRANST) THEN +* +* This means op(A) = A**T +* This is lower triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(I-1, B, INCB, + $ A(1,I), 1) + C(I,1) + END DO + CALL ZACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(I, B, INCB, + $ A(1,I), 1) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTU(I-1, B, INCB, + $ A(1,I), 1) + C(I,1) + END DO + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTU(I, B, INCB, + $ A(1,I), 1) + C(I,1) + END DO + END IF + END IF + ELSE +* +* This means op(A) = A +* This is upper triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*ZDOTC(M-I, B(1,I+1), + $ INCB, A(I,I+1), LDA) + C(I,1) + END DO + CALL ZACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(M-I+1, B(1,I), + $ INCB, A(I,I), LDA) + C(I,1) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* This means that B is a row vector but not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*ZDOTU(M-I, B(1,I+1), + $ INCB, A(I,I+1), LDA) + C(I,1) + END DO + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTU(M-I+1, B(1,I), + $ INCB, A(I,I), LDA) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is a column vector and not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*ZDOTU(M-I, B(I+1,1), + $ INCB, A(I,I+1), LDA) + C(I,1) + END DO + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTU(M-I+1, B(I,1), + $ INCB, A(I,I), LDA) + C(I,1) + END DO + END IF + END IF + END IF +* +* This means A is lower triangular +* + ELSE + IF (CONJA) THEN +* +* This means op(A) = CONJG(A) +* This is upper triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*CONJG(ZDOTU(M-I, + $ B(1,I+1), INCB, A(I+1,I), 1)) + & + C(I,1) + END DO + CALL ZACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*CONJG(ZDOTU(M-I+1, + $ B(1,I), INCB, A(I,I), 1)) + & + C(I,1) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* This means that B is a row vector but not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*ZDOTC(M-I, A(I+1,I), + $ 1, B(1,I+1), INCB) + C(I,1) + END DO + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(M-I+1, A(I,I), + $ 1, B(1,I), INCB) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is a column vector and not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*ZDOTC(M-I, A(I+1,I), + $ 1, B(I+1,1), INCB) + C(I,1) + END DO + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(M-I+1, A(I,I), + $ 1, B(I,1), INCB) + C(I,1) + END DO + END IF + END IF + ELSE IF (TRANST) THEN +* +* This means op(A) = A**T +* This is upper triangular +* + IF (CONJB) THEN +* +* This means that we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*ZDOTC(M-I, B(1,I+1), + $ INCB, A(I+1,I), 1) + C(I,1) + END DO + CALL ZACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(M-I+1, B(1,I), + $ INCB, A(I,I), 1) + C(I,1) + END DO + END IF + ELSE IF (TRANSG) THEN +* +* This means that B is a row vector but not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*ZDOTU(M-I, B(1,I+1), + $ INCB, A(I+1,I), 1) + C(I,1) + END DO + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTU(M-I+1, B(1,I), + $ INCB, A(I,I), 1) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is a column vector and not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M-1 + C(I,1) = ALPHA*ZDOTU(M-I, B(I+1,1), + $ INCB, A(I+1,I), 1) + C(I,1) + END DO + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTU(M-I+1, B(I,1), + $ INCB, A(I,I), 1) + C(I,1) + END DO + END IF + END IF +* +* This means op(A) = A +* This is lower triangular[:w + +* + ELSE + IF (CONJB) THEN +* +* This means that B is conjugated and transposed +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(I-1, B, INCB, + $ A(I,1), LDA) + C(I,1) + END DO + CALL ZACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTC(I, B, INCB, + $ A(I,1), LDA) + C(I,1) + END DO + END IF + ELSE +* +* This means that B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTU(I-1, B, INCB, + $ A(I,1), LDA) + C(I,1) + END DO + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + DO I=1,M + C(I,1) = ALPHA*ZDOTU(I, B, INCB, + $ A(I,1), LDA) + C(I,1) + END DO + END IF + END IF + END IF + END IF + ELSE +* +* This means we are computing +* C = \alpha op(B) * op(A) + \beta C +* Note: This means A is a scalar +* + IF (CONJA) THEN +* +* This means op(A) = CONJG(A) +* + IF (CONJB) THEN +* +* This means we must conjugate B +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + CALL ZACXPY(M, ALPHA*CONJG(A(1,1)), B, + $ INCB, C, 1) + END IF + ELSE +* +* This means B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + CALL ZAXPY(M, ALPHA*CONJG(A(1,1)), B, + $ INCB, C, 1) + END IF + END IF + ELSE +* +* This means op(A) = A or op(A) = A**T = A +* + IF (CONJB) THEN +* +* This means B is conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZACXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + CALL ZACXPY(M, ALPHA*A(1,1), B, INCB, C, + $ 1) + END IF + ELSE +* +* This means B is not conjugated +* + IF (UNIT) THEN +* +* A is assumed unit triangular +* + CALL ZAXPY(M, ALPHA, B, INCB, C, 1) + ELSE +* +* A is not assumed unit triangular +* + CALL ZAXPY(M, ALPHA*A(1,1), B, INCB, C, + $ 1) + END IF + END IF + END IF + END IF + END IF + RETURN + END IF +* +* Recursive Case +* + L = MIN(M,N)/2 + IF (LSIDE) THEN +* +* We are multiplying A from the left IE we are computing +* C = \alpha op(A)*op(B) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing +* +* So we are computing +* C = \alpha A**T * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11}**T + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{21}**T + \beta C_{12} +* C_{21} = \alpha A_{12}**T * B_{11}**T + \alpha A_{22}**T * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{12}**T * B_{21}**T + \alpha A_{22}**T * B_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{12}**T * B_{11}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22}**T * B_{12}**T + C_{21} (This routine) +* +* C_{22} = \alpha A_{12}**T * B_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22}**T * B_{22}**T + C_{22} (This routine) +* +* C_{11} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, + $ LDC) +* +* C_{12} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(1, L+1), LDA, B, LDB, BETA, C(L+1,1), + $ LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1,L+1), LDA, B(1,L+1), + $ LDB, ONE, C(L+1,1), LDC) +* +* C_{22} +* + CALL ZGEMM(TRANSA, TRANSB, M-L, N-L, L, ALPHA, + $ A(1, L+1), LDA, B(L+1,1), LDB, BETA, + $ C(L+1,L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1,L+1), LDA, + $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A**T * B + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11} + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{12} + \beta C_{12} +* C_{21} = \alpha A_{12}**T * B_{11} + \alpha A_{22}**T * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{12}**T * B_{12} + \alpha A_{22}**T * B_{22} + \beta C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{12}**T * B_{11} + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22}**T * B_{21} + C_{21} (This routine) +* +* C_{22} = \alpha A_{12}**T * B_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22}**T * B_{22} + C_{22} (This routine) +* +* C_{11} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, BETA, + $ C(1, L+1), LDC) +* +* C_{21} +* + CALL ZGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(1, L+1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZGEMM(TRANSA, TRANSB, M-L, N-L, L, + $ ALPHA, A(1, L+1), LDA, B(1, L+1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11}**T + \alpha A_{12} * B_{12}**T + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12} +* C_{21} = \alpha A_{22} * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{22} * B_{22}**T + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{12} * B_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11} * B_{11}**T + C_{11} (This routine) +* +* C_{12} = \alpha A_{12} * B_{22}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11} * B_{21}**T + C_{12} (This routine) +* +* C_{11} +* + CALL ZGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(1, L+1), LDA, B(1, L+1), LDB, BETA, C, LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL ZGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1,1), LDB, ONE, + $ C(1, L+1), LDC) +* +* C_{21} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A * B + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11} + \alpha A_{12} * B_{21} + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{12} + \alpha A_{12} * B_{22} + \beta C_{12} +* C_{21} = \alpha A_{22} * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{22} * B_{22} + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{12} * B_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11} * B_{11} + C_{11} (This routine) +* +* C_{12} = \alpha A_{12} * B_{22} + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11} * B_{12} + C_{12} (This routine) +* +* C_{11} +* + CALL ZGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, 1), LDB, BETA, C, LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL ZGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA, + $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, + $ ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + END IF + ELSE +* +* A is lower triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A**T * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11}**T + \alpha A_{21}**T * B_{12}**T + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{21}**T + \alpha A_{21}**T * B_{22}**T + \beta C_{12} +* C_{21} = \alpha A_{22}**T * B_{12}**T + \beta C_{21} +* C_{22} = \alpha A_{22}**T * B_{22}**T + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{21}**T * B_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11}**T * B_{11}**T + C_{11} (This routine) +* +* C_{12} = \alpha A_{21}**T * B_{22}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11}**T * B_{21}**T + C_{12} (This routine) +* +* C_{11} +* + CALL ZGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(1, L+1), LDB, BETA, C, LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL ZGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE, + $ C(1, L+1), LDC) +* +* C_{21} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A**T * B + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11}**T * B_{11} + \alpha A_{21}**T * B_{21} + \beta C_{11} +* C_{12} = \alpha A_{11}**T * B_{12} + \alpha A_{21}**T * B_{22} + \beta C_{12} +* C_{21} = \alpha A_{22}**T * B_{21} + \beta C_{21} +* C_{22} = \alpha A_{22}**T * B_{22} + \beta C_{22} +* +* Computing C_{21} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{12} as follows +* +* C_{11} = \alpha A_{21}**T * B_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha A_{11}**T * B_{11} + C_{11} (This routine) +* +* C_{12} = \alpha A_{21}**T * B_{22} + \beta C_{12} (GEMM call) +* C_{12} = \alpha A_{11}**T * B_{12} + C_{12} (This routine) +* +* C_{11} +* + CALL ZGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, C, LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL ZGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA, + $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA, + $ C(1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, ONE, + $ C(1, L+1), LDC) +* +* C_{21} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha A * B**T + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11}**T + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{21}**T + \beta C_{12} +* C_{21} = \alpha A_{21} * B_{11}**T + \alpha A_{22} * B_{12}**T + \beta * C_{21} +* C_{22} = \alpha A_{21} * B_{21}**T + \alpha A_{22} * B_{22}**T + \beta * C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{21} * B_{11}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22} * B_{12}**T + C_{21} (This routine) +* +* C_{22} = \alpha A_{21} * B_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22} * B_{22}**T + C_{22} (This routine) +* +* C_{11} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZGEMM(TRANSA, TRANSB, M-L, N-L, L, + $ ALPHA, A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha A * B + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell} +* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha A_{11} * B_{11} + \beta C_{11} +* C_{12} = \alpha A_{11} * B_{12} + \beta C_{12} +* C_{21} = \alpha A_{21} * B_{11} + \alpha A_{22} * B_{21} + \beta * C_{21} +* C_{22} = \alpha A_{21} * B_{12} + \alpha A_{22} * B_{22} + \beta * C_{22} +* +* Computing C_{11} and C_{12} is just a recursive call to +* this routine but we can break down computing +* C_{21} and C_{22} as follows +* +* C_{21} = \alpha A_{21} * B_{11} + \beta C_{21} (GEMM call) +* C_{21} = \alpha A_{22} * B_{21} + C_{21} (This routine) +* +* C_{22} = \alpha A_{21} * B_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha A_{22} * B_{22} + C_{22} (This routine) +* +* C_{11} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA, + $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, A(L+1, 1), LDA, B(1, L+1), LDB, BETA, + $ C(L+1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + END IF + END IF + ELSE +* +* We are multiplying A from the right IE we are computing +* C = \alpha op(B)*op(A) + \beta C +* + IF (UPPER) THEN +* +* A is upper triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11}**T + \alpha B_{21}**T * A_{12}**T + \beta C_{11} +* C_{12} = \alpha B_{21}**T * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11}**T + \alpha B_{22}**T * A_{12}**T + \beta C_{21} +* C_{22} = \alpha B_{22}**T * A_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{21}**T * A_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11}**T * A_{11}**T + C_{11} (This routine) +* +* C_{21} = \alpha B_{22}**T * A_{12}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{12}**T * A_{11}**T + C_{21} (This routine) +* +* C_{11} +* + CALL ZGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(L+1, 1), LDB, A(1, L+1), LDA, BETA, C, LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 | +* |C_{21} C_{22}| |A_{12}**T A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11}**T + \alpha B_{12} * A_{12}**T + \beta C_{11} +* C_{12} = \alpha B_{12} * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11}**T + \alpha B_{22} * A_{12}**T + \beta C_{21} +* C_{22} = \alpha B_{22} * A_{22}**T + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{12} * A_{12}**T + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11} * A_{11}**T + C_{11} (This routine) +* +* C_{21} = \alpha B_{22} * A_{12}**T + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{21} * A_{11}**T + C_{21} (This routine) +* +* C_{11} +* + CALL ZGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(1,L+1), LDB, A(1,L+1), LDA, BETA, C, LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, + $ ONE, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11}**T * A_{12} + \alpha B_{21}**T * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11} + \beta C_{21} +* C_{22} = \alpha B_{12}**T * A_{12} + \alpha B_{22}**T * A_{22} + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11}**T * A_{12} + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{21}**T * A_{22} + C_{12} (This routine) +* +* C_{22} = \alpha B_{12}**T * A_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22}**T * A_{22} + C_{22} (This routine) +* +* C_{11} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL ZGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, B(1, L+1), LDB, A(1, L+1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} A_{12}| +* |C_{21} C_{22}| |0 A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11} * A_{12} + \alpha B_{12} * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11} + \beta C_{21} +* C_{22} = \alpha B_{21} * A_{12} + \alpha B_{22} * A_{22} + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11} * A_{12} + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{12} * A_{22} + C_{12} (This routine) +* +* C_{22} = \alpha B_{21} * A_{12} + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22} * A_{22} + C_{22} (This routine) +* +* C_{11} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL ZGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L, + $ ALPHA, B(L+1, 1), LDB, A(1, L+1), LDA, + $ BETA, C(L+1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + END IF + ELSE +* +* A is lower triangular +* + IF (TRANST) THEN +* +* We are transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11}**T + \beta C_{11} +* C_{12} = \alpha B_{11}**T * A_{21}**T + \alpha B_{21}**T * A_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11}**T + \beta C_{21} +* C_{22} = \alpha B_{12}**T * A_{21}**T + \alpha B_{22}**T * A_{22}**T + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11}**T * A_{21}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{21}**T * A_{22}**T + C_{12} (This routine) +* +* C_{22} = \alpha B_{12}**T * A_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22}**T * A_{22}**T + C_{22} (This routine) +* +* C_{11} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL ZGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, + $ BETA, C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA, + $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A**T + \beta C. We break this down as follows +* +* |-------------| |-------------------| +* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T| +* |C_{21} C_{22}| |0 A_{22}**T| +* |-------------| |-------------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11} +* C_{12} = \alpha B_{11} * A_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11}**T + \beta C_{21} +* C_{22} = \alpha B_{21} * A_{21}**T + \alpha A_{22} * B_{22}**T + \beta C_{22} +* +* Computing C_{11} and C_{21} is just a recursive call to +* this routine but we can break down computing +* C_{12} and C_{22} as follows +* +* C_{12} = \alpha B_{11} * A_{21}**T + \beta C_{12} (GEMM call) +* C_{12} = \alpha B_{12} * A_{22}**T + C_{12} (This routine) +* +* C_{22} = \alpha B_{21} * A_{21}**T + \beta C_{22} (GEMM call) +* C_{22} = \alpha B_{22} * A_{22}**T + C_{22} (This routine) +* +* C_{11} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC) +* +* C_{12} +* + CALL ZGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA, + $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, ONE, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA, + $ C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA, + $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, L+1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC) + ENDIF + ELSE +* +* We are not transposing A +* + IF (TRANSG) THEN +* +* We are transposing B. +* +* So we are computing +* C = \alpha B**T * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------------| +* B**T =|B_{11}**T B_{21}**T| +* |B_{12}**T B_{22}**T| +* |-------------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell} +* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11}**T * A_{11} + \alpha B_{21}**T * A_{21} + \beta C_{11} +* C_{12} = \alpha B_{21}**T * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{12}**T * A_{11} + \alpha B_{22}**T * A_{21} + \beta C_{21} +* C_{22} = \alpha B_{22}**T * A_{22} + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{21}**T * A_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11}**T * A_{11} + C_{11}(This routine) +* +* C_{21} = \alpha B_{22}**T * A_{21} + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{12}**T * A_{11} + C_{21} (This routine) +* +* C_{11} +* + CALL ZGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, C, LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1), + $ LDB, BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, ONE, + $ C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ELSE +* +* We are not transposing B. +* +* So we are computing +* C = \alpha B * A + \beta C. We break this down as follows +* +* |-------------| |-------------| +* C = |C_{11} C_{12}| A = |A_{11} 0 | +* |C_{21} C_{22}| |A_{21} A_{22}| +* |-------------| |-------------| +* +* |-------------| +* B = |B_{11} B_{12}| +* |B_{21} B_{22}| +* |-------------| +* +* Where +* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell} +* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell} +* +* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell} +* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell} +* +* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell} +* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell} +* +* Which means that we get +* C_{11} = \alpha B_{11} * A_{11} + \alpha B_{12} * A_{21} + \beta C_{11} +* C_{12} = \alpha B_{12} * A_{22} + \beta C_{12} +* C_{21} = \alpha B_{21} * A_{11} + \alpha B_{22} * A_{21} + \beta C_{21} +* C_{22} = \alpha B_{22} * A_{22} + \beta C_{22} +* +* Computing C_{12} and C_{22} is just a recursive call to +* this routine but we can break down computing +* C_{11} and C_{21} as follows +* +* C_{11} = \alpha B_{12} * A_{21} + \beta C_{11} (GEMM call) +* C_{11} = \alpha B_{11} * A_{11} + C_{11}(This routine) +* +* C_{21} = \alpha B_{22} * A_{21} + \beta C_{21} (GEMM call) +* C_{21} = \alpha B_{21} * A_{11} + C_{21} (This routine) +* +* C_{11} +* + CALL ZGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA, + $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, C, LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC) +* +* C_{12} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1), + $ LDB, BETA, C(1, L+1), LDC) +* +* C_{21} +* + CALL ZGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA, + $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA, + $ C(L+1, 1), LDC) + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE, + $ C(L+1, 1), LDC) +* +* C_{22} +* + CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG, + $ M-L, N-L, ALPHA, A(L+1, L+1), LDA, + $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC) + ENDIF + END IF + END IF + END IF + END SUBROUTINE diff --git a/SRC/ztrtrm.f b/SRC/ztrtrm.f new file mode 100644 index 0000000000..5f839e42d6 --- /dev/null +++ b/SRC/ztrtrm.f @@ -0,0 +1,584 @@ +*> \brief \b ZTRTRM computes an in place triangular-triangular matrix product +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* RECURSIVE SUBROUTINE ZTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, +* $ N, ALPHA, T, LDT, V, LDV) +* +* .. Scalar Arguments .. +* INTEGER N, LDT, LDV +* CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV +* COMPLEX*16 ALPHA +* .. +* .. Array Arguments .. +* COMPLEX*16 T(LDT,*), V(LDV,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZTRTRM performs one of the matrix-matrix operations +*> +*> T = \alpha op(V) * T +*> or +*> T = \alpha T * op(V) +*> where \alpha is a scalar, T and V are unit, or non-unit, upper or +*> lower triangular matrix, and op(V) is one of +*> +*> op(V) = V or op(V) = V**T or op(V) = V**H +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] SIDE +*> \verbatim +*> SIDE is CHARACTER*1 +*> On entry, SIDE specifies whether op(V) multiplies T from +*> the left or right as follows: +*> +*> SIDE = 'L' or 'l' T = \alpha op(V) * T +*> +*> SIDE = 'R' or 'r' T = \alpha T * op(V) +*> \endverbatim +*> +*> \param[in] UPLO +*> \verbatim +*> UPLO is CHARACTER*1 +*> On entry, UPLO specifies whether T an op(V) are upper or +*> lower triangular as follows: +*> UPLO = 'U' or 'u' T and op(V) are upper triangular +*> +*> UPLO = 'L' or 'l' T and op(V) are lower triangular +*> \Endverbatim +*> +*> \param[in] TRANSV +*> \verbatim +*> TRANSV is CHARACTER*1 +*> On entry, TRANSV specifies the form of op(V) to be used in +*> the matrix multiplication as follows: +*> TRANSV = 'N' or 'n' op(V) = V +*> +*> TRANSV = 'T' or 't' op(V) = V**T +*> +*> TRANSV = 'C' or 'c' op(V) = V**H +*> \endverbatim +*> +*> \param[in] DIAGT +*> \verbatim +*> DIAGT is CHARACTER*1 +*> On entry, DIAGT specifies whether or not T is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' T is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' T is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] DIAGV +*> \verbatim +*> DIAGV is CHARACTER*1 +*> On entry, DIAGV specifies whether or not V is unit triangular +*> as follows: +*> +*> DIAG = 'U' or 'u' V is assumed to be unit triangular. +*> +*> DIAG = 'N' or 'n' V is not assumed to be unit +*> triangular. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> On entry, N specifies the number of rows and columns of T. +*> N must be at least zero. +*> \endverbatim +*> +*> \param[in] ALPHA +*> \verbatim +*> ALPHA is COMPLEX*16. +*> On entry, ALPHA specifies the scalar alpha. When alpha is +*> zero then T and V are not referenced, and T and V need not +*> be set before entry. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, dimension ( LDT, N ) +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array T must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> T is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array T must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> T is not referenced. +*> Note that when DIAGT = 'U' or 'u', the diagonal elements of +*> T are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> On entry, LDT specifies the first dimension of T as declared +*> in the calling (sub) program. LDT must be at least max( 1, n ). +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX*16 array, dimension ( LDV, N ) +*> Before entry with UPLO = 'U' or 'u', the leading k-by-k +*> upper triangular part of the array op(V) must contain the upper +*> triangular matrix and the strictly lower triangular part of +*> V is not referenced. +*> Before entry with UPLO = 'L' or 'l', the leading k-by-k +*> lower triangular part of the array op(V) must contain the lower +*> triangular matrix and the strictly upper triangular part of +*> V is not referenced. +*> Note that when DIAGV = 'U' or 'u', the diagonal elements of +*> V are not referenced either, but are assumed to be unity. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> On entry, LDV specifies the first dimension of T as declared +*> in the calling (sub) program. LDV must be at least max( 1, n ). +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + RECURSIVE SUBROUTINE ZTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, + $ N, ALPHA, T, LDT, V, LDV) +* +* .. Scalar Arguments .. + INTEGER N, LDT, LDV + CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV + COMPLEX*16 ALPHA +* .. +* .. Array Arguments .. + COMPLEX*16 T(LDT,*), V(LDV,*) +* .. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL ZTRMM, ZTRMMOOP, ZLASET +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG +* .. +* .. Local Scalars .. + INTEGER K, INFO + LOGICAL TLEFT, TUPPER, VTRANS, VUNIT, TUNIT, CONJV +* .. +* .. Local Parameters .. + COMPLEX*16 ONE, ZERO + PARAMETER(ONE=(1.0D+0,0.D+0), ZERO=(0.0D+0,0.0D+0)) +* .. +* +* Beginning of Executable Statements +* +* +* Early Termination Criteria +* + IF (ALPHA.EQ.ZERO) THEN +* +* If ALPHA is 0, then we are just setting T to be the 0 matrix +* + CALL ZLASET(UPLO, N, N, ZERO, ZERO, T, LDT) + RETURN + END IF + TUNIT = LSAME(DIAGT, 'U') + VUNIT = LSAME(DIAGV, 'U') +* +* Terminating Case +* + IF (N.EQ.1) THEN + IF (VUNIT.AND.TUNIT) THEN + T(1,1) = ALPHA + ELSE IF (VUNIT) THEN + T(1,1) = ALPHA*T(1,1) + ELSE IF (TUNIT) THEN + IF (CONJV) THEN + T(1,1) = ALPHA*CONJG(V(1,1)) + ELSE + T(1,1) = ALPHA*V(1,1) + END IF + ELSE + T(1,1) = ALPHA*T(1,1)*V(1,1) + END IF + RETURN + ELSE IF(N.LE.0) THEN + RETURN + END IF +* +* Recursive case +* + TLEFT = LSAME(SIDE, 'R') + TUPPER = LSAME(UPLO, 'U') + CONJV = LSAME(TRANSV, 'C') + VTRANS = CONJV.OR.LSAME(TRANSV, 'T') + + K = N / 2 + IF(TUPPER) THEN +* +* T is upper triangular +* + IF(TLEFT) THEN +* +* Compute T = T*op(V) +* + IF(VTRANS) THEN +* +* We are computing T = T*V**T, which we break down as follows +* |--------------| |--------------| |--------------------| +* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11}**T V_{21}**T| +* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}**T| +* |--------------| |--------------| |--------------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11}**T +* T_{12} = \alpha T_{11}*V_{21}**T + \alpha T_{12}*V_{22}**T +* T_{22} = \alpha T_{22}*V_{22}**T +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha T_{12}*V_{22}**T (ZTRMM) +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} (ZTRMMOOP) +* +* T_{12} = \alpha T_{12}*V_{22}**T +* + CALL ZTRMM('Right', 'Lower', TRANSV, DIAGV, K, + $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} +* + CALL ZTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT, + $ V(K+1, 1), LDV, ONE, T(1, K+1), LDT) + ELSE +* +* We are computing T = T*V, which we break down as follows +* |--------------| |--------------| |-------------| +* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11} V_{12}| +* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}| +* |--------------| |--------------| |-------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11} +* T_{12} = \alpha T_{11}*V_{12} + \alpha T_{12}*V_{22} +* T_{22} = \alpha T_{22}*V_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha T_{12}*V_{22} (ZTRMM) +* T_{12} = \alpha T_{11}*V_{12} + T_{12} (ZTRMMOOP) +* +* T_{12} = \alpha T_{12}*V_{22} +* + CALL ZTRMM('Right', 'Upper', TRANSV, DIAGV, K, + $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} +* + CALL ZTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT, + $ V(1, K+1), LDV, ONE, T(1, K+1), LDT) + END IF + ELSE +* +* Compute T = op(V)*T +* + IF(VTRANS) THEN +* +* We are computing T = V**T*T, which we break down as follows +* |--------------| |--------------------| |--------------| +* |T_{11} T_{12}| |V_{11}**T V_{21}**T| |T_{11} T_{12}| +* |0 T_{22}| = \alpha |0 V_{22}**T| * |0 T_{22}| +* |--------------| |--------------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}**T*T_{11} +* T_{12} = \alpha V_{11}**T*T_{12} + \alpha V_{21}**T*T_{22} +* T_{22} = \alpha V_{22}**T*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha V_{11}**T*T_{12} (ZTRMM) +* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} (ZTRMMOOP) +* +* T_{12} = \alpha V_{11}**T*T_{12} +* + CALL ZTRMM('Left', 'Lower', TRANSV, DIAGV, K, + $ N-K, ALPHA, V, LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} +* + CALL ZTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1), + $ LDT, V(K+1, 1), LDV, ONE, T(1, K+1), LDT) + ELSE +* +* We are computing T = V*T, which we break down as follows +* |--------------| |--------------| |--------------| +* |T_{11} T_{12}| |V_{11} V_{12}| |T_{11} T_{12}| +* |0 T_{22}| = \alpha |0 V_{22}| * |0 T_{22}| +* |--------------| |--------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k} +* T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}*T_{11} +* T_{12} = \alpha V_{11}*T_{12} + \alpha V_{12}*T_{22} +* T_{22} = \alpha V_{22}*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{12} = \alpha V_{11}*T_{12} (ZTRMM) +* T_{12} = \alpha V_{12}*T_{22} + T_{12} (ZTRMMOOP) +* +* T_{12} = \alpha V_{11}*T_{12} +* + CALL ZTRMM('Left', 'Upper', TRANSV, DIAGV, K, + $ N-K, ALPHA, V, LDV, T(1, K+1), LDT) +* +* T_{12} = \alpha V_{12}*T_{22} + T_{12} (ZTRMMOOP) +* + CALL ZTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1), + $ LDT, V(1, K+1), LDV, ONE, T(1, K+1), LDT) + END IF + END IF + ELSE +* +* T is lower triangular +* + IF(TLEFT) THEN +* +* Compute T = T*op(V) +* + IF(VTRANS) THEN +* +* We are computing T = T*V**T, which we break down as follows +* |--------------| |--------------| |--------------------| +* |T_{11} 0 | |T_{11} 0 | |V_{11}**T 0 | +* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{12}**T V_{22}**T| +* |--------------| |--------------| |--------------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11}**T +* T_{21} = \alpha T_{21}*V_{11}**T + \alpha T_{22}*V_{12}**T +* T_{22} = \alpha T_{22}*V_{22}**T +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha T_{21}*V_{11}**T (ZTRMM) +* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} (ZTRMMOOP) +* +* T_{21} = \alpha T_{21}*V_{11}**T +* + CALL ZTRMM('Right', 'Upper', TRANSV, DIAGV, N-K, + $ K, ALPHA, V, LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} +* + CALL ZTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1), + $ LDT, V(1, K+1), LDV, ONE, T(K+1, 1), LDT) + ELSE +* +* We are computing T = T*V, which we break down as follows +* |--------------| |--------------| |-------------| +* |T_{11} 0 | |T_{11} 0 | |V_{11} 0 | +* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{21} V_{22}| +* |--------------| |--------------| |-------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha T_{11}*V_{11} +* T_{21} = \alpha T_{21}*V_{11} + \alpha T_{22}*V_{21} +* T_{22} = \alpha T_{22}*V_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha T_{21}*V_{11} (ZTRMM) +* T_{21} = \alpha T_{22}*V_{21} + T_{21} (ZTRMMOOP) +* +* T_{21} = \alpha T_{21}*V_{11} +* + CALL ZTRMM('Right', 'Lower', TRANSV, DIAGV, N-K, + $ K, ALPHA, V, LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha T_{22}*V_{12} + T_{21} +* + CALL ZTRMMOOP('Left', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1), + $ LDT, V(K+1, 1), LDV, ONE, T(K+1, 1), LDT) + END IF + ELSE +* +* Compute T = op(V)*T +* + IF(VTRANS) THEN +* +* We are computing T = V**T*T, which we break down as follows +* |--------------| |--------------------| |--------------| +* |T_{11} 0 | |V_{11}**T 0 | |T_{11} 0 | +* |T_{21} T_{22}| = \alpha |V_{12}**T V_{22}**T| * |T_{21} T_{22}| +* |--------------| |--------------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k} +* V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}**T*T_{11} +* T_{21} = \alpha V_{12}**T*T_{11} + \alpha V_{22}**T*T_{21} +* T_{22} = \alpha V_{22}**T*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{21} as follows +* +* T_{21} = \alpha V_{22}**T*T_{21} (ZTRMM) +* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} (ZTRMMOOP) +* +* T_{21} = \alpha V_{22}**T*T_{21} +* + CALL ZTRMM('Left', 'Upper', TRANSV, DIAGV, N-K, K, + $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} +* + CALL ZTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT, + $ V(1, K+1), LDV, ONE, T(K+1, 1), LDT) + ELSE +* +* We are computing T = V*T, which we break down as follows +* |--------------| |-------------| |--------------| +* |T_{11} 0 | |V_{11} 0 | |T_{11} 0 | +* |T_{21} T_{22}| = \alpha |V_{21} V_{22}| * |T_{21} T_{22}| +* |--------------| |-------------| |--------------| +* +* Where +* T_{11}\in\R^{k\times k} +* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k} +* +* V_{11}\in\R^{k\times k} +* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k} +* +* Which means that we get +* +* T_{11} = \alpha V_{11}*T_{11} +* T_{21} = \alpha V_{21}*T_{11} + \alpha V_{22}*T_{21} +* T_{22} = \alpha V_{22}*T_{22} +* +* Computing T_{11} and T_{22} are just recursive calls to this +* routine, but we can break down computing T_{12} as follows +* +* T_{21} = \alpha V_{22}*T_{21} (ZTRMM) +* T_{21} = \alpha V_{12}*T_{11} + T_{21} (ZTRMMOOP) +* +* T_{21} = \alpha V_{22}*T_{12} +* + CALL ZTRMM('Left', 'Lower', TRANSV, DIAGV, N-K, K, + $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT) +* +* T_{21} = \alpha V_{12}*T_{11} + T_{21} +* + CALL ZTRMMOOP('Right', UPLO, 'No Transpose', + $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT, + $ V(K+1, 1), LDV, ONE, T(K+1, 1), LDT) + END IF + END IF + END IF +* +* Since in all the above cases, we compute T_{11} and T_{22} +* the same, we pass in our flags and call this routine recursively +* +* Compute T_{11} recursively +* + CALL ZTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, K, ALPHA, + $ T, LDT, V, LDV) +* +* Compute T_{22} recursively +* + CALL ZTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, N-K, ALPHA, + $ T(K+1, K+1), LDT, V(K+1, K+1), LDV) + END SUBROUTINE diff --git a/SRC/zungkl.f b/SRC/zungkl.f new file mode 100644 index 0000000000..bf3d61b301 --- /dev/null +++ b/SRC/zungkl.f @@ -0,0 +1,174 @@ +*> \brief \b ZUNGKL computes the explicit Q factor from ZGEQLF and ZLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZUNGKL(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* COMPLEX*16 Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNGKL generates an m by n complex matrix Q with orthonormal columns, +*> which is defined as the last n columns of the product of n +*> elementary reflectors +*> +*> Q = I - V*T*V**H = H(n) . . . H(2) H(1) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by ZGEQLF and T is the n by n matrix returned by ZLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V, and the order of T. +*> N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is COMPLEX*16 array, dimension (LDQ,N) +*> On entry, Q(1:m-n+i-1,i) contains the vector which defines the +*> elementary reflector H(i), for i=1,...,n as returned by ZGEQLF. +*> In addition, the lower triangular portion of the submatrix given +*> by Q(m-n+1:m,1:n) will contain the arry T as returned by ZLARFT. +*> See further details for more information. +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The storage of the V and T components inside Q is best illustrated by +*> the following example with m = 5, n = 3. +*> +*> Q = |----------| +*> | V1 V2 V3 | +*> | V1 V2 V3 | +*> | T1 V2 V3 | +*> | T1 T2 V3 | +*> | T1 T2 T3 | +*> |----------| +*> +*> \endverbatim +*> +* ===================================================================== + + SUBROUTINE ZUNGKL(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + COMPLEX*16 Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL ZTRMM, ZTRTRM, ZLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Parameters .. + COMPLEX*16 NEG_ONE, ONE + PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0)) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |---| +* Q = | V | +* | T | +* |---| +* +* Where T is an n-by-n lower triangular matrix, and V is as described +* in the Further Details section +* +* In turn, break apart V as follows +* +* |-----| +* V = | V_2 | +* | V_1 | +* |-----| +* +* Where: +* +* V_1 \in \C^{n\times n} assumed unit upper triangular +* V_2 \in \C^{m-n\times n} +* +* Compute T = T*V_1**T +* + CALL ZTRTRM('Right', 'Lower', 'Conjugate Transpose', + $ 'Non-Unit', 'Unit', N, ONE, Q(M-N+1,1), LDQ, Q(M-N+1,1), + $ LDQ) +* +* Compute Q = -VT. This means that we need to break apart +* Our computation in two parts +* +* |--------| +* Q = | -V_2*T | +* | -V_1*T | +* |--------| +* +* Q_2 = -V_2*T (TRMM) but only when necessary +* + IF (M.GT.N) THEN + CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Non-Unit', + $ M-N, N, NEG_ONE, Q(M-N+1,1), LDQ, Q, LDQ) + END IF +* +* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication) +* + CALL ZLUMM('Right', 'Non-Unit', 'Unit', N, NEG_ONE, + $ Q(M-N+1,1), LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(M-N+I,I) = Q(M-N+I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/zungkr.f b/SRC/zungkr.f new file mode 100644 index 0000000000..27c51d85e6 --- /dev/null +++ b/SRC/zungkr.f @@ -0,0 +1,154 @@ +*> \brief \b ZUNGKR computes the explicit Q factor from ZGEQRF and ZLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZUNGKR(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* COMPLEX*16 Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNGKR generates an m by n complex matrix Q with orthonormal columns, +*> which is defined as the first n columns of the product of n +*> elementary reflectors +*> +*> Q = I - V*T*V**H = H(1) H(2) . . . H(n) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by ZGEQRF and T is the n by n matrix returned by ZLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V, and the order of T. +*> N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is COMPLEX*16 array, dimension (LDQ,N) +*> On entry, the upper triangular part and diagonal contains +*> The array T as returned from ZLARFT. In addition, the +*> strictly lower triangular portion of the i-th column contains +*> the vector which defines the elementary reflector H(i), +*> for i = 1,2,...,n, as returned by ZGEQRF +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== +* Cost: (2mn**2 + n**2 - n)/2 + SUBROUTINE ZUNGKR(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + COMPLEX*16 Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL ZTRMM, ZTRTRM, ZLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Parameters .. + COMPLEX*16 NEG_ONE, ONE + PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0)) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |---| +* Q = | T | +* | V | +* |---| +* +* Where T is an n-by-n upper triangular matrix, and V is an +* m-by-n assumed unit lower trapezoidal matrix +* +* In turn, break apart V as follows +* +* |-----| +* V = | V_1 | +* | V_2 | +* |-----| +* +* Where: +* +* V_1 \in \C^{n\times n} assumed unit lower triangular +* V_2 \in \C^{m-n\times n} +* +* Compute T = T*V_1**H +* + CALL ZTRTRM('Right', 'Upper', 'Conjugate Transpose', + $ 'Non-unit', 'Unit', N, ONE, Q, LDQ, Q, LDQ) +* +* Compute Q = -VT. This means that we need to break apart +* Our computation in two parts +* +* |--------| +* Q = | -V_1*T | +* | -V_2*T | +* |--------| +* +* Q_2 = -V_2*T (TRMM) but only when necessary +* + IF (M.GT.N) THEN + CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Non-unit', + $ M-N, N, NEG_ONE, Q, LDQ, Q(N+1,1), LDQ) + END IF +* +* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication) +* + CALL ZLUMM('Left', 'Unit', 'Non-Unit', N, NEG_ONE, Q, LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,I) = Q(I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/zunglk.f b/SRC/zunglk.f new file mode 100644 index 0000000000..b4f89ca2ec --- /dev/null +++ b/SRC/zunglk.f @@ -0,0 +1,149 @@ +*> \brief \b ZUNGLK computes the explicit Q factor from ZGELQF and ZLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZUNGLK(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* COMPLEX*16 Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNGLK generates an m by n complex matrix Q with orthonormal columns, +*> which is defined as the first n rows of the product of n +*> elementary reflectors +*> +*> Q = I - V'*T*V = H(1) H(2) . . . H(n) +*> +*> Where V is an m by n matrix whose rows are householder reflectors +*> as returned by ZGELQF and T is the n by n matrix returned by ZLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V, and the order of T. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V. N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is COMPLEX*16 array, dimension (LDQ,N) +*> On entry, the lower triangular part and diagonal contains +*> The array T as returned from ZLARFT. In addition, the +*> strictly upper triangular portion of the i-th row contains +*> the vector which defines the elementary reflector H(i), +*> for i = 1,2,...,m, as returned by ZGELQF +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +* ===================================================================== + SUBROUTINE ZUNGLK(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + COMPLEX*16 Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL ZTRMM, ZTRTRM, ZLUMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MIN +* .. +* .. Parameters .. + COMPLEX*16 NEG_ONE, ONE + PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0)) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |-----| +* Q = | T V | +* |-----| +* +* Where T is an m-by-m lower triangular matrix, and V is an +* m-by-n assumed unit upper trapezoidal matrix +* +* In turn, break apart V as follows +* +* |---------| +* V = | V_1 V_2 | +* |---------| +* +* Where: +* +* V_1 \in \R^{m\times m} assumed unit upper triangular +* V_2 \in \R^{m\times n-m} +* +* Compute T = V_1'*T +* + CALL ZTRTRM('Left', 'Lower', 'Conjugate Transpose', + $ 'Non-unit', 'Unit', M, ONE, Q, LDQ, Q, LDQ) +* +* Compute Q = -TV. This means that we need to break apart +* Our computation in two parts +* +* |---------------| +* Q = | -T*V_1 -T*V_2 | +* |---------------| +* +* Q_2 = -T*V_2 (TRMM) but only when necessary +* + IF (N.GT.M) THEN + CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit', + $ M, N-M, NEG_ONE, Q, LDQ, Q(1,M+1), LDQ) + END IF +* +* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication) +* + CALL ZLUMM('Left', 'Non-unit', 'Unit', M, NEG_ONE, Q, LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,I) = Q(I,I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/zunglq.f b/SRC/zunglq.f index 3cc107560d..4da17af295 100644 --- a/SRC/zunglq.f +++ b/SRC/zunglq.f @@ -94,8 +94,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,M). -*> For optimum performance LWORK >= M*NB, where NB is -*> the optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -136,20 +134,17 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK, + $ NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNGL2 + EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, + $ ZUNGL2, ZUNGLK * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX * .. * .. External Functions .. INTEGER ILAENV @@ -161,7 +156,7 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'ZUNGLQ', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, M )*NB + LWKOPT = MAX( 1, M ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -189,95 +184,97 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) RETURN END IF * - NBMIN = 2 - NX = 0 + NBMIN = MAX( 2, ILAENV( 2, 'ZUNGLQ', ' ', M, N, K, -1 ) ) + NX = MAX( 0, ILAENV( 3, 'ZUNGLQ', ' ', M, N, K, -1 ) ) IWS = M - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'ZUNGLQ', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - LDWORK = M - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked version * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'ZUNGLQ', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL ZUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the last block. -* The first kk rows are handled by the block method. +* Factor the last block assuming that our first application +* will be on the Identity matrix * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + I = KK + 1 + IB = NB * -* Set A(kk+1:m,1:kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 20 J = 1, KK - DO 10 I = KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL ZLARFT( 'Forward', 'Transpose', N-I+1, IB, A( I, I ), + $ LDA, TAU( I ), A( I, I ), LDA ) * -* Use unblocked code for the last or only block. +* Apply H to A(i+ib:m,i:n) from the right * - IF( KK.LT.M ) - $ CALL ZUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL ZLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Forward', + $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I+IB,I), LDA) * - IF( KK.GT.0 ) THEN +* Apply H to columns i:n of current block + + CALL ZUNGLK( IB, N-I+1, A( I, I ), LDA) * * Use blocked code * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.M ) THEN + DO I = KI + 1, 1, -NB + IB = NB +* +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) +* + CALL ZLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I), + $ LDA, TAU( I ), A( I, I ), LDA ) * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) +* Apply H to A(i+ib:m,i:n) from the right * - CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I, - $ I ), - $ LDA, TAU( I ), WORK, LDWORK ) + CALL ZLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I+IB,I), LDA) * -* Apply H**H to A(i+ib:m,i:n) from the right +* Apply H to columns i:n of current block * - CALL ZLARFB( 'Right', 'Conjugate transpose', - $ 'Forward', - $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ), - $ LDA, WORK, LDWORK, A( I+IB, I ), LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL ZUNGLK( IB, N-I+1, A( I, I ), LDA) + END DO * -* Apply H**H to columns i:n of current block +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary * - CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set columns 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = 1, I - 1 - DO 30 L = I, I + IB - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL ZLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I), + $ LDA, TAU( I ), A( I, I ), LDA ) +* +* Apply H to A(i+ib:m,i:n) from the right +* + CALL ZLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I+IB,I), LDA) +* +* Apply H to columns i:n of current block +* + CALL ZUNGLK( IB, N-I+1, A( I, I ), LDA) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/zungql.f b/SRC/zungql.f index a64de501d3..7f8200a148 100644 --- a/SRC/zungql.f +++ b/SRC/zungql.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -136,18 +134,15 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * .. * * ===================================================================== -* -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. +* * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT, - $ NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN, + $ NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2L + EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, + $ ZUNG2L, ZUNGKL * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -177,7 +172,11 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) LWKOPT = 1 ELSE NB = ILAENV( 1, 'ZUNGQL', ' ', M, N, K, -1 ) - LWKOPT = N*NB +* +* Only need a workspace for zung2l in case of bailout +* and for the panel factorization +* + LWKOPT = N END IF WORK( 1 ) = LWKOPT * @@ -200,88 +199,75 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 + NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 )) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* We use blocked code for the entire construction * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Possibly bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQL', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL ZUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk columns are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB * -* Set A(m-kk+1:m,1:n-kk) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = 1, N - KK - DO 10 I = M - KK + 1, M - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Exploit the fact that we are applying to an identity * - CALL ZUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward', + $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I), + $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows 1:m-k+i+ib-1 of current block * -* Use blocked code + CALL ZUNGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA) + +* Use blocked code on the remaining blocks if there are any. * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - IF( N-K+I.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, - $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) * -* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL ZLARFB( 'Left', 'No transpose', 'Backward', - $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, - $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA, - $ WORK( IB+1 ), LDWORK ) - END IF + CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB, + $ A( 1, N-K+I ), LDA, TAU( I ), + $ A( M-K+I, N-K+I ), LDA ) * -* Apply H to rows 1:m-k+i+ib-1 of current block +* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left * - CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA, - $ TAU( I ), WORK, IINFO ) + CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, + $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA) * -* Set rows m-k+i+ib:m of current block to zero +* Apply H to rows 1:m-k+i+ib-1 of current block * - DO 40 J = N - K + I, N - K + I + IB - 1 - DO 30 L = M - K + I + IB, M - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL ZUNGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA) + END DO END IF * WORK( 1 ) = IWS diff --git a/SRC/zungqr.f b/SRC/zungqr.f index 6c9f2e3ff5..56022eeefa 100644 --- a/SRC/zungqr.f +++ b/SRC/zungqr.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,N). -*> For optimum performance LWORK >= N*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -137,20 +135,17 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK, - $ LWKOPT, NB, NBMIN, NX + INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB, + $ NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2R + EXTERNAL XERBLA, ZLARFB0C2, ZLARFT, + $ ZUNG2R, ZUNGKR * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX * .. * .. External Functions .. INTEGER ILAENV @@ -162,7 +157,11 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * INFO = 0 NB = ILAENV( 1, 'ZUNGQR', ' ', M, N, K, -1 ) - LWKOPT = MAX( 1, N )*NB +* +* Only need a workspace for zung2r in case of bailout +* and for the panel factorization +* + LWKOPT = MAX( 1, N ) WORK( 1 ) = LWKOPT LQUERY = ( LWORK.EQ.-1 ) IF( M.LT.0 ) THEN @@ -191,92 +190,100 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * NBMIN = 2 - NX = 0 +* Determine when to cross over from unblocked to blocked + NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) ) IWS = N - IF( NB.GT.1 .AND. NB.LT.K ) THEN * -* Determine when to cross over from blocked to unblocked code. -* - NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Treat the last NB block starting at KK+1 specially then use our blocking +* method from the block starting at KI+1 to 1 * - LDWORK = N - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KI = K - 2 * NB + KK = K - NB + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code. * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQR', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL ZUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the last block. -* The first kk columns are handled by the block method. +* Factor the last block assuming that our first application +* will be on the Identity matrix * - KI = ( ( K-NX-1 ) / NB )*NB - KK = MIN( K, KI+NB ) + I = KK + 1 + IB = NB * -* Set A(1:kk,kk+1:n) to zero. +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 20 J = KK + 1, N - DO 10 I = 1, KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * -* Use unblocked code for the last or only block. +* Apply H to A(i:m,i+ib:n) from the left +* Exploit the fact that we are applying to an identity * - IF( KK.LT.N ) - $ CALL ZUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA, - $ TAU( KK+1 ), WORK, IINFO ) + CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward', + $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I), + $ LDA, A(I,I+IB), LDA) * - IF( KK.GT.0 ) THEN +* Apply H to rows i:m of current block * -* Use blocked code + CALL ZUNGKR(M-I+1, IB, A(I,I), LDA) * - DO 50 I = KI + 1, 1, -NB - IB = MIN( NB, K-I+1 ) - IF( I+IB.LE.N ) THEN +* Use our standard blocking method after the last block * -* Form the triangular factor of the block reflector -* H = H(i) H(i+1) . . . H(i+ib-1) + DO I = KI + 1, 1, -NB + IB = NB * - CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, IB, - $ A( I, I ), LDA, TAU( I ), WORK, LDWORK ) +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * -* Apply H to A(i:m,i+ib:n) from the left + CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) * - CALL ZLARFB( 'Left', 'No transpose', 'Forward', - $ 'Columnwise', M-I+1, N-I-IB+1, IB, - $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ), - $ LDA, WORK( IB+1 ), LDWORK ) - END IF +* Apply H to A(i:m,i+ib:n) from the left +* + CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + * * Apply H to rows i:m of current block * - CALL ZUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ), - $ WORK, - $ IINFO ) + CALL ZUNGKR(M-I+1, IB, A(I,I), LDA) + END DO +* +* This checks for if K was a perfect multiple of NB +* so that we only have a special case for the last block when +* necessary +* + IF(I.LT.1) THEN + IB = I + NB - 1 + I = 1 * -* Set rows 1:i-1 of current block to zero +* Form the triangular factor of the block reflector +* H = H(i) H(i+1) . . . H(i+ib-1) * - DO 40 J = I, I + IB - 1 - DO 30 L = 1, I - 1 - A( L, J ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I), + $ LDA, TAU(I), A(I,I), LDA) +* +* Apply H to A(i:m,i+ib:n) from the left +* + CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose', + $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), + $ LDA, A(I,I), LDA, A(I,I+IB), LDA) + +* +* Apply H to rows i:m of current block +* + CALL ZUNGKR(M-I+1, IB, A(I,I), LDA) + END IF END IF * WORK( 1 ) = IWS diff --git a/SRC/zungrk.f b/SRC/zungrk.f new file mode 100644 index 0000000000..f73b8bb095 --- /dev/null +++ b/SRC/zungrk.f @@ -0,0 +1,169 @@ +*> \brief \b ZUNGRK computes the explicit Q factor from DGERQF and ZLARFT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZUNGRK(M, N, Q, LDQ) +* +* .. Scalar Arguments .. +* INTEGER M, N, LDQ +* .. +* .. Array Arguments .. +* COMPLEX*16 Q(LDQ,*) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNGRK generates an m by n complex matrix Q with orthonormal rows, +*> which is defined as the last m rows of the product of m +*> elementary reflectors +*> +*> Q = I - V'*T*V = H(m) . . . H(2) H(1) +*> +*> Where V is an m by n matrix whose columns are householder reflectors +*> as returned by ZGERQF and T is the n by n matrix returned by ZLARFT +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix V, and the order of T. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix V. N >= 0. +*> \endverbatim +*> +*> \param[in,out] Q +*> \verbatim +*> Q is COMPLEX array, dimension (LDQ,N) +*> On entry, Q(i,1:n-m-1+i) contains the vector which defines the +*> elementary reflector H(i), for i=1,...,n as returned by ZGERKF. +*> In addition, the upper triangular portion of the submatrix given +*> by Q(1:m,n-m:n) will contain the array T as returned by ZLARFT. +*> See further details for more information. +*> On exit, the m-by-n matrix Q. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The storage of the V and T components inside Q is best illustrated by +*> the following example with m = 3, n = 5. +*> +*> Q = |----------------| +*> | V1 V1 T1 T1 T1 | +*> | V2 V2 V2 T2 T2 | +*> | V3 V3 V3 V3 T3 | +*> |----------------| +*> +*> \endverbatim +*> +* ===================================================================== + + SUBROUTINE ZUNGRK(M, N, Q, LDQ) +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, LDQ +* .. +* .. Array Arguments .. + COMPLEX*16 Q(LDQ,*) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 NEG_ONE, ONE + PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0)) +* .. +* .. Local Scalars .. + INTEGER I, J +* .. +* .. External Subroutines .. + EXTERNAL ZTRMM, ZTRTRM, ZLUMM +* .. +* .. Intrinsic Functions.. + INTRINSIC MIN +* .. +* .. Executable Statements .. +* +* Break Q apart as follows +* +* |-----| +* Q = | V T | +* |-----| +* +* Where T is an m-by-m upper triangular matrix, and V is as described +* in the Further Details section +* +* In turn, break apart V as follows +* +* |---------| +* V = | V_2 V_1 | +* |---------| +* +* Where: +* +* V_1 \in \R^{m\times m} assumed unit lower triangular +* V_2 \in \R^{m\times n-m} +* +* Compute T = V_1'*T +* + CALL ZTRTRM('Left', 'Upper', 'Conjugate Transpose', + $ 'Non-Unit', 'Unit', M, ONE, Q(1,N-M+1), LDQ, Q(1,N-M+1), + & LDQ) +* +* Compute Q = -TV. This means that we need to break apart +* Our computation in two parts +* +* |---------------| +* Q = | -T*V_2 -T*V_1 | +* |---------------| +* +* Q_2 = -T*V_2 (TRMM) but only when necessary +* + IF (N.GT.M) THEN + CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit', + $ M, N-M, NEG_ONE, Q(1,N-M+1), LDQ, Q, LDQ) + END IF +* +* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication) +* + CALL ZLUMM('Right', 'Unit', 'Non-Unit', M, NEG_ONE, + $ Q(1,N-M+1), LDQ) +* +* Q = "I" + Q +* + J = MIN(M,N) + DO I = 1, J + Q(I,N-M+I) = Q(I,N-M+I) + ONE + END DO + END SUBROUTINE diff --git a/SRC/zungrq.f b/SRC/zungrq.f index dcc2772d67..46ccf3c4aa 100644 --- a/SRC/zungrq.f +++ b/SRC/zungrq.f @@ -95,8 +95,6 @@ *> \verbatim *> LWORK is INTEGER *> The dimension of the array WORK. LWORK >= max(1,M). -*> For optimum performance LWORK >= M*NB, where NB is the -*> optimal blocksize. *> *> If LWORK = -1, then a workspace query is assumed; the routine *> only calculates the optimal size of the WORK array, returns @@ -137,17 +135,14 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) * * ===================================================================== * -* .. Parameters .. - COMPLEX*16 ZERO - PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. * .. Local Scalars .. LOGICAL LQUERY - INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK, + INTEGER I, IB, II, IINFO, IWS, KK, LDWORK, $ LWKOPT, NB, NBMIN, NX * .. * .. External Subroutines .. - EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNGR2 + EXTERNAL XERBLA, ZLARFB, ZLARFT, + $ ZUNGR2, ZUNGRK * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -173,12 +168,8 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) END IF * IF( INFO.EQ.0 ) THEN - IF( M.LE.0 ) THEN - LWKOPT = 1 - ELSE - NB = ILAENV( 1, 'ZUNGRQ', ' ', M, N, K, -1 ) - LWKOPT = M*NB - END IF + LWKOPT = MAX(1, M) + NB = ILAENV( 1, 'ZUNGRQ', ' ', M, N, K, -1 ) WORK( 1 ) = LWKOPT * IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN @@ -199,92 +190,74 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO ) RETURN END IF * - NBMIN = 2 - NX = 0 + NBMIN = MAX( 2, ILAENV( 2, 'ZUNGRQ', ' ', M, N, K, -1 ) ) + NX = MAX( 0, ILAENV( 3, 'ZUNGRQ', ' ', M, N, K, -1 ) ) IWS = M - IF( NB.GT.1 .AND. NB.LT.K ) THEN -* -* Determine when to cross over from blocked to unblocked code. * - NX = MAX( 0, ILAENV( 3, 'ZUNGRQ', ' ', M, N, K, -1 ) ) - IF( NX.LT.K ) THEN + IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN * -* Determine if workspace is large enough for blocked code. +* Use blocked code after the first block. +* The last kk rows are handled by the block method. * - LDWORK = M - IWS = LDWORK*NB - IF( LWORK.LT.IWS ) THEN + KK = K + ELSE + KK = 0 + END IF * -* Not enough workspace to use optimal NB: reduce NB and -* determine the minimum value of NB. +* Potentially bail to the unblocked code * - NB = LWORK / LDWORK - NBMIN = MAX( 2, ILAENV( 2, 'ZUNGRQ', ' ', M, N, K, - $ -1 ) ) - END IF - END IF + IF( KK.EQ.0 ) THEN + CALL ZUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO ) END IF * - IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN + IF( KK.GT.0 ) THEN * -* Use blocked code after the first block. -* The last kk rows are handled by the block method. +* Factor the first block assuming that our first application +* will be on the Identity matrix * - KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB ) + I = 1 + IB = NB + II = M - K + I * -* Set A(1:m-kk,n-kk+1:n) to zero. +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - DO 20 J = N - KK + 1, N - DO 10 I = 1, M - KK - A( I, J ) = ZERO - 10 CONTINUE - 20 CONTINUE - ELSE - KK = 0 - END IF + CALL ZLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA ) * -* Use unblocked code for the first or only block. +* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL ZUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO ) + CALL ZLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Backward', + $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA, + $ A( II, N-K+I ), LDA, A, LDA) * - IF( KK.GT.0 ) THEN +* Apply H to columns 1:n-k+i+ib-1 of current block * -* Use blocked code + CALL ZUNGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA ) * - DO 50 I = K - KK + 1, K, NB - IB = MIN( NB, K-I+1 ) - II = M - K + I - IF( II.GT.1 ) THEN + DO I = NB+1, K, NB * -* Form the triangular factor of the block reflector -* H = H(i+ib-1) . . . H(i+1) H(i) +* The last block may be less than size NB * - CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB, - $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK ) + IB = MIN(NB, K-I+1) + II = M - K + I * -* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right +* Form the triangular factor of the block reflector +* H = H(i+ib-1) . . . H(i+1) H(i) * - CALL ZLARFB( 'Right', 'Conjugate transpose', - $ 'Backward', - $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ), - $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ), - $ LDWORK ) - END IF + CALL ZLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB, + $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA ) * -* Apply H**H to columns 1:n-k+i+ib-1 of current block +* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right * - CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA, - $ TAU( I ), - $ WORK, IINFO ) + CALL ZLARFB0C2(.FALSE., 'Right', 'No Transpose', + $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), + $ LDA, A( II, N-K+I ), LDA, A, LDA) * -* Set columns n-k+i+ib:n of current block to zero +* Apply H to columns 1:n-k+i+ib-1 of current block * - DO 40 L = N - K + I + IB, N - DO 30 J = II, II + IB - 1 - A( J, L ) = ZERO - 30 CONTINUE - 40 CONTINUE - 50 CONTINUE + CALL ZUNGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA ) + END DO END IF * WORK( 1 ) = IWS