diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 676397e768..86ee3dc4f2 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -108,7 +108,7 @@ set(SLASRC
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
- slarf.f slarf1f.f slarf1l.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f
+ slarf.f slarf1f.f slarf1l.f slarfb.f slarfb0c2.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f
slargv.f slarmm.f slarrv.f slartv.f
slarz.f slarzb.f slarzt.f slasy2.f
slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
@@ -116,6 +116,8 @@ set(SLASRC
slauu2.f slauum.f sopgtr.f sopmtr.f sorg2l.f sorg2r.f
sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
sorgrq.f sorgtr.f sorgtsqr.f sorgtsqr_row.f sorm2l.f sorm2r.f sorm22.f
+ sorgkr.f sorgrk.f sorgkl.f sorglk.f
+ slumm.f strtrm.f strmmoop.f
sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f
sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f
spbstf.f spbsv.f spbsvx.f
@@ -220,7 +222,7 @@ set(CLASRC
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
- clarf.f clarf1f.f clarf1l.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
+ clarf.f clarf1f.f clarf1l.f clarfb.f clarfb0c2.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90
claswp.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
@@ -248,6 +250,8 @@ set(CLASRC
ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
ctrsyl.f ctrsyl3.f ctrti2.f ctrtri.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f
cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
+ cungkr.f cungrk.f cungkl.f cunglk.f
+ clumm.f ctrtrm.f ctrmmoop.f cacxpy.f
cungrq.f cungtr.f cungtsqr.f cungtsqr_row.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f
cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f
cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f
@@ -309,13 +313,15 @@ set(DLASRC
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqp2rk.f dlaqp3rk.f dlaqsb.f dlaqsp.f dlaqsy.f
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
- dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f
+ dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlarf1f.f dlarf1l.f dlarfb0c2.f
dlargv.f dlarmm.f dlarrv.f dlartv.f
dlarz.f dlarzb.f dlarzt.f dlaswp.f dlasy2.f
dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrs3.f dlatrz.f dlauu2.f
dlauum.f dopgtr.f dopmtr.f dorg2l.f dorg2r.f
dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
+ dorgkr.f dorgrk.f dorgkl.f dorglk.f
+ dlumm.f dtrtrm.f dtrmmoop.f
dorgrq.f dorgtr.f dorgtsqr.f dorgtsqr_row.f dorm2l.f dorm2r.f dorm22.f
dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f
dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f
@@ -420,7 +426,7 @@ set(ZLASRC
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqp2rk.f zlaqp3rk.f zlaqsb.f
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
- zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f zlarf1f.f zlarf1l.f
+ zlarcm.f zlarf.f zlarfb.f zlarfb0c2.f zlarfb_gett.f zlarf1f.f zlarf1l.f
zlarfg.f zlarfgp.f zlarft.f
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
@@ -450,6 +456,8 @@ set(ZLASRC
ztrsyl.f ztrsyl3.f ztrti2.f ztrtri.f ztrtrs.f ztzrzf.f zung2l.f
zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
zungrq.f zungtr.f zungtsqr.f zungtsqr_row.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f
+ zungkr.f zungrk.f zungkl.f zunglk.f
+ zlumm.f ztrtrm.f ztrmmoop.f zacxpy.f
zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
zunmtr.f zupgtr.f
zupmtr.f izmax1.f dzsum1.f zstemr.f
diff --git a/SRC/Makefile b/SRC/Makefile
index 0191626f0e..ec1e8451bf 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -137,7 +137,7 @@ SLASRC = \
slaqgb.o slaqge.o slaqp2.o slaqps.o slaqp2rk.o slaqp3rk.o slaqsb.o slaqsp.o slaqsy.o \
slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
- slarf.o slarf1f.o slarf1l.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \
+ slarf.o slarf1f.o slarf1l.o slarfb.o slarfb0c2.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o \
slargv.o slarmm.o slarrv.o slartv.o \
slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \
slasyf_rk.o \
@@ -145,6 +145,8 @@ SLASRC = \
slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \
sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \
sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \
+ sorgkr.o sorgrk.o sorgkl.o sorglk.o \
+ slumm.o strtrm.o strmmoop.o \
sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \
sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \
spbstf.o spbsv.o spbsvx.o \
@@ -249,7 +251,7 @@ CLASRC = \
claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqp2rk.o claqp3rk.o claqsb.o \
claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
- clarf.o clarf1f.o clarf1l.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
+ clarf.o clarf1f.o clarf1l.o clarfb.o clarfb0c2.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \
clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \
clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \
claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \
@@ -277,6 +279,8 @@ CLASRC = \
ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \
ctrsyl.o ctrsyl3.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \
cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \
+ cungkr.o cungrk.o cungkl.o cunglk.o \
+ clumm.o ctrtrm.o ctrmmoop.o cacxpy.o \
cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \
cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \
@@ -339,12 +343,14 @@ DLASRC = \
dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqp2rk.o dlaqp3rk.o dlaqsb.o dlaqsp.o dlaqsy.o \
dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
- dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o\
+ dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o dlarf1f.o dlarf1l.o dlarfb0c2.o\
dlargv.o dlarmm.o dlarrv.o dlartv.o \
dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \
dlasyf.o dlasyf_rook.o dlasyf_rk.o \
dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrs3.o dlatrz.o dlauu2.o \
dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \
+ dorgkr.o dorgrk.o dorgkl.o dorglk.o \
+ dlumm.o dtrtrm.o dtrmmoop.o \
dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \
dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \
dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \
@@ -453,7 +459,7 @@ ZLASRC = \
zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqp2rk.o zlaqp3rk.o zlaqsb.o \
zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
- zlarcm.o zlarf.o zlarfb.o zlarfb_gett.o zlarf1f.o zlarf1l.o \
+ zlarcm.o zlarf.o zlarfb.o zlarfb0c2.o zlarfb_gett.o zlarf1f.o zlarf1l.o \
zlarfg.o zlarft.o zlarfgp.o \
zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \
zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \
@@ -483,6 +489,8 @@ ZLASRC = \
ztrsyl.o ztrsyl3.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \
zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \
zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
+ zungkr.o zungrk.o zungkl.o zunglk.o \
+ zlumm.o ztrtrm.o ztrmmoop.o zacxpy.o \
zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \
zunmtr.o zupgtr.o \
zupmtr.o izmax1.o dzsum1.o zstemr.o \
diff --git a/SRC/cacxpy.f b/SRC/cacxpy.f
new file mode 100644
index 0000000000..b27bd5b149
--- /dev/null
+++ b/SRC/cacxpy.f
@@ -0,0 +1,134 @@
+*> \brief \b CACXPY
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CACXPY(N,CA,CX,INCX,CY,INCY)
+*
+* .. Scalar Arguments ..
+* COMPLEX CA
+* INTEGER INCX,INCY,N
+* ..
+* .. Array Arguments ..
+* COMPLEX CX(*),CY(*)
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CACXPY constant times a conjugated vector plus a vector.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> number of elements in input vector(s)
+*> \endverbatim
+*>
+*> \param[in] CA
+*> \verbatim
+*> CA is COMPLEX
+*> On entry, CA specifies the scalar alpha.
+*> \endverbatim
+*>
+*> \param[in] CX
+*> \verbatim
+*> CX is COMPLEX array, dimension ( 1 + ( N - 1 )*abs( INCX ) )
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*> INCX is INTEGER
+*> storage spacing between elements of CX
+*> \endverbatim
+*>
+*> \param[in,out] CY
+*> \verbatim
+*> CY is COMPLEX array, dimension ( 1 + ( N - 1 )*abs( INCY ) )
+*> \endverbatim
+*>
+*> \param[in] INCY
+*> \verbatim
+*> INCY is INTEGER
+*> storage spacing between elements of CY
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup axpy
+*>
+* =====================================================================
+ SUBROUTINE CACXPY(N,CA,CX,INCX,CY,INCY)
+*
+* -- Reference BLAS level1 routine --
+* -- Reference BLAS is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ COMPLEX CA
+ INTEGER INCX,INCY,N
+* ..
+* .. Array Arguments ..
+ COMPLEX CX(*),CY(*)
+* ..
+*
+* =====================================================================
+*
+* .. Local Scalars ..
+ INTEGER I,IX,IY
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC CONJG
+* ..
+* .. External Functions ..
+ REAL SCABS1
+ EXTERNAL SCABS1
+* ..
+ IF (N.LE.0) RETURN
+ IF (SCABS1(CA).EQ.0.0d0) RETURN
+ IF (INCX.EQ.1 .AND. INCY.EQ.1) THEN
+*
+* code for both increments equal to 1
+*
+ DO I = 1,N
+ CY(I) = CY(I) + CA*CONJG(CX(I))
+ END DO
+ ELSE
+*
+* code for unequal increments or equal increments
+* not equal to 1
+*
+ IX = 1
+ IY = 1
+ IF (INCX.LT.0) IX = (-N+1)*INCX + 1
+ IF (INCY.LT.0) IY = (-N+1)*INCY + 1
+ DO I = 1,N
+ CY(IY) = CY(IY) + CA*CONJG(CX(IX))
+ IX = IX + INCX
+ IY = IY + INCY
+ END DO
+ END IF
+*
+ RETURN
+*
+* End of CACXPY
+*
+ END
+
diff --git a/SRC/clarfb0c2.f b/SRC/clarfb0c2.f
new file mode 100644
index 0000000000..28fd002eee
--- /dev/null
+++ b/SRC/clarfb0c2.f
@@ -0,0 +1,564 @@
+*> \brief \b CLARFB0C2 applies a block reflector or its conjugate-transpose
+* to a rectangular matrix with a 0 block while constructing the explicit Q
+* factor
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+* $ K, V, LDV, T, LDT, C, LDC)
+* ! Scalar arguments
+* INTEGER M, N, K, LDV, LDC, LDT
+* CHARACTER SIDE, TRANS, DIRECT, STOREV
+* ! True means that we are assuming C2 is the identity matrix
+* ! and thus don't reference whatever is present in C2
+* ! at the beginning.
+* LOGICAL C2I
+* ! Array arguments
+* COMPLEX V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CLARFB0C2 applies a real block reflector H or its transpose H**H to a
+*> complex m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*> C2I is LOGICAL
+*> = .TRUE.: Assume the nonzero block of C is the identity matrix
+*> = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> = 'L': apply H or H**H from the Left
+*> = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*> TRANS is CHARACTER*1
+*> = 'N': apply H (No transpose)
+*> = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*> DIRECT is CHARACTER*1
+*> Indicates how H is formed from a product of elementary
+*> reflectors
+*> = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*> = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*> STOREV is CHARACTER*1
+*> Indicates how the vectors which define the elementary
+*> reflectors are stored:
+*> = 'C': Columnwise
+*> = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*> K is INTEGER
+*> The order of the matrix T (= the number of elementary
+*> reflectors whose product defines the block reflector).
+*> If SIDE = 'L', M >= K >= 0;
+*> if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is COMPLEX array, dimension
+*> (LDV,K) if STOREV = 'C'
+*> (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*> (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*> See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> The leading dimension of the array V.
+*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*> if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is COMPLEX array, dimension (LDT,K)
+*> The triangular K-by-K matrix T in the representation of the
+*> block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is COMPLEX array, dimension (LDC,N)
+*> On entry, the M-by-N matrix C.
+*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The shape of the matrix V and the storage of the vectors which define
+*> the H(i) is best illustrated by the following example with n = 5 and
+*> k = 3. The triangular part of V (including its diagonal) is not
+*> referenced.
+*>
+*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R':
+*>
+*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 )
+*> ( v1 1 ) ( 1 v2 v2 v2 )
+*> ( v1 v2 1 ) ( 1 v3 v3 )
+*> ( v1 v2 v3 )
+*> ( v1 v2 v3 )
+*>
+*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R':
+*>
+*> V = ( v1 v2 v3 ) V = ( v1 v1 1 )
+*> ( v1 v2 v3 ) ( v2 v2 v2 1 )
+*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 )
+*> ( 1 v3 )
+*> ( 1 )
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE CLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+ $ K, V, LDV, T, LDT, C, LDC)
+ ! Scalar arguments
+ INTEGER M, N, K, LDV, LDC, LDT
+ CHARACTER SIDE, TRANS, DIRECT, STOREV
+ ! True means that we are assuming C2 is the identity matrix
+ ! and thus don't reference whatever is present in C2
+ ! at the beginning.
+ LOGICAL C2I
+ ! Array arguments
+ COMPLEX V(LDV,*), C(LDC,*), T(LDT,*)
+ ! Local scalars
+ LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+ $ TRANST
+ INTEGER I, J
+ ! Intrinsic Functions
+ INTRINSIC CONJG
+ ! External functions
+ LOGICAL LSAME
+ EXTERNAL LSAME
+ ! External Subroutines
+ EXTERNAL CGEMM, CTRMM
+ ! Parameters
+ COMPLEX ONE, ZERO, NEG_ONE
+ PARAMETER(ONE=(1.0E+0, 0.0E+0),
+ $ ZERO = (0.0E+0, 0.0E+0),
+ $ NEG_ONE = (-1.0E+0, 0.0E+0))
+
+ ! Beginning of executable statements
+ ! Convert our character flags to logical values
+ DIRF = LSAME(DIRECT,'F')
+ COLV = LSAME(STOREV,'C')
+ SIDEL = LSAME(SIDE,'L')
+ SIDER = LSAME(SIDE,'R')
+ TRANST = LSAME(TRANS,'C')
+
+ ! Determine which of the 4 modes are using.
+ ! QR is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the first column
+ QR = DIRF.AND.COLV
+
+ ! LQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the first row
+ LQ = DIRF.AND.(.NOT.COLV)
+
+ ! QL is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the last column
+ QL = (.NOT.DIRF).AND.COLV
+
+ ! RQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the last row
+ ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+ ! Since we have exactly one of these 4 modes, we don't need to actually
+ ! store the value of RQ, instead we assume this is the case if we fail
+ ! the above 3 checks.
+
+ IF (QR) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V1 ] and C = [ C1 ]
+ ! [ V2 ] [ C2 ]
+ ! with the following dimensions:
+ ! V1\in\C^{K\times K}
+ ! V2\in\C^{M-K\times K}
+ ! C1=0\in\C^{K\times N}
+ ! C2\in\C^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ]
+ ! [ V2 ] [ C2 ]
+ ! = [ 0 ] - [ V1 ]T*V2'*C2
+ ! [ C2 ] [ V2 ]
+ !
+ ! = [ 0 ] - [ V1*T*V2'*C2 ]
+ ! [ C2 ] [ V2*T*V2'*C2 ]
+ !
+ ! = [ V1*T*V2'*C2 ]
+ ! [ C2 - V2*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('CLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('CLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF (C2I) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(I,J) = CONJG(V(K+J,I))
+ END DO
+ END DO
+ ELSE
+ CALL CGEMM('Conjugate', 'No Transpose', K, N, M - K,
+ $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+ $ C, LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF (C2I) THEN
+ CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+ $ C(K+1,1), LDC)
+ DO I = 1, N
+ C(K+I,I) = C(K+I,I) + ONE
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+ $ C(K+1,1), LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL CTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (LQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+ ! with the following dimensions:
+ ! V1\in\C^{K\times K}
+ ! V2\in\C^{K\times N-K}
+ ! C1=0\in\C^{M\times K}
+ ! C2\in\C^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+ !
+ ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('CLARFB0C2', 2)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,J) = CONJG(V(J,K+I))
+ END DO
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K,
+ $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+ $ LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ IF( TRANST ) THEN
+ CALL CTRMM('Right', 'Upper', 'Conjugate', 'Non-unit',
+ $ M, K, ONE, T, LDT, C, LDC)
+ ELSE
+ CALL CTRMM('Right', 'Lower', 'No Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C, LDC)
+ END IF
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+ $ LDC)
+ DO I = 1, M
+ C(I,K+I) = C(I,K+I) + ONE
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+ $ LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL CTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (QL) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V2 ] and C = [ C2 ]
+ ! [ V1 ] [ C1 ]
+ ! with the following dimensions:
+ ! V1\in\C^{K\times K}
+ ! V2\in\C^{M-K\times K}
+ ! C1=0\in\C^{K\times N}
+ ! C2\in\C^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+ ! [ V1 ] [ 0 ]
+ !
+ ! = [ C2 ] - [ V2 ]T*V2'*C2
+ ! [ 0 ] [ V1 ]
+ !
+ ! = [ C2 ] - [ V2*T*V2'*C2 ]
+ ! [ 0 ] [ V1*T*V2'*C2 ]
+ !
+ ! = [ C2 - V2*T*V2'*C2 ]
+ ! [ - V1*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('CLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('CLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF( C2I ) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(M-K+I,J) = CONJG(V(J,I))
+ END DO
+ END DO
+ ELSE
+ CALL CGEMM('Conjugate', 'No Transpose', K, N, M-K,
+ $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF( C2I ) THEN
+ CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+ DO I = 1, N
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL CTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+ ELSE ! IF (RQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+ ! with the following dimensions:
+ ! V1\in\C^{K\times K}
+ ! V2\in\C^{K\times N-K}
+ ! C1=0\in\C^{M\times K}
+ ! C2\in\C^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+ !
+ ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+ !
+ ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('CLARFB0C2', 2)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,N-K+J) = CONJG(V(J,I))
+ END DO
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'Conjugate', M, K, N-K,
+ $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ IF( TRANST ) THEN
+ CALL CTRMM('Right', 'Lower', 'Conjugate', 'Non-unit',
+ $ M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ ELSE
+ CALL CTRMM('Right', 'Upper', 'No Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+ DO I = 1, M
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL CGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL CTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/clarft.f b/SRC/clarft.f
index c757f3c172..d9d5b99114 100644
--- a/SRC/clarft.f
+++ b/SRC/clarft.f
@@ -6,11 +6,11 @@
* http://www.netlib.org/lapack/explore-html/
*
*> Download CLARFT + dependencies
-*>
+*>
*> [TGZ]
-*>
+*>
*> [ZIP]
-*>
+*>
*> [TXT]
*
* Definition:
@@ -23,7 +23,7 @@
* INTEGER K, LDT, LDV, N
* ..
* .. Array Arguments ..
-* COMPLEX T( LDT, * ), TAU( * ), V( LDV, * )
+* COMPLEX T( LDT, * ), TAU( * ), V( LDV, * )
* ..
*
*
@@ -32,7 +32,7 @@
*>
*> \verbatim
*>
-*> CLARFT forms the triangular factor T of a complex block reflector H
+*> CLARFT forms the triangular factor T of a real block reflector H
*> of order n, which is defined as a product of k elementary reflectors.
*>
*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
@@ -42,12 +42,12 @@
*> If STOREV = 'C', the vector which defines the elementary reflector
*> H(i) is stored in the i-th column of the array V, and
*>
-*> H = I - V * T * V**H
+*> H = I - V * T * V**T
*>
*> If STOREV = 'R', the vector which defines the elementary reflector
*> H(i) is stored in the i-th row of the array V, and
*>
-*> H = I - V**H * T * V
+*> H = I - V**T * T * V
*> \endverbatim
*
* Arguments:
@@ -166,23 +166,25 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* .. Scalar Arguments
*
- CHARACTER DIRECT, STOREV
- INTEGER K, LDT, LDV, N
+ CHARACTER DIRECT, STOREV
+ INTEGER K, LDT, LDV, N
* ..
* .. Array Arguments ..
*
- COMPLEX T( LDT, * ), TAU( * ), V( LDV, * )
+ COMPLEX T( LDT, * ), TAU( * ), V( LDV, * )
* ..
*
* .. Parameters ..
*
- COMPLEX ONE, NEG_ONE, ZERO
- PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE=-1.0E+0)
+ COMPLEX ONE, NEG_ONE, ZERO
+ PARAMETER(ONE=(1.0E+0,0.0E+0),
+ $ ZERO = (0.0E+0,0.0E+0),
+ $ NEG_ONE=(-1.0E+0,0.0E+0))
*
* .. Local Scalars ..
*
INTEGER I,J,L
- LOGICAL QR,LQ,QL,DIRF,COLV
+ LOGICAL QR,LQ,QL,RQ,LQT,RQT,DIRF,COLV,TDIRF,TCOLV
*
* .. External Subroutines ..
*
@@ -193,10 +195,10 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
LOGICAL LSAME
EXTERNAL LSAME
*
-* .. Intrinsic Functions..
+* .. Instrinsic Functions..
*
INTRINSIC CONJG
-*
+*
* The general scheme used is inspired by the approach inside DGEQRT3
* which was (at the time of writing this code):
* Based on the algorithm of Elmroth and Gustavson,
@@ -210,13 +212,6 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
RETURN
END IF
*
-* Base case
-*
- IF(N.EQ.1.OR.K.EQ.1) THEN
- T(1,1) = TAU(1)
- RETURN
- END IF
-*
* Beginning of executable statements
*
L = K / 2
@@ -227,26 +222,48 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* 'C' for STOREV, then they meant to provide 'R'
*
DIRF = LSAME(DIRECT,'F')
+ TDIRF = LSAME(DIRECT,'T')
COLV = LSAME(STOREV,'C')
+ TCOLV = LSAME(STOREV,'T')
*
* QR happens when we have forward direction in column storage
*
QR = DIRF.AND.COLV
*
-* LQ happens when we have forward direction in row storage
+* LQT happens when we have forward direction in row storage and want to compute the transpose of
+* the T we would normally compute
+*
+ LQT = DIRF.AND.TCOLV
+*
+* LQ happens when we have forward direction in row storage and want to compute the T we would
+* normally compute
*
- LQ = DIRF.AND.(.NOT.COLV)
+ LQ = DIRF.AND.(.NOT.LQT)
*
* QL happens when we have backward direction in column storage
*
QL = (.NOT.DIRF).AND.COLV
*
-* The last case is RQ. Due to how we structured this, if the
-* above 3 are false, then RQ must be true, so we never store
-* this
-* RQ happens when we have backward direction in row storage
-* RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+* RQT happens when we have backward direction in row storage and want to compute the transpose
+* of the T we would normally compute
+*
+ RQT = TDIRF.AND.(.NOT.COLV)
+*
+* RQ happens when we have backward direction in row storage and want to compute the T that we
+* would normally compute
+*
+ RQ = (.NOT.RQT).AND.(.NOT.COLV)
*
+* Base case
+*
+ IF(N.EQ.1.OR.K.EQ.1) THEN
+ IF( RQT.OR.LQT ) THEN
+ T(1,1) = CONJG(TAU(1))
+ ELSE
+ T(1,1) = TAU(1)
+ END IF
+ RETURN
+ END IF
IF(QR) THEN
*
* Break V apart into 6 components
@@ -260,17 +277,17 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{1,1}\in\C^{l,l} unit lower triangular
* V_{2,1}\in\C^{k-l,l} rectangular
* V_{3,1}\in\C^{n-k,l} rectangular
-*
+*
* V_{2,2}\in\C^{k-l,k-l} unit lower triangular
* V_{3,2}\in\C^{n-k,k-l} rectangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} T_{1,2}|
* |0 T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -281,17 +298,17 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
+*
* (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2')
* = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2'
-*
-* Define T{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2}
-*
-* Then, we can define the matrix V as
+*
+* Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2}
+*
+* Then, we can define the matrix V as
* V = |-------|
* |V_1 V_2|
* |-------|
-*
+*
* So, our product is equivalent to the matrix product
* I - V*T*V'
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
@@ -303,15 +320,15 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{2,2} recursively
*
- CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV,
- $ TAU(L+1), T(L+1, L+1), LDT)
+ CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
*
-* Compute T_{1,2}
+* Compute T_{1,2}
* T_{1,2} = V_{2,1}'
*
DO J = 1, L
DO I = 1, K-L
- T(J, L+I) = CONJG(V(L+I, J))
+ T(J,L+I) = CONJG(V(L+I,J))
END DO
END DO
*
@@ -324,9 +341,8 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
- CALL CGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE,
- $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE, T(1, L+1),
- $ LDT)
+ CALL CGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE,
+ $ V(K+1, 1), LDV, V(K+1,L+1), LDV, ONE, T(1, L+1), LDT)
*
* At this point, we have that T_{1,2} = V_1'*V_2
* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
@@ -339,8 +355,8 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* T_{1,2} = T_{1,2}*T_{2,2}
*
- CALL CTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT)
+ CALL CTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
+ $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT)
ELSE IF(LQ) THEN
*
@@ -354,19 +370,19 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{1,1}\in\C^{l,l} unit upper triangular
* V_{1,2}\in\C^{l,k-l} rectangular
* V_{1,3}\in\C^{l,n-k} rectangular
-*
+*
* V_{2,2}\in\C^{k-l,k-l} unit upper triangular
* V_{2,3}\in\C^{k-l,n-k} rectangular
*
* Where l = floor(k/2)
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} T_{1,2}|
* |0 T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -375,20 +391,20 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2}\in\C^{l, k-l} rectangular
*
* Then, consider the product:
-*
-* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2)
-* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2
-*
+*
+* (I - V_1'*T_{1,1}'*V_1)*(I - V_2'*T_{2,2}'*V_2)
+* = I - V_1'*T_{1,1}'*V_1 - V_2'*T_{2,2}'*V_2 + V_1'*T_{1,1}'*V_1*V_2'*T_{2,2}'*V_2
+*
* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |---|
* |V_1|
* |V_2|
* |---|
-*
+*
* So, our product is equivalent to the matrix product
-* I - V'*T*V
+* I - V'*T'*V
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
* to compute T_{1,2}
*
@@ -398,14 +414,14 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{2,2} recursively
*
- CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV,
- $ TAU(L+1), T(L+1, L+1), LDT)
+ CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
*
* Compute T_{1,2}
* T_{1,2} = V_{1,2}
*
- CALL CLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT)
+ CALL CLACPY('All', L, K - L, V(1,L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*V_{2,2}'
*
@@ -433,6 +449,99 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
*
CALL CTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
$ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT)
+ ELSE IF(LQT) THEN
+*
+* Break V apart into 6 components
+*
+* V = |----------------------|
+* |V_{1,1} V_{1,2} V{1,3}|
+* |0 V_{2,2} V{2,3}|
+* |----------------------|
+*
+* V_{1,1}\in\C^{l,l} unit upper triangular
+* V_{1,2}\in\C^{l,k-l} rectangular
+* V_{1,3}\in\C^{l,n-k} rectangular
+*
+* V_{2,2}\in\C^{k-l,k-l} unit upper triangular
+* V_{2,3}\in\C^{k-l,n-k} rectangular
+*
+* Where l = floor(k/2)
+*
+* We will construct the T matrix
+* T = |---------------|
+* |T_{1,1} 0 |
+* |T_{2,1} T_{2,2}|
+* |---------------|
+*
+* T is the triangular factor obtained from block reflectors.
+* To motivate the structure, assume we have already computed T_{1,1}
+* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
+*
+* T_{1,1}\in\C^{l, l} lower triangular
+* T_{2,2}\in\C^{k-l, k-l} lower triangular
+* T_{2,1}\in\C^{k-l, l} rectangular
+*
+* Then, consider the product:
+*
+* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2)
+* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2
+*
+* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1}
+*
+* Then, we can define the matrix V as
+* V = |---|
+* |V_1|
+* |V_2|
+* |---|
+*
+* So, our product is equivalent to the matrix product
+* I - V'*T*V
+* This means, we can compute T_{1,1} and T_{2,2}, then use this information
+* to compute T_{2,1}
+*
+* Compute T_{1,1} recursively
+*
+ CALL CLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT)
+*
+* Compute T_{2,2} recursively
+*
+ CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
+
+*
+* Compute T_{2,1}
+* T_{2,1} = V_{1,2}'
+*
+ DO I = 1, K-L
+ DO J = 1, L
+ T(L+I,J) = CONJG(V(J,L+I))
+ END DO
+ END DO
+*
+* T_{2,1} = V_{2,2}*T_{2,1}
+*
+ CALL CTRMM('Left', 'Upper', 'No Transpose', 'Unit', K-L, L,
+ $ ONE, V(L+1,L+1), LDV, T(L+1,1), LDT)
+*
+* T_{2,1} = V_{2,3}*V_{1,3}' + T_{2,1}
+* Note: We assume K <= N, and GEMM will do nothing if N=K
+*
+ CALL CGEMM('No Transpose', 'Conjugate', K-L, L, N-K, ONE,
+ $ V(L+1,K+1), LDV, V(1, K+1), LDV, ONE, T(L+1,1), LDT)
+*
+* At this point, we have that T_{2,1} = V_2*V_1'
+* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1}
+* respectively.
+*
+* T_{2,1} = -T_{2,2}*T_{2,1}
+*
+ CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K-L, L, NEG_ONE, T(L+1,L+1), LDT, T(L+1,1), LDT)
+*
+* T_{2,1} = T_{2,1}*T_{1,1}
+*
+ CALL CTRMM('Right', 'Lower', 'No Transpose', 'Non-unit',
+ $ K-L, L, ONE, T, LDT, T(L+1,1), LDT)
ELSE IF(QL) THEN
*
* Break V apart into 6 components
@@ -445,18 +554,18 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* V_{1,1}\in\C^{n-k,k-l} rectangular
* V_{2,1}\in\C^{k-l,k-l} unit upper triangular
-*
+*
* V_{1,2}\in\C^{n-k,l} rectangular
* V_{2,2}\in\C^{k-l,l} rectangular
* V_{3,2}\in\C^{l,l} unit upper triangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} 0 |
* |T_{2,1} T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -467,17 +576,17 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
+*
* (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1')
* = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1'
-*
+*
* Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |-------|
* |V_1 V_2|
* |-------|
-*
+*
* So, our product is equivalent to the matrix product
* I - V*T*V'
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
@@ -485,34 +594,34 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{1,1} recursively
*
- CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT)
+ CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
*
* Compute T_{2,2} recursively
*
CALL CLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV,
- $ TAU(K-L+1), T(K-L+1, K-L+1), LDT)
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
*
* Compute T_{2,1}
* T_{2,1} = V_{2,2}'
*
DO J = 1, K-L
DO I = 1, L
- T(K-L+I, J) = CONJG(V(N-K+J, K-L+I))
+ T(K-L+I,J) = CONJG(V(N-K+J, K-L+I))
END DO
END DO
*
* T_{2,1} = T_{2,1}*V_{2,1}
*
CALL CTRMM('Right', 'Upper', 'No transpose', 'Unit', L,
- $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT)
+ $ K-L, ONE, V(N-K+1,1), LDV, T(K-L+1,1), LDT)
*
* T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
CALL CGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE,
- $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1),
- $ LDT)
+ $ V(1,K-L+1), LDV, V, LDV, ONE, T(K-L+1,1), LDT)
*
* At this point, we have that T_{2,1} = V_2'*V_1
* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1}
@@ -521,17 +630,13 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{2,1} = -T_{2,2}*T_{2,1}
*
CALL CTRMM('Left', 'Lower', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT,
- $ T(K-L+1, 1), LDT)
+ $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT)
*
* T_{2,1} = T_{2,1}*T_{1,1}
*
CALL CTRMM('Right', 'Lower', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT)
- ELSE
-*
-* Else means RQ case
-*
+ $ K-L, ONE, T, LDT, T(K-L+1,1), LDT)
+ ELSE IF(RQ) THEN
* Break V apart into 6 components
*
* V = |-----------------------|
@@ -546,13 +651,13 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{2,2}\in\C^{l,k-l} rectangular
* V_{2,3}\in\C^{l,l} unit lower triangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} 0 |
* |T_{2,1} T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -563,51 +668,51 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
-* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1)
-* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1
-*
+*
+* (I - V_2'*T_{2,2}'*V_2)*(I - V_1'*T_{1,1}'*V_1)
+* = I - V_2'*T_{2,2}'*V_2 - V_1'*T_{1,1}'*V_1 + V_2'*T_{2,2}'*V_2*V_1'*T_{1,1}'*V_1
+*
* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |---|
* |V_1|
* |V_2|
* |---|
-*
+*
* So, our product is equivalent to the matrix product
-* I - V'*T*V
+* I - V'*T'*V
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
* to compute T_{2,1}
*
* Compute T_{1,1} recursively
*
- CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT)
+ CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
*
* Compute T_{2,2} recursively
*
CALL CLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV,
- $ TAU(K-L+1), T(K-L+1, K-L+1), LDT)
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
*
* Compute T_{2,1}
* T_{2,1} = V_{2,2}
*
- CALL CLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV,
- $ T(K-L+1, 1), LDT)
+ CALL CLACPY('All', L, K-L, V(K-L+1,N-K+1), LDV, T(K-L+1,1),
+ $ LDT)
*
* T_{2,1} = T_{2,1}*V_{1,2}'
*
CALL CTRMM('Right', 'Lower', 'Conjugate', 'Unit', L, K-L,
- $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT)
+ $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT)
*
* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
- CALL CGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE,
- $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1),
- $ LDT)
+ CALL CGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE,
+ $ V(K-L+1,1), LDV, V, LDV, ONE, T(K-L+1,1), LDT)
*
* At this point, we have that T_{2,1} = V_2*V_1'
@@ -617,13 +722,103 @@ RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{2,1} = -T_{2,2}*T_{2,1}
*
CALL CTRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT,
- $ T(K-L+1, 1), LDT)
+ $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT)
*
* T_{2,1} = T_{2,1}*T_{1,1}
*
CALL CTRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L,
- $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT)
+ $ K-L, ONE, T, LDT, T(K-L+1,1), LDT)
+ ELSE IF(RQT) THEN
+* Break V apart into 6 components
+*
+* V = |-----------------------|
+* |V_{1,1} V_{1,2} 0 |
+* |V_{2,1} V_{2,2} V_{2,3}|
+* |-----------------------|
+*
+* V_{1,1}\in\C^{k-l,n-k} rectangular
+* V_{1,2}\in\C^{k-l,k-l} unit lower triangular
+*
+* V_{2,1}\in\C^{l,n-k} rectangular
+* V_{2,2}\in\C^{l,k-l} rectangular
+* V_{2,3}\in\C^{l,l} unit lower triangular
+*
+* We will construct the T matrix
+* T = |---------------|
+* |T_{1,1} T_{1,2}|
+* | 0 T_{2,2}|
+* |---------------|
+*
+* T is the triangular factor obtained from block reflectors.
+* To motivate the structure, assume we have already computed T_{1,1}
+* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
+*
+* T_{1,1}\in\C^{k-l, k-l} non-unit upper triangular
+* T_{2,2}\in\C^{l, l} non-unit upper triangular
+* T_{1,2}\in\C^{k-l, l} rectangular
+*
+* Where l = floor(k/2)
+*
+* Then, consider the product:
+*
+* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1)
+* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1
+*
+* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2}
+*
+* Then, we can define the matrix V as
+* V = |---|
+* |V_1|
+* |V_2|
+* |---|
+*
+* So, our product is equivalent to the matrix product
+* I - V'*T*V
+* This means, we can compute T_{1,1} and T_{2,2}, then use this information
+* to compute T_{1,2}
+*
+* Compute T_{1,1} recursively
+*
+ CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
+*
+* Compute T_{2,2} recursively
+*
+ CALL CLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV,
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
+*
+* Compute T_{1,2}
+* T_{1,2} = V_{2,2}'
+*
+ DO I = 1, K-L
+ DO J = 1, L
+ T(I,K-L+J) = CONJG(V(K-L+J, N-K+I))
+ END DO
+ END DO
+*
+* T_{1,2} = V_{1,2}T_{1,2}
+*
+ CALL CTRMM('Left', 'Lower', 'No Transpose', 'Unit', K-L, L,
+ $ ONE, V(1,N-K+1), LDV, T(1,K-L+1), LDT)
+*
+* T_{1,2} = V_{1,1}V_{2,1}' + T_{1,2}
+*
+ CALL CGEMM('No Tranpose', 'Conjugate', K-L, L, N-K, ONE, V,
+ $ LDV, V(K-L+1,1), LDV, ONE, T(1, K-L+1), LDT)
+*
+* At this point, we have that T_{1,2} = V_1*V_2'
+* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
+* respectively.
+*
+* T_{1,2} = -T_{1,1}*T_{1,2}
+*
+ CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit',
+ $ K-L, L, NEG_ONE, T, LDT, T(1, K-L+1), LDT)
+*
+* T_{1,2} = T_{1,2}*T_{2,2}
+*
+ CALL CTRMM('Right', 'Upper', 'No Transpose', 'Non-Unit',
+ $ K-L, L, ONE, T(K-L+1,K-L+1), LDT, T(1, K-L+1), LDT)
END IF
END SUBROUTINE
diff --git a/SRC/clumm.f b/SRC/clumm.f
new file mode 100644
index 0000000000..671263ac9b
--- /dev/null
+++ b/SRC/clumm.f
@@ -0,0 +1,350 @@
+*> \brief \b CLUMM computes an in place triangular times triangluar matrix multiplication
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE CLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA,
+* $ A, LDA)
+*
+* .. Scalar Arguments ..
+* INTEGER N, LDA
+* CHARACTER SIDEL, DIAGL, DIAGU
+* COMPLEX ALPHA
+*
+* .. Array Arguments ..
+* COMPLEX A(LDA,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CLUMM performs one of the matrix-matrix operations
+*>
+*> C = \alpha L * U
+*> or
+*> C = \alpha U * L
+*>
+*> where \alpha is a scalar, L is a unit, or non-unit, lower triangular matrix, and U is a unit, or
+*> non-unit, upper triangular matrix, and at most one of L and U are non-unit
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDEL
+*> \verbatim
+*> SIDEL is CHARACTER*1
+*> On entry, SIDE specifies whether L multiplies U from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' A = \alpha L * U
+*>
+*> SIDE = 'R' or 'r' A = \alpha U * L
+*> \endverbatim
+*>
+*> \param[in] DIAGL
+*> \verbatim
+*> DIAGL is CHARACTER*1
+*> On entry, DIAGL specifies whether or not L is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' L is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' L is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] DIAGU
+*> \verbatim
+*> DIAGU is CHARACTER*1
+*> On entry, DIAGU specifies whether or not U is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' U is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' U is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> M is INTEGER
+*> On entry, N specifies the number of rows and columns of L and U. M must be at
+*> least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is COMPLEX .
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then A is not referenced, and A need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*> A is COMPLEX array, dimension ( LDA, N ) where
+*> Before entry the leading n-by-n strictly upper triangular part of the array
+*> A must contain the upper triangular matrix U and the strictly lower triangular part of
+*> the leading n-by-n submatrix must contain the lower triangular matrix L.
+*> If DIAGL != 'U', then the diagonal is assumed to be part of L, and if
+*> DIAGU != 'U', then the diagonal is assumed to be part of U.
+*> Note: At most one of DIAGL and DIAGU can be not equal to 'U'.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> On entry, LDA specifies the first dimension of A as declared
+*> in the calling (sub) program. LDA must be at least max( 1, n ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+* Cost: 2/3 * (n^3 - n)
+ RECURSIVE SUBROUTINE CLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA,
+ $ A, LDA)
+*
+* .. Scalar Arguments ..
+ INTEGER N, LDA
+ CHARACTER SIDEL, DIAGL, DIAGU
+ COMPLEX ALPHA
+*
+* .. Array Arguments ..
+ COMPLEX A(LDA,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* .. External Subroutines ..
+ EXTERNAL CGEMM, CTRMM, CLASET,
+ $ XERBLA
+* ..
+* .. Local Scalars ..
+ INTEGER K
+ LOGICAL LLEFT, LUNIT, UUNIT
+* ..
+* .. Local Parameters ..
+ COMPLEX ONE, ZERO
+ PARAMETER(ONE=(1.0E+0,0.0E+0), ZERO=(0.0E+0,0.0E+0))
+* ..
+*
+* Determine if our flags are valid or not. We can have at
+* most one of DIAGU, DIAGL not equal to 'U'
+*
+ LUNIT = LSAME(DIAGL, 'U')
+ UUNIT = LSAME(DIAGU, 'U')
+*
+* If both of the above are false, then it is impossible to have the
+* structure that we are exploiting in this routine
+* Note: It is possible to allow the matrices to share a non-unit
+* diagonal as long as the values are the exact same, but there is
+* currently no use case for this that I am aware of.
+*
+ IF ((.NOT.LUNIT).AND.(.NOT.UUNIT)) THEN
+*
+* We say the error is in the last set DIAG value as we cannot know
+* what the user actually meant.
+*
+ CALL XERBLA( 'CLUMM', 3 )
+ RETURN
+ END IF
+*
+* Determine which side L is on
+*
+ LLEFT = LSAME(SIDEL, 'L')
+*
+* Early exit if possible
+*
+ IF (N.EQ.0) THEN
+ RETURN
+ END IF
+ IF (ALPHA.EQ.ZERO) THEN
+ CALL CLASET('All', N, N, ZERO, ZERO, A, LDA)
+ RETURN
+ END IF
+*
+* Terminating Case
+*
+ IF (N.EQ.1) THEN
+*
+* Since at most one of L and U are non-unit triangular, whatever side L is on, we are still
+* always computing one of
+*
+* 1) A(1,1) = ALPHA*A(1,1)
+* 2) A(1,1) = ALPHA
+*
+* Where the first case happens when exactly one of L and U are unit triangular, while the
+* second case happens when both L and U are unit triangular
+*
+ IF (LUNIT.AND.UUNIT) THEN
+ A(1,1) = ALPHA
+ ELSE
+ A(1,1) = ALPHA*A(1,1)
+ END IF
+ RETURN
+ END IF
+*
+* Recursive Case
+*
+ K = N/2
+*
+* Regardless of us computing A = L*U or A = U*L, break break A apart as follows:
+*
+* |---|
+* A = | U |
+* | L |
+* |---|
+*
+* Further break down L as
+* |---------------|
+* L = | L_{11} 0 |
+* | L_{21} L_{22} |
+* |---------------|
+*
+* Where:
+*
+* L_{11}\in\C^{k\times k} is lower triangular (assumed unit iff DIAGL == 'U')
+* L_{21}\in\C^{n-k\times n} is rectangular
+* L_{22}\in\C^{n-k\times n-k} is lower triangular (assumed unit iff DIAGL == 'U')
+*
+* Further break down U as
+* |---------------|
+* U = | U_{11} U_{21} |
+* | 0 U_{22} |
+* |---------------|
+*
+* Where:
+*
+* U_{11}\in\C^{k\times k} is upper triangular (assumed unit iff DIAGU == 'U')
+* U_{12}\in\C^{n\times n-k} is rectangular
+* U_{22}\in\C^{n-k\times n-k} is upper triangular (assumed unit iff DIAGU == 'U')
+ IF (LLEFT) THEN
+*
+* This means we are computing
+* |---------------| |---------------|
+* A = L*U = \alpha | L_{11} 0 | * | U_{11} U_{12} |
+* | L_{21} L_{22} | | 0 U_{22} |
+* |---------------| |---------------|
+*
+* |---------------------------------------------|
+* = \alpha | L_{11}*U_{11} L_{11}*U_{12} |
+* | L_{21}*U_{11} L_{21}*U_{12} + L_{22}*U_{22} |
+* |---------------------------------------------|
+*
+* We compute these in the following order
+*
+* A_{22} = \alpha*L_{22}*U_{22} (This routine)
+* A_{22} = \alpha*L_{21}*U_{12} + A_{22} (GEMM)
+*
+* A_{12} = \alpha*L_{11}*U_{12} (TRMM)
+* A_{21} = \alpha*L_{21}*U_{11} (TRMM)
+*
+* A_{11} = \alpha*L_{11}*U_{11} (This routine)
+*
+* Compute A_{22}
+*
+* A_{22} = \alpha*L_{22}*U_{22}
+*
+ CALL CLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA,
+ $ A(K+1, K+1), LDA)
+*
+* A_{22} = \alpha L_{21}*U_{12} + A_{22}
+*
+ CALL CGEMM('No Transpose', 'No Transpose', N-K, N-K, K,
+ $ ALPHA, A(K+1,1), LDA, A(1,K+1), LDA, ONE, A(K+1,K+1),
+ $ LDA)
+*
+* Compute A_{12}
+*
+* A_{12} = \alpha*L_{11}*U_{12}
+*
+ CALL CTRMM('Left', 'Lower', 'No Transpose', DIAGL, K, N-K,
+ $ ALPHA, A, LDA, A(1,K+1), LDA)
+*
+* Compute A_{21}
+*
+* A_{21} = \alpha*L_{21}*U_{11}
+*
+ CALL CTRMM('Right', 'Upper', 'No Transpose', DIAGU, N-K, K,
+ $ ALPHA, A, LDA, A(K+1,1), LDA)
+*
+* Compute A_{11}
+*
+* A_{11} = \alpha*L_{11}*U_{11}
+*
+ CALL CLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA)
+ ELSE
+*
+* This means we are computing
+* |---------------| |---------------|
+* A = U*L = \alpha | U_{11} U_{12} | * | L_{11} 0 |
+* | 0 U_{22} | | L_{21} L_{22} |
+* |---------------| |---------------|
+*
+* |---------------------------------------------|
+* = \alpha | U_{11}*L_{11} + U_{12}*L_{21} U_{12}*L_{22} |
+* | U_{22}*L_{21} U_{22}*L_{22} |
+* |---------------------------------------------|
+*
+* We compute these in the following order
+*
+* A_{11} = \alpha*U_{11}*L_{11} (This routine)
+* A_{11} = \alpha*U_{12}*L_{21} + A_{11} (GEMM)
+*
+* A_{12} = \alpha*U_{12}*L_{22} (TRMM)
+* A_{21} = \alpha*U_{22}*L_{21} (TRMM)
+*
+* A_{22} = \alpha*U_{22}*L_{22} (This routine)
+*
+* Compute A_{11}
+*
+* A_{11} = \alpha*U_{11}*L_{11}
+*
+ CALL CLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA)
+*
+* A_{11} = \alpha*U_{12}*L_{21} + A_{11}
+*
+ CALL CGEMM('No Transpose', 'No Transpose', K, K, N-K,
+ $ ALPHA, A(1,K+1), LDA, A(K+1,1), LDA, ONE, A, LDA)
+*
+* Compute A_{12}
+*
+* A_{12} = \alpha*U_{12}*L_{22}
+*
+ CALL CTRMM('Right', 'Lower', 'No Transpose', DIAGL, K, N-K,
+ $ ALPHA, A(K+1,K+1), LDA, A(1,K+1), LDA)
+*
+* Compute A_{21}
+*
+* A_{21} = \alpha*U_{22}*L_{21}
+*
+ CALL CTRMM('Left', 'Upper', 'No Transpose', DIAGU, N-K, K,
+ $ ALPHA, A(K+1, K+1), LDA, A(K+1,1), LDA)
+*
+* Compute A_{22}
+*
+* A_{22} = \alpha*U_{22}*L_{22}
+*
+ CALL CLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA,
+ $ A(K+1, K+1), LDA)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/ctrmmoop.f b/SRC/ctrmmoop.f
new file mode 100644
index 0000000000..a0ec301121
--- /dev/null
+++ b/SRC/ctrmmoop.f
@@ -0,0 +1,2572 @@
+*> \brief \b CTRMMOOP computes an out of place triangular times general matrix multiplication
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB,
+* $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* .. Scalar Arguments ..
+* COMPLEX ALPHA, BETA
+* INTEGER M, N, LDA, LDB, LDC
+* CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG
+* ..
+* .. Array Arguments ..
+* COMPLEX A(LDA,*), B(LDB,*), C(LDC,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CTRMMOOP performs one of the matrix-matrix operations
+*>
+*> C = \alpha op(A) * op(B) + \beta C
+*> or
+*> C = \alpha op(B) * op(A) + \beta C
+*>
+*> where \alpha and \beta are scalars, C is an m-by-n matrix, A is
+*> a unit, or non-unit, upper or lower triangular matrix, and op(A) is
+*> is one of
+*>
+*> op(A) = A or op(A) = A**T op(A) = A**H
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> On entry, SIDE specifies whether op(A) multiplies op(B) from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' C = \alpha op(A) * op(B) + \beta C
+*>
+*> SIDE = 'R' or 'r' C = \alpha op(B) * op(A) + \beta C
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> On entry, UPLO specifies whether the matrix A is an upper or
+*> lower triangular matrix as follows:
+*> UPLO = 'U' or 'u' A is upper triangular
+*>
+*> UPLO = 'L' or 'l' A is lower triangular
+*> \Endverbatim
+*>
+*> \param[in] TRANSA
+*> \verbatim
+*> TRANSA is CHARACTER*1
+*> On entry, TRANSA specifies the form of op(A) to be used in
+*> the matrix multiplication as follows:
+*> TRANSA = 'N' or 'n' op(A) = A
+*>
+*> TRANSA = 'T' or 't' op(A) = A**T
+*>
+*> TRANSA = 'C' or 'c' op(A) = A**H
+*> \endverbatim
+*>
+*> \param[in] TRANSB
+*> \verbatim
+*> TRANSB is CHARACTER*1
+*> On entry, TRANSB specifies the form of op(B) to be used in
+*> the matrix multiplication as follows:
+*> TRANSB = 'N' or 'n' op(B) = B
+*>
+*> TRANSB = 'T' or 't' op(B) = B**T
+*>
+*> TRANSB = 'C' or 'c' op(B) = B**H
+*> \endverbatim
+*>
+*> \param[in] DIAG
+*> \verbatim
+*> DIAG is CHARACTER*1
+*> On entry, DIAG specifies whether or not A is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' A is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' A is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> On entry, M specifies the number of rows of C. M must be at
+*> least zero.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> On entry, N specifies the number of columns of C. N must be
+*> at least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is COMPLEX .
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then A and B are not referenced, and A and B need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*> A is COMPLEX array, dimension ( LDA, K ) where
+*> K is M when SIDE = 'L' and K is N when SIDE='R'
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array A must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> A is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array A must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> A is not referenced.
+*> Note that when DIAG = 'U' or 'u', the diagonal elements of
+*> A are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> On entry, LDA specifies the first dimension of A as declared
+*> in the calling (sub) program. When SIDE = 'L' or 'l' then
+*> LDA must be at least max( 1, m ), when SIDE = 'R' or 'r'
+*> then LDA must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*> B is COMPLEX array, dimension ( LDB, K ), where K is M
+*> If SIDE='R' and TRANSA='N', or SIDE='L' and TRANSA='T' and N
+*> otherwise. On entry, the leading k-by-k submatrix must contain
+*> B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*> LDB is INTEGER
+*> On entry, LDB specifies the first dimension of B as declared
+*> in the calling (sub) program. When SIDE = 'R' and TRANSB='N'
+*> then LDB must be at least max( 1, m ), when SIDE = 'R'
+*> and TRANSB = 'T' then LDB must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] BETA
+*> \verbatim
+*> BETA is COMPLEX .
+*> On entry, BETA specifies the scalar beta. When beta is
+*> zero then C is not referenced on entry, and C need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is COMPLEX array, dimension ( LDC, N )
+*> Before entry, the leading m-by-n part of the array C must
+*> contain the matrix C, and on exit is overwritten by the
+*> transformed matrix.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> On entry, LDC specifies the first dimension of C as declared
+*> in the calling (sub) program. LDC must be at least
+*> max( 1, m ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ RECURSIVE SUBROUTINE CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB,
+ $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* .. Scalar Arguments ..
+ COMPLEX ALPHA, BETA
+ INTEGER M, N, LDA, LDB, LDC
+ CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG
+* ..
+* .. Array Arguments ..
+ COMPLEX A(LDA,*), B(LDB,*), C(LDC,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ COMPLEX CDOTC, CDOTU
+ EXTERNAL LSAME, CDOTC, CDOTU
+* ..
+* .. External Subroutines ..
+ EXTERNAL CGEMM, CAXPY, CACXPY,
+ $ CSCAL, CLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC CONJG, MIN
+* ..
+* .. Local Scalars ..
+ INTEGER I, J, L, K, INCB
+ LOGICAL LSIDE, UPPER, UNIT, TRANST, TRANSG,
+ $ CONJA, CONJB
+* ..
+* .. Local Parameters ..
+ COMPLEX ONE, ZERO
+ PARAMETER(ONE=(1.0E+0,0.0E+0), ZERO=(0.0E+0,0.0E+0))
+* ..
+*
+* Beginning of Executable Statements
+*
+ LSIDE = LSAME(SIDE, 'L')
+ UPPER = LSAME(UPLO, 'U')
+*
+* If we are transposing the triangular matrix (A)
+*
+ CONJA = LSAME(TRANSA, 'C')
+ TRANST= LSAME(TRANSA, 'T').OR.CONJA
+*
+* If we are transposing the general matrix (B)
+*
+ CONJB = LSAME(TRANSB, 'C')
+ TRANSG= LSAME(TRANSB, 'T').OR.CONJB
+*
+* Terminating Case
+*
+ UNIT = LSAME(DIAG, 'U')
+ IF (M.EQ.1.AND.N.EQ.1) THEN
+*
+* This case is the simplest as we are just computing C = \alpha A*B +
+* \beta C where all components are 1-by-1 matrices
+*
+
+ IF (BETA.EQ.ZERO) THEN
+ C(1,1) = ZERO
+ ELSE
+ C(1,1) = C(1,1) * BETA
+ END IF
+*
+* Now, we compute C = \alpha op(A)*op(B)
+*
+ IF(ALPHA.NE.ZERO) THEN
+*
+* A = 1, so we do not care if A is conjugated or not
+*
+ IF (UNIT) THEN
+ IF (CONJB) THEN
+ C(1,1) = C(1,1) + ALPHA*CONJG(B(1,1))
+ ELSE
+ C(1,1) = C(1,1) + ALPHA*B(1,1)
+ END IF
+ ELSE
+*
+* A is not assumed unit, so we need to keep op(A) in mind
+*
+ IF (CONJA) THEN
+ IF (CONJB) THEN
+ C(1,1) = C(1,1) +
+ $ ALPHA*CONJG(B(1,1))*CONJG(A(1,1))
+ ELSE
+ C(1,1) = C(1,1) + ALPHA*B(1,1)*CONJG(A(1,1))
+ END IF
+ ELSE
+ IF (CONJB) THEN
+ C(1,1) = C(1,1) + ALPHA*CONJG(B(1,1))*A(1,1)
+ ELSE
+ C(1,1) = C(1,1) + ALPHA*B(1,1)*A(1,1)
+ END IF
+ END IF
+ END IF
+ END IF
+ RETURN
+ ELSE IF (M.EQ.1) THEN
+*
+* This means that C is a row vector. If BETA is 0, then we
+* set it explicitly, otherwise we overwrite it with BETA*C
+*
+ IF (BETA.EQ.ZERO) THEN
+*
+* This ensures we don't reference C unless we need to
+*
+ CALL CLASET('All', M, N, ZERO, ZERO, C, LDC)
+ ELSE
+ CALL CSCAL(N, BETA, C, LDC)
+ END IF
+ IF (ALPHA.NE.ZERO) THEN
+ IF (LSIDE) THEN
+*
+* We are computing C = \alpha op(A)*op(B) + \beta C
+* Note: This means that A is a scalar
+*
+ IF (CONJA) THEN
+*
+* op(A) = CONJG(A)
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CACXPY(N, ALPHA*CONJG(A(1,1)), B, 1,
+ $ C, LDC)
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CAXPY(N, ALPHA*CONJG(A(1,1)), B, 1,
+ $ C, LDC)
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CAXPY(N, ALPHA*CONJG(A(1,1)), B,
+ $ LDB, C, LDC)
+ END IF
+ END IF
+ ELSE
+*
+* op(A) = A or op(A) = A**T = A
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CACXPY(N, ALPHA*A(1,1), B, 1,
+ $ C, LDC)
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CAXPY(N, ALPHA*A(1,1), B, 1, C, LDC)
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CAXPY(N, ALPHA*A(1,1), B, LDB,
+ $ C, LDC)
+ END IF
+ END IF
+ END IF
+ ELSE
+*
+* We are computing C = \alpha op(B)*op(A) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (CONJA) THEN
+*
+* op(A) = CONJG(A)
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CONJG(CDOTU(N-J,
+ $ A(J,J+1), LDA, B(J+1,1), 1)) +
+ $ C(1,J)
+ END DO
+ CALL CACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CONJG(CDOTU(N-J+1,
+ $ A(J,J), LDA, B(J,1), 1)) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(N-J,
+ $ A(J,J+1), LDA, B(J+1,1), 1) +
+ $ C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(N-J+1,
+ $ A(J,J), LDA, B(J,1), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(N-J,
+ $ A(J,J+1), LDA, B(1,J+1), LDB) +
+ $ C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(N-J+1,
+ $ A(J,J), LDA, B(1,J), LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ ELSE IF (TRANST) THEN
+*
+* op(A) = A**T
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(N-J,
+ $ B(J+1,1), 1, A(J,J+1), LDA) +
+ $ C(1,J)
+ END DO
+ CALL CACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(N-J+1,
+ $ B(J,1), 1, A(J,J), LDA) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(N-J,
+ $ A(J,J+1), LDA, B(J+1,1), 1) +
+ $ C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(N-J+1,
+ $ A(J,J), LDA, B(J,1), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(N-J,
+ $ A(J,J+1), LDA, B(1,J+1), LDB) +
+ $ C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(N-J+1,
+ $ A(J,J), LDA, B(1,J), LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ ELSE
+*
+* op(A) = A
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(J-1, B, 1,
+ $ A(1,J), 1) + C(1,J)
+ END DO
+ CALL CACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(J, B, 1,
+ $ A(1,J), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(J-1,
+ $ A(1,J), 1, B, 1) +
+ $ C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(J,
+ $ A(1,J), 1, B, 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(J-1,
+ $ A(1,J), 1, B, LDB) +
+ $ C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(J,
+ $ A(1,J), 1, B, LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (CONJA) THEN
+*
+* op(A) = CONJG(A)
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CONJG(CDOTU(J-1,
+ $ B, 1, A(J,1), LDA)) + C(1,J)
+ END DO
+ CALL CACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CONJG(CDOTU(J, B,
+ $ 1, A(J,1), LDA)) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(J-1,
+ $ A(J,1), LDA, B, 1) + C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(J,
+ $ A(J,1), LDA, B, 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(J-1,
+ $ A(J,1), LDA, B, LDB) + C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(J,
+ $ A(J,1), LDA, B, LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ ELSE IF (TRANST) THEN
+*
+* op(A) = A**T
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(J-1,
+ $ B, 1, A(J,1), LDA) + C(1,J)
+ END DO
+ CALL CACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(J, B,
+ $ 1, A(J,1), LDA) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(J-1,
+ $ A(J,1), LDA, B, 1) + C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(J,
+ $ A(J,1), LDA, B, 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(J-1,
+ $ A(J,1), LDA, B, LDB) + C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(J,
+ $ A(J,1), LDA, B, LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ ELSE
+*
+* op(A) = A
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(N-J,
+ $ B(J+1,1), 1, A(J+1,J), 1) + C(1,J)
+ END DO
+ CALL CACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTC(N-J+1,
+ $ B(J,1), 1, A(J,J), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(N-J,
+ $ B(J+1,1), 1, A(J+1,J), 1) + C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(N-J+1,
+ $ B(J,1), 1, A(J,J), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(N-J,
+ $ B(1,J+1), LDB, A(J+1,J), 1) + C(1,J)
+ END DO
+ CALL CAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CDOTU(N-J+1,
+ $ B(1,J), LDB, A(J,J), 1) + C(1,J)
+ END DO
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ RETURN
+ ELSE IF (N.EQ.1) THEN
+*
+* This means that C is a column vector. If BETA is 0, then we
+* set it explicitly, otherwise we overwrite it with BETA*C
+*
+ IF (BETA.EQ.ZERO) THEN
+*
+* This ensures we don't reference C unless we need to
+*
+ CALL CLASET('All', M, N, ZERO, ZERO, C, LDC)
+ ELSE
+ CALL CSCAL(M, BETA, C, 1)
+ END IF
+
+*
+* If alpha is 0, we are done
+*
+ IF (ALPHA.NE.ZERO) THEN
+ IF (TRANSG) THEN
+ INCB = LDB
+ ELSE
+ INCB = 1
+ END IF
+ IF (LSIDE) THEN
+*
+* This means we are computing
+* C = \alpha op(A) * op(B) + \beta C
+*
+ IF (UPPER) THEN
+*
+* This means A is upper triangular
+*
+ IF (CONJA) THEN
+*
+* This means op(A) = CONJG(A)
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CONJG(CDOTU(I-1, B,
+ $ INCB, A(1,I), 1)) + C(I,1)
+ END DO
+ CALL CACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CONJG(CDOTU(I, B,
+ $ INCB, A(1,I), 1)) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(I-1, A(1,I),
+ $ 1, B, INCB) + C(I,1)
+ END DO
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(I, A(1,I),
+ $ 1, B, INCB) + C(I,1)
+ END DO
+ END IF
+ END IF
+ ELSE IF (TRANST) THEN
+*
+* This means op(A) = A**T
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(I-1, B, INCB,
+ $ A(1,I), 1) + C(I,1)
+ END DO
+ CALL CACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(I, B, INCB,
+ $ A(1,I), 1) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTU(I-1, B, INCB,
+ $ A(1,I), 1) + C(I,1)
+ END DO
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTU(I, B, INCB,
+ $ A(1,I), 1) + C(I,1)
+ END DO
+ END IF
+ END IF
+ ELSE
+*
+* This means op(A) = A
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CDOTC(M-I, B(1,I+1),
+ $ INCB, A(I,I+1), LDA) + C(I,1)
+ END DO
+ CALL CACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(M-I+1, B(1,I),
+ $ INCB, A(I,I), LDA) + C(I,1)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* This means that B is a row vector but not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CDOTU(M-I, B(1,I+1),
+ $ INCB, A(I,I+1), LDA) + C(I,1)
+ END DO
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTU(M-I+1, B(1,I),
+ $ INCB, A(I,I), LDA) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is a column vector and not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CDOTU(M-I, B(I+1,1),
+ $ INCB, A(I,I+1), LDA) + C(I,1)
+ END DO
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTU(M-I+1, B(I,1),
+ $ INCB, A(I,I), LDA) + C(I,1)
+ END DO
+ END IF
+ END IF
+ END IF
+*
+* This means A is lower triangular
+*
+ ELSE
+ IF (CONJA) THEN
+*
+* This means op(A) = CONJG(A)
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CONJG(CDOTU(M-I,
+ $ B(1,I+1), INCB, A(I+1,I), 1))
+ & + C(I,1)
+ END DO
+ CALL CACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CONJG(CDOTU(M-I+1,
+ $ B(1,I), INCB, A(I,I), 1))
+ & + C(I,1)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* This means that B is a row vector but not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CDOTC(M-I, A(I+1,I),
+ $ 1, B(1,I+1), INCB) + C(I,1)
+ END DO
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(M-I+1, A(I,I),
+ $ 1, B(1,I), INCB) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is a column vector and not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CDOTC(M-I, A(I+1,I),
+ $ 1, B(I+1,1), INCB) + C(I,1)
+ END DO
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(M-I+1, A(I,I),
+ $ 1, B(I,1), INCB) + C(I,1)
+ END DO
+ END IF
+ END IF
+ ELSE IF (TRANST) THEN
+*
+* This means op(A) = A**T
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CDOTC(M-I, B(1,I+1),
+ $ INCB, A(I+1,I), 1) + C(I,1)
+ END DO
+ CALL CACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(M-I+1, B(1,I),
+ $ INCB, A(I,I), 1) + C(I,1)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* This means that B is a row vector but not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CDOTU(M-I, B(1,I+1),
+ $ INCB, A(I+1,I), 1) + C(I,1)
+ END DO
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTU(M-I+1, B(1,I),
+ $ INCB, A(I,I), 1) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is a column vector and not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CDOTU(M-I, B(I+1,1),
+ $ INCB, A(I+1,I), 1) + C(I,1)
+ END DO
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTU(M-I+1, B(I,1),
+ $ INCB, A(I,I), 1) + C(I,1)
+ END DO
+ END IF
+ END IF
+*
+* This means op(A) = A
+* This is lower triangular[:w
+
+*
+ ELSE
+ IF (CONJB) THEN
+*
+* This means that B is conjugated and transposed
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(I-1, B, INCB,
+ $ A(I,1), LDA) + C(I,1)
+ END DO
+ CALL CACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTC(I, B, INCB,
+ $ A(I,1), LDA) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTU(I-1, B, INCB,
+ $ A(I,1), LDA) + C(I,1)
+ END DO
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CDOTU(I, B, INCB,
+ $ A(I,1), LDA) + C(I,1)
+ END DO
+ END IF
+ END IF
+ END IF
+ END IF
+ ELSE
+*
+* This means we are computing
+* C = \alpha op(B) * op(A) + \beta C
+* Note: This means A is a scalar
+*
+ IF (CONJA) THEN
+*
+* This means op(A) = CONJG(A)
+*
+ IF (CONJB) THEN
+*
+* This means we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CACXPY(M, ALPHA*CONJG(A(1,1)), B,
+ $ INCB, C, 1)
+ END IF
+ ELSE
+*
+* This means B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CAXPY(M, ALPHA*CONJG(A(1,1)), B,
+ $ INCB, C, 1)
+ END IF
+ END IF
+ ELSE
+*
+* This means op(A) = A or op(A) = A**T = A
+*
+ IF (CONJB) THEN
+*
+* This means B is conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CACXPY(M, ALPHA*A(1,1), B, INCB, C,
+ $ 1)
+ END IF
+ ELSE
+*
+* This means B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL CAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL CAXPY(M, ALPHA*A(1,1), B, INCB, C,
+ $ 1)
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ RETURN
+ END IF
+*
+* Recursive Case
+*
+ L = MIN(M,N)/2
+ IF (LSIDE) THEN
+*
+* We are multiplying A from the left IE we are computing
+* C = \alpha op(A)*op(B) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing
+*
+* So we are computing
+* C = \alpha A**T * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11}**T + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{21}**T + \beta C_{12}
+* C_{21} = \alpha A_{12}**T * B_{11}**T + \alpha A_{22}**T * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{12}**T * B_{21}**T + \alpha A_{22}**T * B_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{12}**T * B_{11}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22}**T * B_{12}**T + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{12}**T * B_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22}**T * B_{22}**T + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C,
+ $ LDC)
+*
+* C_{12}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(1, L+1), LDA, B, LDB, BETA, C(L+1,1),
+ $ LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1,L+1), LDA, B(1,L+1),
+ $ LDB, ONE, C(L+1,1), LDC)
+*
+* C_{22}
+*
+ CALL CGEMM(TRANSA, TRANSB, M-L, N-L, L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1,1), LDB, BETA,
+ $ C(L+1,L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1,L+1), LDA,
+ $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11} + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{12} + \beta C_{12}
+* C_{21} = \alpha A_{12}**T * B_{11} + \alpha A_{22}**T * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{12}**T * B_{12} + \alpha A_{22}**T * B_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{12}**T * B_{11} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22}**T * B_{21} + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{12}**T * B_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22}**T * B_{22} + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(1, L+1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CGEMM(TRANSA, TRANSB, M-L, N-L, L,
+ $ ALPHA, A(1, L+1), LDA, B(1, L+1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11}**T + \alpha A_{12} * B_{12}**T + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha A_{22} * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{22} * B_{22}**T + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{12} * B_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11} * B_{11}**T + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{12} * B_{22}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11} * B_{21}**T + C_{12} (This routine)
+*
+* C_{11}
+*
+ CALL CGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(1, L+1), LDB, BETA, C, LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL CGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1,1), LDB, ONE,
+ $ C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11} + \alpha A_{12} * B_{21} + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{12} + \alpha A_{12} * B_{22} + \beta C_{12}
+* C_{21} = \alpha A_{22} * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{22} * B_{22} + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{12} * B_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11} * B_{11} + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{12} * B_{22} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11} * B_{12} + C_{12} (This routine)
+*
+* C_{11}
+*
+ CALL CGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, 1), LDB, BETA, C, LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL CGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11}**T + \alpha A_{21}**T * B_{12}**T + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{21}**T + \alpha A_{21}**T * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha A_{22}**T * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{22}**T * B_{22}**T + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{21}**T * B_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11}**T * B_{11}**T + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{21}**T * B_{22}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11}**T * B_{21}**T + C_{12} (This routine)
+*
+* C_{11}
+*
+ CALL CGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(1, L+1), LDB, BETA, C, LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL CGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE,
+ $ C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11} + \alpha A_{21}**T * B_{21} + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{12} + \alpha A_{21}**T * B_{22} + \beta C_{12}
+* C_{21} = \alpha A_{22}**T * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{22}**T * B_{22} + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{21}**T * B_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11}**T * B_{11} + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{21}**T * B_{22} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11}**T * B_{12} + C_{12} (This routine)
+*
+* C_{11}
+*
+ CALL CGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, C, LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL CGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, ONE,
+ $ C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11}**T + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{21}**T + \beta C_{12}
+* C_{21} = \alpha A_{21} * B_{11}**T + \alpha A_{22} * B_{12}**T + \beta * C_{21}
+* C_{22} = \alpha A_{21} * B_{21}**T + \alpha A_{22} * B_{22}**T + \beta * C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{21} * B_{11}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22} * B_{12}**T + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{21} * B_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22} * B_{22}**T + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CGEMM(TRANSA, TRANSB, M-L, N-L, L,
+ $ ALPHA, A(L+1, 1), LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11} + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{12} + \beta C_{12}
+* C_{21} = \alpha A_{21} * B_{11} + \alpha A_{22} * B_{21} + \beta * C_{21}
+* C_{22} = \alpha A_{21} * B_{12} + \alpha A_{22} * B_{22} + \beta * C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{21} * B_{11} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22} * B_{21} + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{21} * B_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22} * B_{22} + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, A(L+1, 1), LDA, B(1, L+1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ END IF
+ ELSE
+*
+* We are multiplying A from the right IE we are computing
+* C = \alpha op(B)*op(A) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11}**T + \alpha B_{21}**T * A_{12}**T + \beta C_{11}
+* C_{12} = \alpha B_{21}**T * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11}**T + \alpha B_{22}**T * A_{12}**T + \beta C_{21}
+* C_{22} = \alpha B_{22}**T * A_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{21}**T * A_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11}**T * A_{11}**T + C_{11} (This routine)
+*
+* C_{21} = \alpha B_{22}**T * A_{12}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{12}**T * A_{11}**T + C_{21} (This routine)
+*
+* C_{11}
+*
+ CALL CGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(L+1, 1), LDB, A(1, L+1), LDA, BETA, C, LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11}**T + \alpha B_{12} * A_{12}**T + \beta C_{11}
+* C_{12} = \alpha B_{12} * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11}**T + \alpha B_{22} * A_{12}**T + \beta C_{21}
+* C_{22} = \alpha B_{22} * A_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{12} * A_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11} * A_{11}**T + C_{11} (This routine)
+*
+* C_{21} = \alpha B_{22} * A_{12}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{21} * A_{11}**T + C_{21} (This routine)
+*
+* C_{11}
+*
+ CALL CGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(1,L+1), LDB, A(1,L+1), LDA, BETA, C, LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11}**T * A_{12} + \alpha B_{21}**T * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11} + \beta C_{21}
+* C_{22} = \alpha B_{12}**T * A_{12} + \alpha B_{22}**T * A_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11}**T * A_{12} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{21}**T * A_{22} + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{12}**T * A_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22}**T * A_{22} + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL CGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, B(1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11} * A_{12} + \alpha B_{12} * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11} + \beta C_{21}
+* C_{22} = \alpha B_{21} * A_{12} + \alpha B_{22} * A_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11} * A_{12} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{12} * A_{22} + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{21} * A_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22} * A_{22} + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL CGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, B(L+1, 1), LDB, A(1, L+1), LDA,
+ $ BETA, C(L+1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11}**T + \beta C_{11}
+* C_{12} = \alpha B_{11}**T * A_{21}**T + \alpha B_{21}**T * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11}**T + \beta C_{21}
+* C_{22} = \alpha B_{12}**T * A_{21}**T + \alpha B_{22}**T * A_{22}**T + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11}**T * A_{21}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{21}**T * A_{22}**T + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{12}**T * A_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22}**T * A_{22}**T + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL CGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA,
+ $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11} * A_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11}**T + \beta C_{21}
+* C_{22} = \alpha B_{21} * A_{21}**T + \alpha A_{22} * B_{22}**T + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11} * A_{21}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{12} * A_{22}**T + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{21} * A_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22} * A_{22}**T + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL CGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA,
+ $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11} + \alpha B_{21}**T * A_{21} + \beta C_{11}
+* C_{12} = \alpha B_{21}**T * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11} + \alpha B_{22}**T * A_{21} + \beta C_{21}
+* C_{22} = \alpha B_{22}**T * A_{22} + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{21}**T * A_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11}**T * A_{11} + C_{11}(This routine)
+*
+* C_{21} = \alpha B_{22}**T * A_{21} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{12}**T * A_{11} + C_{21} (This routine)
+*
+* C_{11}
+*
+ CALL CGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, C, LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, ONE,
+ $ C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \alpha B_{12} * A_{21} + \beta C_{11}
+* C_{12} = \alpha B_{12} * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11} + \alpha B_{22} * A_{21} + \beta C_{21}
+* C_{22} = \alpha B_{22} * A_{22} + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{12} * A_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11} * A_{11} + C_{11}(This routine)
+*
+* C_{21} = \alpha B_{22} * A_{21} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{21} * A_{11} + C_{21} (This routine)
+*
+* C_{11}
+*
+ CALL CGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, C, LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL CGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE,
+ $ C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL CTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ END IF
+ END IF
+ END SUBROUTINE
diff --git a/SRC/ctrtrm.f b/SRC/ctrtrm.f
new file mode 100644
index 0000000000..659ab49e7d
--- /dev/null
+++ b/SRC/ctrtrm.f
@@ -0,0 +1,584 @@
+*> \brief \b CTRTRM computes an in place triangular-triangular matrix product
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE CTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV,
+* $ N, ALPHA, T, LDT, V, LDV)
+*
+* .. Scalar Arguments ..
+* INTEGER N, LDT, LDV
+* CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV
+* COMPLEX ALPHA
+* ..
+* .. Array Arguments ..
+* COMPLEX T(LDT,*), V(LDV,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CTRTRM performs one of the matrix-matrix operations
+*>
+*> T = \alpha op(V) * T
+*> or
+*> T = \alpha T * op(V)
+*> where \alpha is a scalar, T and V are unit, or non-unit, upper or
+*> lower triangular matrix, and op(V) is one of
+*>
+*> op(V) = V or op(V) = V**T or op(V) = V**H
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> On entry, SIDE specifies whether op(V) multiplies T from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' T = \alpha op(V) * T
+*>
+*> SIDE = 'R' or 'r' T = \alpha T * op(V)
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> On entry, UPLO specifies whether T an op(V) are upper or
+*> lower triangular as follows:
+*> UPLO = 'U' or 'u' T and op(V) are upper triangular
+*>
+*> UPLO = 'L' or 'l' T and op(V) are lower triangular
+*> \Endverbatim
+*>
+*> \param[in] TRANSV
+*> \verbatim
+*> TRANSV is CHARACTER*1
+*> On entry, TRANSV specifies the form of op(V) to be used in
+*> the matrix multiplication as follows:
+*> TRANSV = 'N' or 'n' op(V) = V
+*>
+*> TRANSV = 'T' or 't' op(V) = V**T
+*>
+*> TRANSV = 'C' or 'c' op(V) = V**H
+*> \endverbatim
+*>
+*> \param[in] DIAGT
+*> \verbatim
+*> DIAGT is CHARACTER*1
+*> On entry, DIAGT specifies whether or not T is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' T is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' T is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] DIAGV
+*> \verbatim
+*> DIAGV is CHARACTER*1
+*> On entry, DIAGV specifies whether or not V is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' V is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' V is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> On entry, N specifies the number of rows and columns of T.
+*> N must be at least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is COMPLEX.
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then T and V are not referenced, and T and V need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is COMPLEX array, dimension ( LDT, N )
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array T must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> T is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array T must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> T is not referenced.
+*> Note that when DIAGT = 'U' or 'u', the diagonal elements of
+*> T are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> On entry, LDT specifies the first dimension of T as declared
+*> in the calling (sub) program. LDT must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is COMPLEX array, dimension ( LDV, N )
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array op(V) must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> V is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array op(V) must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> V is not referenced.
+*> Note that when DIAGV = 'U' or 'u', the diagonal elements of
+*> V are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> On entry, LDV specifies the first dimension of T as declared
+*> in the calling (sub) program. LDV must be at least max( 1, n ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ RECURSIVE SUBROUTINE CTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV,
+ $ N, ALPHA, T, LDT, V, LDV)
+*
+* .. Scalar Arguments ..
+ INTEGER N, LDT, LDV
+ CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV
+ COMPLEX ALPHA
+* ..
+* .. Array Arguments ..
+ COMPLEX T(LDT,*), V(LDV,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* .. External Subroutines ..
+ EXTERNAL CTRMM, CTRMMOOP, CLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC CONJG
+* ..
+* .. Local Scalars ..
+ INTEGER K, INFO
+ LOGICAL TLEFT, TUPPER, VTRANS, VUNIT, TUNIT, CONJV
+* ..
+* .. Local Parameters ..
+ COMPLEX ONE, ZERO
+ PARAMETER(ONE=(1.0E+0,0.E+0), ZERO=(0.0E+0,0.0E+0))
+* ..
+*
+* Beginning of Executable Statements
+*
+*
+* Early Termination Criteria
+*
+ IF (ALPHA.EQ.ZERO) THEN
+*
+* If ALPHA is 0, then we are just setting T to be the 0 matrix
+*
+ CALL CLASET(UPLO, N, N, ZERO, ZERO, T, LDT)
+ RETURN
+ END IF
+ TUNIT = LSAME(DIAGT, 'U')
+ VUNIT = LSAME(DIAGV, 'U')
+*
+* Terminating Case
+*
+ IF (N.EQ.1) THEN
+ IF (VUNIT.AND.TUNIT) THEN
+ T(1,1) = ALPHA
+ ELSE IF (VUNIT) THEN
+ T(1,1) = ALPHA*T(1,1)
+ ELSE IF (TUNIT) THEN
+ IF (CONJV) THEN
+ T(1,1) = ALPHA*CONJG(V(1,1))
+ ELSE
+ T(1,1) = ALPHA*V(1,1)
+ END IF
+ ELSE
+ T(1,1) = ALPHA*T(1,1)*V(1,1)
+ END IF
+ RETURN
+ ELSE IF(N.LE.0) THEN
+ RETURN
+ END IF
+*
+* Recursive case
+*
+ TLEFT = LSAME(SIDE, 'R')
+ TUPPER = LSAME(UPLO, 'U')
+ CONJV = LSAME(TRANSV, 'C')
+ VTRANS = CONJV.OR.LSAME(TRANSV, 'T')
+
+ K = N / 2
+ IF(TUPPER) THEN
+*
+* T is upper triangular
+*
+ IF(TLEFT) THEN
+*
+* Compute T = T*op(V)
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = T*V**T, which we break down as follows
+* |--------------| |--------------| |--------------------|
+* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11}**T V_{21}**T|
+* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}**T|
+* |--------------| |--------------| |--------------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}**T
+* T_{12} = \alpha T_{11}*V_{21}**T + \alpha T_{12}*V_{22}**T
+* T_{22} = \alpha T_{22}*V_{22}**T
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha T_{12}*V_{22}**T (CTRMM)
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} (CTRMMOOP)
+*
+* T_{12} = \alpha T_{12}*V_{22}**T
+*
+ CALL CTRMM('Right', 'Lower', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12}
+*
+ CALL CTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT,
+ $ V(K+1, 1), LDV, ONE, T(1, K+1), LDT)
+ ELSE
+*
+* We are computing T = T*V, which we break down as follows
+* |--------------| |--------------| |-------------|
+* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11} V_{12}|
+* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}|
+* |--------------| |--------------| |-------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}
+* T_{12} = \alpha T_{11}*V_{12} + \alpha T_{12}*V_{22}
+* T_{22} = \alpha T_{22}*V_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha T_{12}*V_{22} (CTRMM)
+* T_{12} = \alpha T_{11}*V_{12} + T_{12} (CTRMMOOP)
+*
+* T_{12} = \alpha T_{12}*V_{22}
+*
+ CALL CTRMM('Right', 'Upper', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12}
+*
+ CALL CTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT,
+ $ V(1, K+1), LDV, ONE, T(1, K+1), LDT)
+ END IF
+ ELSE
+*
+* Compute T = op(V)*T
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = V**T*T, which we break down as follows
+* |--------------| |--------------------| |--------------|
+* |T_{11} T_{12}| |V_{11}**T V_{21}**T| |T_{11} T_{12}|
+* |0 T_{22}| = \alpha |0 V_{22}**T| * |0 T_{22}|
+* |--------------| |--------------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}**T*T_{11}
+* T_{12} = \alpha V_{11}**T*T_{12} + \alpha V_{21}**T*T_{22}
+* T_{22} = \alpha V_{22}**T*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha V_{11}**T*T_{12} (CTRMM)
+* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} (CTRMMOOP)
+*
+* T_{12} = \alpha V_{11}**T*T_{12}
+*
+ CALL CTRMM('Left', 'Lower', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V, LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha V_{21}**T*T_{22} + T_{12}
+*
+ CALL CTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1),
+ $ LDT, V(K+1, 1), LDV, ONE, T(1, K+1), LDT)
+ ELSE
+*
+* We are computing T = V*T, which we break down as follows
+* |--------------| |--------------| |--------------|
+* |T_{11} T_{12}| |V_{11} V_{12}| |T_{11} T_{12}|
+* |0 T_{22}| = \alpha |0 V_{22}| * |0 T_{22}|
+* |--------------| |--------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}*T_{11}
+* T_{12} = \alpha V_{11}*T_{12} + \alpha V_{12}*T_{22}
+* T_{22} = \alpha V_{22}*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha V_{11}*T_{12} (CTRMM)
+* T_{12} = \alpha V_{12}*T_{22} + T_{12} (CTRMMOOP)
+*
+* T_{12} = \alpha V_{11}*T_{12}
+*
+ CALL CTRMM('Left', 'Upper', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V, LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha V_{12}*T_{22} + T_{12} (CTRMMOOP)
+*
+ CALL CTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1),
+ $ LDT, V(1, K+1), LDV, ONE, T(1, K+1), LDT)
+ END IF
+ END IF
+ ELSE
+*
+* T is lower triangular
+*
+ IF(TLEFT) THEN
+*
+* Compute T = T*op(V)
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = T*V**T, which we break down as follows
+* |--------------| |--------------| |--------------------|
+* |T_{11} 0 | |T_{11} 0 | |V_{11}**T 0 |
+* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{12}**T V_{22}**T|
+* |--------------| |--------------| |--------------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}**T
+* T_{21} = \alpha T_{21}*V_{11}**T + \alpha T_{22}*V_{12}**T
+* T_{22} = \alpha T_{22}*V_{22}**T
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha T_{21}*V_{11}**T (CTRMM)
+* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} (CTRMMOOP)
+*
+* T_{21} = \alpha T_{21}*V_{11}**T
+*
+ CALL CTRMM('Right', 'Upper', TRANSV, DIAGV, N-K,
+ $ K, ALPHA, V, LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha T_{22}*V_{12}**T + T_{21}
+*
+ CALL CTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1),
+ $ LDT, V(1, K+1), LDV, ONE, T(K+1, 1), LDT)
+ ELSE
+*
+* We are computing T = T*V, which we break down as follows
+* |--------------| |--------------| |-------------|
+* |T_{11} 0 | |T_{11} 0 | |V_{11} 0 |
+* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{21} V_{22}|
+* |--------------| |--------------| |-------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}
+* T_{21} = \alpha T_{21}*V_{11} + \alpha T_{22}*V_{21}
+* T_{22} = \alpha T_{22}*V_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha T_{21}*V_{11} (CTRMM)
+* T_{21} = \alpha T_{22}*V_{21} + T_{21} (CTRMMOOP)
+*
+* T_{21} = \alpha T_{21}*V_{11}
+*
+ CALL CTRMM('Right', 'Lower', TRANSV, DIAGV, N-K,
+ $ K, ALPHA, V, LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha T_{22}*V_{12} + T_{21}
+*
+ CALL CTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1),
+ $ LDT, V(K+1, 1), LDV, ONE, T(K+1, 1), LDT)
+ END IF
+ ELSE
+*
+* Compute T = op(V)*T
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = V**T*T, which we break down as follows
+* |--------------| |--------------------| |--------------|
+* |T_{11} 0 | |V_{11}**T 0 | |T_{11} 0 |
+* |T_{21} T_{22}| = \alpha |V_{12}**T V_{22}**T| * |T_{21} T_{22}|
+* |--------------| |--------------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}**T*T_{11}
+* T_{21} = \alpha V_{12}**T*T_{11} + \alpha V_{22}**T*T_{21}
+* T_{22} = \alpha V_{22}**T*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha V_{22}**T*T_{21} (CTRMM)
+* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} (CTRMMOOP)
+*
+* T_{21} = \alpha V_{22}**T*T_{21}
+*
+ CALL CTRMM('Left', 'Upper', TRANSV, DIAGV, N-K, K,
+ $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha V_{12}**T*T_{11} + T_{21}
+*
+ CALL CTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT,
+ $ V(1, K+1), LDV, ONE, T(K+1, 1), LDT)
+ ELSE
+*
+* We are computing T = V*T, which we break down as follows
+* |--------------| |-------------| |--------------|
+* |T_{11} 0 | |V_{11} 0 | |T_{11} 0 |
+* |T_{21} T_{22}| = \alpha |V_{21} V_{22}| * |T_{21} T_{22}|
+* |--------------| |-------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}*T_{11}
+* T_{21} = \alpha V_{21}*T_{11} + \alpha V_{22}*T_{21}
+* T_{22} = \alpha V_{22}*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{21} = \alpha V_{22}*T_{21} (CTRMM)
+* T_{21} = \alpha V_{12}*T_{11} + T_{21} (CTRMMOOP)
+*
+* T_{21} = \alpha V_{22}*T_{12}
+*
+ CALL CTRMM('Left', 'Lower', TRANSV, DIAGV, N-K, K,
+ $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha V_{12}*T_{11} + T_{21}
+*
+ CALL CTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT,
+ $ V(K+1, 1), LDV, ONE, T(K+1, 1), LDT)
+ END IF
+ END IF
+ END IF
+*
+* Since in all the above cases, we compute T_{11} and T_{22}
+* the same, we pass in our flags and call this routine recursively
+*
+* Compute T_{11} recursively
+*
+ CALL CTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, K, ALPHA,
+ $ T, LDT, V, LDV)
+*
+* Compute T_{22} recursively
+*
+ CALL CTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, N-K, ALPHA,
+ $ T(K+1, K+1), LDT, V(K+1, K+1), LDV)
+ END SUBROUTINE
diff --git a/SRC/cungkl.f b/SRC/cungkl.f
new file mode 100644
index 0000000000..ffc4feec10
--- /dev/null
+++ b/SRC/cungkl.f
@@ -0,0 +1,174 @@
+*> \brief \b CUNGKL computes the explicit Q factor from ZGEQLF and CLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CUNGKL(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* COMPLEX Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CUNGKL generates an m by n complex matrix Q with orthonormal columns,
+*> which is defined as the last n columns of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V*T*V**H = H(n) . . . H(2) H(1)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by ZGEQLF and T is the n by n matrix returned by CLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V, and the order of T.
+*> N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is COMPLEX array, dimension (LDQ,N)
+*> On entry, Q(1:m-n+i-1,i) contains the vector which defines the
+*> elementary reflector H(i), for i=1,...,n as returned by ZGEQLF.
+*> In addition, the lower triangular portion of the submatrix given
+*> by Q(m-n+1:m,1:n) will contain the arry T as returned by CLARFT.
+*> See further details for more information.
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The storage of the V and T components inside Q is best illustrated by
+*> the following example with m = 5, n = 3.
+*>
+*> Q = |----------|
+*> | V1 V2 V3 |
+*> | V1 V2 V3 |
+*> | T1 V2 V3 |
+*> | T1 T2 V3 |
+*> | T1 T2 T3 |
+*> |----------|
+*>
+*> \endverbatim
+*>
+* =====================================================================
+
+ SUBROUTINE CUNGKL(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ COMPLEX Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX NEG_ONE, ONE
+ PARAMETER(NEG_ONE=(-1.0E+0,0.0E+0), ONE=(1.0E+0,0.0E+0))
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL CTRMM, CTRTRM, CLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |---|
+* Q = | V |
+* | T |
+* |---|
+*
+* Where T is an n-by-n lower triangular matrix, and V is as described
+* in the Further Details section
+*
+* In turn, break apart V as follows
+*
+* |-----|
+* V = | V_2 |
+* | V_1 |
+* |-----|
+*
+* Where:
+*
+* V_1 \in \C^{n\times n} assumed unit upper triangular
+* V_2 \in \C^{m-n\times n}
+*
+* Compute T = T*V_1**T
+*
+ CALL CTRTRM('Right', 'Lower', 'Conjugate Transpose',
+ $ 'Non-Unit', 'Unit', N, ONE, Q(M-N+1,1), LDQ, Q(M-N+1,1),
+ $ LDQ)
+*
+* Compute Q = -VT. This means that we need to break apart
+* Our computation in two parts
+*
+* |--------|
+* Q = | -V_2*T |
+* | -V_1*T |
+* |--------|
+*
+* Q_2 = -V_2*T (TRMM) but only when necessary
+*
+ IF (M.GT.N) THEN
+ CALL CTRMM('Right', 'Lower', 'No Transpose', 'Non-Unit',
+ $ M-N, N, NEG_ONE, Q(M-N+1,1), LDQ, Q, LDQ)
+ END IF
+*
+* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL CLUMM('Right', 'Non-Unit', 'Unit', N, NEG_ONE,
+ $ Q(M-N+1,1), LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(M-N+I,I) = Q(M-N+I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/cungkr.f b/SRC/cungkr.f
new file mode 100644
index 0000000000..cb7c1d74a0
--- /dev/null
+++ b/SRC/cungkr.f
@@ -0,0 +1,154 @@
+*> \brief \b CUNGKR computes the explicit Q factor from CGEQRF and CLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CUNGKR(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* COMPLEX Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CUNGKR generates an m by n complex matrix Q with orthonormal columns,
+*> which is defined as the first n columns of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V*T*V**H = H(1) H(2) . . . H(n)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by CGEQRF and T is the n by n matrix returned by CLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V, and the order of T.
+*> N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is COMPLEX array, dimension (LDQ,N)
+*> On entry, the upper triangular part and diagonal contains
+*> The array T as returned from CLARFT. In addition, the
+*> strictly lower triangular portion of the i-th column contains
+*> the vector which defines the elementary reflector H(i),
+*> for i = 1,2,...,n, as returned by CGEQRF
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+* Cost: (2mn**2 + n**2 - n)/2
+ SUBROUTINE CUNGKR(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ COMPLEX Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX NEG_ONE, ONE
+ PARAMETER(NEG_ONE=(-1.0E+0,0.0E+0), ONE=(1.0E+0,0.0E+0))
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL CTRMM, CTRTRM, CLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |---|
+* Q = | T |
+* | V |
+* |---|
+*
+* Where T is an n-by-n upper triangular matrix, and V is an
+* m-by-n assumed unit lower trapezoidal matrix
+*
+* In turn, break apart V as follows
+*
+* |-----|
+* V = | V_1 |
+* | V_2 |
+* |-----|
+*
+* Where:
+*
+* V_1 \in \C^{n\times n} assumed unit lower triangular
+* V_2 \in \C^{m-n\times n}
+*
+* Compute T = T*V_1**H
+*
+ CALL CTRTRM('Right', 'Upper', 'Conjugate Transpose',
+ $ 'Non-unit', 'Unit', N, ONE, Q, LDQ, Q, LDQ)
+*
+* Compute Q = -VT. This means that we need to break apart
+* Our computation in two parts
+*
+* |--------|
+* Q = | -V_1*T |
+* | -V_2*T |
+* |--------|
+*
+* Q_2 = -V_2*T (TRMM) but only when necessary
+*
+ IF (M.GT.N) THEN
+ CALL CTRMM('Right', 'Upper', 'No Transpose', 'Non-unit',
+ $ M-N, N, NEG_ONE, Q, LDQ, Q(N+1,1), LDQ)
+ END IF
+*
+* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL CLUMM('Left', 'Unit', 'Non-Unit', N, NEG_ONE, Q, LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,I) = Q(I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/cunglk.f b/SRC/cunglk.f
new file mode 100644
index 0000000000..3a1a5dfc89
--- /dev/null
+++ b/SRC/cunglk.f
@@ -0,0 +1,149 @@
+*> \brief \b CUNGLK computes the explicit Q factor from CGELQF and CLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CUNGLK(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* COMPLEX Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CUNGLK generates an m by n complex matrix Q with orthonormal columns,
+*> which is defined as the first n rows of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V'*T*V = H(1) H(2) . . . H(n)
+*>
+*> Where V is an m by n matrix whose rows are householder reflectors
+*> as returned by CGELQF and T is the n by n matrix returned by CLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V, and the order of T. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is COMPLEX array, dimension (LDQ,N)
+*> On entry, the lower triangular part and diagonal contains
+*> The array T as returned from CLARFT. In addition, the
+*> strictly upper triangular portion of the i-th row contains
+*> the vector which defines the elementary reflector H(i),
+*> for i = 1,2,...,m, as returned by CGELQF
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ SUBROUTINE CUNGLK(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ COMPLEX Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX NEG_ONE, ONE
+ PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0))
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL CTRMM, CTRTRM, CLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |-----|
+* Q = | T V |
+* |-----|
+*
+* Where T is an m-by-m lower triangular matrix, and V is an
+* m-by-n assumed unit upper trapezoidal matrix
+*
+* In turn, break apart V as follows
+*
+* |---------|
+* V = | V_1 V_2 |
+* |---------|
+*
+* Where:
+*
+* V_1 \in \R^{m\times m} assumed unit upper triangular
+* V_2 \in \R^{m\times n-m}
+*
+* Compute T = V_1'*T
+*
+ CALL CTRTRM('Left', 'Lower', 'Conjugate Transpose',
+ $ 'Non-unit', 'Unit', M, ONE, Q, LDQ, Q, LDQ)
+*
+* Compute Q = -TV. This means that we need to break apart
+* Our computation in two parts
+*
+* |---------------|
+* Q = | -T*V_1 -T*V_2 |
+* |---------------|
+*
+* Q_2 = -T*V_2 (TRMM) but only when necessary
+*
+ IF (N.GT.M) THEN
+ CALL CTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ M, N-M, NEG_ONE, Q, LDQ, Q(1,M+1), LDQ)
+ END IF
+*
+* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL CLUMM('Left', 'Non-unit', 'Unit', M, NEG_ONE, Q, LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,I) = Q(I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/cunglq.f b/SRC/cunglq.f
index 7f42c3dcb0..6361766d1c 100644
--- a/SRC/cunglq.f
+++ b/SRC/cunglq.f
@@ -136,17 +136,14 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX ZERO
- PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK,
+ $ NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL CLARFB, CLARFT, CUNGL2, XERBLA
+ EXTERNAL CLARFB0C2, CLARFT, CUNGL2,
+ $ CUNGLK, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -190,95 +187,97 @@ SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
RETURN
END IF
*
- NBMIN = 2
- NX = 0
+ NBMIN = MAX( 2, ILAENV( 2, 'CUNGLQ', ' ', M, N, K, -1 ) )
+ NX = MAX( 0, ILAENV( 3, 'CUNGLQ', ' ', M, N, K, -1 ) )
IWS = M
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
*
- NX = MAX( 0, ILAENV( 3, 'CUNGLQ', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Treat the last NB block starting at KK+1 specially then use our blocking
+* method from the block starting at KI+1 to 1
*
- LDWORK = M
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked version
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'CUNGLQ', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL CUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the last block.
-* The first kk rows are handled by the block method.
+* Factor the last block assuming that our first application
+* will be on the Identity matrix
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ I = KK + 1
+ IB = NB
*
-* Set A(kk+1:m,1:kk) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 20 J = 1, KK
- DO 10 I = KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL CLARFT( 'Forward', 'Transpose', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), A( I, I ), LDA )
*
-* Use unblocked code for the last or only block.
+* Apply H to A(i+ib:m,i:n) from the right
*
- IF( KK.LT.M )
- $ CALL CUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL CLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I+IB,I), LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to columns i:n of current block
+
+ CALL CUNGLK( IB, N-I+1, A( I, I ), LDA)
*
* Use blocked code
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.M ) THEN
+ DO I = KI + 1, 1, -NB
+ IB = NB
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
+*
+ CALL CLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I),
+ $ LDA, TAU( I ), A( I, I ), LDA )
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+* Apply H to A(i+ib:m,i:n) from the right
*
- CALL CLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
- $ I ),
- $ LDA, TAU( I ), WORK, LDWORK )
+ CALL CLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I+IB,I), LDA)
*
-* Apply H**H to A(i+ib:m,i:n) from the right
+* Apply H to columns i:n of current block
*
- CALL CLARFB( 'Right', 'Conjugate transpose',
- $ 'Forward',
- $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ),
- $ LDA, WORK, LDWORK, A( I+IB, I ), LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL CUNGLK( IB, N-I+1, A( I, I ), LDA)
+ END DO
*
-* Apply H**H to columns i:n of current block
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
*
- CALL CUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
-* Set columns 1:i-1 of current block to zero
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 40 J = 1, I - 1
- DO 30 L = I, I + IB - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL CLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I),
+ $ LDA, TAU( I ), A( I, I ), LDA )
+*
+* Apply H to A(i+ib:m,i:n) from the right
+*
+ CALL CLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I+IB,I), LDA)
+*
+* Apply H to columns i:n of current block
+*
+ CALL CUNGLK( IB, N-I+1, A( I, I ), LDA)
+ END IF
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/cungql.f b/SRC/cungql.f
index 3da2702d8e..c4d228495c 100644
--- a/SRC/cungql.f
+++ b/SRC/cungql.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -137,17 +135,14 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX ZERO
- PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
- $ NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN,
+ $ NX
* ..
* .. External Subroutines ..
- EXTERNAL CLARFB, CLARFT, CUNG2L, XERBLA
+ EXTERNAL CLARFB0C2, CLARFT, CUNG2L,
+ $ CUNGKL, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -178,7 +173,11 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 )
- LWKOPT = N*NB
+*
+* Only need a workspace for cung2l in case of bailout
+* and for the panel factorization
+*
+ LWKOPT = N
END IF
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
*
@@ -201,88 +200,75 @@ SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ))
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'CUNGQL', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* We use blocked code for the entire construction
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KK = K
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Possibly bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'CUNGQL', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL CUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the first block.
-* The last kk columns are handled by the block method.
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+ I = 1
+ IB = NB
*
-* Set A(m-kk+1:m,1:n-kk) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- DO 20 J = 1, N - KK
- DO 10 I = M - KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA)
*
-* Use unblocked code for the first or only block.
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Exploit the fact that we are applying to an identity
*
- CALL CUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward',
+ $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I),
+ $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
-* Use blocked code
+ CALL CUNGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA)
+
+* Use blocked code on the remaining blocks if there are any.
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- IF( N-K+I.GT.1 ) THEN
+ DO I = NB+1, K, NB
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* The last block may be less than size NB
*
- CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
- $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+ IB = MIN(NB, K-I+1)
*
-* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL CLARFB( 'Left', 'No transpose', 'Backward',
- $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
- $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL CLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA )
*
-* Apply H to rows 1:m-k+i+ib-1 of current block
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
*
- CALL CUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
- $ TAU( I ), WORK, IINFO )
+ CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
+ $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
-* Set rows m-k+i+ib:m of current block to zero
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
- DO 40 J = N - K + I, N - K + I + IB - 1
- DO 30 L = M - K + I + IB, M
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL CUNGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA)
+ END DO
END IF
*
WORK( 1 ) = CMPLX( IWS )
diff --git a/SRC/cungqr.f b/SRC/cungqr.f
index eb49d2fed5..655810939a 100644
--- a/SRC/cungqr.f
+++ b/SRC/cungqr.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -137,20 +135,17 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX ZERO
- PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB,
+ $ NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL CLARFB, CLARFT, CUNG2R, XERBLA
+ EXTERNAL CLARFB0C2, CLARFT, CUNG2R,
+ $ CUNGKR, XERBLA
* ..
* .. Intrinsic Functions ..
- INTRINSIC MAX, MIN
+ INTRINSIC MAX
* ..
* .. External Functions ..
INTEGER ILAENV
@@ -163,7 +158,11 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, N )*NB
+*
+* Only need a workspace for zung2r in case of bailout
+* and for the panel factorization
+*
+ LWKOPT = MAX( 1, N )
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -192,92 +191,99 @@ SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+* Determine when to cross over from unblocked to blocked
+ NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) )
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'CUNGQR', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Treat the last NB block starting at KK+1 specially then use our blocking
+* method from the block starting at KI+1 to 1
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'CUNGQR', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL CUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the last block.
-* The first kk columns are handled by the block method.
+* Factor the last block assuming that our first application
+* will be on the Identity matrix
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ I = KK + 1
+ IB = NB
*
-* Set A(1:kk,kk+1:n) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 20 J = KK + 1, N
- DO 10 I = 1, KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to A(i:m,i+ib:n) from the left
*
- IF( KK.LT.N )
- $ CALL CUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL CLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+ $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I,I+IB), LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows i:m of current block
*
-* Use blocked code
+ CALL CUNGKR(M-I+1, IB, A(I,I), LDA)
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.N ) THEN
+* Use our standard blocking method after the last block
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
- CALL CLARFT( 'Forward', 'Columnwise', M-I+1, IB,
- $ A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Apply H to A(i:m,i+ib:n) from the left
+ CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- CALL CLARFB( 'Left', 'No transpose', 'Forward',
- $ 'Columnwise', M-I+1, N-I-IB+1, IB,
- $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
- $ LDA, WORK( IB+1 ), LDWORK )
- END IF
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
*
* Apply H to rows i:m of current block
*
- CALL CUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ CALL CUNGKR(M-I+1, IB, A(I,I), LDA)
+ END DO
+*
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
+*
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
-* Set rows 1:i-1 of current block to zero
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 40 J = I, I + IB - 1
- DO 30 L = 1, I - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL CLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
+*
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL CLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+* Apply H to rows i:m of current block
+*
+ CALL CUNGKR(M-I+1, IB, A(I,I), LDA)
+ END IF
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/cungrk.f b/SRC/cungrk.f
new file mode 100644
index 0000000000..72e304176e
--- /dev/null
+++ b/SRC/cungrk.f
@@ -0,0 +1,169 @@
+*> \brief \b CUNGRK computes the explicit Q factor from CGERQF and ZLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE CUNGRK(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* COMPLEX Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> CUNGRK generates an m by n complex matrix Q with orthonormal rows,
+*> which is defined as the last m rows of the product of m
+*> elementary reflectors
+*>
+*> Q = I - V'*T*V = H(m) . . . H(2) H(1)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by CGERQF and T is the n by n matrix returned by ZLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V, and the order of T.
+*> M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is COMPLEX array, dimension (LDQ,N)
+*> On entry, Q(i,1:n-m-1+i) contains the vector which defines the
+*> elementary reflector H(i), for i=1,...,n as returned by ZGERKF.
+*> In addition, the upper triangular portion of the submatrix given
+*> by Q(1:m,n-m:n) will contain the array T as returned by ZLARFT.
+*> See further details for more information.
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The storage of the V and T components inside Q is best illustrated by
+*> the following example with m = 3, n = 5.
+*>
+*> Q = |----------------|
+*> | V1 V1 T1 T1 T1 |
+*> | V2 V2 V2 T2 T2 |
+*> | V3 V3 V3 V3 T3 |
+*> |----------------|
+*>
+*> \endverbatim
+*>
+* =====================================================================
+
+ SUBROUTINE CUNGRK(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ COMPLEX Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX NEG_ONE, ONE
+ PARAMETER(NEG_ONE=(-1.0E+0,0.0E+0), ONE=(1.0E+0,0.0E+0))
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL CTRMM, CTRTRM, CLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |-----|
+* Q = | V T |
+* |-----|
+*
+* Where T is an m-by-m upper triangular matrix, and V is as described
+* in the Further Details section
+*
+* In turn, break apart V as follows
+*
+* |---------|
+* V = | V_2 V_1 |
+* |---------|
+*
+* Where:
+*
+* V_1 \in \R^{m\times m} assumed unit lower triangular
+* V_2 \in \R^{m\times n-m}
+*
+* Compute T = V_1'*T
+*
+ CALL CTRTRM('Left', 'Upper', 'Conjugate Transpose',
+ $ 'Non-Unit', 'Unit', M, ONE, Q(1,N-M+1), LDQ, Q(1,N-M+1),
+ & LDQ)
+*
+* Compute Q = -TV. This means that we need to break apart
+* Our computation in two parts
+*
+* |---------------|
+* Q = | -T*V_2 -T*V_1 |
+* |---------------|
+*
+* Q_2 = -T*V_2 (TRMM) but only when necessary
+*
+ IF (N.GT.M) THEN
+ CALL CTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit',
+ $ M, N-M, NEG_ONE, Q(1,N-M+1), LDQ, Q, LDQ)
+ END IF
+*
+* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL CLUMM('Right', 'Unit', 'Non-Unit', M, NEG_ONE,
+ $ Q(1,N-M+1), LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,N-M+I) = Q(I,N-M+I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/cungrq.f b/SRC/cungrq.f
index 7e812b839f..5448e55228 100644
--- a/SRC/cungrq.f
+++ b/SRC/cungrq.f
@@ -137,17 +137,14 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX ZERO
- PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+ INTEGER I, IB, II, IINFO, IWS, KK, LDWORK,
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL CLARFB, CLARFT, CUNGR2, XERBLA
+ EXTERNAL CLARFB0C2, CLARFT, CUNGR2,
+ $ CUNGRK, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -200,92 +197,74 @@ SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
RETURN
END IF
*
- NBMIN = 2
- NX = 0
+ NBMIN = MAX( 2, ILAENV( 2, 'CUNGRQ', ' ', M, N, K, -1 ) )
+ NX = MAX( 0, ILAENV( 3, 'CUNGRQ', ' ', M, N, K, -1 ) )
IWS = M
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'CUNGRQ', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
-*
-* Determine if workspace is large enough for blocked code.
-*
- LDWORK = M
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
-*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
-*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'CUNGRQ', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
- END IF
*
IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
* Use blocked code after the first block.
* The last kk rows are handled by the block method.
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
-*
-* Set A(1:m-kk,n-kk+1:n) to zero.
-*
- DO 20 J = N - KK + 1, N
- DO 10 I = 1, M - KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KK = K
ELSE
KK = 0
END IF
*
-* Use unblocked code for the first or only block.
+* Potentially bail to the unblocked code
*
- CALL CUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL CUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
*
-* Use blocked code
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- II = M - K + I
- IF( II.GT.1 ) THEN
+ I = 1
+ IB = NB
+ II = M - K + I
+*
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
+*
+ CALL CLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA )
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL CLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
- $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+ CALL CLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Backward',
+ $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA,
+ $ A( II, N-K+I ), LDA, A, LDA)
+*
+* Apply H to columns 1:n-k+i+ib-1 of current block
+*
+ CALL CUNGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA )
+
+ DO I = NB+1, K, NB
+*
+* The last block may be less than size NB
+*
+ IB = MIN(NB, K-I+1)
+ II = M - K + I
*
-* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL CLARFB( 'Right', 'Conjugate transpose',
- $ 'Backward',
- $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ),
- $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ),
- $ LDWORK )
- END IF
+ CALL CLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA )
*
-* Apply H**H to columns 1:n-k+i+ib-1 of current block
+* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL CUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
- $ TAU( I ),
- $ WORK, IINFO )
+ CALL CLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1),
+ $ LDA, A( II, N-K+I ), LDA, A, LDA)
*
-* Set columns n-k+i+ib:n of current block to zero
+* Apply H to columns 1:n-k+i+ib-1 of current block
*
- DO 40 L = N - K + I + IB, N
- DO 30 J = II, II + IB - 1
- A( J, L ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL CUNGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA )
+ END DO
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/dlarfb0c2.f b/SRC/dlarfb0c2.f
new file mode 100644
index 0000000000..4609263849
--- /dev/null
+++ b/SRC/dlarfb0c2.f
@@ -0,0 +1,556 @@
+*> \brief \b DLARFB0C2 applies a block reflector or its transpose to a
+* rectangular matrix with a 0 block while constructing the explicit Q factor
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+* $ K, V, LDV, T, LDT, C, LDC)
+* ! Scalar arguments
+* INTEGER M, N, K, LDV, LDC, LDT
+* CHARACTER SIDE, TRANS, DIRECT, STOREV
+* ! True means that we are assuming C2 is the identity matrix
+* ! and thus don't reference whatever is present in C2
+* ! at the beginning.
+* LOGICAL C2I
+* ! Array arguments
+* DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DLARFB0C2 applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*> C2I is LOGICAL
+*> = .TRUE.: Assume the nonzero block of C is the identity matrix
+*> = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> = 'L': apply H or H**T from the Left
+*> = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*> TRANS is CHARACTER*1
+*> = 'N': apply H (No transpose)
+*> = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*> DIRECT is CHARACTER*1
+*> Indicates how H is formed from a product of elementary
+*> reflectors
+*> = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*> = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*> STOREV is CHARACTER*1
+*> Indicates how the vectors which define the elementary
+*> reflectors are stored:
+*> = 'C': Columnwise
+*> = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*> K is INTEGER
+*> The order of the matrix T (= the number of elementary
+*> reflectors whose product defines the block reflector).
+*> If SIDE = 'L', M >= K >= 0;
+*> if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is DOUBLE PRECISION array, dimension
+*> (LDV,K) if STOREV = 'C'
+*> (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*> (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*> The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> The leading dimension of the array V.
+*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*> if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is DOUBLE PRECISION array, dimension (LDT,K)
+*> The triangular k by k matrix T in the representation of the
+*> block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is DOUBLE PRECISION array, dimension (LDC,N)
+*> On entry, the m by n matrix C.
+*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The shape of the matrix V and the storage of the vectors which define
+*> the H(i) is best illustrated by the following example with n = 5 and
+*> k = 3. The triangular part of V (including its diagonal) is not
+*> referenced.
+*>
+*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R':
+*>
+*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 )
+*> ( v1 1 ) ( 1 v2 v2 v2 )
+*> ( v1 v2 1 ) ( 1 v3 v3 )
+*> ( v1 v2 v3 )
+*> ( v1 v2 v3 )
+*>
+*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R':
+*>
+*> V = ( v1 v2 v3 ) V = ( v1 v1 1 )
+*> ( v1 v2 v3 ) ( v2 v2 v2 1 )
+*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 )
+*> ( 1 v3 )
+*> ( 1 )
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE DLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+ $ K, V, LDV, T, LDT, C, LDC)
+ ! Scalar arguments
+ INTEGER M, N, K, LDV, LDC, LDT
+ CHARACTER SIDE, TRANS, DIRECT, STOREV
+ ! True means that we are assuming C2 is the identity matrix
+ ! and thus don't reference whatever is present in C2
+ ! at the beginning.
+ LOGICAL C2I
+ ! Array arguments
+ DOUBLE PRECISION V(LDV,*), C(LDC,*), T(LDT,*)
+ ! Local scalars
+ LOGICAL QR, LQ, QL, RQ, DIRF, COLV, SIDEL, SIDER,
+ $ TRANST
+ INTEGER I, J
+ ! External functions
+ LOGICAL LSAME
+ EXTERNAL LSAME
+ ! External subroutines
+ EXTERNAL DGEMM, DTRMM, XERBLA
+ ! Parameters
+ DOUBLE PRECISION ONE, ZERO, NEG_ONE
+ PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE = -1.0D+0)
+
+ ! Beginning of executable statements
+ ! Convert our character flags to logical values
+ DIRF = LSAME(DIRECT,'F')
+ COLV = LSAME(STOREV,'C')
+ SIDEL = LSAME(SIDE,'L')
+ SIDER = LSAME(SIDE,'R')
+ TRANST = LSAME(TRANS,'T')
+
+ ! Determine which of the 4 modes are using.
+ ! QR is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the first column
+ QR = DIRF.AND.COLV
+
+ ! LQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the first row
+ LQ = DIRF.AND.(.NOT.COLV)
+
+ ! QL is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the last column
+ QL = (.NOT.DIRF).AND.COLV
+
+ ! RQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the last row
+ RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+
+ IF (QR) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V1 ] and C = [ C1 ]
+ ! [ V2 ] [ C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ]
+ ! [ V2 ] [ C2 ]
+ ! = [ 0 ] - [ V1 ]T*V2'*C2
+ ! [ C2 ] [ V2 ]
+ !
+ ! = [ 0 ] - [ V1*T*V2'*C2 ]
+ ! [ C2 ] [ V2*T*V2'*C2 ]
+ !
+ ! = [ V1*T*V2'*C2 ]
+ ! [ C2 - V2*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('DLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('DLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF (C2I) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(I,J) = V(K+J,I)
+ END DO
+ END DO
+ ELSE
+ CALL DGEMM('Transpose', 'No Transpose', K, N, M - K,
+ $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+ $ C, LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF (C2I) THEN
+ CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+ $ C(K+1,1), LDC)
+ DO I = 1, N
+ C(K+I,I) = C(K+I,I) + ONE
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+ $ C(K+1,1), LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (LQ) THEN
+ ! We are computing C = C op(H) = C(I-V' op(T) V)
+ ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = C op(H) = [ 0, C2 ](I - [ V1' ]op(T)[ V1, V2 ])
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]op(T)[ V1, V2 ]
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - C2*V2'*op(T)[ V1, V2 ]
+ !
+ ! = [ -C2*V2'*op(T)*V1, C2 - C2*V2'*op(T)*V2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*op(T)
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('DLARFB0C2', 2)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,J) = V(J,K+I)
+ END DO
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
+ $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+ $ LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ IF (TRANST) THEN
+ CALL DTRMM('Right', 'Upper', 'Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C, LDC)
+ ELSE
+ CALL DTRMM('Right', 'Lower', 'No Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C, LDC)
+ END IF
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+ $ LDC)
+ DO I = 1, M
+ C(I,K+I) = C(I,K+I) + ONE
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+ $ LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL DTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (QL) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V2 ] and C = [ C2 ]
+ ! [ V1 ] [ C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+ ! [ V1 ] [ 0 ]
+ !
+ ! = [ C2 ] - [ V2 ]T*V2'*C2
+ ! [ 0 ] [ V1 ]
+ !
+ ! = [ C2 ] - [ V2*T*V2'*C2 ]
+ ! [ 0 ] [ V1*T*V2'*C2 ]
+ !
+ ! = [ C2 - V2*T*V2'*C2 ]
+ ! [ - V1*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('DLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('DLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF( C2I ) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(M-K+I,J) = V(J,I)
+ END DO
+ END DO
+ ELSE
+ CALL DGEMM('Transpose', 'No Transpose', K, N, M-K,
+ $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF( C2I ) THEN
+ CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+ DO I = 1, N
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+ ELSE IF (RQ) THEN
+ ! We are computing C = C op(H) = C(I-V' op(T) V)
+ ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = C op(H) = [ C2, 0 ] (I - [ V2' ]op(T)[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]op(T)[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - C2*V2'*op(T)[ V2, V1 ]
+ !
+ ! = [ C2, 0 ] - [ C2*V2'*op(T)*V2, C2*V2'*op(T)*V1 ]
+ !
+ ! = [ C2 - C2*V2'*op(T)*V2, -C2*V2'*op(T)*V1 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*op(T)
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side has the expected value
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('DLARFB0C2', 2)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,N-K+J) = V(J,I)
+ END DO
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'Transpose', M, K, N-K,
+ $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C1 = C1*op(T)
+ !
+ IF( TRANST ) THEN
+ CALL DTRMM('Right', 'Lower', 'Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ ELSE
+ CALL DTRMM('Right', 'Upper', 'No Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+ DO I = 1, M
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL DGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL DTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/dlarft.f b/SRC/dlarft.f
index e6149ded7b..bf9e3d5334 100644
--- a/SRC/dlarft.f
+++ b/SRC/dlarft.f
@@ -182,7 +182,7 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* .. Local Scalars ..
*
INTEGER I,J,L
- LOGICAL QR,LQ,QL,DIRF,COLV
+ LOGICAL QR,LQ,QL,RQ,LQT,RQT,DIRF,COLV,TDIRF,TCOLV
*
* .. External Subroutines ..
*
@@ -192,7 +192,7 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
*
LOGICAL LSAME
EXTERNAL LSAME
-*
+*
* The general scheme used is inspired by the approach inside DGEQRT3
* which was (at the time of writing this code):
* Based on the algorithm of Elmroth and Gustavson,
@@ -223,26 +223,37 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* 'C' for STOREV, then they meant to provide 'R'
*
DIRF = LSAME(DIRECT,'F')
+ TDIRF = LSAME(DIRECT,'T')
COLV = LSAME(STOREV,'C')
+ TCOLV = LSAME(STOREV,'T')
*
* QR happens when we have forward direction in column storage
*
QR = DIRF.AND.COLV
*
-* LQ happens when we have forward direction in row storage
+* LQT happens when we have forward direction in row storage and want to compute the transpose of
+* the T we would normally compute
+*
+ LQT = DIRF.AND.TCOLV
+*
+* LQ happens when we have forward direction in row storage and want to compute the T we would
+* normally compute
*
- LQ = DIRF.AND.(.NOT.COLV)
+ LQ = DIRF.AND.(.NOT.LQT)
*
* QL happens when we have backward direction in column storage
*
QL = (.NOT.DIRF).AND.COLV
*
-* The last case is RQ. Due to how we structured this, if the
-* above 3 are false, then RQ must be true, so we never store
-* this
-* RQ happens when we have backward direction in row storage
-* RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+* RQT happens when we have backward direction in row storage and want to compute the transpose
+* of the T we would normally compute
*
+ RQT = TDIRF.AND.(.NOT.COLV)
+*
+* RQ happens when we have backward direction in row storage and want to compute the T that we
+* would normally compute
+*
+ RQ = (.NOT.RQT).AND.(.NOT.COLV)
IF(QR) THEN
*
* Break V apart into 6 components
@@ -256,17 +267,17 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{1,1}\in\R^{l,l} unit lower triangular
* V_{2,1}\in\R^{k-l,l} rectangular
* V_{3,1}\in\R^{n-k,l} rectangular
-*
+*
* V_{2,2}\in\R^{k-l,k-l} unit lower triangular
* V_{3,2}\in\R^{n-k,k-l} rectangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} T_{1,2}|
* |0 T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -277,17 +288,17 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
+*
* (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2')
* = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2'
-*
+*
* Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |-------|
* |V_1 V_2|
* |-------|
-*
+*
* So, our product is equivalent to the matrix product
* I - V*T*V'
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
@@ -299,30 +310,29 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{2,2} recursively
*
- CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV,
- $ TAU(L+1), T(L+1, L+1), LDT)
+ CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
*
-* Compute T_{1,2}
+* Compute T_{1,2}
* T_{1,2} = V_{2,1}'
*
DO J = 1, L
DO I = 1, K-L
- T(J, L+I) = V(L+I, J)
+ T(J,L+I) = V(L+I,J)
END DO
END DO
*
* T_{1,2} = T_{1,2}*V_{2,2}
*
CALL DTRMM('Right', 'Lower', 'No transpose', 'Unit', L,
- $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
+ $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
- CALL DGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE,
- $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE,
- $ T(1, L+1), LDT)
+ CALL DGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE,
+ $ V(K+1, 1), LDV, V(K+1,L+1), LDV, ONE, T(1, L+1), LDT)
*
* At this point, we have that T_{1,2} = V_1'*V_2
* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
@@ -331,12 +341,12 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2} = -T_{1,1}*T_{1,2}
*
CALL DTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
+ $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*T_{2,2}
*
- CALL DTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT)
+ CALL DTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
+ $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT)
ELSE IF(LQ) THEN
*
@@ -350,19 +360,19 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{1,1}\in\R^{l,l} unit upper triangular
* V_{1,2}\in\R^{l,k-l} rectangular
* V_{1,3}\in\R^{l,n-k} rectangular
-*
+*
* V_{2,2}\in\R^{k-l,k-l} unit upper triangular
* V_{2,3}\in\R^{k-l,n-k} rectangular
*
* Where l = floor(k/2)
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} T_{1,2}|
* |0 T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -371,20 +381,20 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2}\in\R^{l, k-l} rectangular
*
* Then, consider the product:
-*
-* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2)
-* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2
-*
+*
+* (I - V_1'*T_{1,1}'*V_1)*(I - V_2'*T_{2,2}'*V_2)
+* = I - V_1'*T_{1,1}'*V_1 - V_2'*T_{2,2}'*V_2 + V_1'*T_{1,1}'*V_1*V_2'*T_{2,2}'*V_2
+*
* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |---|
* |V_1|
* |V_2|
* |---|
-*
+*
* So, our product is equivalent to the matrix product
-* I - V'*T*V
+* I - V'*T'*V
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
* to compute T_{1,2}
*
@@ -394,27 +404,26 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{2,2} recursively
*
- CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV,
- $ TAU(L+1), T(L+1, L+1), LDT)
+ CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
*
* Compute T_{1,2}
* T_{1,2} = V_{1,2}
*
- CALL DLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT)
+ CALL DLACPY('All', L, K - L, V(1,L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*V_{2,2}'
*
CALL DTRMM('Right', 'Upper', 'Transpose', 'Unit', L, K-L,
- $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
+ $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
CALL DGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE,
- $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE,
- $ T(1, L+1), LDT)
+ $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, T(1, L+1), LDT)
*
* At this point, we have that T_{1,2} = V_1*V_2'
* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
@@ -423,13 +432,106 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2} = -T_{1,1}*T_{1,2}
*
CALL DTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
+ $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*T_{2,2}
*
CALL DTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT)
+ $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT)
+ ELSE IF(LQT) THEN
+*
+* Break V apart into 6 components
+*
+* V = |----------------------|
+* |V_{1,1} V_{1,2} V{1,3}|
+* |0 V_{2,2} V{2,3}|
+* |----------------------|
+*
+* V_{1,1}\in\R^{l,l} unit upper triangular
+* V_{1,2}\in\R^{l,k-l} rectangular
+* V_{1,3}\in\R^{l,n-k} rectangular
+*
+* V_{2,2}\in\R^{k-l,k-l} unit upper triangular
+* V_{2,3}\in\R^{k-l,n-k} rectangular
+*
+* Where l = floor(k/2)
+*
+* We will construct the T matrix
+* T = |---------------|
+* |T_{1,1} 0 |
+* |T_{2,1} T_{2,2}|
+* |---------------|
+*
+* T is the triangular factor obtained from block reflectors.
+* To motivate the structure, assume we have already computed T_{1,1}
+* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
+*
+* T_{1,1}\in\R^{l, l} lower triangular
+* T_{2,2}\in\R^{k-l, k-l} lower triangular
+* T_{2,1}\in\R^{k-l, l} rectangular
+*
+* Then, consider the product:
+*
+* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2)
+* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2
+*
+* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1}
+*
+* Then, we can define the matrix V as
+* V = |---|
+* |V_1|
+* |V_2|
+* |---|
+*
+* So, our product is equivalent to the matrix product
+* I - V'*T*V
+* This means, we can compute T_{1,1} and T_{2,2}, then use this information
+* to compute T_{2,1}
+*
+* Compute T_{1,1} recursively
+*
+ CALL DLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT)
+*
+* Compute T_{2,2} recursively
+*
+ CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
+
+*
+* Compute T_{2,1}
+* T_{2,1} = V_{1,2}'
+*
+ DO I = 1, K-L
+ DO J = 1, L
+ T(L+I,J) = V(J,L+I)
+ END DO
+ END DO
+*
+* T_{2,1} = V_{2,2}*T_{2,1}
+*
+ CALL DTRMM('Left', 'Upper', 'No Transpose', 'Unit', K-L, L,
+ $ ONE, V(L+1,L+1), LDV, T(L+1,1), LDT)
+*
+* T_{2,1} = V_{2,3}*V_{1,3}' + T_{2,1}
+* Note: We assume K <= N, and GEMM will do nothing if N=K
+*
+ CALL DGEMM('No Transpose', 'Transpose', K-L, L, N-K, ONE,
+ $ V(L+1,K+1), LDV, V(1, K+1), LDV, ONE, T(L+1,1), LDT)
+*
+* At this point, we have that T_{2,1} = V_2*V_1'
+* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1}
+* respectively.
+*
+* T_{2,1} = -T_{2,2}*T_{2,1}
+*
+ CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K-L, L, NEG_ONE, T(L+1,L+1), LDT, T(L+1,1), LDT)
+*
+* T_{2,1} = T_{2,1}*T_{1,1}
+*
+ CALL DTRMM('Right', 'Lower', 'No Transpose', 'Non-unit',
+ $ K-L, L, ONE, T, LDT, T(L+1,1), LDT)
ELSE IF(QL) THEN
*
* Break V apart into 6 components
@@ -442,18 +544,18 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* V_{1,1}\in\R^{n-k,k-l} rectangular
* V_{2,1}\in\R^{k-l,k-l} unit upper triangular
-*
+*
* V_{1,2}\in\R^{n-k,l} rectangular
* V_{2,2}\in\R^{k-l,l} rectangular
* V_{3,2}\in\R^{l,l} unit upper triangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} 0 |
* |T_{2,1} T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -464,17 +566,17 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
+*
* (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1')
* = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1'
-*
+*
* Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |-------|
* |V_1 V_2|
* |-------|
-*
+*
* So, our product is equivalent to the matrix product
* I - V*T*V'
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
@@ -482,34 +584,34 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{1,1} recursively
*
- CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT)
+ CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
*
* Compute T_{2,2} recursively
*
CALL DLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV,
- $ TAU(K-L+1), T(K-L+1, K-L+1), LDT)
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
*
* Compute T_{2,1}
* T_{2,1} = V_{2,2}'
*
DO J = 1, K-L
DO I = 1, L
- T(K-L+I, J) = V(N-K+J, K-L+I)
+ T(K-L+I,J) = V(N-K+J, K-L+I)
END DO
END DO
*
* T_{2,1} = T_{2,1}*V_{2,1}
*
CALL DTRMM('Right', 'Upper', 'No transpose', 'Unit', L,
- $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT)
+ $ K-L, ONE, V(N-K+1,1), LDV, T(K-L+1,1), LDT)
*
* T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
CALL DGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE,
- $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1),
- $ LDT)
+ $ V(1,K-L+1), LDV, V, LDV, ONE, T(K-L+1,1), LDT)
*
* At this point, we have that T_{2,1} = V_2'*V_1
* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1}
@@ -518,17 +620,13 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{2,1} = -T_{2,2}*T_{2,1}
*
CALL DTRMM('Left', 'Lower', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT,
- $ T(K-L+1, 1), LDT)
+ $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT)
*
* T_{2,1} = T_{2,1}*T_{1,1}
*
CALL DTRMM('Right', 'Lower', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT)
- ELSE
-*
-* Else means RQ case
-*
+ $ K-L, ONE, T, LDT, T(K-L+1,1), LDT)
+ ELSE IF(RQ) THEN
* Break V apart into 6 components
*
* V = |-----------------------|
@@ -543,13 +641,13 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{2,2}\in\R^{l,k-l} rectangular
* V_{2,3}\in\R^{l,l} unit lower triangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} 0 |
* |T_{2,1} T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -560,51 +658,51 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
-* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1)
-* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1
-*
+*
+* (I - V_2'*T_{2,2}'*V_2)*(I - V_1'*T_{1,1}'*V_1)
+* = I - V_2'*T_{2,2}'*V_2 - V_1'*T_{1,1}'*V_1 + V_2'*T_{2,2}'*V_2*V_1'*T_{1,1}'*V_1
+*
* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |---|
* |V_1|
* |V_2|
* |---|
-*
+*
* So, our product is equivalent to the matrix product
-* I - V'*T*V
+* I - V'*T'*V
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
* to compute T_{2,1}
*
* Compute T_{1,1} recursively
*
- CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT)
+ CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
*
* Compute T_{2,2} recursively
*
- CALL DLARFT(DIRECT, STOREV, N, L, V(K-L+1, 1), LDV,
- $ TAU(K-L+1), T(K-L+1, K-L+1), LDT)
+ CALL DLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV,
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
*
* Compute T_{2,1}
* T_{2,1} = V_{2,2}
*
- CALL DLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV,
- $ T(K-L+1, 1), LDT)
+ CALL DLACPY('All', L, K-L, V(K-L+1,N-K+1), LDV, T(K-L+1,1),
+ $ LDT)
*
* T_{2,1} = T_{2,1}*V_{1,2}'
*
CALL DTRMM('Right', 'Lower', 'Transpose', 'Unit', L, K-L,
- $ ONE, V(1, N-K+1), LDV, T(K-L+1, 1), LDT)
+ $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT)
*
-* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1}
+* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
- CALL DGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE,
- $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1),
- $ LDT)
+ CALL DGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE,
+ $ V(K-L+1,1), LDV, V, LDV, ONE, T(K-L+1,1), LDT)
*
* At this point, we have that T_{2,1} = V_2*V_1'
@@ -614,13 +712,103 @@ RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{2,1} = -T_{2,2}*T_{2,1}
*
CALL DTRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT,
- $ T(K-L+1, 1), LDT)
+ $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT)
*
* T_{2,1} = T_{2,1}*T_{1,1}
*
CALL DTRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L,
- $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT)
+ $ K-L, ONE, T, LDT, T(K-L+1,1), LDT)
+ ELSE IF(RQT) THEN
+* Break V apart into 6 components
+*
+* V = |-----------------------|
+* |V_{1,1} V_{1,2} 0 |
+* |V_{2,1} V_{2,2} V_{2,3}|
+* |-----------------------|
+*
+* V_{1,1}\in\R^{k-l,n-k} rectangular
+* V_{1,2}\in\R^{k-l,k-l} unit lower triangular
+*
+* V_{2,1}\in\R^{l,n-k} rectangular
+* V_{2,2}\in\R^{l,k-l} rectangular
+* V_{2,3}\in\R^{l,l} unit lower triangular
+*
+* We will construct the T matrix
+* T = |---------------|
+* |T_{1,1} T_{1,2}|
+* | 0 T_{2,2}|
+* |---------------|
+*
+* T is the triangular factor obtained from block reflectors.
+* To motivate the structure, assume we have already computed T_{1,1}
+* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
+*
+* T_{1,1}\in\R^{k-l, k-l} non-unit upper triangular
+* T_{2,2}\in\R^{l, l} non-unit upper triangular
+* T_{1,2}\in\R^{k-l, l} rectangular
+*
+* Where l = floor(k/2)
+*
+* Then, consider the product:
+*
+* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1)
+* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1
+*
+* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2}
+*
+* Then, we can define the matrix V as
+* V = |---|
+* |V_1|
+* |V_2|
+* |---|
+*
+* So, our product is equivalent to the matrix product
+* I - V'*T*V
+* This means, we can compute T_{1,1} and T_{2,2}, then use this information
+* to compute T_{1,2}
+*
+* Compute T_{1,1} recursively
+*
+ CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
+*
+* Compute T_{2,2} recursively
+*
+ CALL DLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV,
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
+*
+* Compute T_{1,2}
+* T_{1,2} = V_{2,2}'
+*
+ DO I = 1, K-L
+ DO J = 1, L
+ T(I,K-L+J) = V(K-L+J, N-K+I)
+ END DO
+ END DO
+*
+* T_{1,2} = V_{1,2}T_{1,2}
+*
+ CALL DTRMM('Left', 'Lower', 'No Transpose', 'Unit', K-L, L,
+ $ ONE, V(1,N-K+1), LDV, T(1,K-L+1), LDT)
+*
+* T_{1,2} = V_{1,1}V_{2,1}' + T_{1,2}
+*
+ CALL DGEMM('No Tranpose', 'Transpose', K-L, L, N-K, ONE, V,
+ $ LDV, V(K-L+1,1), LDV, ONE, T(1, K-L+1), LDT)
+*
+* At this point, we have that T_{1,2} = V_1*V_2'
+* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
+* respectively.
+*
+* T_{1,2} = -T_{1,1}*T_{1,2}
+*
+ CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit',
+ $ K-L, L, NEG_ONE, T, LDT, T(1, K-L+1), LDT)
+*
+* T_{1,2} = T_{1,2}*T_{2,2}
+*
+ CALL DTRMM('Right', 'Upper', 'No Transpose', 'Non-Unit',
+ $ K-L, L, ONE, T(K-L+1,K-L+1), LDT, T(1, K-L+1), LDT)
END IF
END SUBROUTINE
diff --git a/SRC/dlumm.f b/SRC/dlumm.f
new file mode 100644
index 0000000000..ca572f6505
--- /dev/null
+++ b/SRC/dlumm.f
@@ -0,0 +1,349 @@
+*> \brief \b DLUMM computes an in place triangular times triangluar matrix multiplication
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE DLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA,
+* $ A, LDA)
+*
+* .. Scalar Arguments ..
+* INTEGER N, LDA
+* CHARACTER SIDEL, DIAGL, DIAGU
+* DOUBLE PRECISION ALPHA
+*
+* .. Array Arguments ..
+* DOUBLE PRECISION A(LDA,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DLUMM performs one of the matrix-matrix operations
+*>
+*> C = \alpha L * U
+*> or
+*> C = \alpha U * L
+*>
+*> where \alpha is a scalar, L is a unit, or non-unit, lower triangular matrix, and U is a unit, or
+*> non-unit, upper triangular matrix, and at most one of L and U are non-unit
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDEL
+*> \verbatim
+*> SIDEL is CHARACTER*1
+*> On entry, SIDE specifies whether L multiplies U from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' A = \alpha L * U
+*>
+*> SIDE = 'R' or 'r' A = \alpha U * L
+*> \endverbatim
+*>
+*> \param[in] DIAGL
+*> \verbatim
+*> DIAGL is CHARACTER*1
+*> On entry, DIAGL specifies whether or not L is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' L is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' L is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] DIAGU
+*> \verbatim
+*> DIAGU is CHARACTER*1
+*> On entry, DIAGU specifies whether or not U is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' U is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' U is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> M is INTEGER
+*> On entry, N specifies the number of rows and columns of L and U. M must be at
+*> least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is DOUBLE PRECISION.
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then A is not referenced, and A need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*> A is DOUBLE PRECISION array, dimension ( LDA, N ) where
+*> Before entry the leading n-by-n strictly upper triangular part of the array
+*> A must contain the upper triangular matrix U and the strictly lower triangular part of
+*> the leading n-by-n submatrix must contain the lower triangular matrix L.
+*> If DIAGL != 'U', then the diagonal is assumed to be part of L, and if
+*> DIAGU != 'U', then the diagonal is assumed to be part of U.
+*> Note: At most one of DIAGL and DIAGU can be not equal to 'U'.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> On entry, LDA specifies the first dimension of A as declared
+*> in the calling (sub) program. LDA must be at least max( 1, n ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+c Cost: 2/3 * (n^3 - n)
+ RECURSIVE SUBROUTINE DLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA,
+ $ A, LDA)
+*
+* .. Scalar Arguments ..
+ INTEGER N, LDA
+ CHARACTER SIDEL, DIAGL, DIAGU
+ DOUBLE PRECISION ALPHA
+*
+* .. Array Arguments ..
+ DOUBLE PRECISION A(LDA,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* .. External Subroutines ..
+ EXTERNAL DGEMM, DTRMM, DLASET, XERBLA
+* ..
+* .. Local Scalars ..
+ INTEGER K
+ LOGICAL LLEFT, LUNIT, UUNIT
+* ..
+* .. Local Parameters ..
+ DOUBLE PRECISION ONE, ZERO
+ PARAMETER(ONE=1.0D+0, ZERO=0.0D+0)
+* ..
+*
+* Determine if our flags are valid or not. We can have at
+* most one of DIAGU, DIAGL not equal to 'U'
+*
+ LUNIT = LSAME(DIAGL, 'U')
+ UUNIT = LSAME(DIAGU, 'U')
+*
+* If both of the above are false, then it is impossible to have the
+* structure that we are exploiting in this routine
+* Note: It is possible to allow the matrices to share a non-unit
+* diagonal as long as the values are the exact same, but there is
+* currently no use case for this that I am aware of.
+*
+ IF ((.NOT.LUNIT).AND.(.NOT.UUNIT)) THEN
+*
+* We say the error is in the last set DIAG value as we cannot know
+* what the user actually meant.
+*
+ CALL XERBLA( 'DLUMM', 3 )
+ RETURN
+ END IF
+*
+* Determine which side L is on
+*
+ LLEFT = LSAME(SIDEL, 'L')
+*
+* Early exit if possible
+*
+ IF (N.EQ.0) THEN
+ RETURN
+ END IF
+ IF (ALPHA.EQ.ZERO) THEN
+ CALL DLASET('All', N, N, ZERO, ZERO, A, LDA)
+ RETURN
+ END IF
+*
+* Terminating Case
+*
+ IF (N.EQ.1) THEN
+*
+* Since at most one of L and U are non-unit triangular, whatever side L is on, we are still
+* always computing one of
+*
+* 1) A(1,1) = ALPHA*A(1,1)
+* 2) A(1,1) = ALPHA
+*
+* Where the first case happens when exactly one of L and U are unit triangular, while the
+* second case happens when both L and U are unit triangular
+*
+ IF (LUNIT.AND.UUNIT) THEN
+ A(1,1) = ALPHA
+ ELSE
+ A(1,1) = ALPHA*A(1,1)
+ END IF
+ RETURN
+ END IF
+*
+* Recursive Case
+*
+ K = N/2
+*
+* Regardless of us computing A = L*U or A = U*L, break break A apart as follows:
+*
+* |---|
+* A = | U |
+* | L |
+* |---|
+*
+* Further break down L as
+* |---------------|
+* L = | L_{11} 0 |
+* | L_{21} L_{22} |
+* |---------------|
+*
+* Where:
+*
+* L_{11}\in\R^{k\times k} is lower triangular (assumed unit iff DIAGL == 'U')
+* L_{21}\in\R^{n-k\times n} is rectangular
+* L_{22}\in\R^{n-k\times n-k} is lower triangular (assumed unit iff DIAGL == 'U')
+*
+* Further break down U as
+* |---------------|
+* U = | U_{11} U_{21} |
+* | 0 U_{22} |
+* |---------------|
+*
+* Where:
+*
+* U_{11}\in\R^{k\times k} is upper triangular (assumed unit iff DIAGU == 'U')
+* U_{12}\in\R^{n\times n-k} is rectangular
+* U_{22}\in\R^{n-k\times n-k} is upper triangular (assumed unit iff DIAGU == 'U')
+ IF (LLEFT) THEN
+*
+* This means we are computing
+* |---------------| |---------------|
+* A = L*U = \alpha | L_{11} 0 | * | U_{11} U_{12} |
+* | L_{21} L_{22} | | 0 U_{22} |
+* |---------------| |---------------|
+*
+* |---------------------------------------------|
+* = \alpha | L_{11}*U_{11} L_{11}*U_{12} |
+* | L_{21}*U_{11} L_{21}*U_{12} + L_{22}*U_{22} |
+* |---------------------------------------------|
+*
+* We compute these in the following order
+*
+* A_{22} = \alpha*L_{22}*U_{22} (This routine)
+* A_{22} = \alpha*L_{21}*U_{12} + A_{22} (GEMM)
+*
+* A_{12} = \alpha*L_{11}*U_{12} (TRMM)
+* A_{21} = \alpha*L_{21}*U_{11} (TRMM)
+*
+* A_{11} = \alpha*L_{11}*U_{11} (This routine)
+*
+* Compute A_{22}
+*
+* A_{22} = \alpha*L_{22}*U_{22}
+*
+ CALL DLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA,
+ $ A(K+1, K+1), LDA)
+*
+* A_{22} = \alpha L_{21}*U_{12} + A_{22}
+*
+ CALL DGEMM('No Transpose', 'No Transpose', N-K, N-K, K,
+ $ ALPHA, A(K+1,1), LDA, A(1,K+1), LDA, ONE,
+ $ A(K+1,K+1), LDA)
+*
+* Compute A_{12}
+*
+* A_{12} = \alpha*L_{11}*U_{12}
+*
+ CALL DTRMM('Left', 'Lower', 'No Transpose', DIAGL, K,
+ $ N-K, ALPHA, A, LDA, A(1,K+1), LDA)
+*
+* Compute A_{21}
+*
+* A_{21} = \alpha*L_{21}*U_{11}
+*
+ CALL DTRMM('Right', 'Upper', 'No Transpose', DIAGU, N-K,
+ $ K, ALPHA, A, LDA, A(K+1,1), LDA)
+*
+* Compute A_{11}
+*
+* A_{11} = \alpha*L_{11}*U_{11}
+*
+ CALL DLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA)
+ ELSE
+*
+* This means we are computing
+* |---------------| |---------------|
+* A = U*L = \alpha | U_{11} U_{12} | * | L_{11} 0 |
+* | 0 U_{22} | | L_{21} L_{22} |
+* |---------------| |---------------|
+*
+* |---------------------------------------------|
+* = \alpha | U_{11}*L_{11} + U_{12}*L_{21} U_{12}*L_{22} |
+* | U_{22}*L_{21} U_{22}*L_{22} |
+* |---------------------------------------------|
+*
+* We compute these in the following order
+*
+* A_{11} = \alpha*U_{11}*L_{11} (This routine)
+* A_{11} = \alpha*U_{12}*L_{21} + A_{11} (GEMM)
+*
+* A_{12} = \alpha*U_{12}*L_{22} (TRMM)
+* A_{21} = \alpha*U_{22}*L_{21} (TRMM)
+*
+* A_{22} = \alpha*U_{22}*L_{22} (This routine)
+*
+* Compute A_{11}
+*
+* A_{11} = \alpha*U_{11}*L_{11}
+*
+ CALL DLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA)
+*
+* A_{11} = \alpha*U_{12}*L_{21} + A_{11}
+*
+ CALL DGEMM('No Transpose', 'No Transpose', K, K, N-K,
+ $ ALPHA, A(1,K+1), LDA, A(K+1,1), LDA, ONE, A, LDA)
+*
+* Compute A_{12}
+*
+* A_{12} = \alpha*U_{12}*L_{22}
+*
+ CALL DTRMM('Right', 'Lower', 'No Transpose', DIAGL, K,
+ $ N-K, ALPHA, A(K+1,K+1), LDA, A(1,K+1), LDA)
+*
+* Compute A_{21}
+*
+* A_{21} = \alpha*U_{22}*L_{21}
+*
+ CALL DTRMM('Left', 'Upper', 'No Transpose', DIAGU, N-K,
+ $ K, ALPHA, A(K+1, K+1), LDA, A(K+1,1), LDA)
+*
+* Compute A_{22}
+*
+* A_{22} = \alpha*U_{22}*L_{22}
+*
+ CALL DLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA,
+ $ A(K+1, K+1), LDA)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/dorgkl.f b/SRC/dorgkl.f
new file mode 100644
index 0000000000..5223dc74b8
--- /dev/null
+++ b/SRC/dorgkl.f
@@ -0,0 +1,173 @@
+*> \brief \b DORGKL computes the explicit Q factor from DGEQLF and DLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DORGKL(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DORGKL generates an m by n real matrix Q with orthonormal columns,
+*> which is defined as the last n columns of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V*T*V**T = H(n) . . . H(2) H(1)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by DGEQLF and T is the n by n matrix returned by DLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V, and the order of T.
+*> N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is DOUBLE PRECISION array, dimension (LDQ,N)
+*> On entry, Q(1:m-n+i-1,i) contains the vector which defines the
+*> elementary reflector H(i), for i=1,...,n as returned by DGEQLF.
+*> In addition, the lower triangular portion of the submatrix given
+*> by Q(m-n+1:m,1:n) will contain the arry T as returned by DLARFT.
+*> See further details for more information.
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The storage of the V and T components inside Q is best illustrated by
+*> the following example with m = 5, n = 3.
+*>
+*> Q = |----------|
+*> | V1 V2 V3 |
+*> | V1 V2 V3 |
+*> | T1 V2 V3 |
+*> | T1 T2 V3 |
+*> | T1 T2 T3 |
+*> |----------|
+*>
+*> \endverbatim
+*>
+* =====================================================================
+
+ SUBROUTINE DORGKL(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION NEG_ONE, ONE
+ PARAMETER(NEG_ONE=-1.0D+0, ONE=1.0D+0)
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL DTRMM, DTRTRM, DLUMM
+* ..
+* .. Intrinsic Functions..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |---|
+* Q = | V |
+* | T |
+* |---|
+*
+* Where T is an n-by-n lower triangular matrix, and V is as described
+* in the Further Details section
+*
+* In turn, break apart V as follows
+*
+* |-----|
+* V = | V_2 |
+* | V_1 |
+* |-----|
+*
+* Where:
+*
+* V_1 \in \R^{n\times n} assumed unit upper triangular
+* V_2 \in \R^{m-n\times n}
+*
+* Compute T = T*V_1**T
+*
+ CALL DTRTRM('Right', 'Lower', 'Transpose', 'Non-Unit', 'Unit',
+ $ N, ONE, Q(M-N+1,1), LDQ, Q(M-N+1,1), LDQ)
+*
+* Compute Q = -VT. This means that we need to break apart
+* Our computation in two parts
+*
+* |--------|
+* Q = | -V_2*T |
+* | -V_1*T |
+* |--------|
+*
+* Q_2 = -V_2*T (TRMM) but only when necessary
+*
+ IF (M.GT.N) THEN
+ CALL DTRMM('Right', 'Lower', 'No Transpose', 'Non-Unit',
+ $ M-N, N, NEG_ONE, Q(M-N+1,1), LDQ, Q, LDQ)
+ END IF
+*
+* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL DLUMM('Right', 'Non-Unit', 'Unit', N, NEG_ONE,
+ $ Q(M-N+1,1), LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(M-N+I,I) = Q(M-N+I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/dorgkr.f b/SRC/dorgkr.f
new file mode 100644
index 0000000000..51b79c3772
--- /dev/null
+++ b/SRC/dorgkr.f
@@ -0,0 +1,153 @@
+*> \brief \b DORGKR computes the explicit Q factor from DGEQRF and DLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DORGKR(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DORGKR generates an m by n real matrix Q with orthonormal columns,
+*> which is defined as the first n columns of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V*T*V**T = H(1) H(2) . . . H(n)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by DGEQRF and T is the n by n matrix returned by DLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V, and the order of T.
+*> N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is DOUBLE PRECISION array, dimension (LDQ,N)
+*> On entry, the upper triangular part and diagonal contains
+*> The array T as returned from DLARFT. In addition, the
+*> strictly lower triangular portion of the i-th column contains
+*> the vector which defines the elementary reflector H(i),
+*> for i = 1,2,...,n, as returned by DGEQRF
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ SUBROUTINE DORGKR(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION NEG_ONE, ONE
+ PARAMETER(NEG_ONE=-1.0D+0, ONE=1.0D+0)
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL DTRMM, DTRTRM, DLUMM
+* ..
+* .. Intrinsic Functions..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |---|
+* Q = | T |
+* | V |
+* |---|
+*
+* Where T is an n-by-n upper triangular matrix, and V is an
+* m-by-n assumed unit lower trapezoidal matrix
+*
+* In turn, break apart V as follows
+*
+* |-----|
+* V = | V_1 |
+* | V_2 |
+* |-----|
+*
+* Where:
+*
+* V_1 \in \R^{n\times n} assumed unit lower triangular
+* V_2 \in \R^{m-n\times n}
+*
+* Compute T = T*V_1**T
+*
+ CALL DTRTRM('Right', 'Upper', 'Transpose', 'Non-unit', 'Unit',
+ $ N, ONE, Q, LDQ, Q, LDQ)
+*
+* Compute Q = -VT. This means that we need to break apart
+* Our computation in two parts
+*
+* |--------|
+* Q = | -V_1*T |
+* | -V_2*T |
+* |--------|
+*
+* Q_2 = -V_2*T (TRMM) but only when necessary
+*
+ IF (M.GT.N) THEN
+ CALL DTRMM('Right', 'Upper', 'No Transpose', 'Non-unit',
+ $ M-N, N, NEG_ONE, Q, LDQ, Q(N+1,1), LDQ)
+ END IF
+*
+* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL DLUMM('Left', 'Unit', 'Non-Unit', N, NEG_ONE, Q, LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,I) = Q(I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/dorglk.f b/SRC/dorglk.f
new file mode 100644
index 0000000000..e68939aa8c
--- /dev/null
+++ b/SRC/dorglk.f
@@ -0,0 +1,149 @@
+*> \brief \b DORGLK computes the explicit Q factor from DGELQF and DLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DORGLK(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DORGLK generates an m by n real matrix Q with orthonormal columns,
+*> which is defined as the first n rows of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V'*T*V = H(1) H(2) . . . H(n)
+*>
+*> Where V is an m by n matrix whose rows are householder reflectors
+*> as returned by DGELQF and T is the n by n matrix returned by DLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V, and the order of T. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is DOUBLE PRECISION array, dimension (LDQ,N)
+*> On entry, the lower triangular part and diagonal contains
+*> The array T as returned from DLARFT. In addition, the
+*> strictly upper triangular portion of the i-th row contains
+*> the vector which defines the elementary reflector H(i),
+*> for i = 1,2,...,m, as returned by DGELQF
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ SUBROUTINE DORGLK(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION NEG_ONE, ONE
+ PARAMETER(NEG_ONE=-1.0D+0, ONE=1.0D+0)
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL DTRMM, DTRTRM, DLUMM
+* ..
+* .. Intrinsic Functions..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |-----|
+* Q = | T V |
+* |-----|
+*
+* Where T is an m-by-m lower triangular matrix, and V is an
+* m-by-n assumed unit upper trapezoidal matrix
+*
+* In turn, break apart V as follows
+*
+* |---------|
+* V = | V_1 V_2 |
+* |---------|
+*
+* Where:
+*
+* V_1 \in \R^{m\times m} assumed unit upper triangular
+* V_2 \in \R^{m\times n-m}
+*
+* Compute T = V_1'*T
+*
+ CALL DTRTRM('Left', 'Lower', 'Transpose', 'Non-unit', 'Unit',
+ $ M, ONE, Q, LDQ, Q, LDQ)
+*
+* Compute Q = -TV. This means that we need to break apart
+* Our computation in two parts
+*
+* |---------------|
+* Q = | -T*V_1 -T*V_2 |
+* |---------------|
+*
+* Q_2 = -T*V_2 (TRMM) but only when necessary
+*
+ IF (N.GT.M) THEN
+ CALL DTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ M, N-M, NEG_ONE, Q, LDQ, Q(1,M+1), LDQ)
+ END IF
+*
+* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL DLUMM('Left', 'Non-unit', 'Unit', M, NEG_ONE, Q, LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,I) = Q(I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/dorglq.f b/SRC/dorglq.f
index 71ff93fa66..152aa11c6a 100644
--- a/SRC/dorglq.f
+++ b/SRC/dorglq.f
@@ -5,6 +5,7 @@
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
+*> \htmlonly
*> Download DORGLQ + dependencies
*>
*> [TGZ]
@@ -12,6 +13,7 @@
*> [ZIP]
*>
*> [TXT]
+*> \endhtmlonly
*
* Definition:
* ===========
@@ -136,17 +138,14 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- DOUBLE PRECISION ZERO
- PARAMETER ( ZERO = 0.0D+0 )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LDWORK, LWKOPT,
+ $ NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL DLARFB, DLARFT, DORGL2, XERBLA
+ EXTERNAL DLARFB0C2, DLARFT, DORGL2,
+ $ DORGLK, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -161,7 +160,8 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'DORGLQ', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, M )*NB
+ ! Only need a workspace for calls to dorgl2
+ LWKOPT = MAX( 1, M )
WORK( 1 ) = LWKOPT
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -190,94 +190,92 @@ SUBROUTINE DORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX( 0, ILAENV( 3, 'DORGLQ', ' ', M, N, K, -1 ) )
IWS = M
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
*
- NX = MAX( 0, ILAENV( 3, 'DORGLQ', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Handle the first block assuming we are applying to the
+* identity, then resume regular blocking method after
*
- LDWORK = M
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked version
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'DORGLQ', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL DORGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
+ I = KK + 1
+ IB = NB
*
-* Use blocked code after the last block.
-* The first kk rows are handled by the block method.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ CALL DLARFT( 'Forward', 'Transpose', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), A( I, I ), LDA )
*
-* Set A(kk+1:m,1:kk) to zero.
+* Apply H to A(i+ib:m,i:n) from the right
*
- DO 20 J = 1, KK
- DO 10 I = KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL DLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I+IB,I), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to columns i:n of current block
+
+ CALL DORGLK( IB, N-I+1, A( I, I ), LDA)
*
- IF( KK.LT.M )
- $ CALL DORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+* Use blocked code
*
- IF( KK.GT.0 ) THEN
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
-* Use blocked code
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.M ) THEN
+ CALL DLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I),
+ $ LDA, TAU( I ), A( I, I ), LDA )
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+* Apply H to A(i+ib:m,i:n) from the right
*
- CALL DLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
- $ I ),
- $ LDA, TAU( I ), WORK, LDWORK )
+ CALL DLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I+IB,I), LDA)
*
-* Apply H**T to A(i+ib:m,i:n) from the right
+* Apply H to columns i:n of current block
*
- CALL DLARFB( 'Right', 'Transpose', 'Forward',
- $ 'Rowwise',
- $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK,
- $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ),
- $ LDWORK )
- END IF
+ CALL DORGLK( IB, N-I+1, A( I, I ), LDA)
+ END DO
*
-* Apply H**T to columns i:n of current block
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
*
- CALL DORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
-* Set columns 1:i-1 of current block to zero
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 40 J = 1, I - 1
- DO 30 L = I, I + IB - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL DLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I),
+ $ LDA, TAU( I ), A( I, I ), LDA )
+*
+* Apply H to A(i+ib:m,i:n) from the right
+*
+ CALL DLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I+IB,I), LDA)
+*
+* Apply H to columns i:n of current block
+*
+ CALL DORGLK( IB, N-I+1, A( I, I ), LDA)
+ END IF
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/dorgql.f b/SRC/dorgql.f
index c7bba63c22..7eb8cd6468 100644
--- a/SRC/dorgql.f
+++ b/SRC/dorgql.f
@@ -5,6 +5,7 @@
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
+*> \htmlonly
*> Download DORGQL + dependencies
*>
*> [TGZ]
@@ -12,6 +13,7 @@
*> [ZIP]
*>
*> [TXT]
+*> \endhtmlonly
*
* Definition:
* ===========
@@ -95,8 +97,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -137,17 +137,13 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- DOUBLE PRECISION ZERO
- PARAMETER ( ZERO = 0.0D+0 )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
- $ NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN
* ..
* .. External Subroutines ..
- EXTERNAL DLARFB, DLARFT, DORG2L, XERBLA
+ EXTERNAL DLARFB0C2, DLARFT, DORG2L,
+ $ DORGKL, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -177,7 +173,8 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'DORGQL', ' ', M, N, K, -1 )
- LWKOPT = N*NB
+ ! Only need a workspace for calls to dorg2l
+ LWKOPT = N
END IF
WORK( 1 ) = LWKOPT
*
@@ -200,88 +197,75 @@ SUBROUTINE DORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
*
- NX = MAX( 0, ILAENV( 3, 'DORGQL', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* We want to use the blocking method as long as our matrix is big enough
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KK = K
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Possibly bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'DORGQL', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL DORG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+* Use our blocked code for everything
+*
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the first block.
-* The last kk columns are handled by the block method.
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+ I = 1
+ IB = NB
*
-* Set A(m-kk+1:m,1:n-kk) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- DO 20 J = 1, N - KK
- DO 10 I = M - KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA)
*
-* Use unblocked code for the first or only block.
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
*
- CALL DORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward',
+ $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I),
+ $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
-* Use blocked code
+ CALL DORGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA)
+
+* Use blocked code on the remaining blocks if there are any.
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- IF( N-K+I.GT.1 ) THEN
+ DO I = NB+1, K, NB
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* The last block may be less than size NB
*
- CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
- $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+ IB = MIN(NB, K-I+1)
*
-* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL DLARFB( 'Left', 'No transpose', 'Backward',
- $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
- $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL DLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA )
*
-* Apply H to rows 1:m-k+i+ib-1 of current block
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
*
- CALL DORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
- $ TAU( I ), WORK, IINFO )
+ CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
+ $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
-* Set rows m-k+i+ib:m of current block to zero
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
- DO 40 J = N - K + I, N - K + I + IB - 1
- DO 30 L = M - K + I + IB, M
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL DORGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA)
+ END DO
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/dorgqr.f b/SRC/dorgqr.f
index 83e26588e3..c09b48990f 100644
--- a/SRC/dorgqr.f
+++ b/SRC/dorgqr.f
@@ -5,6 +5,7 @@
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
+*> \htmlonly
*> Download DORGQR + dependencies
*>
*> [TGZ]
@@ -12,6 +13,7 @@
*> [ZIP]
*>
*> [TXT]
+*> \endhtmlonly
*
* Definition:
* ===========
@@ -95,8 +97,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -119,7 +119,7 @@
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
-*> \ingroup ungqr
+*> \ingroup doubleOTHERcomputational
*
* =====================================================================
SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -137,20 +137,17 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- DOUBLE PRECISION ZERO
- PARAMETER ( ZERO = 0.0D+0 )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, KI, KK, LWKOPT,
+ $ NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL DLARFB, DLARFT, DORG2R, XERBLA
+ EXTERNAL DLARFB0C2, DLARFT, DORG2R,
+ $ DORGKR, XERBLA
* ..
* .. Intrinsic Functions ..
- INTRINSIC MAX, MIN
+ INTRINSIC MAX
* ..
* .. External Functions ..
INTEGER ILAENV
@@ -162,7 +159,10 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'DORGQR', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, N )*NB
+*
+* Only need a workspace for dorg2r in case of bail out
+*
+ LWKOPT = MAX( 1, N )
WORK( 1 ) = LWKOPT
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -191,95 +191,93 @@ SUBROUTINE DORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
- IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
+ NX = MAX(0, ILAENV(3, 'DORGQR', ' ', M, N, K, -1))
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'DORGQR', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Handle the first block assuming we are applying to the
+* identity, then resume regular blocking method after
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'DORGQR', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL DORG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
+ I = KK + 1
+ IB = NB
*
-* Use blocked code after the last block.
-* The first kk columns are handled by the block method.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
-* Set A(1:kk,kk+1:n) to zero.
+* Apply H to A(i:m,i+ib:n) from the left
*
- DO 20 J = KK + 1, N
- DO 10 I = 1, KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL DLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+ $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I,I+IB), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to rows i:m of current block
*
- IF( KK.LT.N )
- $ CALL DORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL DORGKR(M-I+1, IB, A(I,I), LDA)
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
- IF( KK.GT.0 ) THEN
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Use blocked code
+ CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.N ) THEN
+* Apply H to A(i:m,i+ib:n) from the left
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+ CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
*
- CALL DLARFT( 'Forward', 'Columnwise', M-I+1, IB,
- $ A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+* Apply H to rows i:m of current block
*
-* Apply H to A(i:m,i+ib:n) from the left
+ CALL DORGKR(M-I+1, IB, A(I,I), LDA)
+ END DO
*
- CALL DLARFB( 'Left', 'No transpose', 'Forward',
- $ 'Columnwise', M-I+1, N-I-IB+1, IB,
- $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
- $ LDA, WORK( IB+1 ), LDWORK )
- END IF
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
*
-* Apply H to rows i:m of current block
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
- CALL DORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Set rows 1:i-1 of current block to zero
+ CALL DLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- DO 40 J = I, I + IB - 1
- DO 30 L = 1, I - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL DLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+* Apply H to rows i:m of current block
+*
+ CALL DORGKR(M-I+1, IB, A(I,I), LDA)
+ END IF
END IF
*
- WORK( 1 ) = IWS
+ WORK( 1 ) = N
RETURN
*
* End of DORGQR
diff --git a/SRC/dorgrk.f b/SRC/dorgrk.f
new file mode 100644
index 0000000000..4017352d3f
--- /dev/null
+++ b/SRC/dorgrk.f
@@ -0,0 +1,168 @@
+*> \brief \b DORGRK computes the explicit Q factor from DGERQF and DLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE DORGRK(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DORGRK generates an m by n real matrix Q with orthonormal rows,
+*> which is defined as the last m rows of the product of m
+*> elementary reflectors
+*>
+*> Q = I - V'*T*V = H(m) . . . H(2) H(1)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by DGERQF and T is the n by n matrix returned by DLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V, and the order of T.
+*> M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is DOUBLE PRECISION array, dimension (LDQ,N)
+*> On entry, Q(i,1:n-m-1+i) contains the vector which defines the
+*> elementary reflector H(i), for i=1,...,n as returned by DGERQF.
+*> In addition, the upper triangular portion of the submatrix given
+*> by Q(1:m,n-m:n) will contain the array T as returned by DLARFT.
+*> See further details for more information.
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The storage of the V and T components inside Q is best illustrated by
+*> the following example with m = 3, n = 5.
+*>
+*> Q = |----------------|
+*> | V1 V1 T1 T1 T1 |
+*> | V2 V2 V2 T2 T2 |
+*> | V3 V3 V3 V3 T3 |
+*> |----------------|
+*>
+*> \endverbatim
+*>
+* =====================================================================
+
+ SUBROUTINE DORGRK(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ DOUBLE PRECISION NEG_ONE, ONE
+ PARAMETER(NEG_ONE=-1.0D+0, ONE=1.0D+0)
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL DTRMM, DTRTRM, DLUMM
+* ..
+* .. Intrinsic Functions..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |-----|
+* Q = | V T |
+* |-----|
+*
+* Where T is an m-by-m upper triangular matrix, and V is as described
+* in the Further Details section
+*
+* In turn, break apart V as follows
+*
+* |---------|
+* V = | V_2 V_1 |
+* |---------|
+*
+* Where:
+*
+* V_1 \in \R^{m\times m} assumed unit lower triangular
+* V_2 \in \R^{m\times n-m}
+*
+* Compute T = V_1'*T
+*
+ CALL DTRTRM('Left', 'Upper', 'Transpose', 'Non-Unit', 'Unit',
+ $ M, ONE, Q(1,N-M+1), LDQ, Q(1,N-M+1), LDQ)
+*
+* Compute Q = -TV. This means that we need to break apart
+* Our computation in two parts
+*
+* |---------------|
+* Q = | -T*V_2 -T*V_1 |
+* |---------------|
+*
+* Q_2 = -T*V_2 (TRMM) but only when necessary
+*
+ IF (N.GT.M) THEN
+ CALL DTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit',
+ $ M, N-M, NEG_ONE, Q(1,N-M+1), LDQ, Q, LDQ)
+ END IF
+*
+* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL DLUMM('Right', 'Unit', 'Non-Unit', M, NEG_ONE,
+ $ Q(1,N-M+1), LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,N-M+I) = Q(I,N-M+I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/dorgrq.f b/SRC/dorgrq.f
index 126489bbe9..9b945df4ea 100644
--- a/SRC/dorgrq.f
+++ b/SRC/dorgrq.f
@@ -5,6 +5,7 @@
* Online html documentation available at
* http://www.netlib.org/lapack/explore-html/
*
+*> \htmlonly
*> Download DORGRQ + dependencies
*>
*> [TGZ]
@@ -12,6 +13,7 @@
*> [ZIP]
*>
*> [TXT]
+*> \endhtmlonly
*
* Definition:
* ===========
@@ -137,17 +139,14 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- DOUBLE PRECISION ZERO
- PARAMETER ( ZERO = 0.0D+0 )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+ INTEGER I, IB, II, IINFO, IWS, KK, LDWORK,
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL DLARFB, DLARFT, DORGR2, XERBLA
+ EXTERNAL DLARFB0C2, DLARFT, DORGR2,
+ $ DORGRK, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -177,7 +176,8 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'DORGRQ', ' ', M, N, K, -1 )
- LWKOPT = M*NB
+ ! Only need a workspace for calls to dorgr2
+ LWKOPT = M
END IF
WORK( 1 ) = LWKOPT
*
@@ -207,83 +207,72 @@ SUBROUTINE DORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
* Determine when to cross over from blocked to unblocked code.
*
NX = MAX( 0, ILAENV( 3, 'DORGRQ', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
-*
-* Determine if workspace is large enough for blocked code.
-*
- LDWORK = M
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
-*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
-*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'DORGRQ', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
END IF
*
IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Use blocked code after the first block.
-* The last kk rows are handled by the block method.
-*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+* We want to use the blocking method as long as our matrix is big enough
+* and it's deemed worthwhile
*
-* Set A(1:m-kk,n-kk+1:n) to zero.
-*
- DO 20 J = N - KK + 1, N
- DO 10 I = 1, M - KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KK = K
ELSE
KK = 0
END IF
*
-* Use unblocked code for the first or only block.
+* Potentially bail to the unblocked code
*
- CALL DORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL DORGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
*
-* Use blocked code
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- II = M - K + I
- IF( II.GT.1 ) THEN
+ I = 1
+ IB = NB
+ II = M - K + I
+*
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
+*
+ CALL DLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA )
+*
+* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+ CALL DLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Backward',
+ $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA,
+ $ A( II, N-K+I ), LDA, A, LDA)
*
- CALL DLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
- $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+* Apply H to columns 1:n-k+i+ib-1 of current block
+*
+ CALL DORGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA )
+
+ DO I = NB + 1, K, NB
+*
+* The last block may be less than size NB
+*
+ IB = MIN(NB, K-I+1)
+ II = M - K + I
*
-* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL DLARFB( 'Right', 'Transpose', 'Backward',
- $ 'Rowwise',
- $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK,
- $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK )
- END IF
+ CALL DLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA )
*
-* Apply H**T to columns 1:n-k+i+ib-1 of current block
+* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL DORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
- $ TAU( I ),
- $ WORK, IINFO )
+ CALL DLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1),
+ $ LDA, A( II, N-K+I ), LDA, A, LDA)
*
-* Set columns n-k+i+ib:n of current block to zero
+* Apply H to columns 1:n-k+i+ib-1 of current block
*
- DO 40 L = N - K + I + IB, N
- DO 30 J = II, II + IB - 1
- A( J, L ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL DORGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA )
+ END DO
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/dtrmmoop.f b/SRC/dtrmmoop.f
new file mode 100644
index 0000000000..3052d2eef3
--- /dev/null
+++ b/SRC/dtrmmoop.f
@@ -0,0 +1,1625 @@
+*> \brief \b DTRMMOOP computes an out of place triangular times general matrix multiplication
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB,
+* $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* .. Scalar Arguments ..
+* DOUBLE PRECISION ALPHA, BETA
+* INTEGER M, N, LDA, LDB, LDC
+* CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION A(LDA,*), B(LDB,*), C(LDC,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DTRMMOOP performs one of the matrix-matrix operations
+*>
+*> C = \alpha op(A) * op(B) + \beta C
+*> or
+*> C = \alpha op(B) * op(A) + \beta C
+*>
+*> where \alpha and \beta are scalars, C is an m-by-n matrix, A is
+*> a unit, or non-unit, upper or lower triangular matrix, and op(A) is
+*> is one of
+*>
+*> op(A) = A or op(A) = A**T
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> On entry, SIDE specifies whether op(A) multiplies op(B) from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' C = \alpha op(A) * op(B) + \beta C
+*>
+*> SIDE = 'R' or 'r' C = \alpha op(B) * op(A) + \beta C
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> On entry, UPLO specifies whether the matrix A is an upper or
+*> lower triangular matrix as follows:
+*> UPLO = 'U' or 'u' A is upper triangular
+*>
+*> UPLO = 'L' or 'l' A is lower triangular
+*> \Endverbatim
+*>
+*> \param[in] TRANSA
+*> \verbatim
+*> TRANSA is CHARACTER*1
+*> On entry, TRANSA specifies the form of op(A) to be used in
+*> the matrix multiplication as follows:
+*> TRANSA = 'N' or 'n' op(A) = A
+*>
+*> TRANSA = 'T' or 't' op(A) = A**T
+*>
+*> TRANSA = 'C' or 'c' op(A) = A**T
+*> \endverbatim
+*>
+*> \param[in] TRANSB
+*> \verbatim
+*> TRANSB is CHARACTER*1
+*> On entry, TRANSB specifies the form of op(B) to be used in
+*> the matrix multiplication as follows:
+*> TRANSB = 'N' or 'n' op(B) = B
+*>
+*> TRANSB = 'T' or 't' op(B) = B**T
+*>
+*> TRANSB = 'C' or 'c' op(B) = B**T
+*> \endverbatim
+*>
+*> \param[in] DIAG
+*> \verbatim
+*> DIAG is CHARACTER*1
+*> On entry, DIAG specifies whether or not A is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' A is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' A is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> On entry, M specifies the number of rows of C. M must be at
+*> least zero.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> On entry, N specifies the number of columns of C. N must be
+*> at least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is DOUBLE PRECISION.
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then A and B are not referenced, and A and B need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*> A is DOUBLE PRECISION array, dimension ( LDA, K ) where
+*> K is M when SIDE = 'L' and K is N when SIDE='R'
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array A must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> A is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array A must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> A is not referenced.
+*> Note that when DIAG = 'U' or 'u', the diagonal elements of
+*> A are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> On entry, LDA specifies the first dimension of A as declared
+*> in the calling (sub) program. When SIDE = 'L' or 'l' then
+*> LDA must be at least max( 1, m ), when SIDE = 'R' or 'r'
+*> then LDA must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*> B is DOUBLE PRECISION array, dimension ( LDB, K ), where K is M
+*> If SIDE='R' and TRANSA='N', or SIDE='L' and TRANSA='T' and N
+*> otherwise. On entry, the leading k-by-k submatrix must contain
+*> B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*> LDB is INTEGER
+*> On entry, LDB specifies the first dimension of B as declared
+*> in the calling (sub) program. When SIDE = 'R' and TRANSB='N'
+*> then LDB must be at least max( 1, m ), when SIDE = 'R'
+*> and TRANSB = 'T' then LDB must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] BETA
+*> \verbatim
+*> BETA is DOUBLE PRECISION.
+*> On entry, BETA specifies the scalar beta. When beta is
+*> zero then C is not referenced on entry, and C need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is DOUBLE PRECISION array, dimension ( LDC, N )
+*> Before entry, the leading m-by-n part of the array C must
+*> contain the matrix C, and on exit is overwritten by the
+*> transformed matrix.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> On entry, LDC specifies the first dimension of C as declared
+*> in the calling (sub) program. LDC must be at least
+*> max( 1, m ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ RECURSIVE SUBROUTINE DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB,
+ $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* .. Scalar Arguments ..
+ DOUBLE PRECISION ALPHA, BETA
+ INTEGER M, N, LDA, LDB, LDC
+ CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION A(LDA,*), B(LDB,*), C(LDC,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ DOUBLE PRECISION DDOT
+ EXTERNAL LSAME, DDOT
+* ..
+* .. External Subroutines ..
+ EXTERNAL DGEMM, DAXPY, DLASET, DSCAL
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Local Scalars ..
+ INTEGER I, L, K, INCB
+ LOGICAL LSIDE, UPPER, UNIT, TRANST, TRANSG
+* ..
+* .. Local Parameters ..
+ DOUBLE PRECISION ONE, ZERO
+ PARAMETER(ONE=1.0D+0, ZERO=0.0D+0)
+* ..
+*
+* Beginning of Executable Statements
+*
+ LSIDE = LSAME(SIDE, 'L')
+ UPPER = LSAME(UPLO, 'U')
+ ! If we are transposing the triangular matrix (A)
+ TRANST= LSAME(TRANSA, 'T').OR.LSAME(TRANSA, 'C')
+ ! If we are transposing the general matrix (B)
+ TRANSG= LSAME(TRANSB, 'T').OR.LSAME(TRANSB, 'C')
+*
+* Terminating Case
+*
+ UNIT = LSAME(DIAG, 'U')
+ IF (M.EQ.1.AND.N.EQ.1) THEN
+*
+* This case is the simplest as we are just computing C = \alpha A*B +
+* \beta C where all components are 1-by-1 matrices
+*
+
+ IF (BETA.EQ.ZERO) THEN
+ C(1,1) = ZERO
+ ELSE
+ C(1,1) = C(1,1) * BETA
+ END IF
+ IF(ALPHA.NE.ZERO) THEN
+ IF(UNIT) THEN
+ C(1,1) = C(1,1) + ALPHA*B(1,1)
+ ELSE
+ C(1,1) = C(1,1) + ALPHA*A(1,1)*B(1,1)
+ END IF
+ END IF
+ RETURN
+ ELSE IF (M.EQ.1) THEN
+*
+* This means that C is a row vector. If BETA is 0, then we
+* set it explicitly, otherwise we overwrite it with BETA*C
+*
+ IF (BETA.EQ.ZERO) THEN
+ ! This ensures we don't reference C unless we need to
+ CALL DLASET('All', M, N, ZERO, ZERO, C, LDC)
+ ELSE
+ CALL DSCAL(N, BETA, C, LDC)
+ END IF
+ IF (ALPHA.NE.ZERO) THEN
+*
+* Recall that the number of columns of A is determined by SIDE
+*
+ IF (LSIDE) THEN
+*
+* Determine if B is a row or column vector
+*
+ IF (TRANSG) THEN
+ INCB = 1 ! This means that B is a column vector
+ ELSE
+ INCB = LDB ! This means that B is a row vector
+ END IF
+*
+* This means that A is a scalar, so it is either assumed to be
+* ONE or explicitly stored in A(1,1)
+*
+ IF (UNIT) THEN
+ CALL DAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ CALL DAXPY(N, ALPHA * A(1,1), B, INCB, C, LDC)
+ END IF
+ ELSE ! A is on the right
+*
+* Determine if B is a row or column vector
+*
+ IF (TRANSG) THEN
+ INCB = 1 ! This means that B is a column vector
+ ELSE
+ INCB = LDB ! This means that B is a row vector
+ END IF
+*
+* This means that A is an n-by-n matrix
+*
+ IF (UPPER) THEN
+ IF (TRANST) THEN
+ IF (TRANSG) THEN
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(N-I,
+ $ B(I+1,1), 1, A(I,I+1), LDA) +
+ $ C(1,I)
+ END DO
+ CALL DAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(N-I+1,
+ $ B(I,1), 1, A(I,I), LDA) +
+ $ C(1,I)
+ END DO
+ END IF
+ ELSE ! Not transposing B
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(N-I,
+ $ B(1,I+1), LDB, A(I,I+1), LDA) +
+ $ C(1,I)
+ END DO
+ CALL DAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(N-I+1,
+ $ B(1,I), LDB, A(I,I), LDA) +
+ $ C(1,I)
+ END DO
+ END IF
+ END IF
+ ELSE ! Not transposing A
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(I-1, B, INCB,
+ $ A(1,I), 1) + C(1,I)
+ END DO
+
+ CALL DAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(I, B, INCB,
+ $ A(1,I), 1) + C(1,I)
+ END DO
+ END IF
+ END IF
+ ELSE ! A is lower
+ IF (TRANST) THEN
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(I-1, B, INCB,
+ $ A(I,1), LDA) + C(1,I)
+ END DO
+
+ CALL DAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(I, B, INCB,
+ $ A(I,1), LDA) + C(1,I)
+ END DO
+ END IF
+ ELSE ! A is not transposed
+ IF (TRANSG) THEN
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(N-I,
+ $ B(I+1,1), 1, A(I+1,I), 1) +
+ $ C(1,I)
+ END DO
+ CALL DAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(N-I+1,
+ $ B(I,1), 1, A(I,I), 1) +
+ $ C(1,I)
+ END DO
+ END IF
+ ELSE! B is not transposed
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(N-I,
+ $ B(1,I+1), LDB, A(I+1,I), 1) +
+ $ C(1,I)
+ END DO
+ CALL DAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * DDOT(N-I+1,
+ $ B(1,I), LDB, A(I,I), 1) +
+ $ C(1,I)
+ END DO
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ RETURN
+ ELSE IF (N.EQ.1) THEN
+*
+* This means that C is a column vector. If BETA is 0, then we
+* set it explicitly, otherwise we overwrite it with BETA*C
+*
+ IF (BETA.EQ.ZERO) THEN
+ ! This ensures we don't reference C unless we need to
+ CALL DLASET('All', M, N, ZERO, ZERO, C, LDC)
+ ELSE
+ CALL DSCAL(M, BETA, C, 1)
+ END IF
+
+ IF (ALPHA.NE.ZERO) THEN
+ IF (TRANSG) THEN
+ INCB = LDB ! B is a row vector
+ ELSE
+ INCB = 1 ! B is a column vector
+ END IF
+ IF (LSIDE) THEN
+ IF (UPPER) THEN
+ IF (TRANST) THEN
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(I-1, A(1, I),
+ $ 1, B, INCB) + C(I,1)
+ END DO
+ CALL DAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(I, A(1, I), 1,
+ $ B, INCB) + C(I,1)
+ END DO
+ END IF
+ ELSE ! A is not transposed
+ IF (TRANSG) THEN
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(M-I,
+ $ A(I,I+1), LDA, B(1, I+1), LDB) +
+ $ C(I,1)
+ END DO
+
+ CALL DAXPY(M, ALPHA, B, LDB, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(M-I+1,
+ $ A(I,I), LDA, B(1,I), LDB) +
+ $ C(I,1)
+ END DO
+ END IF
+ ELSE ! B is not transposed
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(M-I,
+ $ A(I,I+1), LDA, B(I+1,1), 1) +
+ $ C(I,1)
+ END DO
+
+ CALL DAXPY(M, ALPHA, B, 1, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(M-I+1,
+ $ A(I,I), LDA, B(I,1), 1) +
+ $ C(I,1)
+ END DO
+ END IF
+ END IF
+ END IF
+ ELSE ! A is lower
+ IF (TRANST) THEN
+ IF (TRANSG) THEN
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(M-I,
+ $ A(I+1,I), 1, B(1,I+1), LDB) +
+ $ C(I,1)
+ END DO
+
+ CALL DAXPY(M, ALPHA, B, LDB, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(M-I+1,
+ $ A(I,I), 1, B(1,I), LDB) +
+ $ C(I,1)
+ END DO
+ END IF
+ ELSE ! A is not transposed
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(M-I,
+ $ A(I+1,I), 1, B(I+1,1), 1) +
+ $ C(I,1)
+ END DO
+
+ CALL DAXPY(M, ALPHA, B, 1, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(M-I+1,
+ $ A(I,I), 1, B(I,1), 1) +
+ $ C(I,1)
+ END DO
+ END IF
+ END IF
+ ELSE ! A is not transposed
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(I-1, A(I,1),
+ $ LDA, B, INCB) + C(I,1)
+ END DO
+ CALL DAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * DDOT(I, A(I,1), LDA,
+ $ B, INCB) + C(I,1)
+ END DO
+ END IF
+ END IF
+ END IF
+ ELSE ! A is on the right
+ ! Since the trailing dimension of op(A) must be 1,
+ ! we know that A must be a scalar
+ IF (UNIT) THEN
+ CALL DAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+ CALL DAXPY(M, ALPHA*A(1,1), B, INCB, C, 1)
+ END IF
+ END IF
+ END IF
+ RETURN
+ END IF
+*
+* Recursive Case
+*
+ L = MIN(M,N)/2
+ IF (LSIDE) THEN
+*
+* We are multiplying A from the left IE we are computing
+* C = \alpha op(A)*op(B) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing
+*
+* So we are computing
+* C = \alpha A**T * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11}**T + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{21}**T + \beta C_{12}
+* C_{21} = \alpha A_{12}**T * B_{11}**T + \alpha A_{22}**T * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{12}**T * B_{21}**T + \alpha A_{22}**T * B_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{12}**T * B_{11}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22}**T * B_{12}**T + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{12}**T * B_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22}**T * B_{22}**T + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C,
+ $ LDC)
+ ! C_{12}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(1, L+1), LDA, B, LDB, BETA, C(L+1,1),
+ $ LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1,L+1), LDA, B(1,L+1),
+ $ LDB, ONE, C(L+1,1), LDC)
+ ! C_{22}
+ CALL DGEMM(TRANSA, TRANSB, M-L, N-L, L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1,1), LDB, BETA,
+ $ C(L+1,L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1,L+1), LDA,
+ $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11} + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{12} + \beta C_{12}
+* C_{21} = \alpha A_{12}**T * B_{11} + \alpha A_{22}**T * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{12}**T * B_{12} + \alpha A_{22}**T * B_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{12}**T * B_{11} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22}**T * B_{21} + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{12}**T * B_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22}**T * B_{22} + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ ! C_{21}
+ CALL DGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(1, L+1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DGEMM(TRANSA, TRANSB, M-L, N-L, L,
+ $ ALPHA, A(1, L+1), LDA, B(1, L+1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11}**T + \alpha A_{12} * B_{12}**T + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha A_{22} * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{22} * B_{22}**T + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{12} * B_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11} * B_{11}**T + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{12} * B_{22}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11} * B_{21}**T + C_{12} (This routine)
+*
+ ! C_{11}
+ CALL DGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(1, L+1), LDB, BETA, C, LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL DGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1,1), LDB, ONE,
+ $ C(1, L+1), LDC)
+ ! C_{21}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11} + \alpha A_{12} * B_{21} + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{12} + \alpha A_{12} * B_{22} + \beta C_{12}
+* C_{21} = \alpha A_{22} * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{22} * B_{22} + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{12} * B_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11} * B_{11} + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{12} * B_{22} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11} * B_{12} + C_{12} (This routine)
+*
+ ! C_{11}
+ CALL DGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, 1), LDB, BETA, C, LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL DGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11}**T + \alpha A_{21}**T * B_{12}**T + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{21}**T + \alpha A_{21}**T * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha A_{22}**T * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{22}**T * B_{22}**T + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{21}**T * B_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11}**T * B_{11}**T + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{21}**T * B_{22}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11}**T * B_{21}**T + C_{12} (This routine)
+*
+ ! C_{11}
+ CALL DGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(1, L+1), LDB, BETA, C, LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL DGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE,
+ $ C(1, L+1), LDC)
+ ! C_{21}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11} + \alpha A_{21}**T * B_{21} + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{12} + \alpha A_{21}**T * B_{22} + \beta C_{12}
+* C_{21} = \alpha A_{22}**T * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{22}**T * B_{22} + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{21}**T * B_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11}**T * B_{11} + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{21}**T * B_{22} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11}**T * B_{12} + C_{12} (This routine)
+*
+ ! C_{11}
+ CALL DGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, C, LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL DGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, ONE,
+ $ C(1, L+1), LDC)
+ ! C_{21}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11}**T + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{21}**T + \beta C_{12}
+* C_{21} = \alpha A_{21} * B_{11}**T + \alpha A_{22} * B_{12}**T + \beta * C_{21}
+* C_{22} = \alpha A_{21} * B_{21}**T + \alpha A_{22} * B_{22}**T + \beta * C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{21} * B_{11}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22} * B_{12}**T + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{21} * B_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22} * B_{22}**T + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DGEMM(TRANSA, TRANSB, M-L, N-L, L,
+ $ ALPHA, A(L+1, 1), LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11} + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{12} + \beta C_{12}
+* C_{21} = \alpha A_{21} * B_{11} + \alpha A_{22} * B_{21} + \beta * C_{21}
+* C_{22} = \alpha A_{21} * B_{12} + \alpha A_{22} * B_{22} + \beta * C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{21} * B_{11} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22} * B_{21} + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{21} * B_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22} * B_{22} + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, A(L+1, 1), LDA, B(1, L+1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ END IF
+ ELSE
+*
+* We are multiplying A from the right IE we are computing
+* C = \alpha op(B)*op(A) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11}**T + \alpha B_{21}**T * A_{12}**T + \beta C_{11}
+* C_{12} = \alpha B_{21}**T * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11}**T + \alpha B_{22}**T * A_{12}**T + \beta C_{21}
+* C_{22} = \alpha B_{22}**T * A_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{21}**T * A_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11}**T * A_{11}**T + C_{11} (This routine)
+*
+* C_{21} = \alpha B_{22}**T * A_{12}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{12}**T * A_{11}**T + C_{21} (This routine)
+*
+ ! C_{11}
+ CALL DGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(L+1, 1), LDB, A(1, L+1), LDA, BETA, C, LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11}**T + \alpha B_{12} * A_{12}**T + \beta C_{11}
+* C_{12} = \alpha B_{12} * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11}**T + \alpha B_{22} * A_{12}**T + \beta C_{21}
+* C_{22} = \alpha B_{22} * A_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{12} * A_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11} * A_{11}**T + C_{11} (This routine)
+*
+* C_{21} = \alpha B_{22} * A_{12}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{21} * A_{11}**T + C_{21} (This routine)
+*
+ ! C_{11}
+ CALL DGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(1,L+1), LDB, A(1,L+1), LDA, BETA, C, LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11}**T * A_{12} + \alpha B_{21}**T * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11} + \beta C_{21}
+* C_{22} = \alpha B_{12}**T * A_{12} + \alpha B_{22}**T * A_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11}**T * A_{12} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{21}**T * A_{22} + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{12}**T * A_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22}**T * A_{22} + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL DGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, B(1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11} * A_{12} + \alpha B_{12} * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11} + \beta C_{21}
+* C_{22} = \alpha B_{21} * A_{12} + \alpha B_{22} * A_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11} * A_{12} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{12} * A_{22} + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{21} * A_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22} * A_{22} + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL DGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, B(L+1, 1), LDB, A(1, L+1), LDA,
+ $ BETA, C(L+1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11}**T + \beta C_{11}
+* C_{12} = \alpha B_{11}**T * A_{21}**T + \alpha B_{21}**T * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11}**T + \beta C_{21}
+* C_{22} = \alpha B_{12}**T * A_{21}**T + \alpha B_{22}**T * A_{22}**T + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11}**T * A_{21}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{21}**T * A_{22}**T + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{12}**T * A_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22}**T * A_{22}**T + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL DGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA,
+ $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11} * A_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11}**T + \beta C_{21}
+* C_{22} = \alpha B_{21} * A_{21}**T + \alpha A_{22} * B_{22}**T + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11} * A_{21}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{12} * A_{22}**T + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{21} * A_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22} * A_{22}**T + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL DGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA,
+ $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11} + \alpha B_{21}**T * A_{21} + \beta C_{11}
+* C_{12} = \alpha B_{21}**T * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11} + \alpha B_{22}**T * A_{21} + \beta C_{21}
+* C_{22} = \alpha B_{22}**T * A_{22} + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{21}**T * A_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11}**T * A_{11} + C_{11}(This routine)
+*
+* C_{21} = \alpha B_{22}**T * A_{21} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{12}**T * A_{11} + C_{21} (This routine)
+*
+ ! C_{11}
+ CALL DGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, C, LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, ONE,
+ $ C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \alpha B_{12} * A_{21} + \beta C_{11}
+* C_{12} = \alpha B_{12} * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11} + \alpha B_{22} * A_{21} + \beta C_{21}
+* C_{22} = \alpha B_{22} * A_{22} + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{12} * A_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11} * A_{11} + C_{11}(This routine)
+*
+* C_{21} = \alpha B_{22} * A_{21} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{21} * A_{11} + C_{21} (This routine)
+*
+ ! C_{11}
+ CALL DGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, C, LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL DGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE,
+ $ C(L+1, 1), LDC)
+ ! C_{22}
+ CALL DTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ END IF
+ END IF
+ END SUBROUTINE
diff --git a/SRC/dtrtrm.f b/SRC/dtrtrm.f
new file mode 100644
index 0000000000..6a5aabb3b7
--- /dev/null
+++ b/SRC/dtrtrm.f
@@ -0,0 +1,577 @@
+*> \brief \b DTRTRM computes an in place triangular-triangular matrix product
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE DTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV,
+* $ N, ALPHA, T, LDT, V, LDV)
+*
+* .. Scalar Arguments ..
+* INTEGER N, LDT, LDV
+* CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV
+* DOUBLE PRECISION ALPHA
+* ..
+* .. Array Arguments ..
+* DOUBLE PRECISION T(LDT,*), V(LDV,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> DTRTRM performs one of the matrix-matrix operations
+*>
+*> T = \alpha op(V) * T
+*> or
+*> T = \alpha T * op(V)
+*> where \alpha is a scalar, T and V are unit, or non-unit, upper or
+*> lower triangular matrix, and op(V) is one of
+*>
+*> op(V) = V or op(V) = V**T
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> On entry, SIDE specifies whether op(V) multiplies T from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' T = \alpha op(V) * T
+*>
+*> SIDE = 'R' or 'r' T = \alpha T * op(V)
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> On entry, UPLO specifies whether the matrix T is an upper or
+*> lower triangular matrix as follows:
+*> UPLO = 'U' or 'u' T is upper triangular
+*>
+*> UPLO = 'L' or 'l' T is lower triangular
+*> \Endverbatim
+*>
+*> \param[in] TRANSV
+*> \verbatim
+*> TRANSV is CHARACTER*1
+*> On entry, TRANSV specifies the form of op(V) to be used in
+*> the matrix multiplication as follows:
+*> TRANSV = 'N' or 'n' op(V) = V
+*>
+*> TRANSV = 'T' or 't' op(V) = V**T
+*>
+*> TRANSV = 'C' or 'c' op(V) = V**T
+*> \endverbatim
+*>
+*> \param[in] DIAGT
+*> \verbatim
+*> DIAGT is CHARACTER*1
+*> On entry, DIAGT specifies whether or not T is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' T is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' T is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] DIAGV
+*> \verbatim
+*> DIAGV is CHARACTER*1
+*> On entry, DIAGV specifies whether or not V is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' V is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' V is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> On entry, N specifies the number of rows and columns of T.
+*> N must be at least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is DOUBLE PRECISION.
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then T and V are not referenced, and T and V need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is DOUBLE PRECISION array, dimension ( LDT, N )
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array T must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> T is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array T must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> T is not referenced.
+*> Note that when DIAGT = 'U' or 'u', the diagonal elements of
+*> T are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> On entry, LDT specifies the first dimension of T as declared
+*> in the calling (sub) program. LDT must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is DOUBLE PRECISION array, dimension ( LDV, N )
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array op(V) must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> V is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array op(V) must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> V is not referenced.
+*> Note that when DIAGV = 'U' or 'u', the diagonal elements of
+*> V are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> On entry, LDV specifies the first dimension of T as declared
+*> in the calling (sub) program. LDV must be at least max( 1, n ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ RECURSIVE SUBROUTINE DTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV,
+ $ N, ALPHA, T, LDT, V, LDV)
+*
+* .. Scalar Arguments ..
+ INTEGER N, LDT, LDV
+ CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV
+ DOUBLE PRECISION ALPHA
+* ..
+* .. Array Arguments ..
+ DOUBLE PRECISION T(LDT,*), V(LDV,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* .. External Subroutines ..
+ EXTERNAL DTRMM, DTRMMOOP, DLASET
+* ..
+* .. Local Scalars ..
+ INTEGER K, INFO
+ LOGICAL TLEFT, TUPPER, VTRANS, VUNIT, TUNIT
+* ..
+* .. Local Parameters ..
+ DOUBLE PRECISION ONE, ZERO
+ PARAMETER(ONE=1.0D+0, ZERO=0.0D+0)
+* ..
+*
+* Beginning of Executable Statements
+*
+*
+* Early Termination Criteria
+*
+ IF (ALPHA.EQ.ZERO) THEN
+*
+* If ALPHA is 0, then we are just setting T to be the 0 matrix
+*
+ CALL DLASET(UPLO, N, N, ZERO, ZERO, T, LDT)
+ RETURN
+ END IF
+ TUNIT = LSAME(DIAGT, 'U')
+ VUNIT = LSAME(DIAGV, 'U')
+*
+* Terminating Case
+*
+ IF (N.EQ.1) THEN
+ IF (VUNIT.AND.TUNIT) THEN
+ T(1,1) = ALPHA
+ ELSE IF (VUNIT) THEN
+ T(1,1) = ALPHA*T(1,1)
+ ELSE IF (TUNIT) THEN
+ T(1,1) = ALPHA*V(1,1)
+ ELSE
+ T(1,1) = ALPHA*T(1,1)*V(1,1)
+ END IF
+ RETURN
+ ELSE IF(N.LE.0) THEN
+ RETURN
+ END IF
+*
+* Recursive case
+*
+ TUPPER = LSAME(UPLO,'U')
+ TLEFT = LSAME(SIDE,'R')
+ VTRANS = LSAME(TRANSV,'T').OR.LSAME(TRANSV,'C')
+
+ K = N / 2
+ IF(TUPPER) THEN
+*
+* T is upper triangular
+*
+ IF(TLEFT) THEN
+*
+* Compute T = T*op(V)
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = T*V**T, which we break down as follows
+* |--------------| |--------------| |--------------------|
+* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11}**T V_{21}**T|
+* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}**T|
+* |--------------| |--------------| |--------------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}**T
+* T_{12} = \alpha T_{11}*V_{21}**T + \alpha T_{12}*V_{22}**T
+* T_{22} = \alpha T_{22}*V_{22}**T
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha T_{12}*V_{22}**T (DTRMM)
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} (DTRMMOOP)
+*
+* T_{12} = \alpha T_{12}*V_{22}**T
+*
+ CALL DTRMM('Right', 'Lower', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12}
+*
+ CALL DTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT,
+ $ V(K+1, 1), LDV, ONE, T(1, K+1), LDT)
+ ELSE
+*
+* We are computing T = T*V, which we break down as follows
+* |--------------| |--------------| |-------------|
+* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11} V_{12}|
+* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}|
+* |--------------| |--------------| |-------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}
+* T_{12} = \alpha T_{11}*V_{12} + \alpha T_{12}*V_{22}
+* T_{22} = \alpha T_{22}*V_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha T_{12}*V_{22} (DTRMM)
+* T_{12} = \alpha T_{11}*V_{12} + T_{12} (DTRMMOOP)
+*
+* T_{12} = \alpha T_{12}*V_{22}
+*
+ CALL DTRMM('Right', 'Upper', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12}
+*
+ CALL DTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT,
+ $ V(1, K+1), LDV, ONE, T(1, K+1), LDT)
+ END IF
+ ELSE
+*
+* Compute T = op(V)*T
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = V**T*T, which we break down as follows
+* |--------------| |--------------------| |--------------|
+* |T_{11} T_{12}| |V_{11}**T V_{21}**T| |T_{11} T_{12}|
+* |0 T_{22}| = \alpha |0 V_{22}**T| * |0 T_{22}|
+* |--------------| |--------------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}**T*T_{11}
+* T_{12} = \alpha V_{11}**T*T_{12} + \alpha V_{21}**T*T_{22}
+* T_{22} = \alpha V_{22}**T*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha V_{11}**T*T_{12} (DTRMM)
+* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} (DTRMMOOP)
+*
+* T_{12} = \alpha V_{11}**T*T_{12}
+*
+ CALL DTRMM('Left', 'Lower', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V, LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha V_{21}**T*T_{22} + T_{12}
+*
+ CALL DTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1),
+ $ LDT, V(K+1, 1), LDV, ONE, T(1, K+1), LDT)
+ ELSE
+*
+* We are computing T = V*T, which we break down as follows
+* |--------------| |--------------| |--------------|
+* |T_{11} T_{12}| |V_{11} V_{12}| |T_{11} T_{12}|
+* |0 T_{22}| = \alpha |0 V_{22}| * |0 T_{22}|
+* |--------------| |--------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}*T_{11}
+* T_{12} = \alpha V_{11}*T_{12} + \alpha V_{12}*T_{22}
+* T_{22} = \alpha V_{22}*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha V_{11}*T_{12} (DTRMM)
+* T_{12} = \alpha V_{12}*T_{22} + T_{12} (DTRMMOOP)
+*
+* T_{12} = \alpha V_{11}*T_{12}
+*
+ CALL DTRMM('Left', 'Upper', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V, LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha V_{12}*T_{22} + T_{12} (DTRMMOOP)
+*
+ CALL DTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1),
+ $ LDT, V(1, K+1), LDV, ONE, T(1, K+1), LDT)
+ END IF
+ END IF
+ ELSE
+*
+* T is lower triangular
+*
+ IF(TLEFT) THEN
+*
+* Compute T = T*op(V)
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = T*V**T, which we break down as follows
+* |--------------| |--------------| |--------------------|
+* |T_{11} 0 | |T_{11} 0 | |V_{11}**T 0 |
+* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{12}**T V_{22}**T|
+* |--------------| |--------------| |--------------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}**T
+* T_{21} = \alpha T_{21}*V_{11}**T + \alpha T_{22}*V_{12}**T
+* T_{22} = \alpha T_{22}*V_{22}**T
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha T_{21}*V_{11}**T (DTRMM)
+* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} (DTRMMOOP)
+*
+* T_{21} = \alpha T_{21}*V_{11}**T
+*
+ CALL DTRMM('Right', 'Upper', TRANSV, DIAGV, N-K,
+ $ K, ALPHA, V, LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha T_{22}*V_{12}**T + T_{21}
+*
+ CALL DTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1),
+ $ LDT, V(1, K+1), LDV, ONE, T(K+1, 1), LDT)
+ ELSE
+*
+* We are computing T = T*V, which we break down as follows
+* |--------------| |--------------| |-------------|
+* |T_{11} 0 | |T_{11} 0 | |V_{11} 0 |
+* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{21} V_{22}|
+* |--------------| |--------------| |-------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}
+* T_{21} = \alpha T_{21}*V_{11} + \alpha T_{22}*V_{21}
+* T_{22} = \alpha T_{22}*V_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha T_{21}*V_{11} (DTRMM)
+* T_{21} = \alpha T_{22}*V_{21} + T_{21} (DTRMMOOP)
+*
+* T_{21} = \alpha T_{21}*V_{11}
+*
+ CALL DTRMM('Right', 'Lower', TRANSV, DIAGV, N-K,
+ $ K, ALPHA, V, LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha T_{22}*V_{12} + T_{21}
+*
+ CALL DTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1),
+ $ LDT, V(K+1, 1), LDV, ONE, T(K+1, 1), LDT)
+ END IF
+ ELSE
+*
+* Compute T = op(V)*T
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = V**T*T, which we break down as follows
+* |--------------| |--------------------| |--------------|
+* |T_{11} 0 | |V_{11}**T 0 | |T_{11} 0 |
+* |T_{21} T_{22}| = \alpha |V_{12}**T V_{22}**T| * |T_{21} T_{22}|
+* |--------------| |--------------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}**T*T_{11}
+* T_{21} = \alpha V_{12}**T*T_{11} + \alpha V_{22}**T*T_{21}
+* T_{22} = \alpha V_{22}**T*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha V_{22}**T*T_{21} (DTRMM)
+* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} (DTRMMOOP)
+*
+* T_{21} = \alpha V_{22}**T*T_{21}
+*
+ CALL DTRMM('Left', 'Upper', TRANSV, DIAGV, N-K, K,
+ $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha V_{12}**T*T_{11} + T_{21}
+*
+ CALL DTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT,
+ $ V(1, K+1), LDV, ONE, T(K+1, 1), LDT)
+ ELSE
+*
+* We are computing T = V*T, which we break down as follows
+* |--------------| |-------------| |--------------|
+* |T_{11} 0 | |V_{11} 0 | |T_{11} 0 |
+* |T_{21} T_{22}| = \alpha |V_{21} V_{22}| * |T_{21} T_{22}|
+* |--------------| |-------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}*T_{11}
+* T_{21} = \alpha V_{21}*T_{11} + \alpha V_{22}*T_{21}
+* T_{22} = \alpha V_{22}*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{21} = \alpha V_{22}*T_{21} (DTRMM)
+* T_{21} = \alpha V_{12}*T_{11} + T_{21} (DTRMMOOP)
+*
+* T_{21} = \alpha V_{22}*T_{12}
+*
+ CALL DTRMM('Left', 'Lower', TRANSV, DIAGV, N-K, K,
+ $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha V_{12}*T_{11} + T_{21}
+*
+ CALL DTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT,
+ $ V(K+1, 1), LDV, ONE, T(K+1, 1), LDT)
+ END IF
+ END IF
+ END IF
+*
+* Since in all the above cases, we compute T_{11} and T_{22}
+* the same, we pass in our flags and call this routine recursively
+*
+* Compute T_{11} recursively
+*
+ CALL DTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, K, ALPHA,
+ $ T, LDT, V, LDV)
+*
+* Compute T_{22} recursively
+*
+ CALL DTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, N-K, ALPHA,
+ $ T(K+1, K+1), LDT, V(K+1, K+1), LDV)
+
+ END SUBROUTINE
diff --git a/SRC/lapack_64.h b/SRC/lapack_64.h
index e8000bf2c4..f899d642a8 100644
--- a/SRC/lapack_64.h
+++ b/SRC/lapack_64.h
@@ -331,6 +331,7 @@
#define CLARF1F CLARF1F_64
#define CLARF1L CLARF1L_64
#define CLARFB CLARFB_64
+#define CLARFB0C2 CLARFB0C2_64
#define CLARFB_GETT CLARFB_GETT_64
#define CLARFG CLARFG_64
#define CLARFGP CLARFGP_64
@@ -551,9 +552,18 @@
#define CUNGQR CUNGQR_64
#define CUNGR2 CUNGR2_64
#define CUNGRQ CUNGRQ_64
+#define CUNGRQ CUNGRQ_64
+#define CUNGLK CUNGLK_64
+#define CUNGKL CUNGKL_64
+#define CUNGKR CUNGKR_64
+#define CUNGRK CUNGRK_64
#define CUNGTR CUNGTR_64
#define CUNGTSQR CUNGTSQR_64
#define CUNGTSQR_ROW CUNGTSQR_ROW_64
+#define CLUMM CLUMM_64
+#define CTRTRM CTRTRM_64
+#define CACXPY CACXPY_64
+#define CTRMMOOP CTRMMOOP_64
#define CUNHR_COL CUNHR_COL_64
#define CUNM22 CUNM22_64
#define CUNM2L CUNM2L_64
@@ -805,6 +815,7 @@
#define DLARF1F DLARF1F_64
#define DLARF1L DLARF1L_64
#define DLARFB DLARFB_64
+#define DLARFB0C2 DLARFB0C2_64
#define DLARFB_GETT DLARFB_GETT_64
#define DLARFG DLARFG_64
#define DLARFGP DLARFGP_64
@@ -905,9 +916,16 @@
#define DORGQR DORGQR_64
#define DORGR2 DORGR2_64
#define DORGRQ DORGRQ_64
+#define DORGLK DORGLK_64
+#define DORGKL DORGKL_64
+#define DORGKR DORGKR_64
+#define DORGRK DORGRK_64
#define DORGTR DORGTR_64
#define DORGTSQR DORGTSQR_64
#define DORGTSQR_ROW DORGTSQR_ROW_64
+#define DLUMM DLUMM_64
+#define DTRTRM DTRTRM_64
+#define DTRMMOOP DTRMMOOP_64
#define DORHR_COL DORHR_COL_64
#define DORM22 DORM22_64
#define DORM2L DORM2L_64
@@ -1400,6 +1418,7 @@
#define SLARF1F SLARF1F_64
#define SLARF1L SLARF1L_64
#define SLARFB SLARFB_64
+#define SLARFB0C2 SLARFB0C2_64
#define SLARFB_GETT SLARFB_GETT_64
#define SLARFG SLARFG_64
#define SLARFGP SLARFGP_64
@@ -1499,9 +1518,16 @@
#define SORGQR SORGQR_64
#define SORGR2 SORGR2_64
#define SORGRQ SORGRQ_64
+#define SORGLK SORGLK_64
+#define SORGKL SORGKL_64
+#define SORGKR SORGKR_64
+#define SORGRK SORGRK_64
#define SORGTR SORGTR_64
#define SORGTSQR SORGTSQR_64
#define SORGTSQR_ROW SORGTSQR_ROW_64
+#define SLUMM SLUMM_64
+#define STRTRM STRTRM_64
+#define STRMMOOP STRMMOOP_64
#define SORHR_COL SORHR_COL_64
#define SORM22 SORM22_64
#define SORM2L SORM2L_64
@@ -2046,6 +2072,7 @@
#define ZLARF1F ZLARF1F_64
#define ZLARF1L ZLARF1L_64
#define ZLARFB ZLARFB_64
+#define ZLARFB0C2 ZLARFB0C2_64
#define ZLARFB_GETT ZLARFB_GETT_64
#define ZLARFG ZLARFG_64
#define ZLARFGP ZLARFGP_64
@@ -2264,9 +2291,18 @@
#define ZUNGQR ZUNGQR_64
#define ZUNGR2 ZUNGR2_64
#define ZUNGRQ ZUNGRQ_64
+#define ZUNGLK ZUNGLK_64
+#define ZUNGKL ZUNGKL_64
+#define ZUNGKR ZUNGKR_64
+#define ZUNGRK ZUNGRK_64
+#define ZUNGTR ZUNGTR_64
#define ZUNGTR ZUNGTR_64
#define ZUNGTSQR ZUNGTSQR_64
#define ZUNGTSQR_ROW ZUNGTSQR_ROW_64
+#define ZLUMM ZLUMM_64
+#define ZTRTRM ZTRTRM_64
+#define ZACXPY ZACXPY_64
+#define ZTRMMOOP ZTRMMOOP_64
#define ZUNHR_COL ZUNHR_COL_64
#define ZUNM22 ZUNM22_64
#define ZUNM2L ZUNM2L_64
diff --git a/SRC/slarfb0c2.f b/SRC/slarfb0c2.f
new file mode 100644
index 0000000000..f4c8ddd36f
--- /dev/null
+++ b/SRC/slarfb0c2.f
@@ -0,0 +1,556 @@
+*> \brief \b SLARFB0C2 applies a block reflector or its transpose to a
+* rectangular matrix with a 0 block while constructing the explicit Q factor
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+* $ K, V, LDV, T, LDT, C, LDC)
+* ! Scalar arguments
+* INTEGER M, N, K, LDV, LDC, LDT
+* CHARACTER SIDE, TRANS, DIRECT, STOREV
+* ! True means that we are assuming C2 is the identity matrix
+* ! and thus don't reference whatever is present in C2
+* ! at the beginning.
+* LOGICAL C2I
+* ! Array arguments
+* REAL V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SLARFB0C2 applies a real block reflector H or its transpose H**T to a
+*> real m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*> C2I is LOGICAL
+*> = .TRUE.: Assume the nonzero block of C is the identity matrix
+*> = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> = 'L': apply H or H**T from the Left
+*> = 'R': apply H or H**T from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*> TRANS is CHARACTER*1
+*> = 'N': apply H (No transpose)
+*> = 'T': apply H**T (Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*> DIRECT is CHARACTER*1
+*> Indicates how H is formed from a product of elementary
+*> reflectors
+*> = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*> = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*> STOREV is CHARACTER*1
+*> Indicates how the vectors which define the elementary
+*> reflectors are stored:
+*> = 'C': Columnwise
+*> = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*> K is INTEGER
+*> The order of the matrix T (= the number of elementary
+*> reflectors whose product defines the block reflector).
+*> If SIDE = 'L', M >= K >= 0;
+*> if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is REAL array, dimension
+*> (LDV,K) if STOREV = 'C'
+*> (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*> (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*> The matrix V. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> The leading dimension of the array V.
+*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*> if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is REAL array, dimension (LDT,K)
+*> The triangular k by k matrix T in the representation of the
+*> block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is REAL array, dimension (LDC,N)
+*> On entry, the m by n matrix C.
+*> On exit, C is overwritten by H*C or H**T*C or C*H or C*H**T.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The shape of the matrix V and the storage of the vectors which define
+*> the H(i) is best illustrated by the following example with n = 5 and
+*> k = 3. The triangular part of V (including its diagonal) is not
+*> referenced.
+*>
+*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R':
+*>
+*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 )
+*> ( v1 1 ) ( 1 v2 v2 v2 )
+*> ( v1 v2 1 ) ( 1 v3 v3 )
+*> ( v1 v2 v3 )
+*> ( v1 v2 v3 )
+*>
+*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R':
+*>
+*> V = ( v1 v2 v3 ) V = ( v1 v1 1 )
+*> ( v1 v2 v3 ) ( v2 v2 v2 1 )
+*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 )
+*> ( 1 v3 )
+*> ( 1 )
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE SLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+ $ K, V, LDV, T, LDT, C, LDC)
+ ! Scalar arguments
+ INTEGER M, N, K, LDV, LDC, LDT
+ CHARACTER SIDE, TRANS, DIRECT, STOREV
+ ! True means that we are assuming C2 is the identity matrix
+ ! and thus don't reference whatever is present in C2
+ ! at the beginning.
+ LOGICAL C2I
+ ! Array arguments
+ REAL V(LDV,*), C(LDC,*), T(LDT,*)
+ ! Local scalars
+ LOGICAL QR, LQ, QL, RQ, DIRF, COLV, SIDEL, SIDER,
+ $ TRANST
+ INTEGER I, J
+ ! External functions
+ LOGICAL LSAME
+ EXTERNAL LSAME
+ ! External subroutines
+ EXTERNAL SGEMM, STRMM, XERBLA
+ ! Parameters
+ REAL ONE, ZERO, NEG_ONE
+ PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE = -1.0E+0)
+
+ ! Beginning of executable statements
+ ! Convert our character flags to logical values
+ DIRF = LSAME(DIRECT,'F')
+ COLV = LSAME(STOREV,'C')
+ SIDEL = LSAME(SIDE,'L')
+ SIDER = LSAME(SIDE,'R')
+ TRANST = LSAME(TRANS,'T')
+
+ ! Determine which of the 4 modes are using.
+ ! QR is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the first column
+ QR = DIRF.AND.COLV
+
+ ! LQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the first row
+ LQ = DIRF.AND.(.NOT.COLV)
+
+ ! QL is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the last column
+ QL = (.NOT.DIRF).AND.COLV
+
+ ! RQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the last row
+ RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+
+ IF (QR) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V1 ] and C = [ C1 ]
+ ! [ V2 ] [ C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ]
+ ! [ V2 ] [ C2 ]
+ ! = [ 0 ] - [ V1 ]T*V2'*C2
+ ! [ C2 ] [ V2 ]
+ !
+ ! = [ 0 ] - [ V1*T*V2'*C2 ]
+ ! [ C2 ] [ V2*T*V2'*C2 ]
+ !
+ ! = [ V1*T*V2'*C2 ]
+ ! [ C2 - V2*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('SLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('SLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF (C2I) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(I,J) = V(K+J,I)
+ END DO
+ END DO
+ ELSE
+ CALL SGEMM('Transpose', 'No Transpose', K, N, M - K,
+ $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+ $ C, LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF (C2I) THEN
+ CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+ $ C(K+1,1), LDC)
+ DO I = 1, N
+ C(K+I,I) = C(K+I,I) + ONE
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+ $ C(K+1,1), LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (LQ) THEN
+ ! We are computing C = C op(H) = C(I-V' op(T) V)
+ ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = C op(H) = [ 0, C2 ](I - [ V1' ]op(T)[ V1, V2 ])
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]op(T)[ V1, V2 ]
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - C2*V2'*op(T)[ V1, V2 ]
+ !
+ ! = [ -C2*V2'*op(T)*V1, C2 - C2*V2'*op(T)*V2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*op(T)
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('SLARFB0C2', 2)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,J) = V(J,K+I)
+ END DO
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
+ $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+ $ LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ IF (TRANST) THEN
+ CALL STRMM('Right', 'Upper', 'Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C, LDC)
+ ELSE
+ CALL STRMM('Right', 'Lower', 'No Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C, LDC)
+ END IF
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+ $ LDC)
+ DO I = 1, M
+ C(I,K+I) = C(I,K+I) + ONE
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+ $ LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL STRMM('Right', 'Upper', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (QL) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V2 ] and C = [ C2 ]
+ ! [ V1 ] [ C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{M-K\times K}
+ ! C1=0\in\R^{K\times N}
+ ! C2\in\R^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+ ! [ V1 ] [ 0 ]
+ !
+ ! = [ C2 ] - [ V2 ]T*V2'*C2
+ ! [ 0 ] [ V1 ]
+ !
+ ! = [ C2 ] - [ V2*T*V2'*C2 ]
+ ! [ 0 ] [ V1*T*V2'*C2 ]
+ !
+ ! = [ C2 - V2*T*V2'*C2 ]
+ ! [ - V1*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('SLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('SLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF( C2I ) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(M-K+I,J) = V(J,I)
+ END DO
+ END DO
+ ELSE
+ CALL SGEMM('Transpose', 'No Transpose', K, N, M-K,
+ $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF( C2I ) THEN
+ CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+ DO I = 1, N
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+ ELSE IF (RQ) THEN
+ ! We are computing C = C op(H) = C(I-V' op(T) V)
+ ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+ ! with the following dimensions:
+ ! V1\in\R^{K\times K}
+ ! V2\in\R^{K\times N-K}
+ ! C1=0\in\R^{M\times K}
+ ! C2\in\R^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = C op(H) = [ C2, 0 ] (I - [ V2' ]op(T)[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]op(T)[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - C2*V2'*op(T)[ V2, V1 ]
+ !
+ ! = [ C2, 0 ] - [ C2*V2'*op(T)*V2, C2*V2'*op(T)*V1 ]
+ !
+ ! = [ C2 - C2*V2'*op(T)*V2, -C2*V2'*op(T)*V1 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*op(T)
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side has the expected value
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('SLARFB0C2', 2)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,N-K+J) = V(J,I)
+ END DO
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'Transpose', M, K, N-K,
+ $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C1 = C1*op(T)
+ !
+ IF( TRANST ) THEN
+ CALL STRMM('Right', 'Lower', 'Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ ELSE
+ CALL STRMM('Right', 'Upper', 'No Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+ DO I = 1, M
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL SGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL STRMM('Right', 'Lower', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/slarft.f b/SRC/slarft.f
index 7a67fa57f4..515dcff5d1 100644
--- a/SRC/slarft.f
+++ b/SRC/slarft.f
@@ -176,23 +176,23 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* .. Parameters ..
*
- REAL ONE, NEG_ONE, ZERO
+ REAL ONE, NEG_ONE, ZERO
PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE=-1.0E+0)
*
* .. Local Scalars ..
*
- INTEGER I,J,L
- LOGICAL QR,LQ,QL,DIRF,COLV
+ INTEGER I,J,L
+ LOGICAL QR,LQ,QL,RQ,LQT,RQT,DIRF,COLV,TDIRF,TCOLV
*
* .. External Subroutines ..
*
- EXTERNAL STRMM,SGEMM,SLACPY
+ EXTERNAL STRMM,SGEMM,SLACPY
*
* .. External Functions..
*
- LOGICAL LSAME
- EXTERNAL LSAME
-*
+ LOGICAL LSAME
+ EXTERNAL LSAME
+*
* The general scheme used is inspired by the approach inside DGEQRT3
* which was (at the time of writing this code):
* Based on the algorithm of Elmroth and Gustavson,
@@ -223,26 +223,37 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* 'C' for STOREV, then they meant to provide 'R'
*
DIRF = LSAME(DIRECT,'F')
+ TDIRF = LSAME(DIRECT,'T')
COLV = LSAME(STOREV,'C')
+ TCOLV = LSAME(STOREV,'T')
*
* QR happens when we have forward direction in column storage
*
QR = DIRF.AND.COLV
*
-* LQ happens when we have forward direction in row storage
+* LQT happens when we have forward direction in row storage and want to compute the transpose of
+* the T we would normally compute
+*
+ LQT = DIRF.AND.TCOLV
+*
+* LQ happens when we have forward direction in row storage and want to compute the T we would
+* normally compute
*
- LQ = DIRF.AND.(.NOT.COLV)
+ LQ = DIRF.AND.(.NOT.LQT)
*
* QL happens when we have backward direction in column storage
*
QL = (.NOT.DIRF).AND.COLV
*
-* The last case is RQ. Due to how we structured this, if the
-* above 3 are false, then RQ must be true, so we never store
-* this
-* RQ happens when we have backward direction in row storage
-* RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+* RQT happens when we have backward direction in row storage and want to compute the transpose
+* of the T we would normally compute
*
+ RQT = TDIRF.AND.(.NOT.COLV)
+*
+* RQ happens when we have backward direction in row storage and want to compute the T that we
+* would normally compute
+*
+ RQ = (.NOT.RQT).AND.(.NOT.COLV)
IF(QR) THEN
*
* Break V apart into 6 components
@@ -256,17 +267,17 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{1,1}\in\R^{l,l} unit lower triangular
* V_{2,1}\in\R^{k-l,l} rectangular
* V_{3,1}\in\R^{n-k,l} rectangular
-*
+*
* V_{2,2}\in\R^{k-l,k-l} unit lower triangular
* V_{3,2}\in\R^{n-k,k-l} rectangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} T_{1,2}|
* |0 T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -277,17 +288,17 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
+*
* (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2')
* = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2'
-*
+*
* Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |-------|
* |V_1 V_2|
* |-------|
-*
+*
* So, our product is equivalent to the matrix product
* I - V*T*V'
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
@@ -299,30 +310,29 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{2,2} recursively
*
- CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV,
- $ TAU(L+1), T(L+1, L+1), LDT)
+ CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
*
-* Compute T_{1,2}
+* Compute T_{1,2}
* T_{1,2} = V_{2,1}'
*
DO J = 1, L
DO I = 1, K-L
- T(J, L+I) = V(L+I, J)
+ T(J,L+I) = V(L+I,J)
END DO
END DO
*
* T_{1,2} = T_{1,2}*V_{2,2}
*
CALL STRMM('Right', 'Lower', 'No transpose', 'Unit', L,
- $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
+ $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
- CALL SGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE,
- $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE,
- $ T(1, L+1), LDT)
+ CALL SGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE,
+ $ V(K+1, 1), LDV, V(K+1,L+1), LDV, ONE, T(1, L+1), LDT)
*
* At this point, we have that T_{1,2} = V_1'*V_2
* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
@@ -331,12 +341,12 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2} = -T_{1,1}*T_{1,2}
*
CALL STRMM('Left', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
+ $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*T_{2,2}
*
- CALL STRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT)
+ CALL STRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
+ $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT)
ELSE IF(LQ) THEN
*
@@ -350,19 +360,19 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{1,1}\in\R^{l,l} unit upper triangular
* V_{1,2}\in\R^{l,k-l} rectangular
* V_{1,3}\in\R^{l,n-k} rectangular
-*
+*
* V_{2,2}\in\R^{k-l,k-l} unit upper triangular
* V_{2,3}\in\R^{k-l,n-k} rectangular
*
* Where l = floor(k/2)
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} T_{1,2}|
* |0 T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -371,20 +381,20 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2}\in\R^{l, k-l} rectangular
*
* Then, consider the product:
-*
-* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2)
-* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2
-*
+*
+* (I - V_1'*T_{1,1}'*V_1)*(I - V_2'*T_{2,2}'*V_2)
+* = I - V_1'*T_{1,1}'*V_1 - V_2'*T_{2,2}'*V_2 + V_1'*T_{1,1}'*V_1*V_2'*T_{2,2}'*V_2
+*
* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |---|
* |V_1|
* |V_2|
* |---|
-*
+*
* So, our product is equivalent to the matrix product
-* I - V'*T*V
+* I - V'*T'*V
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
* to compute T_{1,2}
*
@@ -394,27 +404,26 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{2,2} recursively
*
- CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV,
- $ TAU(L+1), T(L+1, L+1), LDT)
+ CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
*
* Compute T_{1,2}
* T_{1,2} = V_{1,2}
*
- CALL SLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT)
+ CALL SLACPY('All', L, K - L, V(1,L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*V_{2,2}'
*
CALL STRMM('Right', 'Upper', 'Transpose', 'Unit', L, K-L,
- $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
+ $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
CALL SGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE,
- $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE,
- $ T(1, L+1), LDT)
+ $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, T(1, L+1), LDT)
*
* At this point, we have that T_{1,2} = V_1*V_2'
* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
@@ -423,13 +432,106 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2} = -T_{1,1}*T_{1,2}
*
CALL STRMM('Left', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
+ $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*T_{2,2}
*
CALL STRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT)
+ $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT)
+ ELSE IF(LQT) THEN
+*
+* Break V apart into 6 components
+*
+* V = |----------------------|
+* |V_{1,1} V_{1,2} V{1,3}|
+* |0 V_{2,2} V{2,3}|
+* |----------------------|
+*
+* V_{1,1}\in\R^{l,l} unit upper triangular
+* V_{1,2}\in\R^{l,k-l} rectangular
+* V_{1,3}\in\R^{l,n-k} rectangular
+*
+* V_{2,2}\in\R^{k-l,k-l} unit upper triangular
+* V_{2,3}\in\R^{k-l,n-k} rectangular
+*
+* Where l = floor(k/2)
+*
+* We will construct the T matrix
+* T = |---------------|
+* |T_{1,1} 0 |
+* |T_{2,1} T_{2,2}|
+* |---------------|
+*
+* T is the triangular factor obtained from block reflectors.
+* To motivate the structure, assume we have already computed T_{1,1}
+* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
+*
+* T_{1,1}\in\R^{l, l} lower triangular
+* T_{2,2}\in\R^{k-l, k-l} lower triangular
+* T_{2,1}\in\R^{k-l, l} rectangular
+*
+* Then, consider the product:
+*
+* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2)
+* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2
+*
+* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1}
+*
+* Then, we can define the matrix V as
+* V = |---|
+* |V_1|
+* |V_2|
+* |---|
+*
+* So, our product is equivalent to the matrix product
+* I - V'*T*V
+* This means, we can compute T_{1,1} and T_{2,2}, then use this information
+* to compute T_{2,1}
+*
+* Compute T_{1,1} recursively
+*
+ CALL SLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT)
+*
+* Compute T_{2,2} recursively
+*
+ CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
+
+*
+* Compute T_{2,1}
+* T_{2,1} = V_{1,2}'
+*
+ DO I = 1, K-L
+ DO J = 1, L
+ T(L+I,J) = V(J,L+I)
+ END DO
+ END DO
+*
+* T_{2,1} = V_{2,2}*T_{2,1}
+*
+ CALL STRMM('Left', 'Upper', 'No Transpose', 'Unit', K-L, L,
+ $ ONE, V(L+1,L+1), LDV, T(L+1,1), LDT)
+*
+* T_{2,1} = V_{2,3}*V_{1,3}' + T_{2,1}
+* Note: We assume K <= N, and GEMM will do nothing if N=K
+*
+ CALL SGEMM('No Transpose', 'Transpose', K-L, L, N-K, ONE,
+ $ V(L+1,K+1), LDV, V(1, K+1), LDV, ONE, T(L+1,1), LDT)
+*
+* At this point, we have that T_{2,1} = V_2*V_1'
+* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1}
+* respectively.
+*
+* T_{2,1} = -T_{2,2}*T_{2,1}
+*
+ CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K-L, L, NEG_ONE, T(L+1,L+1), LDT, T(L+1,1), LDT)
+*
+* T_{2,1} = T_{2,1}*T_{1,1}
+*
+ CALL STRMM('Right', 'Lower', 'No Transpose', 'Non-unit',
+ $ K-L, L, ONE, T, LDT, T(L+1,1), LDT)
ELSE IF(QL) THEN
*
* Break V apart into 6 components
@@ -442,18 +544,18 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* V_{1,1}\in\R^{n-k,k-l} rectangular
* V_{2,1}\in\R^{k-l,k-l} unit upper triangular
-*
+*
* V_{1,2}\in\R^{n-k,l} rectangular
* V_{2,2}\in\R^{k-l,l} rectangular
* V_{3,2}\in\R^{l,l} unit upper triangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} 0 |
* |T_{2,1} T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -464,17 +566,17 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
+*
* (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1')
* = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1'
-*
+*
* Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |-------|
* |V_1 V_2|
* |-------|
-*
+*
* So, our product is equivalent to the matrix product
* I - V*T*V'
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
@@ -482,34 +584,34 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{1,1} recursively
*
- CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT)
+ CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
*
* Compute T_{2,2} recursively
*
CALL SLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV,
- $ TAU(K-L+1), T(K-L+1, K-L+1), LDT)
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
*
* Compute T_{2,1}
* T_{2,1} = V_{2,2}'
*
DO J = 1, K-L
DO I = 1, L
- T(K-L+I, J) = V(N-K+J, K-L+I)
+ T(K-L+I,J) = V(N-K+J, K-L+I)
END DO
END DO
*
* T_{2,1} = T_{2,1}*V_{2,1}
*
CALL STRMM('Right', 'Upper', 'No transpose', 'Unit', L,
- $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT)
+ $ K-L, ONE, V(N-K+1,1), LDV, T(K-L+1,1), LDT)
*
* T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
CALL SGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE,
- $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1),
- $ LDT)
+ $ V(1,K-L+1), LDV, V, LDV, ONE, T(K-L+1,1), LDT)
*
* At this point, we have that T_{2,1} = V_2'*V_1
* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1}
@@ -518,17 +620,13 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{2,1} = -T_{2,2}*T_{2,1}
*
CALL STRMM('Left', 'Lower', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT,
- $ T(K-L+1, 1), LDT)
+ $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT)
*
* T_{2,1} = T_{2,1}*T_{1,1}
*
CALL STRMM('Right', 'Lower', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT)
- ELSE
-*
-* Else means RQ case
-*
+ $ K-L, ONE, T, LDT, T(K-L+1,1), LDT)
+ ELSE IF(RQ) THEN
* Break V apart into 6 components
*
* V = |-----------------------|
@@ -543,13 +641,13 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{2,2}\in\R^{l,k-l} rectangular
* V_{2,3}\in\R^{l,l} unit lower triangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} 0 |
* |T_{2,1} T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -560,51 +658,51 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
-* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1)
-* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1
-*
+*
+* (I - V_2'*T_{2,2}'*V_2)*(I - V_1'*T_{1,1}'*V_1)
+* = I - V_2'*T_{2,2}'*V_2 - V_1'*T_{1,1}'*V_1 + V_2'*T_{2,2}'*V_2*V_1'*T_{1,1}'*V_1
+*
* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |---|
* |V_1|
* |V_2|
* |---|
-*
+*
* So, our product is equivalent to the matrix product
-* I - V'TV
+* I - V'*T'*V
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
* to compute T_{2,1}
*
* Compute T_{1,1} recursively
*
- CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT)
+ CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
*
* Compute T_{2,2} recursively
*
- CALL SLARFT(DIRECT, STOREV, N, L, V(K-L+1, 1), LDV,
- $ TAU(K-L+1), T(K-L+1, K-L+1), LDT)
+ CALL SLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV,
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
*
* Compute T_{2,1}
* T_{2,1} = V_{2,2}
*
- CALL SLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV,
- $ T(K-L+1, 1), LDT)
+ CALL SLACPY('All', L, K-L, V(K-L+1,N-K+1), LDV, T(K-L+1,1),
+ $ LDT)
*
* T_{2,1} = T_{2,1}*V_{1,2}'
*
CALL STRMM('Right', 'Lower', 'Transpose', 'Unit', L, K-L,
- $ ONE, V(1, N-K+1), LDV, T(K-L+1, 1), LDT)
+ $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT)
*
-* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1}
+* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
- CALL SGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE,
- $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1),
- $ LDT)
+ CALL SGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE,
+ $ V(K-L+1,1), LDV, V, LDV, ONE, T(K-L+1,1), LDT)
*
* At this point, we have that T_{2,1} = V_2*V_1'
@@ -614,13 +712,103 @@ RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{2,1} = -T_{2,2}*T_{2,1}
*
CALL STRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT,
- $ T(K-L+1, 1), LDT)
+ $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT)
*
* T_{2,1} = T_{2,1}*T_{1,1}
*
CALL STRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L,
- $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT)
+ $ K-L, ONE, T, LDT, T(K-L+1,1), LDT)
+ ELSE IF(RQT) THEN
+* Break V apart into 6 components
+*
+* V = |-----------------------|
+* |V_{1,1} V_{1,2} 0 |
+* |V_{2,1} V_{2,2} V_{2,3}|
+* |-----------------------|
+*
+* V_{1,1}\in\R^{k-l,n-k} rectangular
+* V_{1,2}\in\R^{k-l,k-l} unit lower triangular
+*
+* V_{2,1}\in\R^{l,n-k} rectangular
+* V_{2,2}\in\R^{l,k-l} rectangular
+* V_{2,3}\in\R^{l,l} unit lower triangular
+*
+* We will construct the T matrix
+* T = |---------------|
+* |T_{1,1} T_{1,2}|
+* | 0 T_{2,2}|
+* |---------------|
+*
+* T is the triangular factor obtained from block reflectors.
+* To motivate the structure, assume we have already computed T_{1,1}
+* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
+*
+* T_{1,1}\in\R^{k-l, k-l} non-unit upper triangular
+* T_{2,2}\in\R^{l, l} non-unit upper triangular
+* T_{1,2}\in\R^{k-l, l} rectangular
+*
+* Where l = floor(k/2)
+*
+* Then, consider the product:
+*
+* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1)
+* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1
+*
+* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2}
+*
+* Then, we can define the matrix V as
+* V = |---|
+* |V_1|
+* |V_2|
+* |---|
+*
+* So, our product is equivalent to the matrix product
+* I - V'*T*V
+* This means, we can compute T_{1,1} and T_{2,2}, then use this information
+* to compute T_{1,2}
+*
+* Compute T_{1,1} recursively
+*
+ CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
+*
+* Compute T_{2,2} recursively
+*
+ CALL SLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV,
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
+*
+* Compute T_{1,2}
+* T_{1,2} = V_{2,2}'
+*
+ DO I = 1, K-L
+ DO J = 1, L
+ T(I,K-L+J) = V(K-L+J, N-K+I)
+ END DO
+ END DO
+*
+* T_{1,2} = V_{1,2}T_{1,2}
+*
+ CALL STRMM('Left', 'Lower', 'No Transpose', 'Unit', K-L, L,
+ $ ONE, V(1,N-K+1), LDV, T(1,K-L+1), LDT)
+*
+* T_{1,2} = V_{1,1}V_{2,1}' + T_{1,2}
+*
+ CALL SGEMM('No Tranpose', 'Transpose', K-L, L, N-K, ONE, V,
+ $ LDV, V(K-L+1,1), LDV, ONE, T(1, K-L+1), LDT)
+*
+* At this point, we have that T_{1,2} = V_1*V_2'
+* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
+* respectively.
+*
+* T_{1,2} = -T_{1,1}*T_{1,2}
+*
+ CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-Unit',
+ $ K-L, L, NEG_ONE, T, LDT, T(1, K-L+1), LDT)
+*
+* T_{1,2} = T_{1,2}*T_{2,2}
+*
+ CALL STRMM('Right', 'Upper', 'No Transpose', 'Non-Unit',
+ $ K-L, L, ONE, T(K-L+1,K-L+1), LDT, T(1, K-L+1), LDT)
END IF
END SUBROUTINE
diff --git a/SRC/slumm.f b/SRC/slumm.f
new file mode 100644
index 0000000000..1712ce32d0
--- /dev/null
+++ b/SRC/slumm.f
@@ -0,0 +1,350 @@
+*> \brief \b SLUMM computes an in place triangular times triangluar matrix multiplication
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE SLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA,
+* $ A, LDA)
+*
+* .. Scalar Arguments ..
+* INTEGER N, LDA
+* CHARACTER SIDEL, DIAGL, DIAGU
+* REAL ALPHA
+*
+* .. Array Arguments ..
+* REAL A(LDA,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SLUMM performs one of the matrix-matrix operations
+*>
+*> C = \alpha L * U
+*> or
+*> C = \alpha U * L
+*>
+*> where \alpha is a scalar, L is a unit, or non-unit, lower triangular matrix, and U is a unit, or
+*> non-unit, upper triangular matrix, and at most one of L and U are non-unit
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDEL
+*> \verbatim
+*> SIDEL is CHARACTER*1
+*> On entry, SIDE specifies whether L multiplies U from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' A = \alpha L * U
+*>
+*> SIDE = 'R' or 'r' A = \alpha U * L
+*> \endverbatim
+*>
+*> \param[in] DIAGL
+*> \verbatim
+*> DIAGL is CHARACTER*1
+*> On entry, DIAGL specifies whether or not L is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' L is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' L is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] DIAGU
+*> \verbatim
+*> DIAGU is CHARACTER*1
+*> On entry, DIAGU specifies whether or not U is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' U is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' U is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> M is INTEGER
+*> On entry, N specifies the number of rows and columns of L and U. M must be at
+*> least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is REAL.
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then A is not referenced, and A need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*> A is REAL array, dimension ( LDA, N ) where
+*> Before entry the leading n-by-n strictly upper triangular part of the array
+*> A must contain the upper triangular matrix U and the strictly lower triangular part of
+*> the leading n-by-n submatrix must contain the lower triangular matrix L.
+*> If DIAGL != 'U', then the diagonal is assumed to be part of L, and if
+*> DIAGU != 'U', then the diagonal is assumed to be part of U.
+*> Note: At most one of DIAGL and DIAGU can be not equal to 'U'.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> On entry, LDA specifies the first dimension of A as declared
+*> in the calling (sub) program. LDA must be at least max( 1, n ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+c Cost: 2/3 * (n^3 - n)
+ RECURSIVE SUBROUTINE SLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA,
+ $ A, LDA)
+*
+* .. Scalar Arguments ..
+ INTEGER N, LDA
+ CHARACTER SIDEL, DIAGL, DIAGU
+ REAL ALPHA
+*
+* .. Array Arguments ..
+ REAL A(LDA,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* .. External Subroutines ..
+ EXTERNAL SGEMM, STRMM, SLASET,
+ $ XERBLA
+* ..
+* .. Local Scalars ..
+ INTEGER K
+ LOGICAL LLEFT, LUNIT, UUNIT
+* ..
+* .. Local Parameters ..
+ REAL ONE, ZERO
+ PARAMETER(ONE=1.0E+0, ZERO=0.0E+0)
+* ..
+*
+* Determine if our flags are valid or not. We can have at
+* most one of DIAGU, DIAGL not equal to 'U'
+*
+ LUNIT = LSAME(DIAGL, 'U')
+ UUNIT = LSAME(DIAGU, 'U')
+*
+* If both of the above are false, then it is impossible to have the
+* structure that we are exploiting in this routine
+* Note: It is possible to allow the matrices to share a non-unit
+* diagonal as long as the values are the exact same, but there is
+* currently no use case for this that I am aware of.
+*
+ IF ((.NOT.LUNIT).AND.(.NOT.UUNIT)) THEN
+*
+* We say the error is in the last set DIAG value as we cannot know
+* what the user actually meant.
+*
+ CALL XERBLA( 'SLUMM', 3 )
+ RETURN
+ END IF
+*
+* Determine which side L is on
+*
+ LLEFT = LSAME(SIDEL, 'L')
+*
+* Early exit if possible
+*
+ IF (N.EQ.0) THEN
+ RETURN
+ END IF
+ IF (ALPHA.EQ.ZERO) THEN
+ CALL SLASET('All', N, N, ZERO, ZERO, A, LDA)
+ RETURN
+ END IF
+*
+* Terminating Case
+*
+ IF (N.EQ.1) THEN
+*
+* Since at most one of L and U are non-unit triangular, whatever side L is on, we are still
+* always computing one of
+*
+* 1) A(1,1) = ALPHA*A(1,1)
+* 2) A(1,1) = ALPHA
+*
+* Where the first case happens when exactly one of L and U are unit triangular, while the
+* second case happens when both L and U are unit triangular
+*
+ IF (LUNIT.AND.UUNIT) THEN
+ A(1,1) = ALPHA
+ ELSE
+ A(1,1) = ALPHA*A(1,1)
+ END IF
+ RETURN
+ END IF
+*
+* Recursive Case
+*
+ K = N/2
+*
+* Regardless of us computing A = L*U or A = U*L, break break A apart as follows:
+*
+* |---|
+* A = | U |
+* | L |
+* |---|
+*
+* Further break down L as
+* |---------------|
+* L = | L_{11} 0 |
+* | L_{21} L_{22} |
+* |---------------|
+*
+* Where:
+*
+* L_{11}\in\R^{k\times k} is lower triangular (assumed unit iff DIAGL == 'U')
+* L_{21}\in\R^{n-k\times n} is rectangular
+* L_{22}\in\R^{n-k\times n-k} is lower triangular (assumed unit iff DIAGL == 'U')
+*
+* Further break down U as
+* |---------------|
+* U = | U_{11} U_{21} |
+* | 0 U_{22} |
+* |---------------|
+*
+* Where:
+*
+* U_{11}\in\R^{k\times k} is upper triangular (assumed unit iff DIAGU == 'U')
+* U_{12}\in\R^{n\times n-k} is rectangular
+* U_{22}\in\R^{n-k\times n-k} is upper triangular (assumed unit iff DIAGU == 'U')
+ IF (LLEFT) THEN
+*
+* This means we are computing
+* |---------------| |---------------|
+* A = L*U = \alpha | L_{11} 0 | * | U_{11} U_{12} |
+* | L_{21} L_{22} | | 0 U_{22} |
+* |---------------| |---------------|
+*
+* |---------------------------------------------|
+* = \alpha | L_{11}*U_{11} L_{11}*U_{12} |
+* | L_{21}*U_{11} L_{21}*U_{12} + L_{22}*U_{22} |
+* |---------------------------------------------|
+*
+* We compute these in the following order
+*
+* A_{22} = \alpha*L_{22}*U_{22} (This routine)
+* A_{22} = \alpha*L_{21}*U_{12} + A_{22} (GEMM)
+*
+* A_{12} = \alpha*L_{11}*U_{12} (TRMM)
+* A_{21} = \alpha*L_{21}*U_{11} (TRMM)
+*
+* A_{11} = \alpha*L_{11}*U_{11} (This routine)
+*
+* Compute A_{22}
+*
+* A_{22} = \alpha*L_{22}*U_{22}
+*
+ CALL SLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA,
+ $ A(K+1, K+1), LDA)
+*
+* A_{22} = \alpha L_{21}*U_{12} + A_{22}
+*
+ CALL SGEMM('No Transpose', 'No Transpose', N-K, N-K, K,
+ $ ALPHA, A(K+1,1), LDA, A(1,K+1), LDA, ONE,
+ $ A(K+1,K+1), LDA)
+*
+* Compute A_{12}
+*
+* A_{12} = \alpha*L_{11}*U_{12}
+*
+ CALL STRMM('Left', 'Lower', 'No Transpose', DIAGL, K,
+ $ N-K, ALPHA, A, LDA, A(1,K+1), LDA)
+*
+* Compute A_{21}
+*
+* A_{21} = \alpha*L_{21}*U_{11}
+*
+ CALL STRMM('Right', 'Upper', 'No Transpose', DIAGU, N-K,
+ $ K, ALPHA, A, LDA, A(K+1,1), LDA)
+*
+* Compute A_{11}
+*
+* A_{11} = \alpha*L_{11}*U_{11}
+*
+ CALL SLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA)
+ ELSE
+*
+* This means we are computing
+* |---------------| |---------------|
+* A = U*L = \alpha | U_{11} U_{12} | * | L_{11} 0 |
+* | 0 U_{22} | | L_{21} L_{22} |
+* |---------------| |---------------|
+*
+* |---------------------------------------------|
+* = \alpha | U_{11}*L_{11} + U_{12}*L_{21} U_{12}*L_{22} |
+* | U_{22}*L_{21} U_{22}*L_{22} |
+* |---------------------------------------------|
+*
+* We compute these in the following order
+*
+* A_{11} = \alpha*U_{11}*L_{11} (This routine)
+* A_{11} = \alpha*U_{12}*L_{21} + A_{11} (GEMM)
+*
+* A_{12} = \alpha*U_{12}*L_{22} (TRMM)
+* A_{21} = \alpha*U_{22}*L_{21} (TRMM)
+*
+* A_{22} = \alpha*U_{22}*L_{22} (This routine)
+*
+* Compute A_{11}
+*
+* A_{11} = \alpha*U_{11}*L_{11}
+*
+ CALL SLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA)
+*
+* A_{11} = \alpha*U_{12}*L_{21} + A_{11}
+*
+ CALL SGEMM('No Transpose', 'No Transpose', K, K, N-K,
+ $ ALPHA, A(1,K+1), LDA, A(K+1,1), LDA, ONE, A, LDA)
+*
+* Compute A_{12}
+*
+* A_{12} = \alpha*U_{12}*L_{22}
+*
+ CALL STRMM('Right', 'Lower', 'No Transpose', DIAGL, K,
+ $ N-K, ALPHA, A(K+1,K+1), LDA, A(1,K+1), LDA)
+*
+* Compute A_{21}
+*
+* A_{21} = \alpha*U_{22}*L_{21}
+*
+ CALL STRMM('Left', 'Upper', 'No Transpose', DIAGU, N-K,
+ $ K, ALPHA, A(K+1, K+1), LDA, A(K+1,1), LDA)
+*
+* Compute A_{22}
+*
+* A_{22} = \alpha*U_{22}*L_{22}
+*
+ CALL SLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA,
+ $ A(K+1, K+1), LDA)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/sorgkl.f b/SRC/sorgkl.f
new file mode 100644
index 0000000000..e90f1c9ee3
--- /dev/null
+++ b/SRC/sorgkl.f
@@ -0,0 +1,173 @@
+*> \brief \b SORGKL computes the explicit Q factor from SGEQLF and SLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SORGKL(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* REAL Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SORGKL generates an m by n real matrix Q with orthonormal columns,
+*> which is defined as the last n columns of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V*T*V**T = H(n) . . . H(2) H(1)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by SGEQLF and T is the n by n matrix returned by SLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V, and the order of T.
+*> N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is REAL array, dimension (LDQ,N)
+*> On entry, Q(1:m-n+i-1,i) contains the vector which defines the
+*> elementary reflector H(i), for i=1,...,n as returned by SGEQLF.
+*> In addition, the lower triangular portion of the submatrix given
+*> by Q(m-n+1:m,1:n) will contain the arry T as returned by SLARFT.
+*> See further details for more information.
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The storage of the V and T components inside Q is best illustrated by
+*> the following example with m = 5, n = 3.
+*>
+*> Q = |----------|
+*> | V1 V2 V3 |
+*> | V1 V2 V3 |
+*> | T1 V2 V3 |
+*> | T1 T2 V3 |
+*> | T1 T2 T3 |
+*> |----------|
+*>
+*> \endverbatim
+*>
+* =====================================================================
+
+ SUBROUTINE SORGKL(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ REAL Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL NEG_ONE, ONE
+ PARAMETER(NEG_ONE=-1.0E+0, ONE=1.0E+0)
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL STRMM, STRTRM, SLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |---|
+* Q = | V |
+* | T |
+* |---|
+*
+* Where T is an n-by-n lower triangular matrix, and V is as described
+* in the Further Details section
+*
+* In turn, break apart V as follows
+*
+* |-----|
+* V = | V_2 |
+* | V_1 |
+* |-----|
+*
+* Where:
+*
+* V_1 \in \R^{n\times n} assumed unit upper triangular
+* V_2 \in \R^{m-n\times n}
+*
+* Compute T = T*V_1**T
+*
+ CALL STRTRM('Right', 'Lower', 'Transpose', 'Non-Unit', 'Unit',
+ $ N, ONE, Q(M-N+1,1), LDQ, Q(M-N+1,1), LDQ)
+*
+* Compute Q = -VT. This means that we need to break apart
+* Our computation in two parts
+*
+* |--------|
+* Q = | -V_2*T |
+* | -V_1*T |
+* |--------|
+*
+* Q_2 = -V_2*T (TRMM) but only when necessary
+*
+ IF (M.GT.N) THEN
+ CALL STRMM('Right', 'Lower', 'No Transpose', 'Non-Unit',
+ $ M-N, N, NEG_ONE, Q(M-N+1,1), LDQ, Q, LDQ)
+ END IF
+*
+* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL SLUMM('Right', 'Non-Unit', 'Unit', N, NEG_ONE,
+ $ Q(M-N+1,1), LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(M-N+I,I) = Q(M-N+I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/sorgkr.f b/SRC/sorgkr.f
new file mode 100644
index 0000000000..4bd870cdd8
--- /dev/null
+++ b/SRC/sorgkr.f
@@ -0,0 +1,153 @@
+*> \brief \b SORGKR computes the explicit Q factor from SGEQRF and SLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SORGKR(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* REAL Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SORGKR generates an m by n real matrix Q with orthonormal columns,
+*> which is defined as the first n columns of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V*T*V**T = H(1) H(2) . . . H(n)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by SGEQRF and T is the n by n matrix returned by SLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V, and the order of T.
+*> N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is REAL array, dimension (LDQ,N)
+*> On entry, the upper triangular part and diagonal contains
+*> The array T as returned from SLARFT. In addition, the
+*> strictly lower triangular portion of the i-th column contains
+*> the vector which defines the elementary reflector H(i),
+*> for i = 1,2,...,n, as returned by SGEQRF
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ SUBROUTINE SORGKR(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ REAL Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL NEG_ONE, ONE
+ PARAMETER(NEG_ONE=-1.0E+0, ONE=1.0E+0)
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL STRMM, STRTRM, SLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |---|
+* Q = | T |
+* | V |
+* |---|
+*
+* Where T is an n-by-n upper triangular matrix, and V is an
+* m-by-n assumed unit lower trapezoidal matrix
+*
+* In turn, break apart V as follows
+*
+* |-----|
+* V = | V_1 |
+* | V_2 |
+* |-----|
+*
+* Where:
+*
+* V_1 \in \R^{n\times n} assumed unit lower triangular
+* V_2 \in \R^{m-n\times n}
+*
+* Compute T = T*V_1**T
+*
+ CALL STRTRM('Right', 'Upper', 'Transpose', 'Non-unit', 'Unit',
+ $ N, ONE, Q, LDQ, Q, LDQ)
+*
+* Compute Q = -VT. This means that we need to break apart
+* Our computation in two parts
+*
+* |--------|
+* Q = | -V_1*T |
+* | -V_2*T |
+* |--------|
+*
+* Q_2 = -V_2*T (TRMM) but only when necessary
+*
+ IF (M.GT.N) THEN
+ CALL STRMM('Right', 'Upper', 'No Transpose', 'Non-unit',
+ $ M-N, N, NEG_ONE, Q, LDQ, Q(N+1,1), LDQ)
+ END IF
+*
+* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL SLUMM('Left', 'Unit', 'Non-Unit', N, NEG_ONE, Q, LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,I) = Q(I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/sorglk.f b/SRC/sorglk.f
new file mode 100644
index 0000000000..f5a486ff79
--- /dev/null
+++ b/SRC/sorglk.f
@@ -0,0 +1,149 @@
+*> \brief \b SORGLK computes the explicit Q factor from SGELQF and SLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SORGLK(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* REAL Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SORGLK generates an m by n real matrix Q with orthonormal columns,
+*> which is defined as the first n rows of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V'*T*V = H(1) H(2) . . . H(n)
+*>
+*> Where V is an m by n matrix whose rows are householder reflectors
+*> as returned by SGELQF and T is the n by n matrix returned by SLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V, and the order of T. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is REAL array, dimension (LDQ,N)
+*> On entry, the lower triangular part and diagonal contains
+*> The array T as returned from SLARFT. In addition, the
+*> strictly upper triangular portion of the i-th row contains
+*> the vector which defines the elementary reflector H(i),
+*> for i = 1,2,...,m, as returned by SGELQF
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ SUBROUTINE SORGLK(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ REAL Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL NEG_ONE, ONE
+ PARAMETER(NEG_ONE=-1.0E+0, ONE=1.0E+0)
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL STRMM, STRTRM, SLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |-----|
+* Q = | T V |
+* |-----|
+*
+* Where T is an m-by-m lower triangular matrix, and V is an
+* m-by-n assumed unit upper trapezoidal matrix
+*
+* In turn, break apart V as follows
+*
+* |---------|
+* V = | V_1 V_2 |
+* |---------|
+*
+* Where:
+*
+* V_1 \in \R^{m\times m} assumed unit upper triangular
+* V_2 \in \R^{m\times n-m}
+*
+* Compute T = V_1'*T
+*
+ CALL STRTRM('Left', 'Lower', 'Transpose', 'Non-unit', 'Unit',
+ $ M, ONE, Q, LDQ, Q, LDQ)
+*
+* Compute Q = -TV. This means that we need to break apart
+* Our computation in two parts
+*
+* |---------------|
+* Q = | -T*V_1 -T*V_2 |
+* |---------------|
+*
+* Q_2 = -T*V_2 (TRMM) but only when necessary
+*
+ IF (N.GT.M) THEN
+ CALL STRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ M, N-M, NEG_ONE, Q, LDQ, Q(1,M+1), LDQ)
+ END IF
+*
+* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL SLUMM('Left', 'Non-unit', 'Unit', M, NEG_ONE, Q, LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,I) = Q(I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/sorglq.f b/SRC/sorglq.f
index fb5ef79cce..61e0c4c968 100644
--- a/SRC/sorglq.f
+++ b/SRC/sorglq.f
@@ -146,7 +146,8 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL SLARFB, SLARFT, SORGL2, XERBLA
+ EXTERNAL SLARFB0C2, SLARFT, SORGL2,
+ $ SORGLK, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -162,7 +163,7 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'SORGLQ', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, M )*NB
+ LWKOPT = MAX( 1, M )
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -191,94 +192,92 @@ SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX( 0, ILAENV( 3, 'SORGLQ', ' ', M, N, K, -1 ) )
IWS = M
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'SORGLQ', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Handle the first block assuming we are applying to the
+* identity, then resume regular blocking method after
*
- LDWORK = M
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked version
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'SORGLQ', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL SORGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
+ I = KK + 1
+ IB = NB
*
-* Use blocked code after the last block.
-* The first kk rows are handled by the block method.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ CALL SLARFT( 'Forward', 'Transpose', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), A( I, I ), LDA )
*
-* Set A(kk+1:m,1:kk) to zero.
+* Apply H to A(i+ib:m,i:n) from the right
*
- DO 20 J = 1, KK
- DO 10 I = KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL SLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I+IB,I), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to columns i:n of current block
+
+ CALL SORGLK( IB, N-I+1, A( I, I ), LDA)
*
- IF( KK.LT.M )
- $ CALL SORGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+* Use blocked code
*
- IF( KK.GT.0 ) THEN
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
-* Use blocked code
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
+*
+ CALL SLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I),
+ $ LDA, TAU( I ), A( I, I ), LDA )
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.M ) THEN
+* Apply H to A(i+ib:m,i:n) from the right
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+ CALL SLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I+IB,I), LDA)
*
- CALL SLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
- $ I ),
- $ LDA, TAU( I ), WORK, LDWORK )
+* Apply H to columns i:n of current block
*
-* Apply H**T to A(i+ib:m,i:n) from the right
+ CALL SORGLK( IB, N-I+1, A( I, I ), LDA)
+ END DO
*
- CALL SLARFB( 'Right', 'Transpose', 'Forward',
- $ 'Rowwise',
- $ M-I-IB+1, N-I+1, IB, A( I, I ), LDA, WORK,
- $ LDWORK, A( I+IB, I ), LDA, WORK( IB+1 ),
- $ LDWORK )
- END IF
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
*
-* Apply H**T to columns i:n of current block
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
- CALL SORGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Set columns 1:i-1 of current block to zero
+ CALL SLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I),
+ $ LDA, TAU( I ), A( I, I ), LDA )
*
- DO 40 J = 1, I - 1
- DO 30 L = I, I + IB - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+* Apply H to A(i+ib:m,i:n) from the right
+*
+ CALL SLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I+IB,I), LDA)
+*
+* Apply H to columns i:n of current block
+*
+ CALL SORGLK( IB, N-I+1, A( I, I ), LDA)
+ END IF
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/sorgql.f b/SRC/sorgql.f
index 7257ff2d95..960b4857cc 100644
--- a/SRC/sorgql.f
+++ b/SRC/sorgql.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -143,11 +141,12 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
- $ NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, J, KK, LWKOPT, NB,
+ $ NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL SLARFB, SLARFT, SORG2L, XERBLA
+ EXTERNAL SLARFB0C2, SLARFT, SORG2L,
+ $ SORGKL, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -178,7 +177,8 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 )
- LWKOPT = N*NB
+ ! Only need a workspace for calls to dorg2l
+ LWKOPT = N
END IF
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
*
@@ -201,88 +201,74 @@ SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX(0, ILAENV(3, 'SORGQL', ' ', M, N, K, -1))
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'SORGQL', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* We want to use the blocking method as long as our matrix is big enough
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KK = K
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Possibly bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'SORGQL', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL SORG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the first block.
-* The last kk columns are handled by the block method.
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+ I = 1
+ IB = NB
*
-* Set A(m-kk+1:m,1:n-kk) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- DO 20 J = 1, N - KK
- DO 10 I = M - KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA)
*
-* Use unblocked code for the first or only block.
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
*
- CALL SORG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward',
+ $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I),
+ $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
-* Use blocked code
+ CALL SORGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA)
+
+* Use blocked code on the remaining blocks if there are any.
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- IF( N-K+I.GT.1 ) THEN
+ DO I = NB+1, K, NB
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* The last block may be less than size NB
*
- CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
- $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+ IB = MIN(NB, K-I+1)
*
-* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL SLARFB( 'Left', 'No transpose', 'Backward',
- $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
- $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL SLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA )
*
-* Apply H to rows 1:m-k+i+ib-1 of current block
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
*
- CALL SORG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
- $ TAU( I ), WORK, IINFO )
+ CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
+ $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
-* Set rows m-k+i+ib:m of current block to zero
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
- DO 40 J = N - K + I, N - K + I + IB - 1
- DO 30 L = M - K + I + IB, M
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL SORGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA)
+ END DO
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/sorgqr.f b/SRC/sorgqr.f
index 47d1fd248c..c52acef089 100644
--- a/SRC/sorgqr.f
+++ b/SRC/sorgqr.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -147,7 +145,8 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL SLARFB, SLARFT, SORG2R, XERBLA
+ EXTERNAL SLARFB0C2, SLARFT, SORG2R,
+ $ SORGKR, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -163,7 +162,7 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, N )*NB
+ LWKOPT = MAX( 1, N )
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -192,94 +191,92 @@ SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX(0, ILAENV(3, 'SORGQR', ' ', M, N, K, -1))
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'SORGQR', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Handle the first block assuming we are applying to the
+* identity, then resume regular blocking method after
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'SORGQR', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL SORG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
+ I = KK + 1
+ IB = NB
*
-* Use blocked code after the last block.
-* The first kk columns are handled by the block method.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
-* Set A(1:kk,kk+1:n) to zero.
+* Apply H to A(i:m,i+ib:n) from the left
*
- DO 20 J = KK + 1, N
- DO 10 I = 1, KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL SLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+ $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I,I+IB), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to rows i:m of current block
*
- IF( KK.LT.N )
- $ CALL SORG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL SORGKR(M-I+1, IB, A(I,I), LDA)
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
- IF( KK.GT.0 ) THEN
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Use blocked code
+ CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.N ) THEN
+* Apply H to A(i:m,i+ib:n) from the left
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+ CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
*
- CALL SLARFT( 'Forward', 'Columnwise', M-I+1, IB,
- $ A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+* Apply H to rows i:m of current block
*
-* Apply H to A(i:m,i+ib:n) from the left
+ CALL SORGKR(M-I+1, IB, A(I,I), LDA)
+ END DO
*
- CALL SLARFB( 'Left', 'No transpose', 'Forward',
- $ 'Columnwise', M-I+1, N-I-IB+1, IB,
- $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
- $ LDA, WORK( IB+1 ), LDWORK )
- END IF
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
*
-* Apply H to rows i:m of current block
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
- CALL SORG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Set rows 1:i-1 of current block to zero
+ CALL SLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- DO 40 J = I, I + IB - 1
- DO 30 L = 1, I - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
- END IF
+* Apply H to A(i:m,i+ib:n) from the left
*
+ CALL SLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+* Apply H to rows i:m of current block
+*
+ CALL SORGKR(M-I+1, IB, A(I,I), LDA)
+ END IF
+ END IF
WORK( 1 ) = SROUNDUP_LWORK(IWS)
RETURN
*
diff --git a/SRC/sorgrk.f b/SRC/sorgrk.f
new file mode 100644
index 0000000000..c2429d93e4
--- /dev/null
+++ b/SRC/sorgrk.f
@@ -0,0 +1,168 @@
+*> \brief \b SORGRK computes the explicit Q factor from SGERQF and SLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE SORGRK(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* REAL Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> SORGRK generates an m by n real matrix Q with orthonormal rows,
+*> which is defined as the last m rows of the product of m
+*> elementary reflectors
+*>
+*> Q = I - V'*T*V = H(m) . . . H(2) H(1)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by SGERQF and T is the n by n matrix returned by SLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V, and the order of T.
+*> M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is REAL array, dimension (LDQ,N)
+*> On entry, Q(i,1:n-m-1+i) contains the vector which defines the
+*> elementary reflector H(i), for i=1,...,n as returned by SGERQF.
+*> In addition, the upper triangular portion of the submatrix given
+*> by Q(1:m,n-m:n) will contain the array T as returned by SLARFT.
+*> See further details for more information.
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The storage of the V and T components inside Q is best illustrated by
+*> the following example with m = 3, n = 5.
+*>
+*> Q = |----------------|
+*> | V1 V1 T1 T1 T1 |
+*> | V2 V2 V2 T2 T2 |
+*> | V3 V3 V3 V3 T3 |
+*> |----------------|
+*>
+*> \endverbatim
+*>
+* =====================================================================
+
+ SUBROUTINE SORGRK(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ REAL Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ REAL NEG_ONE, ONE
+ PARAMETER(NEG_ONE=-1.0E+0, ONE=1.0E+0)
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL STRMM, STRTRM, SLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |-----|
+* Q = | V T |
+* |-----|
+*
+* Where T is an m-by-m upper triangular matrix, and V is as described
+* in the Further Details section
+*
+* In turn, break apart V as follows
+*
+* |---------|
+* V = | V_2 V_1 |
+* |---------|
+*
+* Where:
+*
+* V_1 \in \R^{m\times m} assumed unit lower triangular
+* V_2 \in \R^{m\times n-m}
+*
+* Compute T = V_1'*T
+*
+ CALL STRTRM('Left', 'Upper', 'Transpose', 'Non-Unit', 'Unit',
+ $ M, ONE, Q(1,N-M+1), LDQ, Q(1,N-M+1), LDQ)
+*
+* Compute Q = -TV. This means that we need to break apart
+* Our computation in two parts
+*
+* |---------------|
+* Q = | -T*V_2 -T*V_1 |
+* |---------------|
+*
+* Q_2 = -T*V_2 (TRMM) but only when necessary
+*
+ IF (N.GT.M) THEN
+ CALL STRMM('Left', 'Upper', 'No Transpose', 'Non-Unit',
+ $ M, N-M, NEG_ONE, Q(1,N-M+1), LDQ, Q, LDQ)
+ END IF
+*
+* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL SLUMM('Right', 'Unit', 'Non-Unit', M, NEG_ONE,
+ $ Q(1,N-M+1), LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,N-M+I) = Q(I,N-M+I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/sorgrq.f b/SRC/sorgrq.f
index 5fe685c1d8..27fd47803c 100644
--- a/SRC/sorgrq.f
+++ b/SRC/sorgrq.f
@@ -147,7 +147,8 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL SLARFB, SLARFT, SORGR2, XERBLA
+ EXTERNAL SLARFB0C2, SLARFT, SORGR2,
+ $ SORGRK, XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -178,7 +179,7 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'SORGRQ', ' ', M, N, K, -1 )
- LWKOPT = M*NB
+ LWKOPT = M
END IF
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
*
@@ -208,83 +209,72 @@ SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
* Determine when to cross over from blocked to unblocked code.
*
NX = MAX( 0, ILAENV( 3, 'SORGRQ', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
-*
-* Determine if workspace is large enough for blocked code.
-*
- LDWORK = M
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
-*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
-*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'SORGRQ', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
END IF
*
IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Use blocked code after the first block.
-* The last kk rows are handled by the block method.
-*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+* We want to use the blocking method as long as our matrix is big enough
+* and it's deemed worthwhile
*
-* Set A(1:m-kk,n-kk+1:n) to zero.
-*
- DO 20 J = N - KK + 1, N
- DO 10 I = 1, M - KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
+ KK = K
ELSE
KK = 0
END IF
*
-* Use unblocked code for the first or only block.
+* Potentially bail to the unblocked code
*
- CALL SORGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ IF( KK.EQ.0 ) THEN
+ CALL SORGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
+ END IF
*
IF( KK.GT.0 ) THEN
*
-* Use blocked code
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- II = M - K + I
- IF( II.GT.1 ) THEN
+ I = 1
+ IB = NB
+ II = M - K + I
+*
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
+*
+ CALL SLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA )
+*
+* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+ CALL SLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Backward',
+ $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA,
+ $ A( II, N-K+I ), LDA, A, LDA)
*
- CALL SLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
- $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+* Apply H to columns 1:n-k+i+ib-1 of current block
+*
+ CALL SORGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA )
+
+ DO I = NB + 1, K, NB
+*
+* The last block may be less than size NB
+*
+ IB = MIN(NB, K-I+1)
+ II = M - K + I
*
-* Apply H**T to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL SLARFB( 'Right', 'Transpose', 'Backward',
- $ 'Rowwise',
- $ II-1, N-K+I+IB-1, IB, A( II, 1 ), LDA, WORK,
- $ LDWORK, A, LDA, WORK( IB+1 ), LDWORK )
- END IF
+ CALL SLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA )
*
-* Apply H**T to columns 1:n-k+i+ib-1 of current block
+* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL SORGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
- $ TAU( I ),
- $ WORK, IINFO )
+ CALL SLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1),
+ $ LDA, A( II, N-K+I ), LDA, A, LDA)
*
-* Set columns n-k+i+ib:n of current block to zero
+* Apply H to columns 1:n-k+i+ib-1 of current block
*
- DO 40 L = N - K + I + IB, N
- DO 30 J = II, II + IB - 1
- A( J, L ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL SORGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA )
+ END DO
END IF
*
WORK( 1 ) = SROUNDUP_LWORK(IWS)
diff --git a/SRC/strmmoop.f b/SRC/strmmoop.f
new file mode 100644
index 0000000000..26776369b0
--- /dev/null
+++ b/SRC/strmmoop.f
@@ -0,0 +1,1625 @@
+*> \brief \b STRMMOOP computes an out of place triangular times general matrix multiplication
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE STRMMOOP(SIDE, UPLO, TRANSA, TRANSB,
+* $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* .. Scalar Arguments ..
+* REAL ALPHA, BETA
+* INTEGER M, N, LDA, LDB, LDC
+* CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG
+* ..
+* .. Array Arguments ..
+* REAL A(LDA,*), B(LDB,*), C(LDC,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> STRMMOOP performs one of the matrix-matrix operations
+*>
+*> C = \alpha op(A) * op(B) + \beta C
+*> or
+*> C = \alpha op(B) * op(A) + \beta C
+*>
+*> where \alpha and \beta are scalars, C is an m-by-n matrix, A is
+*> a unit, or non-unit, upper or lower triangular matrix, and op(A) is
+*> is one of
+*>
+*> op(A) = A or op(A) = A**T
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> On entry, SIDE specifies whether op(A) multiplies op(B) from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' C = \alpha op(A) * op(B) + \beta C
+*>
+*> SIDE = 'R' or 'r' C = \alpha op(B) * op(A) + \beta C
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> On entry, UPLO specifies whether the matrix A is an upper or
+*> lower triangular matrix as follows:
+*> UPLO = 'U' or 'u' A is upper triangular
+*>
+*> UPLO = 'L' or 'l' A is lower triangular
+*> \Endverbatim
+*>
+*> \param[in] TRANSA
+*> \verbatim
+*> TRANSA is CHARACTER*1
+*> On entry, TRANSA specifies the form of op(A) to be used in
+*> the matrix multiplication as follows:
+*> TRANSA = 'N' or 'n' op(A) = A
+*>
+*> TRANSA = 'T' or 't' op(A) = A**T
+*>
+*> TRANSA = 'C' or 'c' op(A) = A**T
+*> \endverbatim
+*>
+*> \param[in] TRANSB
+*> \verbatim
+*> TRANSB is CHARACTER*1
+*> On entry, TRANSB specifies the form of op(B) to be used in
+*> the matrix multiplication as follows:
+*> TRANSB = 'N' or 'n' op(B) = B
+*>
+*> TRANSB = 'T' or 't' op(B) = B**T
+*>
+*> TRANSB = 'C' or 'c' op(B) = B**T
+*> \endverbatim
+*>
+*> \param[in] DIAG
+*> \verbatim
+*> DIAG is CHARACTER*1
+*> On entry, DIAG specifies whether or not A is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' A is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' A is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> On entry, M specifies the number of rows of C. M must be at
+*> least zero.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> On entry, N specifies the number of columns of C. N must be
+*> at least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is REAL
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then A and B are not referenced, and A and B need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*> A is REAL array, dimension ( LDA, K ) where
+*> K is M when SIDE = 'L' and K is N when SIDE='R'
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array A must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> A is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array A must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> A is not referenced.
+*> Note that when DIAG = 'U' or 'u', the diagonal elements of
+*> A are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> On entry, LDA specifies the first dimension of A as declared
+*> in the calling (sub) program. When SIDE = 'L' or 'l' then
+*> LDA must be at least max( 1, m ), when SIDE = 'R' or 'r'
+*> then LDA must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*> B is REAL array, dimension ( LDB, K ), where K is M
+*> If SIDE='R' and TRANSA='N', or SIDE='L' and TRANSA='T' and N
+*> otherwise. On entry, the leading k-by-k submatrix must contain
+*> B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*> LDB is INTEGER
+*> On entry, LDB specifies the first dimension of B as declared
+*> in the calling (sub) program. When SIDE = 'R' and TRANSB='N'
+*> then LDB must be at least max( 1, m ), when SIDE = 'R'
+*> and TRANSB = 'T' then LDB must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] BETA
+*> \verbatim
+*> BETA is REAL .
+*> On entry, BETA specifies the scalar beta. When beta is
+*> zero then C is not referenced on entry, and C need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is REAL array, dimension ( LDC, N )
+*> Before entry, the leading m-by-n part of the array C must
+*> contain the matrix C, and on exit is overwritten by the
+*> transformed matrix.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> On entry, LDC specifies the first dimension of C as declared
+*> in the calling (sub) program. LDC must be at least
+*> max( 1, m ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ RECURSIVE SUBROUTINE STRMMOOP(SIDE, UPLO, TRANSA, TRANSB,
+ $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* .. Scalar Arguments ..
+ REAL ALPHA, BETA
+ INTEGER M, N, LDA, LDB, LDC
+ CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG
+* ..
+* .. Array Arguments ..
+ REAL A(LDA,*), B(LDB,*), C(LDC,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ REAL SDOT
+ EXTERNAL LSAME, SDOT
+* ..
+* .. External Subroutines ..
+ EXTERNAL SGEMM, SAXPY, SLASET, SSCAL
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Local Scalars ..
+ INTEGER I, L, K, INCB
+ LOGICAL LSIDE, UPPER, UNIT, TRANST, TRANSG
+* ..
+* .. Local Parameters ..
+ REAL ONE, ZERO
+ PARAMETER(ONE=1.0E+0, ZERO=0.0E+0)
+* ..
+*
+* Beginning of Executable Statements
+*
+ LSIDE = LSAME(SIDE, 'L')
+ UPPER = LSAME(UPLO, 'U')
+ ! If we are transposing the triangular matrix (A)
+ TRANST= LSAME(TRANSA, 'T').OR.LSAME(TRANSA, 'C')
+ ! If we are transposing the general matrix (B)
+ TRANSG= LSAME(TRANSB, 'T').OR.LSAME(TRANSB, 'C')
+*
+* Terminating Case
+*
+ UNIT = LSAME(DIAG, 'U')
+ IF (M.EQ.1.AND.N.EQ.1) THEN
+*
+* This case is the simplest as we are just computing C = \alpha A*B +
+* \beta C where all components are 1-by-1 matrices
+*
+
+ IF (BETA.EQ.ZERO) THEN
+ C(1,1) = ZERO
+ ELSE
+ C(1,1) = C(1,1) * BETA
+ END IF
+ IF(ALPHA.NE.ZERO) THEN
+ IF(UNIT) THEN
+ C(1,1) = C(1,1) + ALPHA*B(1,1)
+ ELSE
+ C(1,1) = C(1,1) + ALPHA*A(1,1)*B(1,1)
+ END IF
+ END IF
+ RETURN
+ ELSE IF (M.EQ.1) THEN
+*
+* This means that C is a row vector. If BETA is 0, then we
+* set it explicitly, otherwise we overwrite it with BETA*C
+*
+ IF (BETA.EQ.ZERO) THEN
+ ! This ensures we don't reference C unless we need to
+ CALL SLASET('All', M, N, ZERO, ZERO, C, LDC)
+ ELSE
+ CALL SSCAL(N, BETA, C, LDC)
+ END IF
+ IF (ALPHA.NE.ZERO) THEN
+*
+* Recall that the number of columns of A is determined by SIDE
+*
+ IF (LSIDE) THEN
+*
+* Determine if B is a row or column vector
+*
+ IF (TRANSG) THEN
+ INCB = 1 ! This means that B is a column vector
+ ELSE
+ INCB = LDB ! This means that B is a row vector
+ END IF
+*
+* This means that A is a scalar, so it is either assumed to be
+* ONE or explicitly stored in A(1,1)
+*
+ IF (UNIT) THEN
+ CALL SAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ CALL SAXPY(N, ALPHA * A(1,1), B, INCB, C, LDC)
+ END IF
+ ELSE ! A is on the right
+*
+* Determine if B is a row or column vector
+*
+ IF (TRANSG) THEN
+ INCB = 1 ! This means that B is a column vector
+ ELSE
+ INCB = LDB ! This means that B is a row vector
+ END IF
+*
+* This means that A is an n-by-n matrix
+*
+ IF (UPPER) THEN
+ IF (TRANST) THEN
+ IF (TRANSG) THEN
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(N-I,
+ $ B(I+1,1), 1, A(I,I+1), LDA) +
+ $ C(1,I)
+ END DO
+ CALL SAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(N-I+1,
+ $ B(I,1), 1, A(I,I), LDA) +
+ $ C(1,I)
+ END DO
+ END IF
+ ELSE ! Not transposing B
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(N-I,
+ $ B(1,I+1), LDB, A(I,I+1), LDA) +
+ $ C(1,I)
+ END DO
+ CALL SAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(N-I+1,
+ $ B(1,I), LDB, A(I,I), LDA) +
+ $ C(1,I)
+ END DO
+ END IF
+ END IF
+ ELSE ! Not transposing A
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(I-1, B, INCB,
+ $ A(1,I), 1) + C(1,I)
+ END DO
+
+ CALL SAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(I, B, INCB,
+ $ A(1,I), 1) + C(1,I)
+ END DO
+ END IF
+ END IF
+ ELSE ! A is lower
+ IF (TRANST) THEN
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(I-1, B, INCB,
+ $ A(I,1), LDA) + C(1,I)
+ END DO
+
+ CALL SAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(I, B, INCB,
+ $ A(I,1), LDA) + C(1,I)
+ END DO
+ END IF
+ ELSE ! A is not transposed
+ IF (TRANSG) THEN
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(N-I,
+ $ B(I+1,1), 1, A(I+1,I), 1) +
+ $ C(1,I)
+ END DO
+ CALL SAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(N-I+1,
+ $ B(I,1), 1, A(I,I), 1) +
+ $ C(1,I)
+ END DO
+ END IF
+ ELSE! B is not transposed
+ IF (UNIT) THEN
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(N-I,
+ $ B(1,I+1), LDB, A(I+1,I), 1) +
+ $ C(1,I)
+ END DO
+ CALL SAXPY(N, ALPHA, B, INCB, C, LDC)
+ ELSE
+ DO I = 1, N
+ C(1,I) = ALPHA * SDOT(N-I+1,
+ $ B(1,I), LDB, A(I,I), 1) +
+ $ C(1,I)
+ END DO
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ RETURN
+ ELSE IF (N.EQ.1) THEN
+*
+* This means that C is a column vector. If BETA is 0, then we
+* set it explicitly, otherwise we overwrite it with BETA*C
+*
+ IF (BETA.EQ.ZERO) THEN
+ ! This ensures we don't reference C unless we need to
+ CALL SLASET('All', M, N, ZERO, ZERO, C, LDC)
+ ELSE
+ CALL SSCAL(M, BETA, C, 1)
+ END IF
+
+ IF (ALPHA.NE.ZERO) THEN
+ IF (TRANSG) THEN
+ INCB = LDB ! B is a row vector
+ ELSE
+ INCB = 1 ! B is a column vector
+ END IF
+ IF (LSIDE) THEN
+ IF (UPPER) THEN
+ IF (TRANST) THEN
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(I-1, A(1, I),
+ $ 1, B, INCB) + C(I,1)
+ END DO
+ CALL SAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(I, A(1, I), 1,
+ $ B, INCB) + C(I,1)
+ END DO
+ END IF
+ ELSE ! A is not transposed
+ IF (TRANSG) THEN
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(M-I,
+ $ A(I,I+1), LDA, B(1, I+1), LDB) +
+ $ C(I,1)
+ END DO
+
+ CALL SAXPY(M, ALPHA, B, LDB, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(M-I+1,
+ $ A(I,I), LDA, B(1,I), LDB) +
+ $ C(I,1)
+ END DO
+ END IF
+ ELSE ! B is not transposed
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(M-I,
+ $ A(I,I+1), LDA, B(I+1,1), 1) +
+ $ C(I,1)
+ END DO
+
+ CALL SAXPY(M, ALPHA, B, 1, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(M-I+1,
+ $ A(I,I), LDA, B(I,1), 1) +
+ $ C(I,1)
+ END DO
+ END IF
+ END IF
+ END IF
+ ELSE ! A is lower
+ IF (TRANST) THEN
+ IF (TRANSG) THEN
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(M-I,
+ $ A(I+1,I), 1, B(1,I+1), LDB) +
+ $ C(I,1)
+ END DO
+
+ CALL SAXPY(M, ALPHA, B, LDB, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(M-I+1,
+ $ A(I,I), 1, B(1,I), LDB) +
+ $ C(I,1)
+ END DO
+ END IF
+ ELSE ! A is not transposed
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(M-I,
+ $ A(I+1,I), 1, B(I+1,1), 1) +
+ $ C(I,1)
+ END DO
+
+ CALL SAXPY(M, ALPHA, B, 1, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(M-I+1,
+ $ A(I,I), 1, B(I,1), 1) +
+ $ C(I,1)
+ END DO
+ END IF
+ END IF
+ ELSE ! A is not transposed
+ IF (UNIT) THEN
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(I-1, A(I,1),
+ $ LDA, B, INCB) + C(I,1)
+ END DO
+ CALL SAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+ DO I = 1, M
+ C(I,1) = ALPHA * SDOT(I, A(I,1), LDA,
+ $ B, INCB) + C(I,1)
+ END DO
+ END IF
+ END IF
+ END IF
+ ELSE ! A is on the right
+ ! Since the trailing dimension of op(A) must be 1,
+ ! we know that A must be a scalar
+ IF (UNIT) THEN
+ CALL SAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+ CALL SAXPY(M, ALPHA*A(1,1), B, INCB, C, 1)
+ END IF
+ END IF
+ END IF
+ RETURN
+ END IF
+*
+* Recursive Case
+*
+ L = MIN(M,N)/2
+ IF (LSIDE) THEN
+*
+* We are multiplying A from the left IE we are computing
+* C = \alpha op(A)*op(B) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing
+*
+* So we are computing
+* C = \alpha A**T * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11}**T + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{21}**T + \beta C_{12}
+* C_{21} = \alpha A_{12}**T * B_{11}**T + \alpha A_{22}**T * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{12}**T * B_{21}**T + \alpha A_{22}**T * B_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{12}**T * B_{11}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22}**T * B_{12}**T + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{12}**T * B_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22}**T * B_{22}**T + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C,
+ $ LDC)
+ ! C_{12}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL SGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(1, L+1), LDA, B, LDB, BETA, C(L+1,1),
+ $ LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1,L+1), LDA, B(1,L+1),
+ $ LDB, ONE, C(L+1,1), LDC)
+ ! C_{22}
+ CALL SGEMM(TRANSA, TRANSB, M-L, N-L, L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1,1), LDB, BETA,
+ $ C(L+1,L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1,L+1), LDA,
+ $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11} + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{12} + \beta C_{12}
+* C_{21} = \alpha A_{12}**T * B_{11} + \alpha A_{22}**T * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{12}**T * B_{12} + \alpha A_{22}**T * B_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{12}**T * B_{11} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22}**T * B_{21} + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{12}**T * B_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22}**T * B_{22} + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ ! C_{21}
+ CALL SGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(1, L+1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL SGEMM(TRANSA, TRANSB, M-L, N-L, L,
+ $ ALPHA, A(1, L+1), LDA, B(1, L+1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11}**T + \alpha A_{12} * B_{12}**T + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha A_{22} * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{22} * B_{22}**T + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{12} * B_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11} * B_{11}**T + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{12} * B_{22}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11} * B_{21}**T + C_{12} (This routine)
+*
+ ! C_{11}
+ CALL SGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(1, L+1), LDB, BETA, C, LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL SGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1,1), LDB, ONE,
+ $ C(1, L+1), LDC)
+ ! C_{21}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11} + \alpha A_{12} * B_{21} + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{12} + \alpha A_{12} * B_{22} + \beta C_{12}
+* C_{21} = \alpha A_{22} * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{22} * B_{22} + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{12} * B_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11} * B_{11} + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{12} * B_{22} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11} * B_{12} + C_{12} (This routine)
+*
+ ! C_{11}
+ CALL SGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, 1), LDB, BETA, C, LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL SGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11}**T + \alpha A_{21}**T * B_{12}**T + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{21}**T + \alpha A_{21}**T * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha A_{22}**T * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{22}**T * B_{22}**T + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{21}**T * B_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11}**T * B_{11}**T + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{21}**T * B_{22}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11}**T * B_{21}**T + C_{12} (This routine)
+*
+ ! C_{11}
+ CALL SGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(1, L+1), LDB, BETA, C, LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL SGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE,
+ $ C(1, L+1), LDC)
+ ! C_{21}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11} + \alpha A_{21}**T * B_{21} + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{12} + \alpha A_{21}**T * B_{22} + \beta C_{12}
+* C_{21} = \alpha A_{22}**T * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{22}**T * B_{22} + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{21}**T * B_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11}**T * B_{11} + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{21}**T * B_{22} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11}**T * B_{12} + C_{12} (This routine)
+*
+ ! C_{11}
+ CALL SGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, C, LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL SGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, ONE,
+ $ C(1, L+1), LDC)
+ ! C_{21}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11}**T + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{21}**T + \beta C_{12}
+* C_{21} = \alpha A_{21} * B_{11}**T + \alpha A_{22} * B_{12}**T + \beta * C_{21}
+* C_{22} = \alpha A_{21} * B_{21}**T + \alpha A_{22} * B_{22}**T + \beta * C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{21} * B_{11}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22} * B_{12}**T + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{21} * B_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22} * B_{22}**T + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL SGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL SGEMM(TRANSA, TRANSB, M-L, N-L, L,
+ $ ALPHA, A(L+1, 1), LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11} + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{12} + \beta C_{12}
+* C_{21} = \alpha A_{21} * B_{11} + \alpha A_{22} * B_{21} + \beta * C_{21}
+* C_{22} = \alpha A_{21} * B_{12} + \alpha A_{22} * B_{22} + \beta * C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{21} * B_{11} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22} * B_{21} + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{21} * B_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22} * B_{22} + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL SGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, A(L+1, 1), LDA, B(1, L+1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ END IF
+ ELSE
+*
+* We are multiplying A from the right IE we are computing
+* C = \alpha op(B)*op(A) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11}**T + \alpha B_{21}**T * A_{12}**T + \beta C_{11}
+* C_{12} = \alpha B_{21}**T * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11}**T + \alpha B_{22}**T * A_{12}**T + \beta C_{21}
+* C_{22} = \alpha B_{22}**T * A_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{21}**T * A_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11}**T * A_{11}**T + C_{11} (This routine)
+*
+* C_{21} = \alpha B_{22}**T * A_{12}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{12}**T * A_{11}**T + C_{21} (This routine)
+*
+ ! C_{11}
+ CALL SGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(L+1, 1), LDB, A(1, L+1), LDA, BETA, C, LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL SGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11}**T + \alpha B_{12} * A_{12}**T + \beta C_{11}
+* C_{12} = \alpha B_{12} * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11}**T + \alpha B_{22} * A_{12}**T + \beta C_{21}
+* C_{22} = \alpha B_{22} * A_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{12} * A_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11} * A_{11}**T + C_{11} (This routine)
+*
+* C_{21} = \alpha B_{22} * A_{12}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{21} * A_{11}**T + C_{21} (This routine)
+*
+ ! C_{11}
+ CALL SGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(1,L+1), LDB, A(1,L+1), LDA, BETA, C, LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL SGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ ONE, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11}**T * A_{12} + \alpha B_{21}**T * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11} + \beta C_{21}
+* C_{22} = \alpha B_{12}**T * A_{12} + \alpha B_{22}**T * A_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11}**T * A_{12} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{21}**T * A_{22} + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{12}**T * A_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22}**T * A_{22} + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL SGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, B(1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11} * A_{12} + \alpha B_{12} * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11} + \beta C_{21}
+* C_{22} = \alpha B_{21} * A_{12} + \alpha B_{22} * A_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11} * A_{12} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{12} * A_{22} + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{21} * A_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22} * A_{22} + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL SGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, 1), LDC)
+ ! C_{22}
+ CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, B(L+1, 1), LDB, A(1, L+1), LDA,
+ $ BETA, C(L+1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11}**T + \beta C_{11}
+* C_{12} = \alpha B_{11}**T * A_{21}**T + \alpha B_{21}**T * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11}**T + \beta C_{21}
+* C_{22} = \alpha B_{12}**T * A_{21}**T + \alpha B_{22}**T * A_{22}**T + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11}**T * A_{21}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{21}**T * A_{22}**T + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{12}**T * A_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22}**T * A_{22}**T + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL SGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(L+1, 1), LDC)
+ ! C_{22}
+ CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA,
+ $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11} * A_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11}**T + \beta C_{21}
+* C_{22} = \alpha B_{21} * A_{21}**T + \alpha A_{22} * B_{22}**T + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11} * A_{21}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{12} * A_{22}**T + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{21} * A_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22} * A_{22}**T + C_{22} (This routine)
+*
+ ! C_{11}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+ ! C_{12}
+ CALL SGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(1, L+1), LDC)
+ ! C_{21}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, 1), LDC)
+ ! C_{22}
+ CALL SGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA,
+ $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11} + \alpha B_{21}**T * A_{21} + \beta C_{11}
+* C_{12} = \alpha B_{21}**T * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11} + \alpha B_{22}**T * A_{21} + \beta C_{21}
+* C_{22} = \alpha B_{22}**T * A_{22} + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{21}**T * A_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11}**T * A_{11} + C_{11}(This routine)
+*
+* C_{21} = \alpha B_{22}**T * A_{21} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{12}**T * A_{11} + C_{21} (This routine)
+*
+ ! C_{11}
+ CALL SGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, C, LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL SGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, ONE,
+ $ C(L+1, 1), LDC)
+ ! C_{22}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \alpha B_{12} * A_{21} + \beta C_{11}
+* C_{12} = \alpha B_{12} * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11} + \alpha B_{22} * A_{21} + \beta C_{21}
+* C_{22} = \alpha B_{22} * A_{22} + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{12} * A_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11} * A_{11} + C_{11}(This routine)
+*
+* C_{21} = \alpha B_{22} * A_{21} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{21} * A_{11} + C_{21} (This routine)
+*
+ ! C_{11}
+ CALL SGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, C, LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+ ! C_{12}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(1, L+1), LDC)
+ ! C_{21}
+ CALL SGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE,
+ $ C(L+1, 1), LDC)
+ ! C_{22}
+ CALL STRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ END IF
+ END IF
+ END SUBROUTINE
diff --git a/SRC/strtrm.f b/SRC/strtrm.f
new file mode 100644
index 0000000000..ba8f4a0dba
--- /dev/null
+++ b/SRC/strtrm.f
@@ -0,0 +1,577 @@
+*> \brief \b STRTRM computes an in place triangular-triangular matrix product
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE STRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV,
+* $ N, ALPHA, T, LDT, V, LDV)
+*
+* .. Scalar Arguments ..
+* INTEGER N, LDT, LDV
+* CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV
+* REAL ALPHA
+* ..
+* .. Array Arguments ..
+* REAL T(LDT,*), V(LDV,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> STRTRM performs one of the matrix-matrix operations
+*>
+*> T = \alpha op(V) * T
+*> or
+*> T = \alpha T * op(V)
+*> where \alpha is a scalar, T and V are unit, or non-unit, upper or
+*> lower triangular matrix, and op(V) is one of
+*>
+*> op(V) = V or op(V) = V**T
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> On entry, SIDE specifies whether op(V) multiplies T from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' T = \alpha op(V) * T
+*>
+*> SIDE = 'R' or 'r' T = \alpha T * op(V)
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> On entry, UPLO specifies whether the matrix T is an upper or
+*> lower triangular matrix as follows:
+*> UPLO = 'U' or 'u' T is upper triangular
+*>
+*> UPLO = 'L' or 'l' T is lower triangular
+*> \Endverbatim
+*>
+*> \param[in] TRANSV
+*> \verbatim
+*> TRANSV is CHARACTER*1
+*> On entry, TRANSV specifies the form of op(V) to be used in
+*> the matrix multiplication as follows:
+*> TRANSV = 'N' or 'n' op(V) = V
+*>
+*> TRANSV = 'T' or 't' op(V) = V**T
+*>
+*> TRANSV = 'C' or 'c' op(V) = V**T
+*> \endverbatim
+*>
+*> \param[in] DIAGT
+*> \verbatim
+*> DIAGT is CHARACTER*1
+*> On entry, DIAGT specifies whether or not T is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' T is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' T is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] DIAGV
+*> \verbatim
+*> DIAGV is CHARACTER*1
+*> On entry, DIAGV specifies whether or not V is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' V is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' V is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> On entry, N specifies the number of rows and columns of T.
+*> N must be at least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is REAL .
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then T and V are not referenced, and T and V need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is REAL array, dimension ( LDT, N )
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array T must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> T is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array T must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> T is not referenced.
+*> Note that when DIAGT = 'U' or 'u', the diagonal elements of
+*> T are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> On entry, LDT specifies the first dimension of T as declared
+*> in the calling (sub) program. LDT must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is REAL array, dimension ( LDV, N )
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array op(V) must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> V is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array op(V) must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> V is not referenced.
+*> Note that when DIAGV = 'U' or 'u', the diagonal elements of
+*> V are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> On entry, LDV specifies the first dimension of T as declared
+*> in the calling (sub) program. LDV must be at least max( 1, n ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ RECURSIVE SUBROUTINE STRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV,
+ $ N, ALPHA, T, LDT, V, LDV)
+*
+* .. Scalar Arguments ..
+ INTEGER N, LDT, LDV
+ CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV
+ REAL ALPHA
+* ..
+* .. Array Arguments ..
+ REAL T(LDT,*), V(LDV,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* .. External Subroutines ..
+ EXTERNAL STRMM, STRMMOOP, SLASET
+* ..
+* .. Local Scalars ..
+ INTEGER K, INFO
+ LOGICAL TLEFT, TUPPER, VTRANS, VUNIT, TUNIT
+* ..
+* .. Local Parameters ..
+ REAL ONE, ZERO
+ PARAMETER(ONE=1.0E+0, ZERO=0.0E+0)
+* ..
+*
+* Beginning of Executable Statements
+*
+*
+* Early Termination Criteria
+*
+ IF (ALPHA.EQ.ZERO) THEN
+*
+* If ALPHA is 0, then we are just setting T to be the 0 matrix
+*
+ CALL SLASET(UPLO, N, N, ZERO, ZERO, T, LDT)
+ RETURN
+ END IF
+ TUNIT = LSAME(DIAGT, 'U')
+ VUNIT = LSAME(DIAGV, 'U')
+*
+* Terminating Case
+*
+ IF (N.EQ.1) THEN
+ IF (VUNIT.AND.TUNIT) THEN
+ T(1,1) = ALPHA
+ ELSE IF (VUNIT) THEN
+ T(1,1) = ALPHA*T(1,1)
+ ELSE IF (TUNIT) THEN
+ T(1,1) = ALPHA*V(1,1)
+ ELSE
+ T(1,1) = ALPHA*T(1,1)*V(1,1)
+ END IF
+ RETURN
+ ELSE IF(N.LE.0) THEN
+ RETURN
+ END IF
+*
+* Recursive case
+*
+ TUPPER = LSAME(UPLO,'U')
+ TLEFT = LSAME(SIDE,'R')
+ VTRANS = LSAME(TRANSV,'T').OR.LSAME(TRANSV,'C')
+
+ K = N / 2
+ IF(TUPPER) THEN
+*
+* T is upper triangular
+*
+ IF(TLEFT) THEN
+*
+* Compute T = T*op(V)
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = T*V**T, which we break down as follows
+* |--------------| |--------------| |--------------------|
+* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11}**T V_{21}**T|
+* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}**T|
+* |--------------| |--------------| |--------------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}**T
+* T_{12} = \alpha T_{11}*V_{21}**T + \alpha T_{12}*V_{22}**T
+* T_{22} = \alpha T_{22}*V_{22}**T
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha T_{12}*V_{22}**T (STRMM)
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} (STRMMOOP)
+*
+* T_{12} = \alpha T_{12}*V_{22}**T
+*
+ CALL STRMM('Right', 'Lower', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12}
+*
+ CALL STRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT,
+ $ V(K+1, 1), LDV, ONE, T(1, K+1), LDT)
+ ELSE
+*
+* We are computing T = T*V, which we break down as follows
+* |--------------| |--------------| |-------------|
+* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11} V_{12}|
+* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}|
+* |--------------| |--------------| |-------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}
+* T_{12} = \alpha T_{11}*V_{12} + \alpha T_{12}*V_{22}
+* T_{22} = \alpha T_{22}*V_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha T_{12}*V_{22} (STRMM)
+* T_{12} = \alpha T_{11}*V_{12} + T_{12} (STRMMOOP)
+*
+* T_{12} = \alpha T_{12}*V_{22}
+*
+ CALL STRMM('Right', 'Upper', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12}
+*
+ CALL STRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT,
+ $ V(1, K+1), LDV, ONE, T(1, K+1), LDT)
+ END IF
+ ELSE
+*
+* Compute T = op(V)*T
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = V**T*T, which we break down as follows
+* |--------------| |--------------------| |--------------|
+* |T_{11} T_{12}| |V_{11}**T V_{21}**T| |T_{11} T_{12}|
+* |0 T_{22}| = \alpha |0 V_{22}**T| * |0 T_{22}|
+* |--------------| |--------------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}**T*T_{11}
+* T_{12} = \alpha V_{11}**T*T_{12} + \alpha V_{21}**T*T_{22}
+* T_{22} = \alpha V_{22}**T*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha V_{11}**T*T_{12} (STRMM)
+* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} (STRMMOOP)
+*
+* T_{12} = \alpha V_{11}**T*T_{12}
+*
+ CALL STRMM('Left', 'Lower', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V, LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha V_{21}**T*T_{22} + T_{12}
+*
+ CALL STRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1),
+ $ LDT, V(K+1, 1), LDV, ONE, T(1, K+1), LDT)
+ ELSE
+*
+* We are computing T = V*T, which we break down as follows
+* |--------------| |--------------| |--------------|
+* |T_{11} T_{12}| |V_{11} V_{12}| |T_{11} T_{12}|
+* |0 T_{22}| = \alpha |0 V_{22}| * |0 T_{22}|
+* |--------------| |--------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}*T_{11}
+* T_{12} = \alpha V_{11}*T_{12} + \alpha V_{12}*T_{22}
+* T_{22} = \alpha V_{22}*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha V_{11}*T_{12} (STRMM)
+* T_{12} = \alpha V_{12}*T_{22} + T_{12} (STRMMOOP)
+*
+* T_{12} = \alpha V_{11}*T_{12}
+*
+ CALL STRMM('Left', 'Upper', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V, LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha V_{12}*T_{22} + T_{12} (STRMMOOP)
+*
+ CALL STRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1),
+ $ LDT, V(1, K+1), LDV, ONE, T(1, K+1), LDT)
+ END IF
+ END IF
+ ELSE
+*
+* T is lower triangular
+*
+ IF(TLEFT) THEN
+*
+* Compute T = T*op(V)
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = T*V**T, which we break down as follows
+* |--------------| |--------------| |--------------------|
+* |T_{11} 0 | |T_{11} 0 | |V_{11}**T 0 |
+* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{12}**T V_{22}**T|
+* |--------------| |--------------| |--------------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}**T
+* T_{21} = \alpha T_{21}*V_{11}**T + \alpha T_{22}*V_{12}**T
+* T_{22} = \alpha T_{22}*V_{22}**T
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha T_{21}*V_{11}**T (STRMM)
+* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} (STRMMOOP)
+*
+* T_{21} = \alpha T_{21}*V_{11}**T
+*
+ CALL STRMM('Right', 'Upper', TRANSV, DIAGV, N-K,
+ $ K, ALPHA, V, LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha T_{22}*V_{12}**T + T_{21}
+*
+ CALL STRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1),
+ $ LDT, V(1, K+1), LDV, ONE, T(K+1, 1), LDT)
+ ELSE
+*
+* We are computing T = T*V, which we break down as follows
+* |--------------| |--------------| |-------------|
+* |T_{11} 0 | |T_{11} 0 | |V_{11} 0 |
+* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{21} V_{22}|
+* |--------------| |--------------| |-------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}
+* T_{21} = \alpha T_{21}*V_{11} + \alpha T_{22}*V_{21}
+* T_{22} = \alpha T_{22}*V_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha T_{21}*V_{11} (STRMM)
+* T_{21} = \alpha T_{22}*V_{21} + T_{21} (STRMMOOP)
+*
+* T_{21} = \alpha T_{21}*V_{11}
+*
+ CALL STRMM('Right', 'Lower', TRANSV, DIAGV, N-K,
+ $ K, ALPHA, V, LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha T_{22}*V_{12} + T_{21}
+*
+ CALL STRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1),
+ $ LDT, V(K+1, 1), LDV, ONE, T(K+1, 1), LDT)
+ END IF
+ ELSE
+*
+* Compute T = op(V)*T
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = V**T*T, which we break down as follows
+* |--------------| |--------------------| |--------------|
+* |T_{11} 0 | |V_{11}**T 0 | |T_{11} 0 |
+* |T_{21} T_{22}| = \alpha |V_{12}**T V_{22}**T| * |T_{21} T_{22}|
+* |--------------| |--------------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}**T*T_{11}
+* T_{21} = \alpha V_{12}**T*T_{11} + \alpha V_{22}**T*T_{21}
+* T_{22} = \alpha V_{22}**T*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha V_{22}**T*T_{21} (STRMM)
+* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} (STRMMOOP)
+*
+* T_{21} = \alpha V_{22}**T*T_{21}
+*
+ CALL STRMM('Left', 'Upper', TRANSV, DIAGV, N-K, K,
+ $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha V_{12}**T*T_{11} + T_{21}
+*
+ CALL STRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT,
+ $ V(1, K+1), LDV, ONE, T(K+1, 1), LDT)
+ ELSE
+*
+* We are computing T = V*T, which we break down as follows
+* |--------------| |-------------| |--------------|
+* |T_{11} 0 | |V_{11} 0 | |T_{11} 0 |
+* |T_{21} T_{22}| = \alpha |V_{21} V_{22}| * |T_{21} T_{22}|
+* |--------------| |-------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}*T_{11}
+* T_{21} = \alpha V_{21}*T_{11} + \alpha V_{22}*T_{21}
+* T_{22} = \alpha V_{22}*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{21} = \alpha V_{22}*T_{21} (STRMM)
+* T_{21} = \alpha V_{12}*T_{11} + T_{21} (STRMMOOP)
+*
+* T_{21} = \alpha V_{22}*T_{12}
+*
+ CALL STRMM('Left', 'Lower', TRANSV, DIAGV, N-K, K,
+ $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha V_{12}*T_{11} + T_{21}
+*
+ CALL STRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT,
+ $ V(K+1, 1), LDV, ONE, T(K+1, 1), LDT)
+ END IF
+ END IF
+ END IF
+*
+* Since in all the above cases, we compute T_{11} and T_{22}
+* the same, we pass in our flags and call this routine recursively
+*
+* Compute T_{11} recursively
+*
+ CALL STRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, K, ALPHA,
+ $ T, LDT, V, LDV)
+*
+* Compute T_{22} recursively
+*
+ CALL STRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, N-K, ALPHA,
+ $ T(K+1, K+1), LDT, V(K+1, K+1), LDV)
+
+ END SUBROUTINE
diff --git a/SRC/zacxpy.f b/SRC/zacxpy.f
new file mode 100644
index 0000000000..14d07f77d2
--- /dev/null
+++ b/SRC/zacxpy.f
@@ -0,0 +1,133 @@
+*> \brief \b ZACXPY
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZACXPY(N,ZA,ZX,INCX,ZY,INCY)
+*
+* .. Scalar Arguments ..
+* COMPLEX*16 ZA
+* INTEGER INCX,INCY,N
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 ZX(*),ZY(*)
+* ..
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZACXPY constant times a conjugated vector plus a vector.
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> number of elements in input vector(s)
+*> \endverbatim
+*>
+*> \param[in] ZA
+*> \verbatim
+*> ZA is COMPLEX*16
+*> On entry, ZA specifies the scalar alpha.
+*> \endverbatim
+*>
+*> \param[in] ZX
+*> \verbatim
+*> ZX is COMPLEX*16 array, dimension ( 1 + ( N - 1 )*abs( INCX ) )
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*> INCX is INTEGER
+*> storage spacing between elements of ZX
+*> \endverbatim
+*>
+*> \param[in,out] ZY
+*> \verbatim
+*> ZY is COMPLEX*16 array, dimension ( 1 + ( N - 1 )*abs( INCY ) )
+*> \endverbatim
+*>
+*> \param[in] INCY
+*> \verbatim
+*> INCY is INTEGER
+*> storage spacing between elements of ZY
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup axpy
+*>
+* =====================================================================
+ SUBROUTINE ZACXPY(N,ZA,ZX,INCX,ZY,INCY)
+*
+* -- Reference BLAS level1 routine --
+* -- Reference BLAS is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ COMPLEX*16 ZA
+ INTEGER INCX,INCY,N
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 ZX(*),ZY(*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ DOUBLE PRECISION DCABS1
+ EXTERNAL DCABS1
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC CONJG
+* .. Local Scalars ..
+ INTEGER I,IX,IY
+* ..
+ IF (N.LE.0) RETURN
+ IF (DCABS1(ZA).EQ.0.0d0) RETURN
+ IF (INCX.EQ.1 .AND. INCY.EQ.1) THEN
+*
+* code for both increments equal to 1
+*
+ DO I = 1,N
+ ZY(I) = ZY(I) + ZA*CONJG(ZX(I))
+ END DO
+ ELSE
+*
+* code for unequal increments or equal increments
+* not equal to 1
+*
+ IX = 1
+ IY = 1
+ IF (INCX.LT.0) IX = (-N+1)*INCX + 1
+ IF (INCY.LT.0) IY = (-N+1)*INCY + 1
+ DO I = 1,N
+ ZY(IY) = ZY(IY) + ZA*CONJG(ZX(IX))
+ IX = IX + INCX
+ IY = IY + INCY
+ END DO
+ END IF
+*
+ RETURN
+*
+* End of ZACXPY
+*
+ END
+
diff --git a/SRC/zlarfb0c2.f b/SRC/zlarfb0c2.f
new file mode 100644
index 0000000000..a73b693901
--- /dev/null
+++ b/SRC/zlarfb0c2.f
@@ -0,0 +1,564 @@
+*> \brief \b ZLARFB0C2 applies a block reflector or its conjugate-transpose
+* to a rectangular matrix with a 0 block while constructing the explicit Q
+* factor
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+* $ K, V, LDV, T, LDT, C, LDC)
+* ! Scalar arguments
+* INTEGER M, N, K, LDV, LDC, LDT
+* CHARACTER SIDE, TRANS, DIRECT, STOREV
+* ! True means that we are assuming C2 is the identity matrix
+* ! and thus don't reference whatever is present in C2
+* ! at the beginning.
+* LOGICAL C2I
+* ! Array arguments
+* COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*)
+*
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZLARFB0C2 applies a real block reflector H or its transpose H**H to a
+*> complex m by n matrix C with a 0 block, while computing the explicit Q factor
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] C2I
+*> \verbatim
+*> C2I is LOGICAL
+*> = .TRUE.: Assume the nonzero block of C is the identity matrix
+*> = .FALSE.: Use existing data in the nonzero block of C
+*> \endverbatim
+*>
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> = 'L': apply H or H**H from the Left
+*> = 'R': apply H or H**H from the Right
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*> TRANS is CHARACTER*1
+*> = 'N': apply H (No transpose)
+*> = 'C': apply H**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIRECT
+*> \verbatim
+*> DIRECT is CHARACTER*1
+*> Indicates how H is formed from a product of elementary
+*> reflectors
+*> = 'F': H = H(1) H(2) . . . H(k) (Forward)
+*> = 'B': H = H(k) . . . H(2) H(1) (Backward)
+*> \endverbatim
+*>
+*> \param[in] STOREV
+*> \verbatim
+*> STOREV is CHARACTER*1
+*> Indicates how the vectors which define the elementary
+*> reflectors are stored:
+*> = 'C': Columnwise
+*> = 'R': Rowwise
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix C.
+*> \endverbatim
+*>
+*> \param[in] K
+*> \verbatim
+*> K is INTEGER
+*> The order of the matrix T (= the number of elementary
+*> reflectors whose product defines the block reflector).
+*> If SIDE = 'L', M >= K >= 0;
+*> if SIDE = 'R', N >= K >= 0.
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is COMPLEX*16 array, dimension
+*> (LDV,K) if STOREV = 'C'
+*> (LDV,M) if STOREV = 'R' and SIDE = 'L'
+*> (LDV,N) if STOREV = 'R' and SIDE = 'R'
+*> See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> The leading dimension of the array V.
+*> If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M);
+*> if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N);
+*> if STOREV = 'R', LDV >= K.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is COMPLEX*16 array, dimension (LDT,K)
+*> The triangular K-by-K matrix T in the representation of the
+*> block reflector.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> The leading dimension of the array T. LDT >= K.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is COMPLEX*16 array, dimension (LDC,N)
+*> On entry, the M-by-N matrix C.
+*> On exit, C is overwritten by H*C or H**H*C or C*H or C*H**H.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> The leading dimension of the array C. LDC >= max(1,M).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup larfb
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The shape of the matrix V and the storage of the vectors which define
+*> the H(i) is best illustrated by the following example with n = 5 and
+*> k = 3. The triangular part of V (including its diagonal) is not
+*> referenced.
+*>
+*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R':
+*>
+*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 )
+*> ( v1 1 ) ( 1 v2 v2 v2 )
+*> ( v1 v2 1 ) ( 1 v3 v3 )
+*> ( v1 v2 v3 )
+*> ( v1 v2 v3 )
+*>
+*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R':
+*>
+*> V = ( v1 v2 v3 ) V = ( v1 v1 1 )
+*> ( v1 v2 v3 ) ( v2 v2 v2 1 )
+*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 )
+*> ( 1 v3 )
+*> ( 1 )
+*> \endverbatim
+*>
+* =====================================================================
+ SUBROUTINE ZLARFB0C2(C2I, SIDE, TRANS, DIRECT, STOREV, M, N,
+ $ K, V, LDV, T, LDT, C, LDC)
+ ! Scalar arguments
+ INTEGER M, N, K, LDV, LDC, LDT
+ CHARACTER SIDE, TRANS, DIRECT, STOREV
+ ! True means that we are assuming C2 is the identity matrix
+ ! and thus don't reference whatever is present in C2
+ ! at the beginning.
+ LOGICAL C2I
+ ! Array arguments
+ COMPLEX*16 V(LDV,*), C(LDC,*), T(LDT,*)
+ ! Local scalars
+ LOGICAL QR, LQ, QL, DIRF, COLV, SIDEL, SIDER,
+ $ TRANST
+ INTEGER I, J
+ ! Intrinsic Functions
+ INTRINSIC CONJG
+ ! External functions
+ LOGICAL LSAME
+ EXTERNAL LSAME
+ ! External Subroutines
+ EXTERNAL ZGEMM, ZTRMM
+ ! Parameters
+ COMPLEX*16 ONE, ZERO, NEG_ONE
+ PARAMETER(ONE=(1.0D+0, 0.0D+0),
+ $ ZERO = (0.0D+0, 0.0D+0),
+ $ NEG_ONE = (-1.0D+0, 0.0D+0))
+
+ ! Beginning of executable statements
+ ! Convert our character flags to logical values
+ DIRF = LSAME(DIRECT,'F')
+ COLV = LSAME(STOREV,'C')
+ SIDEL = LSAME(SIDE,'L')
+ SIDER = LSAME(SIDE,'R')
+ TRANST = LSAME(TRANS,'C')
+
+ ! Determine which of the 4 modes are using.
+ ! QR is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the first column
+ QR = DIRF.AND.COLV
+
+ ! LQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the first row
+ LQ = DIRF.AND.(.NOT.COLV)
+
+ ! QL is when we store the reflectors column by column and have the
+ ! 'first' reflector stored in the last column
+ QL = (.NOT.DIRF).AND.COLV
+
+ ! RQ is when we store the reflectors row by row and have the
+ ! 'first' reflector stored in the last row
+ ! RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+ ! Since we have exactly one of these 4 modes, we don't need to actually
+ ! store the value of RQ, instead we assume this is the case if we fail
+ ! the above 3 checks.
+
+ IF (QR) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V1 ] and C = [ C1 ]
+ ! [ V2 ] [ C2 ]
+ ! with the following dimensions:
+ ! V1\in\C^{K\times K}
+ ! V2\in\C^{M-K\times K}
+ ! C1=0\in\C^{K\times N}
+ ! C2\in\C^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I - [ V1 ]T [V1', V2'])[ 0 ]
+ ! [ V2 ] [ C2 ]
+ ! = [ 0 ] - [ V1 ]T*V2'*C2
+ ! [ C2 ] [ V2 ]
+ !
+ ! = [ 0 ] - [ V1*T*V2'*C2 ]
+ ! [ C2 ] [ V2*T*V2'*C2 ]
+ !
+ ! = [ V1*T*V2'*C2 ]
+ ! [ C2 - V2*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('ZLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('ZLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF (C2I) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(I,J) = CONJG(V(K+J,I))
+ END DO
+ END DO
+ ELSE
+ CALL ZGEMM('Conjugate', 'No Transpose', K, N, M - K,
+ $ ONE, V(K+1,1), LDV, C(K+1,1), LDC, ZERO,
+ $ C, LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C, LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF (C2I) THEN
+ CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ZERO,
+ $ C(K+1,1), LDC)
+ DO I = 1, N
+ C(K+I,I) = C(K+I,I) + ONE
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V(K+1,1), LDV, C, LDC, ONE,
+ $ C(K+1,1), LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (LQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V1 V2 ] and C = [ C1 C2 ]
+ ! with the following dimensions:
+ ! V1\in\C^{K\times K}
+ ! V2\in\C^{K\times N-K}
+ ! C1=0\in\C^{M\times K}
+ ! C2\in\C^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ 0, C2 ](I - [ V1' ]T'[ V1, V2 ])
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - [ 0, C2 ][ V1' ]T'[ V1, V2 ]
+ ! [ V2' ]
+ !
+ ! = [ 0, C2 ] - C2*V2'*T'[ V1, V2 ]
+ !
+ ! = [ -C2*V2'*T'*V1, C2 - C2*V2'*T'*V2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('ZLARFB0C2', 2)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,J) = CONJG(V(J,K+I))
+ END DO
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K,
+ $ ONE, C(1,K+1), LDC, V(1, K+1), LDV, ZERO, C,
+ $ LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ IF( TRANST ) THEN
+ CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Non-unit',
+ $ M, K, ONE, T, LDT, C, LDC)
+ ELSE
+ CALL ZTRMM('Right', 'Lower', 'No Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C, LDC)
+ END IF
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ZERO, C(1,K+1),
+ $ LDC)
+ DO I = 1, M
+ C(I,K+I) = C(I,K+I) + ONE
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C, LDC, V(1,K+1), LDV, ONE, C(1,K+1),
+ $ LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V, LDV, C, LDC)
+ ELSE IF (QL) THEN
+ ! We are computing C = HC = (I - VTV')C
+ ! Where: V = [ V2 ] and C = [ C2 ]
+ ! [ V1 ] [ C1 ]
+ ! with the following dimensions:
+ ! V1\in\C^{K\times K}
+ ! V2\in\C^{M-K\times K}
+ ! C1=0\in\C^{K\times N}
+ ! C2\in\C^{M-K\times N}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = HC = (I-[ V2 ]T[ V2' V1' ])[ C2 ]
+ ! [ V1 ] [ 0 ]
+ !
+ ! = [ C2 ] - [ V2 ]T*V2'*C2
+ ! [ 0 ] [ V1 ]
+ !
+ ! = [ C2 ] - [ V2*T*V2'*C2 ]
+ ! [ 0 ] [ V1*T*V2'*C2 ]
+ !
+ ! = [ C2 - V2*T*V2'*C2 ]
+ ! [ - V1*T*V2'*C2 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = V2'*C2
+ ! C1 = T*C1
+ ! C2 = C2 - V2*C1
+ ! C1 = -V1*C1
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDEL ) THEN
+ CALL XERBLA('ZLARFB0C2', 2)
+ RETURN
+ ELSE IF(TRANST) THEN
+ CALL XERBLA('ZLARFB0C2', 3)
+ RETURN
+ END IF
+ !
+ ! C1 = V2'*C2
+ !
+ IF( C2I ) THEN
+ DO J = 1, N
+ DO I = 1, K
+ C(M-K+I,J) = CONJG(V(J,I))
+ END DO
+ END DO
+ ELSE
+ CALL ZGEMM('Conjugate', 'No Transpose', K, N, M-K,
+ $ ONE, V, LDV, C, LDC, ZERO, C(M-K+1, 1), LDC)
+ END IF
+ !
+ ! C1 = T*C1
+ !
+ CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K, N, ONE, T, LDT, C(M-K+1,1), LDC)
+ !
+ ! C2 = C2 - V2*C1 = -V2*C1 + C2
+ !
+ IF( C2I ) THEN
+ CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ZERO, C, LDC)
+ DO I = 1, N
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'No Transpose', M-K, N, K,
+ $ NEG_ONE, V, LDV, C(M-K+1,1), LDC, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -V1*C1
+ !
+ CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Unit',
+ $ K, N, NEG_ONE, V(M-K+1,1), LDV, C(M-K+1,1), LDC)
+ ELSE ! IF (RQ) THEN
+ ! We are computing C = CH' = C(I-V'T'V)
+ ! Where: V = [ V2 V1] and C = [ C2 C1 ]
+ ! with the following dimensions:
+ ! V1\in\C^{K\times K}
+ ! V2\in\C^{K\times N-K}
+ ! C1=0\in\C^{M\times K}
+ ! C2\in\C^{M\times N-K}
+ ! Since we are assuming that C1 is a zero matrix and it will be
+ ! overwritten on exit, we can use this spot as a temporary workspace
+ ! without having to allocate anything extra.
+ ! This lets us simplify our above equation to get
+ !
+ ! C = CH' = [ C2, 0 ] (I - [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - [ C2, 0 ] [ V2' ]T'[ V2, V1 ]
+ ! [ V1' ]
+ !
+ ! = [ C2, 0 ] - C2*V2'*T'[ V2, V1 ]
+ !
+ ! = [ C2, 0 ] - [ C2*V2'*T'*V2, C2*V2'*T'*V1 ]
+ !
+ ! = [ C2 - C2*V2'*T'*V2, -C2*V2'*T'*V1 ]
+ !
+ ! So, we can order our computations as follows:
+ !
+ ! C1 = C2*V2'
+ ! C1 = C1*T'
+ ! C2 = C2 - C1*V2
+ ! C1 = -C1*V1
+ !
+ !
+ ! To achieve the same end result
+ !
+ ! Check to ensure side and trans are the expected values
+ !
+ IF( .NOT.SIDER ) THEN
+ CALL XERBLA('ZLARFB0C2', 2)
+ RETURN
+ END IF
+ !
+ ! C1 = C2*V2'
+ !
+ IF( C2I ) THEN
+ DO J = 1, K
+ DO I = 1, M
+ C(I,N-K+J) = CONJG(V(J,I))
+ END DO
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'Conjugate', M, K, N-K,
+ $ ONE, C, LDC, V, LDV, ZERO, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C1 = C1*T'
+ !
+ IF( TRANST ) THEN
+ CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Non-unit',
+ $ M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ ELSE
+ CALL ZTRMM('Right', 'Upper', 'No Transpose',
+ $ 'Non-unit', M, K, ONE, T, LDT, C(1, N-K+1), LDC)
+ END IF
+ !
+ ! C2 = C2 - C1*V2 = -C1*V2 + C2
+ !
+ IF( C2I ) THEN
+ CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ZERO, C, LDC)
+ DO I = 1, M
+ C(I,I) = C(I,I) + ONE
+ END DO
+ ELSE
+ CALL ZGEMM('No Transpose', 'No Transpose', M, N-K, K,
+ $ NEG_ONE, C(1, N-K+1), LDC, V, LDV, ONE, C, LDC)
+ END IF
+ !
+ ! C1 = -C1*V1
+ !
+ CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Unit',
+ $ M, K, NEG_ONE, V(1, N-K+1), LDV, C(1,N-K+1), LDC)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/zlarft.f b/SRC/zlarft.f
index 10e90d3346..47ed80fa60 100644
--- a/SRC/zlarft.f
+++ b/SRC/zlarft.f
@@ -6,11 +6,11 @@
* http://www.netlib.org/lapack/explore-html/
*
*> Download ZLARFT + dependencies
-*>
+*>
*> [TGZ]
-*>
+*>
*> [ZIP]
-*>
+*>
*> [TXT]
*
* Definition:
@@ -23,7 +23,7 @@
* INTEGER K, LDT, LDV, N
* ..
* .. Array Arguments ..
-* COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * )
+* COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * )
* ..
*
*
@@ -32,7 +32,7 @@
*>
*> \verbatim
*>
-*> ZLARFT forms the triangular factor T of a complex block reflector H
+*> ZLARFT forms the triangular factor T of a real block reflector H
*> of order n, which is defined as a product of k elementary reflectors.
*>
*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular;
@@ -42,12 +42,12 @@
*> If STOREV = 'C', the vector which defines the elementary reflector
*> H(i) is stored in the i-th column of the array V, and
*>
-*> H = I - V * T * V**H
+*> H = I - V * T * V**T
*>
*> If STOREV = 'R', the vector which defines the elementary reflector
*> H(i) is stored in the i-th row of the array V, and
*>
-*> H = I - V**H * T * V
+*> H = I - V**T * T * V
*> \endverbatim
*
* Arguments:
@@ -166,23 +166,25 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* .. Scalar Arguments
*
- CHARACTER DIRECT, STOREV
- INTEGER K, LDT, LDV, N
+ CHARACTER DIRECT, STOREV
+ INTEGER K, LDT, LDV, N
* ..
* .. Array Arguments ..
*
- COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * )
+ COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * )
* ..
*
* .. Parameters ..
*
- COMPLEX*16 ONE, NEG_ONE, ZERO
- PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE=-1.0D+0)
+ COMPLEX*16 ONE, NEG_ONE, ZERO
+ PARAMETER(ONE=(1.0D+0,0.0D+0),
+ $ ZERO = (0.0D+0,0.0D+0),
+ $ NEG_ONE=(-1.0D+0,0.0D+0))
*
* .. Local Scalars ..
*
INTEGER I,J,L
- LOGICAL QR,LQ,QL,DIRF,COLV
+ LOGICAL QR,LQ,QL,RQ,LQT,RQT,DIRF,COLV,TDIRF,TCOLV
*
* .. External Subroutines ..
*
@@ -196,7 +198,7 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* .. Intrinsic Functions..
*
INTRINSIC CONJG
-*
+*
* The general scheme used is inspired by the approach inside DGEQRT3
* which was (at the time of writing this code):
* Based on the algorithm of Elmroth and Gustavson,
@@ -210,13 +212,6 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
RETURN
END IF
*
-* Base case
-*
- IF(N.EQ.1.OR.K.EQ.1) THEN
- T(1,1) = TAU(1)
- RETURN
- END IF
-*
* Beginning of executable statements
*
L = K / 2
@@ -227,26 +222,48 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* 'C' for STOREV, then they meant to provide 'R'
*
DIRF = LSAME(DIRECT,'F')
+ TDIRF = LSAME(DIRECT,'T')
COLV = LSAME(STOREV,'C')
+ TCOLV = LSAME(STOREV,'T')
*
* QR happens when we have forward direction in column storage
*
QR = DIRF.AND.COLV
*
-* LQ happens when we have forward direction in row storage
+* LQT happens when we have forward direction in row storage and want to compute the transpose of
+* the T we would normally compute
+*
+ LQT = DIRF.AND.TCOLV
+*
+* LQ happens when we have forward direction in row storage and want to compute the T we would
+* normally compute
*
- LQ = DIRF.AND.(.NOT.COLV)
+ LQ = DIRF.AND.(.NOT.LQT)
*
* QL happens when we have backward direction in column storage
*
QL = (.NOT.DIRF).AND.COLV
*
-* The last case is RQ. Due to how we structured this, if the
-* above 3 are false, then RQ must be true, so we never store
-* this
-* RQ happens when we have backward direction in row storage
-* RQ = (.NOT.DIRF).AND.(.NOT.COLV)
+* RQT happens when we have backward direction in row storage and want to compute the transpose
+* of the T we would normally compute
+*
+ RQT = TDIRF.AND.(.NOT.COLV)
+*
+* RQ happens when we have backward direction in row storage and want to compute the T that we
+* would normally compute
+*
+ RQ = (.NOT.RQT).AND.(.NOT.COLV)
*
+* Base case
+*
+ IF(N.EQ.1.OR.K.EQ.1) THEN
+ IF( RQT.OR.LQT ) THEN
+ T(1,1) = CONJG(TAU(1))
+ ELSE
+ T(1,1) = TAU(1)
+ END IF
+ RETURN
+ END IF
IF(QR) THEN
*
* Break V apart into 6 components
@@ -260,17 +277,17 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{1,1}\in\C^{l,l} unit lower triangular
* V_{2,1}\in\C^{k-l,l} rectangular
* V_{3,1}\in\C^{n-k,l} rectangular
-*
+*
* V_{2,2}\in\C^{k-l,k-l} unit lower triangular
* V_{3,2}\in\C^{n-k,k-l} rectangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} T_{1,2}|
* |0 T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -281,17 +298,17 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
+*
* (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2')
* = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2'
-*
+*
* Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |-------|
* |V_1 V_2|
* |-------|
-*
+*
* So, our product is equivalent to the matrix product
* I - V*T*V'
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
@@ -303,30 +320,29 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{2,2} recursively
*
- CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV,
- $ TAU(L+1), T(L+1, L+1), LDT)
+ CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
*
-* Compute T_{1,2}
+* Compute T_{1,2}
* T_{1,2} = V_{2,1}'
*
DO J = 1, L
DO I = 1, K-L
- T(J, L+I) = CONJG(V(L+I, J))
+ T(J,L+I) = CONJG(V(L+I,J))
END DO
END DO
*
* T_{1,2} = T_{1,2}*V_{2,2}
*
CALL ZTRMM('Right', 'Lower', 'No transpose', 'Unit', L,
- $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
+ $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
- CALL ZGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE,
- $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE,
- $ T(1, L+1), LDT)
+ CALL ZGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE,
+ $ V(K+1, 1), LDV, V(K+1,L+1), LDV, ONE, T(1, L+1), LDT)
*
* At this point, we have that T_{1,2} = V_1'*V_2
* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
@@ -335,12 +351,12 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2} = -T_{1,1}*T_{1,2}
*
CALL ZTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
+ $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*T_{2,2}
*
- CALL ZTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT)
+ CALL ZTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
+ $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT)
ELSE IF(LQ) THEN
*
@@ -354,41 +370,41 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{1,1}\in\C^{l,l} unit upper triangular
* V_{1,2}\in\C^{l,k-l} rectangular
* V_{1,3}\in\C^{l,n-k} rectangular
-*
+*
* V_{2,2}\in\C^{k-l,k-l} unit upper triangular
* V_{2,3}\in\C^{k-l,n-k} rectangular
*
* Where l = floor(k/2)
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} T_{1,2}|
* |0 T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
-* T_{1,1}\in\C^{l, l} upper triangular
-* T_{2,2}\in\C^{k-l, k-l} upper triangular
-* T_{1,2}\in\C^{l, k-l} rectangular
+* T_{1,1}\in\C^{l, l} upper triangular
+* T_{2,2}\in\C^{k-l, k-l} upper triangular
+* T_{1,2}\in\C^{l, k-l} rectangular
*
* Then, consider the product:
-*
-* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2)
-* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2
-*
+*
+* (I - V_1'*T_{1,1}'*V_1)*(I - V_2'*T_{2,2}'*V_2)
+* = I - V_1'*T_{1,1}'*V_1 - V_2'*T_{2,2}'*V_2 + V_1'*T_{1,1}'*V_1*V_2'*T_{2,2}'*V_2
+*
* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |---|
* |V_1|
* |V_2|
* |---|
-*
+*
* So, our product is equivalent to the matrix product
-* I - V'*T*V
+* I - V'*T'*V
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
* to compute T_{1,2}
*
@@ -398,27 +414,26 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{2,2} recursively
*
- CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV,
- $ TAU(L+1), T(L+1, L+1), LDT)
+ CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
*
* Compute T_{1,2}
* T_{1,2} = V_{1,2}
*
- CALL ZLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT)
+ CALL ZLACPY('All', L, K - L, V(1,L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*V_{2,2}'
*
CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Unit', L, K-L,
- $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
+ $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT)
*
* T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
CALL ZGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE,
- $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE,
- $ T(1, L+1), LDT)
+ $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, T(1, L+1), LDT)
*
* At this point, we have that T_{1,2} = V_1*V_2'
* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
@@ -427,13 +442,106 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{1,2} = -T_{1,1}*T_{1,2}
*
CALL ZTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
+ $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT)
*
* T_{1,2} = T_{1,2}*T_{2,2}
*
CALL ZTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT)
+ $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT)
+ ELSE IF(LQT) THEN
+*
+* Break V apart into 6 components
+*
+* V = |----------------------|
+* |V_{1,1} V_{1,2} V{1,3}|
+* |0 V_{2,2} V{2,3}|
+* |----------------------|
+*
+* V_{1,1}\in\C^{l,l} unit upper triangular
+* V_{1,2}\in\C^{l,k-l} rectangular
+* V_{1,3}\in\C^{l,n-k} rectangular
+*
+* V_{2,2}\in\C^{k-l,k-l} unit upper triangular
+* V_{2,3}\in\C^{k-l,n-k} rectangular
+*
+* Where l = floor(k/2)
+*
+* We will construct the T matrix
+* T = |---------------|
+* |T_{1,1} 0 |
+* |T_{2,1} T_{2,2}|
+* |---------------|
+*
+* T is the triangular factor obtained from block reflectors.
+* To motivate the structure, assume we have already computed T_{1,1}
+* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
+*
+* T_{1,1}\in\C^{l, l} lower triangular
+* T_{2,2}\in\C^{k-l, k-l} lower triangular
+* T_{2,1}\in\C^{k-l, l} rectangular
+*
+* Then, consider the product:
+*
+* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2)
+* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2
+*
+* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1}
+*
+* Then, we can define the matrix V as
+* V = |---|
+* |V_1|
+* |V_2|
+* |---|
+*
+* So, our product is equivalent to the matrix product
+* I - V'*T*V
+* This means, we can compute T_{1,1} and T_{2,2}, then use this information
+* to compute T_{2,1}
+*
+* Compute T_{1,1} recursively
+*
+ CALL ZLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT)
+*
+* Compute T_{2,2} recursively
+*
+ CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1,L+1), LDV,
+ $ TAU(L+1), T(L+1,L+1), LDT)
+
+*
+* Compute T_{2,1}
+* T_{2,1} = V_{1,2}'
+*
+ DO I = 1, K-L
+ DO J = 1, L
+ T(L+I,J) = CONJG(V(J,L+I))
+ END DO
+ END DO
+*
+* T_{2,1} = V_{2,2}*T_{2,1}
+*
+ CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Unit', K-L, L,
+ $ ONE, V(L+1,L+1), LDV, T(L+1,1), LDT)
+*
+* T_{2,1} = V_{2,3}*V_{1,3}' + T_{2,1}
+* Note: We assume K <= N, and GEMM will do nothing if N=K
+*
+ CALL ZGEMM('No Transpose', 'Conjugate', K-L, L, N-K, ONE,
+ $ V(L+1,K+1), LDV, V(1, K+1), LDV, ONE, T(L+1,1), LDT)
+*
+* At this point, we have that T_{2,1} = V_2*V_1'
+* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1}
+* respectively.
+*
+* T_{2,1} = -T_{2,2}*T_{2,1}
+*
+ CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ K-L, L, NEG_ONE, T(L+1,L+1), LDT, T(L+1,1), LDT)
+*
+* T_{2,1} = T_{2,1}*T_{1,1}
+*
+ CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Non-unit',
+ $ K-L, L, ONE, T, LDT, T(L+1,1), LDT)
ELSE IF(QL) THEN
*
* Break V apart into 6 components
@@ -446,18 +554,18 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* V_{1,1}\in\C^{n-k,k-l} rectangular
* V_{2,1}\in\C^{k-l,k-l} unit upper triangular
-*
+*
* V_{1,2}\in\C^{n-k,l} rectangular
* V_{2,2}\in\C^{k-l,l} rectangular
* V_{3,2}\in\C^{l,l} unit upper triangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} 0 |
* |T_{2,1} T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -468,17 +576,17 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
+*
* (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1')
* = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1'
-*
+*
* Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |-------|
* |V_1 V_2|
* |-------|
-*
+*
* So, our product is equivalent to the matrix product
* I - V*T*V'
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
@@ -486,34 +594,34 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
*
* Compute T_{1,1} recursively
*
- CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT)
+ CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
*
* Compute T_{2,2} recursively
*
CALL ZLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV,
- $ TAU(K-L+1), T(K-L+1, K-L+1), LDT)
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
*
* Compute T_{2,1}
* T_{2,1} = V_{2,2}'
*
DO J = 1, K-L
DO I = 1, L
- T(K-L+I, J) = CONJG(V(N-K+J, K-L+I))
+ T(K-L+I,J) = CONJG(V(N-K+J, K-L+I))
END DO
END DO
*
* T_{2,1} = T_{2,1}*V_{2,1}
*
CALL ZTRMM('Right', 'Upper', 'No transpose', 'Unit', L,
- $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT)
+ $ K-L, ONE, V(N-K+1,1), LDV, T(K-L+1,1), LDT)
*
* T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
CALL ZGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE,
- $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1),
- $ LDT)
+ $ V(1,K-L+1), LDV, V, LDV, ONE, T(K-L+1,1), LDT)
*
* At this point, we have that T_{2,1} = V_2'*V_1
* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1}
@@ -522,17 +630,13 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{2,1} = -T_{2,2}*T_{2,1}
*
CALL ZTRMM('Left', 'Lower', 'No transpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT,
- $ T(K-L+1, 1), LDT)
+ $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT)
*
* T_{2,1} = T_{2,1}*T_{1,1}
*
CALL ZTRMM('Right', 'Lower', 'No transpose', 'Non-unit', L,
- $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT)
- ELSE
-*
-* Else means RQ case
-*
+ $ K-L, ONE, T, LDT, T(K-L+1,1), LDT)
+ ELSE IF(RQ) THEN
* Break V apart into 6 components
*
* V = |-----------------------|
@@ -547,13 +651,13 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* V_{2,2}\in\C^{l,k-l} rectangular
* V_{2,3}\in\C^{l,l} unit lower triangular
*
-* We will construct the T matrix
+* We will construct the T matrix
* T = |---------------|
* |T_{1,1} 0 |
* |T_{2,1} T_{2,2}|
* |---------------|
*
-* T is the triangular factor obtained from block reflectors.
+* T is the triangular factor obtained from block reflectors.
* To motivate the structure, assume we have already computed T_{1,1}
* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
*
@@ -564,51 +668,51 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* Where l = floor(k/2)
*
* Then, consider the product:
-*
-* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1)
-* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1
-*
+*
+* (I - V_2'*T_{2,2}'*V_2)*(I - V_1'*T_{1,1}'*V_1)
+* = I - V_2'*T_{2,2}'*V_2 - V_1'*T_{1,1}'*V_1 + V_2'*T_{2,2}'*V_2*V_1'*T_{1,1}'*V_1
+*
* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1}
-*
-* Then, we can define the matrix V as
+*
+* Then, we can define the matrix V as
* V = |---|
* |V_1|
* |V_2|
* |---|
-*
+*
* So, our product is equivalent to the matrix product
-* I - V'*T*V
+* I - V'*T'*V
* This means, we can compute T_{1,1} and T_{2,2}, then use this information
* to compute T_{2,1}
*
* Compute T_{1,1} recursively
*
- CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT)
+ CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
*
* Compute T_{2,2} recursively
*
- CALL ZLARFT(DIRECT, STOREV, N, L, V(K-L+1, 1), LDV,
- $ TAU(K-L+1), T(K-L+1, K-L+1), LDT)
+ CALL ZLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV,
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
*
* Compute T_{2,1}
* T_{2,1} = V_{2,2}
*
- CALL ZLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV,
- $ T(K-L+1, 1), LDT)
+ CALL ZLACPY('All', L, K-L, V(K-L+1,N-K+1), LDV, T(K-L+1,1),
+ $ LDT)
*
* T_{2,1} = T_{2,1}*V_{1,2}'
*
CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Unit', L, K-L,
- $ ONE, V(1, N-K+1), LDV, T(K-L+1, 1), LDT)
+ $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT)
*
-* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1}
+* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1}
* Note: We assume K <= N, and GEMM will do nothing if N=K
*
- CALL ZGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE,
- $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1),
- $ LDT)
+ CALL ZGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE,
+ $ V(K-L+1,1), LDV, V, LDV, ONE, T(K-L+1,1), LDT)
*
* At this point, we have that T_{2,1} = V_2*V_1'
@@ -618,13 +722,103 @@ RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV,
* T_{2,1} = -T_{2,2}*T_{2,1}
*
CALL ZTRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L,
- $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT,
- $ T(K-L+1, 1), LDT)
+ $ K-L, NEG_ONE, T(K-L+1,K-L+1), LDT, T(K-L+1,1), LDT)
*
* T_{2,1} = T_{2,1}*T_{1,1}
*
CALL ZTRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L,
- $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT)
+ $ K-L, ONE, T, LDT, T(K-L+1,1), LDT)
+ ELSE IF(RQT) THEN
+* Break V apart into 6 components
+*
+* V = |-----------------------|
+* |V_{1,1} V_{1,2} 0 |
+* |V_{2,1} V_{2,2} V_{2,3}|
+* |-----------------------|
+*
+* V_{1,1}\in\C^{k-l,n-k} rectangular
+* V_{1,2}\in\C^{k-l,k-l} unit lower triangular
+*
+* V_{2,1}\in\C^{l,n-k} rectangular
+* V_{2,2}\in\C^{l,k-l} rectangular
+* V_{2,3}\in\C^{l,l} unit lower triangular
+*
+* We will construct the T matrix
+* T = |---------------|
+* |T_{1,1} T_{1,2}|
+* | 0 T_{2,2}|
+* |---------------|
+*
+* T is the triangular factor obtained from block reflectors.
+* To motivate the structure, assume we have already computed T_{1,1}
+* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2
+*
+* T_{1,1}\in\C^{k-l, k-l} non-unit upper triangular
+* T_{2,2}\in\C^{l, l} non-unit upper triangular
+* T_{1,2}\in\C^{k-l, l} rectangular
+*
+* Where l = floor(k/2)
+*
+* Then, consider the product:
+*
+* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1)
+* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1
+*
+* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2}
+*
+* Then, we can define the matrix V as
+* V = |---|
+* |V_1|
+* |V_2|
+* |---|
+*
+* So, our product is equivalent to the matrix product
+* I - V'*T*V
+* This means, we can compute T_{1,1} and T_{2,2}, then use this information
+* to compute T_{1,2}
+*
+* Compute T_{1,1} recursively
+*
+ CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T,
+ $ LDT)
+*
+* Compute T_{2,2} recursively
+*
+ CALL ZLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV,
+ $ TAU(K-L+1), T(K-L+1,K-L+1), LDT)
+*
+* Compute T_{1,2}
+* T_{1,2} = V_{2,2}'
+*
+ DO I = 1, K-L
+ DO J = 1, L
+ T(I,K-L+J) = CONJG(V(K-L+J, N-K+I))
+ END DO
+ END DO
+*
+* T_{1,2} = V_{1,2}T_{1,2}
+*
+ CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Unit', K-L, L,
+ $ ONE, V(1,N-K+1), LDV, T(1,K-L+1), LDT)
+*
+* T_{1,2} = V_{1,1}V_{2,1}' + T_{1,2}
+*
+ CALL ZGEMM('No Tranpose', 'Conjugate', K-L, L, N-K, ONE, V,
+ $ LDV, V(K-L+1,1), LDV, ONE, T(1, K-L+1), LDT)
+*
+* At this point, we have that T_{1,2} = V_1*V_2'
+* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2}
+* respectively.
+*
+* T_{1,2} = -T_{1,1}*T_{1,2}
+*
+ CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit',
+ $ K-L, L, NEG_ONE, T, LDT, T(1, K-L+1), LDT)
+*
+* T_{1,2} = T_{1,2}*T_{2,2}
+*
+ CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Non-Unit',
+ $ K-L, L, ONE, T(K-L+1,K-L+1), LDT, T(1, K-L+1), LDT)
END IF
END SUBROUTINE
diff --git a/SRC/zlumm.f b/SRC/zlumm.f
new file mode 100644
index 0000000000..f6d1fdb9e6
--- /dev/null
+++ b/SRC/zlumm.f
@@ -0,0 +1,349 @@
+*> \brief \b ZLUMM computes an in place triangular times triangluar matrix multiplication
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE ZLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA,
+* $ A, LDA)
+*
+* .. Scalar Arguments ..
+* INTEGER N, LDA
+* CHARACTER SIDEL, DIAGL, DIAGU
+* COMPLEX*16 ALPHA
+*
+* .. Array Arguments ..
+* COMPLEX*16 A(LDA,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZLUMM performs one of the matrix-matrix operations
+*>
+*> C = \alpha L * U
+*> or
+*> C = \alpha U * L
+*>
+*> where \alpha is a scalar, L is a unit, or non-unit, lower triangular matrix, and U is a unit, or
+*> non-unit, upper triangular matrix, and at most one of L and U are non-unit
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDEL
+*> \verbatim
+*> SIDEL is CHARACTER*1
+*> On entry, SIDE specifies whether L multiplies U from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' A = \alpha L * U
+*>
+*> SIDE = 'R' or 'r' A = \alpha U * L
+*> \endverbatim
+*>
+*> \param[in] DIAGL
+*> \verbatim
+*> DIAGL is CHARACTER*1
+*> On entry, DIAGL specifies whether or not L is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' L is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' L is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] DIAGU
+*> \verbatim
+*> DIAGU is CHARACTER*1
+*> On entry, DIAGU specifies whether or not U is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' U is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' U is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> M is INTEGER
+*> On entry, N specifies the number of rows and columns of L and U. M must be at
+*> least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is COMPLEX*16 .
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then A is not referenced, and A need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*> A is COMPLEX*16 array, dimension ( LDA, N ) where
+*> Before entry the leading n-by-n strictly upper triangular part of the array
+*> A must contain the upper triangular matrix U and the strictly lower triangular part of
+*> the leading n-by-n submatrix must contain the lower triangular matrix L.
+*> If DIAGL != 'U', then the diagonal is assumed to be part of L, and if
+*> DIAGU != 'U', then the diagonal is assumed to be part of U.
+*> Note: At most one of DIAGL and DIAGU can be not equal to 'U'.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> On entry, LDA specifies the first dimension of A as declared
+*> in the calling (sub) program. LDA must be at least max( 1, n ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+* Cost: 2/3 * (n^3 - n)
+ RECURSIVE SUBROUTINE ZLUMM(SIDEL, DIAGL, DIAGU, N, ALPHA,
+ $ A, LDA)
+*
+* .. Scalar Arguments ..
+ INTEGER N, LDA
+ CHARACTER SIDEL, DIAGL, DIAGU
+ COMPLEX*16 ALPHA
+*
+* .. Array Arguments ..
+ COMPLEX*16 A(LDA,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* .. External Subroutines ..
+ EXTERNAL ZGEMM, ZTRMM, ZLASET, XERBLA
+* ..
+* .. Local Scalars ..
+ INTEGER K
+ LOGICAL LLEFT, LUNIT, UUNIT
+* ..
+* .. Local Parameters ..
+ COMPLEX*16 ONE, ZERO
+ PARAMETER(ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0))
+* ..
+*
+* Determine if our flags are valid or not. We can have at
+* most one of DIAGU, DIAGL not equal to 'U'
+*
+ LUNIT = LSAME(DIAGL, 'U')
+ UUNIT = LSAME(DIAGU, 'U')
+*
+* If both of the above are false, then it is impossible to have the
+* structure that we are exploiting in this routine
+* Note: It is possible to allow the matrices to share a non-unit
+* diagonal as long as the values are the exact same, but there is
+* currently no use case for this that I am aware of.
+*
+ IF ((.NOT.LUNIT).AND.(.NOT.UUNIT)) THEN
+*
+* We say the error is in the last set DIAG value as we cannot know
+* what the user actually meant.
+*
+ CALL XERBLA( 'ZLUMM', 3 )
+ RETURN
+ END IF
+*
+* Determine which side L is on
+*
+ LLEFT = LSAME(SIDEL, 'L')
+*
+* Early exit if possible
+*
+ IF (N.EQ.0) THEN
+ RETURN
+ END IF
+ IF (ALPHA.EQ.ZERO) THEN
+ CALL ZLASET('All', N, N, ZERO, ZERO, A, LDA)
+ RETURN
+ END IF
+*
+* Terminating Case
+*
+ IF (N.EQ.1) THEN
+*
+* Since at most one of L and U are non-unit triangular, whatever side L is on, we are still
+* always computing one of
+*
+* 1) A(1,1) = ALPHA*A(1,1)
+* 2) A(1,1) = ALPHA
+*
+* Where the first case happens when exactly one of L and U are unit triangular, while the
+* second case happens when both L and U are unit triangular
+*
+ IF (LUNIT.AND.UUNIT) THEN
+ A(1,1) = ALPHA
+ ELSE
+ A(1,1) = ALPHA*A(1,1)
+ END IF
+ RETURN
+ END IF
+*
+* Recursive Case
+*
+ K = N/2
+*
+* Regardless of us computing A = L*U or A = U*L, break break A apart as follows:
+*
+* |---|
+* A = | U |
+* | L |
+* |---|
+*
+* Further break down L as
+* |---------------|
+* L = | L_{11} 0 |
+* | L_{21} L_{22} |
+* |---------------|
+*
+* Where:
+*
+* L_{11}\in\C^{k\times k} is lower triangular (assumed unit iff DIAGL == 'U')
+* L_{21}\in\C^{n-k\times n} is rectangular
+* L_{22}\in\C^{n-k\times n-k} is lower triangular (assumed unit iff DIAGL == 'U')
+*
+* Further break down U as
+* |---------------|
+* U = | U_{11} U_{21} |
+* | 0 U_{22} |
+* |---------------|
+*
+* Where:
+*
+* U_{11}\in\C^{k\times k} is upper triangular (assumed unit iff DIAGU == 'U')
+* U_{12}\in\C^{n\times n-k} is rectangular
+* U_{22}\in\C^{n-k\times n-k} is upper triangular (assumed unit iff DIAGU == 'U')
+ IF (LLEFT) THEN
+*
+* This means we are computing
+* |---------------| |---------------|
+* A = L*U = \alpha | L_{11} 0 | * | U_{11} U_{12} |
+* | L_{21} L_{22} | | 0 U_{22} |
+* |---------------| |---------------|
+*
+* |---------------------------------------------|
+* = \alpha | L_{11}*U_{11} L_{11}*U_{12} |
+* | L_{21}*U_{11} L_{21}*U_{12} + L_{22}*U_{22} |
+* |---------------------------------------------|
+*
+* We compute these in the following order
+*
+* A_{22} = \alpha*L_{22}*U_{22} (This routine)
+* A_{22} = \alpha*L_{21}*U_{12} + A_{22} (GEMM)
+*
+* A_{12} = \alpha*L_{11}*U_{12} (TRMM)
+* A_{21} = \alpha*L_{21}*U_{11} (TRMM)
+*
+* A_{11} = \alpha*L_{11}*U_{11} (This routine)
+*
+* Compute A_{22}
+*
+* A_{22} = \alpha*L_{22}*U_{22}
+*
+ CALL ZLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA,
+ $ A(K+1, K+1), LDA)
+*
+* A_{22} = \alpha L_{21}*U_{12} + A_{22}
+*
+ CALL ZGEMM('No Transpose', 'No Transpose', N-K, N-K, K,
+ $ ALPHA, A(K+1,1), LDA, A(1,K+1), LDA, ONE, A(K+1,K+1),
+ $ LDA)
+*
+* Compute A_{12}
+*
+* A_{12} = \alpha*L_{11}*U_{12}
+*
+ CALL ZTRMM('Left', 'Lower', 'No Transpose', DIAGL, K, N-K,
+ $ ALPHA, A, LDA, A(1,K+1), LDA)
+*
+* Compute A_{21}
+*
+* A_{21} = \alpha*L_{21}*U_{11}
+*
+ CALL ZTRMM('Right', 'Upper', 'No Transpose', DIAGU, N-K, K,
+ $ ALPHA, A, LDA, A(K+1,1), LDA)
+*
+* Compute A_{11}
+*
+* A_{11} = \alpha*L_{11}*U_{11}
+*
+ CALL ZLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA)
+ ELSE
+*
+* This means we are computing
+* |---------------| |---------------|
+* A = U*L = \alpha | U_{11} U_{12} | * | L_{11} 0 |
+* | 0 U_{22} | | L_{21} L_{22} |
+* |---------------| |---------------|
+*
+* |---------------------------------------------|
+* = \alpha | U_{11}*L_{11} + U_{12}*L_{21} U_{12}*L_{22} |
+* | U_{22}*L_{21} U_{22}*L_{22} |
+* |---------------------------------------------|
+*
+* We compute these in the following order
+*
+* A_{11} = \alpha*U_{11}*L_{11} (This routine)
+* A_{11} = \alpha*U_{12}*L_{21} + A_{11} (GEMM)
+*
+* A_{12} = \alpha*U_{12}*L_{22} (TRMM)
+* A_{21} = \alpha*U_{22}*L_{21} (TRMM)
+*
+* A_{22} = \alpha*U_{22}*L_{22} (This routine)
+*
+* Compute A_{11}
+*
+* A_{11} = \alpha*U_{11}*L_{11}
+*
+ CALL ZLUMM(SIDEL, DIAGL, DIAGU, K, ALPHA, A, LDA)
+*
+* A_{11} = \alpha*U_{12}*L_{21} + A_{11}
+*
+ CALL ZGEMM('No Transpose', 'No Transpose', K, K, N-K,
+ $ ALPHA, A(1,K+1), LDA, A(K+1,1), LDA, ONE, A, LDA)
+*
+* Compute A_{12}
+*
+* A_{12} = \alpha*U_{12}*L_{22}
+*
+ CALL ZTRMM('Right', 'Lower', 'No Transpose', DIAGL, K, N-K,
+ $ ALPHA, A(K+1,K+1), LDA, A(1,K+1), LDA)
+*
+* Compute A_{21}
+*
+* A_{21} = \alpha*U_{22}*L_{21}
+*
+ CALL ZTRMM('Left', 'Upper', 'No Transpose', DIAGU, N-K, K,
+ $ ALPHA, A(K+1, K+1), LDA, A(K+1,1), LDA)
+*
+* Compute A_{22}
+*
+* A_{22} = \alpha*U_{22}*L_{22}
+*
+ CALL ZLUMM(SIDEL, DIAGL, DIAGU, N-K, ALPHA,
+ $ A(K+1, K+1), LDA)
+ END IF
+ END SUBROUTINE
diff --git a/SRC/ztrmmoop.f b/SRC/ztrmmoop.f
new file mode 100644
index 0000000000..d55b3f8523
--- /dev/null
+++ b/SRC/ztrmmoop.f
@@ -0,0 +1,2572 @@
+*> \brief \b ZTRMMOOP computes an out of place triangular times general matrix multiplication
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB,
+* $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* .. Scalar Arguments ..
+* COMPLEX*16 ALPHA, BETA
+* INTEGER M, N, LDA, LDB, LDC
+* CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 A(LDA,*), B(LDB,*), C(LDC,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZTRMMOOP performs one of the matrix-matrix operations
+*>
+*> C = \alpha op(A) * op(B) + \beta C
+*> or
+*> C = \alpha op(B) * op(A) + \beta C
+*>
+*> where \alpha and \beta are scalars, C is an m-by-n matrix, A is
+*> a unit, or non-unit, upper or lower triangular matrix, and op(A) is
+*> is one of
+*>
+*> op(A) = A or op(A) = A**T op(A) = A**H
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> On entry, SIDE specifies whether op(A) multiplies op(B) from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' C = \alpha op(A) * op(B) + \beta C
+*>
+*> SIDE = 'R' or 'r' C = \alpha op(B) * op(A) + \beta C
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> On entry, UPLO specifies whether the matrix A is an upper or
+*> lower triangular matrix as follows:
+*> UPLO = 'U' or 'u' A is upper triangular
+*>
+*> UPLO = 'L' or 'l' A is lower triangular
+*> \Endverbatim
+*>
+*> \param[in] TRANSA
+*> \verbatim
+*> TRANSA is CHARACTER*1
+*> On entry, TRANSA specifies the form of op(A) to be used in
+*> the matrix multiplication as follows:
+*> TRANSA = 'N' or 'n' op(A) = A
+*>
+*> TRANSA = 'T' or 't' op(A) = A**T
+*>
+*> TRANSA = 'C' or 'c' op(A) = A**H
+*> \endverbatim
+*>
+*> \param[in] TRANSB
+*> \verbatim
+*> TRANSB is CHARACTER*1
+*> On entry, TRANSB specifies the form of op(B) to be used in
+*> the matrix multiplication as follows:
+*> TRANSB = 'N' or 'n' op(B) = B
+*>
+*> TRANSB = 'T' or 't' op(B) = B**T
+*>
+*> TRANSB = 'C' or 'c' op(B) = B**H
+*> \endverbatim
+*>
+*> \param[in] DIAG
+*> \verbatim
+*> DIAG is CHARACTER*1
+*> On entry, DIAG specifies whether or not A is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' A is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' A is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> On entry, M specifies the number of rows of C. M must be at
+*> least zero.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> On entry, N specifies the number of columns of C. N must be
+*> at least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is COMPLEX*16.
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then A and B are not referenced, and A and B need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*> A is COMPLEX*16 array, dimension ( LDA, K ) where
+*> K is M when SIDE = 'L' and K is N when SIDE='R'
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array A must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> A is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array A must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> A is not referenced.
+*> Note that when DIAG = 'U' or 'u', the diagonal elements of
+*> A are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*> LDA is INTEGER
+*> On entry, LDA specifies the first dimension of A as declared
+*> in the calling (sub) program. When SIDE = 'L' or 'l' then
+*> LDA must be at least max( 1, m ), when SIDE = 'R' or 'r'
+*> then LDA must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*> B is COMPLEX*16 array, dimension ( LDB, K ), where K is M
+*> If SIDE='R' and TRANSA='N', or SIDE='L' and TRANSA='T' and N
+*> otherwise. On entry, the leading k-by-k submatrix must contain
+*> B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*> LDB is INTEGER
+*> On entry, LDB specifies the first dimension of B as declared
+*> in the calling (sub) program. When SIDE = 'R' and TRANSB='N'
+*> then LDB must be at least max( 1, m ), when SIDE = 'R'
+*> and TRANSB = 'T' then LDB must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] BETA
+*> \verbatim
+*> BETA is COMPLEX*16.
+*> On entry, BETA specifies the scalar beta. When beta is
+*> zero then C is not referenced on entry, and C need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*> C is COMPLEX*16 array, dimension ( LDC, N )
+*> Before entry, the leading m-by-n part of the array C must
+*> contain the matrix C, and on exit is overwritten by the
+*> transformed matrix.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*> LDC is INTEGER
+*> On entry, LDC specifies the first dimension of C as declared
+*> in the calling (sub) program. LDC must be at least
+*> max( 1, m ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ RECURSIVE SUBROUTINE ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB,
+ $ DIAG, M, N, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* .. Scalar Arguments ..
+ COMPLEX*16 ALPHA, BETA
+ INTEGER M, N, LDA, LDB, LDC
+ CHARACTER SIDE, UPLO, TRANSA, TRANSB, DIAG
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 A(LDA,*), B(LDB,*), C(LDC,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ COMPLEX*16 ZDOTC, ZDOTU
+ EXTERNAL LSAME, ZDOTC, ZDOTU
+* ..
+* .. External Subroutines ..
+ EXTERNAL ZGEMM, ZAXPY, ZACXPY,
+ $ ZSCAL, ZLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC CONJG, MIN
+* ..
+* .. Local Scalars ..
+ INTEGER I, J, L, K, INCB
+ LOGICAL LSIDE, UPPER, UNIT, TRANST, TRANSG,
+ $ CONJA, CONJB
+* ..
+* .. Local Parameters ..
+ COMPLEX*16 ONE, ZERO
+ PARAMETER(ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0))
+* ..
+*
+* Beginning of Executable Statements
+*
+ LSIDE = LSAME(SIDE, 'L')
+ UPPER = LSAME(UPLO, 'U')
+*
+* If we are transposing the triangular matrix (A)
+*
+ CONJA = LSAME(TRANSA, 'C')
+ TRANST= LSAME(TRANSA, 'T').OR.CONJA
+*
+* If we are transposing the general matrix (B)
+*
+ CONJB = LSAME(TRANSB, 'C')
+ TRANSG= LSAME(TRANSB, 'T').OR.CONJB
+*
+* Terminating Case
+*
+ UNIT = LSAME(DIAG, 'U')
+ IF (M.EQ.1.AND.N.EQ.1) THEN
+*
+* This case is the simplest as we are just computing C = \alpha A*B +
+* \beta C where all components are 1-by-1 matrices
+*
+
+ IF (BETA.EQ.ZERO) THEN
+ C(1,1) = ZERO
+ ELSE
+ C(1,1) = C(1,1) * BETA
+ END IF
+*
+* Now, we compute C = \alpha op(A)*op(B)
+*
+ IF(ALPHA.NE.ZERO) THEN
+*
+* A = 1, so we do not care if A is conjugated or not
+*
+ IF (UNIT) THEN
+ IF (CONJB) THEN
+ C(1,1) = C(1,1) + ALPHA*CONJG(B(1,1))
+ ELSE
+ C(1,1) = C(1,1) + ALPHA*B(1,1)
+ END IF
+ ELSE
+*
+* A is not assumed unit, so we need to keep op(A) in mind
+*
+ IF (CONJA) THEN
+ IF (CONJB) THEN
+ C(1,1) = C(1,1) +
+ $ ALPHA*CONJG(B(1,1))*CONJG(A(1,1))
+ ELSE
+ C(1,1) = C(1,1) + ALPHA*B(1,1)*CONJG(A(1,1))
+ END IF
+ ELSE
+ IF (CONJB) THEN
+ C(1,1) = C(1,1) + ALPHA*CONJG(B(1,1))*A(1,1)
+ ELSE
+ C(1,1) = C(1,1) + ALPHA*B(1,1)*A(1,1)
+ END IF
+ END IF
+ END IF
+ END IF
+ RETURN
+ ELSE IF (M.EQ.1) THEN
+*
+* This means that C is a row vector. If BETA is 0, then we
+* set it explicitly, otherwise we overwrite it with BETA*C
+*
+ IF (BETA.EQ.ZERO) THEN
+*
+* This ensures we don't reference C unless we need to
+*
+ CALL ZLASET('All', M, N, ZERO, ZERO, C, LDC)
+ ELSE
+ CALL ZSCAL(N, BETA, C, LDC)
+ END IF
+ IF (ALPHA.NE.ZERO) THEN
+ IF (LSIDE) THEN
+*
+* We are computing C = \alpha op(A)*op(B) + \beta C
+* Note: This means that A is a scalar
+*
+ IF (CONJA) THEN
+*
+* op(A) = CONJG(A)
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZACXPY(N, ALPHA*CONJG(A(1,1)), B, 1,
+ $ C, LDC)
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZAXPY(N, ALPHA*CONJG(A(1,1)), B, 1,
+ $ C, LDC)
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZAXPY(N, ALPHA*CONJG(A(1,1)), B,
+ $ LDB, C, LDC)
+ END IF
+ END IF
+ ELSE
+*
+* op(A) = A or op(A) = A**T = A
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZACXPY(N, ALPHA*A(1,1), B, 1,
+ $ C, LDC)
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZAXPY(N, ALPHA*A(1,1), B, 1, C, LDC)
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZAXPY(N, ALPHA*A(1,1), B, LDB,
+ $ C, LDC)
+ END IF
+ END IF
+ END IF
+ ELSE
+*
+* We are computing C = \alpha op(B)*op(A) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (CONJA) THEN
+*
+* op(A) = CONJG(A)
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CONJG(ZDOTU(N-J,
+ $ A(J,J+1), LDA, B(J+1,1), 1)) +
+ $ C(1,J)
+ END DO
+ CALL ZACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CONJG(ZDOTU(N-J+1,
+ $ A(J,J), LDA, B(J,1), 1)) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(N-J,
+ $ A(J,J+1), LDA, B(J+1,1), 1) +
+ $ C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(N-J+1,
+ $ A(J,J), LDA, B(J,1), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(N-J,
+ $ A(J,J+1), LDA, B(1,J+1), LDB) +
+ $ C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(N-J+1,
+ $ A(J,J), LDA, B(1,J), LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ ELSE IF (TRANST) THEN
+*
+* op(A) = A**T
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(N-J,
+ $ B(J+1,1), 1, A(J,J+1), LDA) +
+ $ C(1,J)
+ END DO
+ CALL ZACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(N-J+1,
+ $ B(J,1), 1, A(J,J), LDA) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(N-J,
+ $ A(J,J+1), LDA, B(J+1,1), 1) +
+ $ C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(N-J+1,
+ $ A(J,J), LDA, B(J,1), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(N-J,
+ $ A(J,J+1), LDA, B(1,J+1), LDB) +
+ $ C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(N-J+1,
+ $ A(J,J), LDA, B(1,J), LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ ELSE
+*
+* op(A) = A
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(J-1, B, 1,
+ $ A(1,J), 1) + C(1,J)
+ END DO
+ CALL ZACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(J, B, 1,
+ $ A(1,J), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(J-1,
+ $ A(1,J), 1, B, 1) +
+ $ C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(J,
+ $ A(1,J), 1, B, 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(J-1,
+ $ A(1,J), 1, B, LDB) +
+ $ C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(J,
+ $ A(1,J), 1, B, LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (CONJA) THEN
+*
+* op(A) = CONJG(A)
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CONJG(ZDOTU(J-1,
+ $ B, 1, A(J,1), LDA)) + C(1,J)
+ END DO
+ CALL ZACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * CONJG(ZDOTU(J, B,
+ $ 1, A(J,1), LDA)) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(J-1,
+ $ A(J,1), LDA, B, 1) + C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(J,
+ $ A(J,1), LDA, B, 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(J-1,
+ $ A(J,1), LDA, B, LDB) + C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(J,
+ $ A(J,1), LDA, B, LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ ELSE IF (TRANST) THEN
+*
+* op(A) = A**T
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(J-1,
+ $ B, 1, A(J,1), LDA) + C(1,J)
+ END DO
+ CALL ZACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(J, B,
+ $ 1, A(J,1), LDA) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(J-1,
+ $ A(J,1), LDA, B, 1) + C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(J,
+ $ A(J,1), LDA, B, 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(J-1,
+ $ A(J,1), LDA, B, LDB) + C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(J,
+ $ A(J,1), LDA, B, LDB) + C(1,J)
+ END DO
+ END IF
+ END IF
+ ELSE
+*
+* op(A) = A
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* op(B) = CONJG(B)
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(N-J,
+ $ B(J+1,1), 1, A(J+1,J), 1) + C(1,J)
+ END DO
+ CALL ZACXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTC(N-J+1,
+ $ B(J,1), 1, A(J,J), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* op(B) = B**T
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(N-J,
+ $ B(J+1,1), 1, A(J+1,J), 1) + C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, 1, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(N-J+1,
+ $ B(J,1), 1, A(J,J), 1) + C(1,J)
+ END DO
+ END IF
+ ELSE
+*
+* op(B) = B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(N-J,
+ $ B(1,J+1), LDB, A(J+1,J), 1) + C(1,J)
+ END DO
+ CALL ZAXPY(N, ALPHA, B, LDB, C, LDC)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO J = 1,N
+ C(1,J) = ALPHA * ZDOTU(N-J+1,
+ $ B(1,J), LDB, A(J,J), 1) + C(1,J)
+ END DO
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ RETURN
+ ELSE IF (N.EQ.1) THEN
+*
+* This means that C is a column vector. If BETA is 0, then we
+* set it explicitly, otherwise we overwrite it with BETA*C
+*
+ IF (BETA.EQ.ZERO) THEN
+*
+* This ensures we don't reference C unless we need to
+*
+ CALL ZLASET('All', M, N, ZERO, ZERO, C, LDC)
+ ELSE
+ CALL ZSCAL(M, BETA, C, 1)
+ END IF
+
+*
+* If alpha is 0, we are done
+*
+ IF (ALPHA.NE.ZERO) THEN
+ IF (TRANSG) THEN
+ INCB = LDB
+ ELSE
+ INCB = 1
+ END IF
+ IF (LSIDE) THEN
+*
+* This means we are computing
+* C = \alpha op(A) * op(B) + \beta C
+*
+ IF (UPPER) THEN
+*
+* This means A is upper triangular
+*
+ IF (CONJA) THEN
+*
+* This means op(A) = CONJG(A)
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CONJG(ZDOTU(I-1, B,
+ $ INCB, A(1,I), 1)) + C(I,1)
+ END DO
+ CALL ZACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CONJG(ZDOTU(I, B,
+ $ INCB, A(1,I), 1)) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(I-1, A(1,I),
+ $ 1, B, INCB) + C(I,1)
+ END DO
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(I, A(1,I),
+ $ 1, B, INCB) + C(I,1)
+ END DO
+ END IF
+ END IF
+ ELSE IF (TRANST) THEN
+*
+* This means op(A) = A**T
+* This is lower triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(I-1, B, INCB,
+ $ A(1,I), 1) + C(I,1)
+ END DO
+ CALL ZACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(I, B, INCB,
+ $ A(1,I), 1) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTU(I-1, B, INCB,
+ $ A(1,I), 1) + C(I,1)
+ END DO
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTU(I, B, INCB,
+ $ A(1,I), 1) + C(I,1)
+ END DO
+ END IF
+ END IF
+ ELSE
+*
+* This means op(A) = A
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*ZDOTC(M-I, B(1,I+1),
+ $ INCB, A(I,I+1), LDA) + C(I,1)
+ END DO
+ CALL ZACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(M-I+1, B(1,I),
+ $ INCB, A(I,I), LDA) + C(I,1)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* This means that B is a row vector but not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*ZDOTU(M-I, B(1,I+1),
+ $ INCB, A(I,I+1), LDA) + C(I,1)
+ END DO
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTU(M-I+1, B(1,I),
+ $ INCB, A(I,I), LDA) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is a column vector and not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*ZDOTU(M-I, B(I+1,1),
+ $ INCB, A(I,I+1), LDA) + C(I,1)
+ END DO
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTU(M-I+1, B(I,1),
+ $ INCB, A(I,I), LDA) + C(I,1)
+ END DO
+ END IF
+ END IF
+ END IF
+*
+* This means A is lower triangular
+*
+ ELSE
+ IF (CONJA) THEN
+*
+* This means op(A) = CONJG(A)
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*CONJG(ZDOTU(M-I,
+ $ B(1,I+1), INCB, A(I+1,I), 1))
+ & + C(I,1)
+ END DO
+ CALL ZACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*CONJG(ZDOTU(M-I+1,
+ $ B(1,I), INCB, A(I,I), 1))
+ & + C(I,1)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* This means that B is a row vector but not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*ZDOTC(M-I, A(I+1,I),
+ $ 1, B(1,I+1), INCB) + C(I,1)
+ END DO
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(M-I+1, A(I,I),
+ $ 1, B(1,I), INCB) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is a column vector and not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*ZDOTC(M-I, A(I+1,I),
+ $ 1, B(I+1,1), INCB) + C(I,1)
+ END DO
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(M-I+1, A(I,I),
+ $ 1, B(I,1), INCB) + C(I,1)
+ END DO
+ END IF
+ END IF
+ ELSE IF (TRANST) THEN
+*
+* This means op(A) = A**T
+* This is upper triangular
+*
+ IF (CONJB) THEN
+*
+* This means that we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*ZDOTC(M-I, B(1,I+1),
+ $ INCB, A(I+1,I), 1) + C(I,1)
+ END DO
+ CALL ZACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(M-I+1, B(1,I),
+ $ INCB, A(I,I), 1) + C(I,1)
+ END DO
+ END IF
+ ELSE IF (TRANSG) THEN
+*
+* This means that B is a row vector but not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*ZDOTU(M-I, B(1,I+1),
+ $ INCB, A(I+1,I), 1) + C(I,1)
+ END DO
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTU(M-I+1, B(1,I),
+ $ INCB, A(I,I), 1) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is a column vector and not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M-1
+ C(I,1) = ALPHA*ZDOTU(M-I, B(I+1,1),
+ $ INCB, A(I+1,I), 1) + C(I,1)
+ END DO
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTU(M-I+1, B(I,1),
+ $ INCB, A(I,I), 1) + C(I,1)
+ END DO
+ END IF
+ END IF
+*
+* This means op(A) = A
+* This is lower triangular[:w
+
+*
+ ELSE
+ IF (CONJB) THEN
+*
+* This means that B is conjugated and transposed
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(I-1, B, INCB,
+ $ A(I,1), LDA) + C(I,1)
+ END DO
+ CALL ZACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTC(I, B, INCB,
+ $ A(I,1), LDA) + C(I,1)
+ END DO
+ END IF
+ ELSE
+*
+* This means that B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTU(I-1, B, INCB,
+ $ A(I,1), LDA) + C(I,1)
+ END DO
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ DO I=1,M
+ C(I,1) = ALPHA*ZDOTU(I, B, INCB,
+ $ A(I,1), LDA) + C(I,1)
+ END DO
+ END IF
+ END IF
+ END IF
+ END IF
+ ELSE
+*
+* This means we are computing
+* C = \alpha op(B) * op(A) + \beta C
+* Note: This means A is a scalar
+*
+ IF (CONJA) THEN
+*
+* This means op(A) = CONJG(A)
+*
+ IF (CONJB) THEN
+*
+* This means we must conjugate B
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZACXPY(M, ALPHA*CONJG(A(1,1)), B,
+ $ INCB, C, 1)
+ END IF
+ ELSE
+*
+* This means B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZAXPY(M, ALPHA*CONJG(A(1,1)), B,
+ $ INCB, C, 1)
+ END IF
+ END IF
+ ELSE
+*
+* This means op(A) = A or op(A) = A**T = A
+*
+ IF (CONJB) THEN
+*
+* This means B is conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZACXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZACXPY(M, ALPHA*A(1,1), B, INCB, C,
+ $ 1)
+ END IF
+ ELSE
+*
+* This means B is not conjugated
+*
+ IF (UNIT) THEN
+*
+* A is assumed unit triangular
+*
+ CALL ZAXPY(M, ALPHA, B, INCB, C, 1)
+ ELSE
+*
+* A is not assumed unit triangular
+*
+ CALL ZAXPY(M, ALPHA*A(1,1), B, INCB, C,
+ $ 1)
+ END IF
+ END IF
+ END IF
+ END IF
+ END IF
+ RETURN
+ END IF
+*
+* Recursive Case
+*
+ L = MIN(M,N)/2
+ IF (LSIDE) THEN
+*
+* We are multiplying A from the left IE we are computing
+* C = \alpha op(A)*op(B) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing
+*
+* So we are computing
+* C = \alpha A**T * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11}**T + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{21}**T + \beta C_{12}
+* C_{21} = \alpha A_{12}**T * B_{11}**T + \alpha A_{22}**T * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{12}**T * B_{21}**T + \alpha A_{22}**T * B_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{12}**T * B_{11}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22}**T * B_{12}**T + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{12}**T * B_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22}**T * B_{22}**T + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C,
+ $ LDC)
+*
+* C_{12}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(1, L+1), LDA, B, LDB, BETA, C(L+1,1),
+ $ LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1,L+1), LDA, B(1,L+1),
+ $ LDB, ONE, C(L+1,1), LDC)
+*
+* C_{22}
+*
+ CALL ZGEMM(TRANSA, TRANSB, M-L, N-L, L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1,1), LDB, BETA,
+ $ C(L+1,L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1,L+1), LDA,
+ $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11} + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{12} + \beta C_{12}
+* C_{21} = \alpha A_{12}**T * B_{11} + \alpha A_{22}**T * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{12}**T * B_{12} + \alpha A_{22}**T * B_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{12}**T * B_{11} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22}**T * B_{21} + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{12}**T * B_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22}**T * B_{22} + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(1, L+1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZGEMM(TRANSA, TRANSB, M-L, N-L, L,
+ $ ALPHA, A(1, L+1), LDA, B(1, L+1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1,L+1), LDB, ONE, C(L+1,L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11}**T + \alpha A_{12} * B_{12}**T + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha A_{22} * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{22} * B_{22}**T + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{12} * B_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11} * B_{11}**T + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{12} * B_{22}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11} * B_{21}**T + C_{12} (This routine)
+*
+* C_{11}
+*
+ CALL ZGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(1, L+1), LDB, BETA, C, LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL ZGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1,1), LDB, ONE,
+ $ C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11} + \alpha A_{12} * B_{21} + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{12} + \alpha A_{12} * B_{22} + \beta C_{12}
+* C_{21} = \alpha A_{22} * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{22} * B_{22} + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{12} * B_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11} * B_{11} + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{12} * B_{22} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11} * B_{12} + C_{12} (This routine)
+*
+* C_{11}
+*
+ CALL ZGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, 1), LDB, BETA, C, LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL ZGEMM(TRANSB, TRANSA, L, N-L, M-L, ALPHA,
+ $ A(1, L+1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11}**T + \alpha A_{21}**T * B_{12}**T + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{21}**T + \alpha A_{21}**T * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha A_{22}**T * B_{12}**T + \beta C_{21}
+* C_{22} = \alpha A_{22}**T * B_{22}**T + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{21}**T * B_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11}**T * B_{11}**T + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{21}**T * B_{22}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11}**T * B_{21}**T + C_{12} (This routine)
+*
+* C_{11}
+*
+ CALL ZGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(1, L+1), LDB, BETA, C, LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL ZGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE,
+ $ C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A**T * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11}**T * B_{11} + \alpha A_{21}**T * B_{21} + \beta C_{11}
+* C_{12} = \alpha A_{11}**T * B_{12} + \alpha A_{21}**T * B_{22} + \beta C_{12}
+* C_{21} = \alpha A_{22}**T * B_{21} + \beta C_{21}
+* C_{22} = \alpha A_{22}**T * B_{22} + \beta C_{22}
+*
+* Computing C_{21} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{12} as follows
+*
+* C_{11} = \alpha A_{21}**T * B_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha A_{11}**T * B_{11} + C_{11} (This routine)
+*
+* C_{12} = \alpha A_{21}**T * B_{22} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha A_{11}**T * B_{12} + C_{12} (This routine)
+*
+* C_{11}
+*
+ CALL ZGEMM(TRANSA, TRANSB, L, L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, 1), LDB, BETA, C, LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL ZGEMM(TRANSA, TRANSB, L, N-L, M-L, ALPHA,
+ $ A(L+1, 1), LDA, B(L+1, L+1), LDB, BETA,
+ $ C(1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB, ONE,
+ $ C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha A * B**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11}**T + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{21}**T + \beta C_{12}
+* C_{21} = \alpha A_{21} * B_{11}**T + \alpha A_{22} * B_{12}**T + \beta * C_{21}
+* C_{22} = \alpha A_{21} * B_{21}**T + \alpha A_{22} * B_{22}**T + \beta * C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{21} * B_{11}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22} * B_{12}**T + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{21} * B_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22} * B_{22}**T + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZGEMM(TRANSA, TRANSB, M-L, N-L, L,
+ $ ALPHA, A(L+1, 1), LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha A * B + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times m-\ell}
+* A_{21}\in\R^{m-\ell\times\ell} A_{22}\in\R^{m-\ell\times m-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha A_{11} * B_{11} + \beta C_{11}
+* C_{12} = \alpha A_{11} * B_{12} + \beta C_{12}
+* C_{21} = \alpha A_{21} * B_{11} + \alpha A_{22} * B_{21} + \beta * C_{21}
+* C_{22} = \alpha A_{21} * B_{12} + \alpha A_{22} * B_{22} + \beta * C_{22}
+*
+* Computing C_{11} and C_{12} is just a recursive call to
+* this routine but we can break down computing
+* C_{21} and C_{22} as follows
+*
+* C_{21} = \alpha A_{21} * B_{11} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha A_{22} * B_{21} + C_{21} (This routine)
+*
+* C_{22} = \alpha A_{21} * B_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha A_{22} * B_{22} + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZGEMM(TRANSA, TRANSB, M-L, L, L, ALPHA,
+ $ A(L+1, 1), LDA, B, LDB, BETA, C(L+1, 1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, A(L+1, 1), LDA, B(1, L+1), LDB, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ END IF
+ ELSE
+*
+* We are multiplying A from the right IE we are computing
+* C = \alpha op(B)*op(A) + \beta C
+*
+ IF (UPPER) THEN
+*
+* A is upper triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11}**T + \alpha B_{21}**T * A_{12}**T + \beta C_{11}
+* C_{12} = \alpha B_{21}**T * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11}**T + \alpha B_{22}**T * A_{12}**T + \beta C_{21}
+* C_{22} = \alpha B_{22}**T * A_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{21}**T * A_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11}**T * A_{11}**T + C_{11} (This routine)
+*
+* C_{21} = \alpha B_{22}**T * A_{12}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{12}**T * A_{11}**T + C_{21} (This routine)
+*
+* C_{11}
+*
+ CALL ZGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(L+1, 1), LDB, A(1, L+1), LDA, BETA, C, LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T 0 |
+* |C_{21} C_{22}| |A_{12}**T A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11}**T + \alpha B_{12} * A_{12}**T + \beta C_{11}
+* C_{12} = \alpha B_{12} * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11}**T + \alpha B_{22} * A_{12}**T + \beta C_{21}
+* C_{22} = \alpha B_{22} * A_{22}**T + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{12} * A_{12}**T + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11} * A_{11}**T + C_{11} (This routine)
+*
+* C_{21} = \alpha B_{22} * A_{12}**T + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{21} * A_{11}**T + C_{21} (This routine)
+*
+* C_{11}
+*
+ CALL ZGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(1,L+1), LDB, A(1,L+1), LDA, BETA, C, LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB,
+ $ ONE, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11}**T * A_{12} + \alpha B_{21}**T * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11} + \beta C_{21}
+* C_{22} = \alpha B_{12}**T * A_{12} + \alpha B_{22}**T * A_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11}**T * A_{12} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{21}**T * A_{22} + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{12}**T * A_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22}**T * A_{22} + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL ZGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, B(1, L+1), LDB, A(1, L+1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} A_{12}|
+* |C_{21} C_{22}| |0 A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11} * A_{12} + \alpha B_{12} * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11} + \beta C_{21}
+* C_{22} = \alpha B_{21} * A_{12} + \alpha B_{22} * A_{22} + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11} * A_{12} + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{12} * A_{22} + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{21} * A_{12} + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22} * A_{22} + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL ZGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(1, L+1), LDA, BETA, C(1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L,
+ $ ALPHA, B(L+1, 1), LDB, A(1, L+1), LDA,
+ $ BETA, C(L+1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ ELSE
+*
+* A is lower triangular
+*
+ IF (TRANST) THEN
+*
+* We are transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11}**T + \beta C_{11}
+* C_{12} = \alpha B_{11}**T * A_{21}**T + \alpha B_{21}**T * A_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11}**T + \beta C_{21}
+* C_{22} = \alpha B_{12}**T * A_{21}**T + \alpha B_{22}**T * A_{22}**T + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11}**T * A_{21}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{21}**T * A_{22}**T + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{12}**T * A_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22}**T * A_{22}**T + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL ZGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB,
+ $ BETA, C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA,
+ $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A**T + \beta C. We break this down as follows
+*
+* |-------------| |-------------------|
+* C = |C_{11} C_{12}| A**T = |A_{11}**T A_{21}**T|
+* |C_{21} C_{22}| |0 A_{22}**T|
+* |-------------| |-------------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \beta C_{11}
+* C_{12} = \alpha B_{11} * A_{21}**T + \alpha A_{12} * B_{22}**T + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11}**T + \beta C_{21}
+* C_{22} = \alpha B_{21} * A_{21}**T + \alpha A_{22} * B_{22}**T + \beta C_{22}
+*
+* Computing C_{11} and C_{21} is just a recursive call to
+* this routine but we can break down computing
+* C_{12} and C_{22} as follows
+*
+* C_{12} = \alpha B_{11} * A_{21}**T + \beta C_{12} (GEMM call)
+* C_{12} = \alpha B_{12} * A_{22}**T + C_{12} (This routine)
+*
+* C_{22} = \alpha B_{21} * A_{21}**T + \beta C_{22} (GEMM call)
+* C_{22} = \alpha B_{22} * A_{22}**T + C_{22} (This routine)
+*
+* C_{11}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, BETA, C, LDC)
+*
+* C_{12}
+*
+ CALL ZGEMM(TRANSB, TRANSA, L, N-L, L, ALPHA,
+ $ B, LDB, A(L+1, 1), LDA, BETA, C(1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, ONE, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, BETA,
+ $ C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZGEMM(TRANSB, TRANSA, M-L, N-L, L, ALPHA,
+ $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, L+1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, ONE, C(L+1, L+1), LDC)
+ ENDIF
+ ELSE
+*
+* We are not transposing A
+*
+ IF (TRANSG) THEN
+*
+* We are transposing B.
+*
+* So we are computing
+* C = \alpha B**T * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------------|
+* B**T =|B_{11}**T B_{21}**T|
+* |B_{12}**T B_{22}**T|
+* |-------------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times m-\ell}
+* B_{21}\in\R^{n-\ell\times\ell} B_{22}\in\R^{n-\ell\times m-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11}**T * A_{11} + \alpha B_{21}**T * A_{21} + \beta C_{11}
+* C_{12} = \alpha B_{21}**T * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{12}**T * A_{11} + \alpha B_{22}**T * A_{21} + \beta C_{21}
+* C_{22} = \alpha B_{22}**T * A_{22} + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{21}**T * A_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11}**T * A_{11} + C_{11}(This routine)
+*
+* C_{21} = \alpha B_{22}**T * A_{21} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{12}**T * A_{11} + C_{21} (This routine)
+*
+* C_{11}
+*
+ CALL ZGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(L+1, 1), LDB, A(L+1, 1), LDA, BETA, C, LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(L+1, 1),
+ $ LDB, BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(1, L+1), LDB, ONE,
+ $ C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ELSE
+*
+* We are not transposing B.
+*
+* So we are computing
+* C = \alpha B * A + \beta C. We break this down as follows
+*
+* |-------------| |-------------|
+* C = |C_{11} C_{12}| A = |A_{11} 0 |
+* |C_{21} C_{22}| |A_{21} A_{22}|
+* |-------------| |-------------|
+*
+* |-------------|
+* B = |B_{11} B_{12}|
+* |B_{21} B_{22}|
+* |-------------|
+*
+* Where
+* C_{11}\in\R^{\ell\times\ell} C_{12}\in\R^{\ell\times n-\ell}
+* C_{21}\in\R^{m-\ell\times\ell} C_{22}\in\R^{m-\ell\times n-\ell}
+*
+* A_{11}\in\R^{\ell\times\ell} A_{12}\in\R^{\ell\times n-\ell}
+* A_{21}\in\R^{n-\ell\times\ell} A_{22}\in\R^{n-\ell\times n-\ell}
+*
+* B_{11}\in\R^{\ell\times\ell} B_{12}\in\R^{\ell\times n-\ell}
+* B_{21}\in\R^{m-\ell\times\ell} B_{22}\in\R^{m-\ell\times n-\ell}
+*
+* Which means that we get
+* C_{11} = \alpha B_{11} * A_{11} + \alpha B_{12} * A_{21} + \beta C_{11}
+* C_{12} = \alpha B_{12} * A_{22} + \beta C_{12}
+* C_{21} = \alpha B_{21} * A_{11} + \alpha B_{22} * A_{21} + \beta C_{21}
+* C_{22} = \alpha B_{22} * A_{22} + \beta C_{22}
+*
+* Computing C_{12} and C_{22} is just a recursive call to
+* this routine but we can break down computing
+* C_{11} and C_{21} as follows
+*
+* C_{11} = \alpha B_{12} * A_{21} + \beta C_{11} (GEMM call)
+* C_{11} = \alpha B_{11} * A_{11} + C_{11}(This routine)
+*
+* C_{21} = \alpha B_{22} * A_{21} + \beta C_{21} (GEMM call)
+* C_{21} = \alpha B_{21} * A_{11} + C_{21} (This routine)
+*
+* C_{11}
+*
+ CALL ZGEMM(TRANSB, TRANSA, L, L, N-L, ALPHA,
+ $ B(1, L+1), LDB, A(L+1, 1), LDA, BETA, C, LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, L, ALPHA, A, LDA, B, LDB, ONE, C, LDC)
+*
+* C_{12}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ L, N-L, ALPHA, A(L+1, L+1), LDA, B(1, L+1),
+ $ LDB, BETA, C(1, L+1), LDC)
+*
+* C_{21}
+*
+ CALL ZGEMM(TRANSB, TRANSA, M-L, L, N-L, ALPHA,
+ $ B(L+1, L+1), LDB, A(L+1, 1), LDA, BETA,
+ $ C(L+1, 1), LDC)
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, L, ALPHA, A, LDA, B(L+1, 1), LDB, ONE,
+ $ C(L+1, 1), LDC)
+*
+* C_{22}
+*
+ CALL ZTRMMOOP(SIDE, UPLO, TRANSA, TRANSB, DIAG,
+ $ M-L, N-L, ALPHA, A(L+1, L+1), LDA,
+ $ B(L+1, L+1), LDB, BETA, C(L+1, L+1), LDC)
+ ENDIF
+ END IF
+ END IF
+ END IF
+ END SUBROUTINE
diff --git a/SRC/ztrtrm.f b/SRC/ztrtrm.f
new file mode 100644
index 0000000000..5f839e42d6
--- /dev/null
+++ b/SRC/ztrtrm.f
@@ -0,0 +1,584 @@
+*> \brief \b ZTRTRM computes an in place triangular-triangular matrix product
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* RECURSIVE SUBROUTINE ZTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV,
+* $ N, ALPHA, T, LDT, V, LDV)
+*
+* .. Scalar Arguments ..
+* INTEGER N, LDT, LDV
+* CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV
+* COMPLEX*16 ALPHA
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 T(LDT,*), V(LDV,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZTRTRM performs one of the matrix-matrix operations
+*>
+*> T = \alpha op(V) * T
+*> or
+*> T = \alpha T * op(V)
+*> where \alpha is a scalar, T and V are unit, or non-unit, upper or
+*> lower triangular matrix, and op(V) is one of
+*>
+*> op(V) = V or op(V) = V**T or op(V) = V**H
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] SIDE
+*> \verbatim
+*> SIDE is CHARACTER*1
+*> On entry, SIDE specifies whether op(V) multiplies T from
+*> the left or right as follows:
+*>
+*> SIDE = 'L' or 'l' T = \alpha op(V) * T
+*>
+*> SIDE = 'R' or 'r' T = \alpha T * op(V)
+*> \endverbatim
+*>
+*> \param[in] UPLO
+*> \verbatim
+*> UPLO is CHARACTER*1
+*> On entry, UPLO specifies whether T an op(V) are upper or
+*> lower triangular as follows:
+*> UPLO = 'U' or 'u' T and op(V) are upper triangular
+*>
+*> UPLO = 'L' or 'l' T and op(V) are lower triangular
+*> \Endverbatim
+*>
+*> \param[in] TRANSV
+*> \verbatim
+*> TRANSV is CHARACTER*1
+*> On entry, TRANSV specifies the form of op(V) to be used in
+*> the matrix multiplication as follows:
+*> TRANSV = 'N' or 'n' op(V) = V
+*>
+*> TRANSV = 'T' or 't' op(V) = V**T
+*>
+*> TRANSV = 'C' or 'c' op(V) = V**H
+*> \endverbatim
+*>
+*> \param[in] DIAGT
+*> \verbatim
+*> DIAGT is CHARACTER*1
+*> On entry, DIAGT specifies whether or not T is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' T is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' T is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] DIAGV
+*> \verbatim
+*> DIAGV is CHARACTER*1
+*> On entry, DIAGV specifies whether or not V is unit triangular
+*> as follows:
+*>
+*> DIAG = 'U' or 'u' V is assumed to be unit triangular.
+*>
+*> DIAG = 'N' or 'n' V is not assumed to be unit
+*> triangular.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> On entry, N specifies the number of rows and columns of T.
+*> N must be at least zero.
+*> \endverbatim
+*>
+*> \param[in] ALPHA
+*> \verbatim
+*> ALPHA is COMPLEX*16.
+*> On entry, ALPHA specifies the scalar alpha. When alpha is
+*> zero then T and V are not referenced, and T and V need not
+*> be set before entry.
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*> T is COMPLEX*16 array, dimension ( LDT, N )
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array T must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> T is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array T must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> T is not referenced.
+*> Note that when DIAGT = 'U' or 'u', the diagonal elements of
+*> T are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*> LDT is INTEGER
+*> On entry, LDT specifies the first dimension of T as declared
+*> in the calling (sub) program. LDT must be at least max( 1, n ).
+*> \endverbatim
+*>
+*> \param[in] V
+*> \verbatim
+*> V is COMPLEX*16 array, dimension ( LDV, N )
+*> Before entry with UPLO = 'U' or 'u', the leading k-by-k
+*> upper triangular part of the array op(V) must contain the upper
+*> triangular matrix and the strictly lower triangular part of
+*> V is not referenced.
+*> Before entry with UPLO = 'L' or 'l', the leading k-by-k
+*> lower triangular part of the array op(V) must contain the lower
+*> triangular matrix and the strictly upper triangular part of
+*> V is not referenced.
+*> Note that when DIAGV = 'U' or 'u', the diagonal elements of
+*> V are not referenced either, but are assumed to be unity.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*> LDV is INTEGER
+*> On entry, LDV specifies the first dimension of T as declared
+*> in the calling (sub) program. LDV must be at least max( 1, n ).
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ RECURSIVE SUBROUTINE ZTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV,
+ $ N, ALPHA, T, LDT, V, LDV)
+*
+* .. Scalar Arguments ..
+ INTEGER N, LDT, LDV
+ CHARACTER SIDE, UPLO, TRANSV, DIAGT, DIAGV
+ COMPLEX*16 ALPHA
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 T(LDT,*), V(LDV,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Functions ..
+ LOGICAL LSAME
+ EXTERNAL LSAME
+* ..
+* .. External Subroutines ..
+ EXTERNAL ZTRMM, ZTRMMOOP, ZLASET
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC CONJG
+* ..
+* .. Local Scalars ..
+ INTEGER K, INFO
+ LOGICAL TLEFT, TUPPER, VTRANS, VUNIT, TUNIT, CONJV
+* ..
+* .. Local Parameters ..
+ COMPLEX*16 ONE, ZERO
+ PARAMETER(ONE=(1.0D+0,0.D+0), ZERO=(0.0D+0,0.0D+0))
+* ..
+*
+* Beginning of Executable Statements
+*
+*
+* Early Termination Criteria
+*
+ IF (ALPHA.EQ.ZERO) THEN
+*
+* If ALPHA is 0, then we are just setting T to be the 0 matrix
+*
+ CALL ZLASET(UPLO, N, N, ZERO, ZERO, T, LDT)
+ RETURN
+ END IF
+ TUNIT = LSAME(DIAGT, 'U')
+ VUNIT = LSAME(DIAGV, 'U')
+*
+* Terminating Case
+*
+ IF (N.EQ.1) THEN
+ IF (VUNIT.AND.TUNIT) THEN
+ T(1,1) = ALPHA
+ ELSE IF (VUNIT) THEN
+ T(1,1) = ALPHA*T(1,1)
+ ELSE IF (TUNIT) THEN
+ IF (CONJV) THEN
+ T(1,1) = ALPHA*CONJG(V(1,1))
+ ELSE
+ T(1,1) = ALPHA*V(1,1)
+ END IF
+ ELSE
+ T(1,1) = ALPHA*T(1,1)*V(1,1)
+ END IF
+ RETURN
+ ELSE IF(N.LE.0) THEN
+ RETURN
+ END IF
+*
+* Recursive case
+*
+ TLEFT = LSAME(SIDE, 'R')
+ TUPPER = LSAME(UPLO, 'U')
+ CONJV = LSAME(TRANSV, 'C')
+ VTRANS = CONJV.OR.LSAME(TRANSV, 'T')
+
+ K = N / 2
+ IF(TUPPER) THEN
+*
+* T is upper triangular
+*
+ IF(TLEFT) THEN
+*
+* Compute T = T*op(V)
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = T*V**T, which we break down as follows
+* |--------------| |--------------| |--------------------|
+* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11}**T V_{21}**T|
+* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}**T|
+* |--------------| |--------------| |--------------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}**T
+* T_{12} = \alpha T_{11}*V_{21}**T + \alpha T_{12}*V_{22}**T
+* T_{22} = \alpha T_{22}*V_{22}**T
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha T_{12}*V_{22}**T (ZTRMM)
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12} (ZTRMMOOP)
+*
+* T_{12} = \alpha T_{12}*V_{22}**T
+*
+ CALL ZTRMM('Right', 'Lower', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12}
+*
+ CALL ZTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT,
+ $ V(K+1, 1), LDV, ONE, T(1, K+1), LDT)
+ ELSE
+*
+* We are computing T = T*V, which we break down as follows
+* |--------------| |--------------| |-------------|
+* |T_{11} T_{12}| |T_{11} T_{12}| |V_{11} V_{12}|
+* |0 T_{22}| = \alpha |0 T_{22}| * |0 V_{22}|
+* |--------------| |--------------| |-------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}
+* T_{12} = \alpha T_{11}*V_{12} + \alpha T_{12}*V_{22}
+* T_{22} = \alpha T_{22}*V_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha T_{12}*V_{22} (ZTRMM)
+* T_{12} = \alpha T_{11}*V_{12} + T_{12} (ZTRMMOOP)
+*
+* T_{12} = \alpha T_{12}*V_{22}
+*
+ CALL ZTRMM('Right', 'Upper', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V(K+1, K+1), LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha T_{11}*V_{21}**T + T_{12}
+*
+ CALL ZTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T, LDT,
+ $ V(1, K+1), LDV, ONE, T(1, K+1), LDT)
+ END IF
+ ELSE
+*
+* Compute T = op(V)*T
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = V**T*T, which we break down as follows
+* |--------------| |--------------------| |--------------|
+* |T_{11} T_{12}| |V_{11}**T V_{21}**T| |T_{11} T_{12}|
+* |0 T_{22}| = \alpha |0 V_{22}**T| * |0 T_{22}|
+* |--------------| |--------------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}**T*T_{11}
+* T_{12} = \alpha V_{11}**T*T_{12} + \alpha V_{21}**T*T_{22}
+* T_{22} = \alpha V_{22}**T*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha V_{11}**T*T_{12} (ZTRMM)
+* T_{12} = \alpha V_{21}**T*T_{22} + T_{12} (ZTRMMOOP)
+*
+* T_{12} = \alpha V_{11}**T*T_{12}
+*
+ CALL ZTRMM('Left', 'Lower', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V, LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha V_{21}**T*T_{22} + T_{12}
+*
+ CALL ZTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1),
+ $ LDT, V(K+1, 1), LDV, ONE, T(1, K+1), LDT)
+ ELSE
+*
+* We are computing T = V*T, which we break down as follows
+* |--------------| |--------------| |--------------|
+* |T_{11} T_{12}| |V_{11} V_{12}| |T_{11} T_{12}|
+* |0 T_{22}| = \alpha |0 V_{22}| * |0 T_{22}|
+* |--------------| |--------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k} T_{12}\in\R^{k\times n-k}
+* T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}*T_{11}
+* T_{12} = \alpha V_{11}*T_{12} + \alpha V_{12}*T_{22}
+* T_{22} = \alpha V_{22}*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{12} = \alpha V_{11}*T_{12} (ZTRMM)
+* T_{12} = \alpha V_{12}*T_{22} + T_{12} (ZTRMMOOP)
+*
+* T_{12} = \alpha V_{11}*T_{12}
+*
+ CALL ZTRMM('Left', 'Upper', TRANSV, DIAGV, K,
+ $ N-K, ALPHA, V, LDV, T(1, K+1), LDT)
+*
+* T_{12} = \alpha V_{12}*T_{22} + T_{12} (ZTRMMOOP)
+*
+ CALL ZTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, K, N-K, ALPHA, T(K+1, K+1),
+ $ LDT, V(1, K+1), LDV, ONE, T(1, K+1), LDT)
+ END IF
+ END IF
+ ELSE
+*
+* T is lower triangular
+*
+ IF(TLEFT) THEN
+*
+* Compute T = T*op(V)
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = T*V**T, which we break down as follows
+* |--------------| |--------------| |--------------------|
+* |T_{11} 0 | |T_{11} 0 | |V_{11}**T 0 |
+* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{12}**T V_{22}**T|
+* |--------------| |--------------| |--------------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}**T
+* T_{21} = \alpha T_{21}*V_{11}**T + \alpha T_{22}*V_{12}**T
+* T_{22} = \alpha T_{22}*V_{22}**T
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha T_{21}*V_{11}**T (ZTRMM)
+* T_{21} = \alpha T_{22}*V_{12}**T + T_{21} (ZTRMMOOP)
+*
+* T_{21} = \alpha T_{21}*V_{11}**T
+*
+ CALL ZTRMM('Right', 'Upper', TRANSV, DIAGV, N-K,
+ $ K, ALPHA, V, LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha T_{22}*V_{12}**T + T_{21}
+*
+ CALL ZTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1),
+ $ LDT, V(1, K+1), LDV, ONE, T(K+1, 1), LDT)
+ ELSE
+*
+* We are computing T = T*V, which we break down as follows
+* |--------------| |--------------| |-------------|
+* |T_{11} 0 | |T_{11} 0 | |V_{11} 0 |
+* |T_{21} T_{22}| = \alpha |T_{21} T_{22}| * |V_{21} V_{22}|
+* |--------------| |--------------| |-------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha T_{11}*V_{11}
+* T_{21} = \alpha T_{21}*V_{11} + \alpha T_{22}*V_{21}
+* T_{22} = \alpha T_{22}*V_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha T_{21}*V_{11} (ZTRMM)
+* T_{21} = \alpha T_{22}*V_{21} + T_{21} (ZTRMMOOP)
+*
+* T_{21} = \alpha T_{21}*V_{11}
+*
+ CALL ZTRMM('Right', 'Lower', TRANSV, DIAGV, N-K,
+ $ K, ALPHA, V, LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha T_{22}*V_{12} + T_{21}
+*
+ CALL ZTRMMOOP('Left', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T(K+1, K+1),
+ $ LDT, V(K+1, 1), LDV, ONE, T(K+1, 1), LDT)
+ END IF
+ ELSE
+*
+* Compute T = op(V)*T
+*
+ IF(VTRANS) THEN
+*
+* We are computing T = V**T*T, which we break down as follows
+* |--------------| |--------------------| |--------------|
+* |T_{11} 0 | |V_{11}**T 0 | |T_{11} 0 |
+* |T_{21} T_{22}| = \alpha |V_{12}**T V_{22}**T| * |T_{21} T_{22}|
+* |--------------| |--------------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k} V_{12}\in\R^{k\times n-k}
+* V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}**T*T_{11}
+* T_{21} = \alpha V_{12}**T*T_{11} + \alpha V_{22}**T*T_{21}
+* T_{22} = \alpha V_{22}**T*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{21} as follows
+*
+* T_{21} = \alpha V_{22}**T*T_{21} (ZTRMM)
+* T_{21} = \alpha V_{12}**T*T_{11} + T_{21} (ZTRMMOOP)
+*
+* T_{21} = \alpha V_{22}**T*T_{21}
+*
+ CALL ZTRMM('Left', 'Upper', TRANSV, DIAGV, N-K, K,
+ $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha V_{12}**T*T_{11} + T_{21}
+*
+ CALL ZTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT,
+ $ V(1, K+1), LDV, ONE, T(K+1, 1), LDT)
+ ELSE
+*
+* We are computing T = V*T, which we break down as follows
+* |--------------| |-------------| |--------------|
+* |T_{11} 0 | |V_{11} 0 | |T_{11} 0 |
+* |T_{21} T_{22}| = \alpha |V_{21} V_{22}| * |T_{21} T_{22}|
+* |--------------| |-------------| |--------------|
+*
+* Where
+* T_{11}\in\R^{k\times k}
+* T_{21}\in\R^{n-k\times k} T_{22}\in\R^{n-k\times n-k}
+*
+* V_{11}\in\R^{k\times k}
+* V_{21}\in\R^{n-k\times k} V_{22}\in\R^{n-k\times n-k}
+*
+* Which means that we get
+*
+* T_{11} = \alpha V_{11}*T_{11}
+* T_{21} = \alpha V_{21}*T_{11} + \alpha V_{22}*T_{21}
+* T_{22} = \alpha V_{22}*T_{22}
+*
+* Computing T_{11} and T_{22} are just recursive calls to this
+* routine, but we can break down computing T_{12} as follows
+*
+* T_{21} = \alpha V_{22}*T_{21} (ZTRMM)
+* T_{21} = \alpha V_{12}*T_{11} + T_{21} (ZTRMMOOP)
+*
+* T_{21} = \alpha V_{22}*T_{12}
+*
+ CALL ZTRMM('Left', 'Lower', TRANSV, DIAGV, N-K, K,
+ $ ALPHA, V(K+1, K+1), LDV, T(K+1, 1), LDT)
+*
+* T_{21} = \alpha V_{12}*T_{11} + T_{21}
+*
+ CALL ZTRMMOOP('Right', UPLO, 'No Transpose',
+ $ TRANSV, DIAGT, N-K, K, ALPHA, T, LDT,
+ $ V(K+1, 1), LDV, ONE, T(K+1, 1), LDT)
+ END IF
+ END IF
+ END IF
+*
+* Since in all the above cases, we compute T_{11} and T_{22}
+* the same, we pass in our flags and call this routine recursively
+*
+* Compute T_{11} recursively
+*
+ CALL ZTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, K, ALPHA,
+ $ T, LDT, V, LDV)
+*
+* Compute T_{22} recursively
+*
+ CALL ZTRTRM(SIDE, UPLO, TRANSV, DIAGT, DIAGV, N-K, ALPHA,
+ $ T(K+1, K+1), LDT, V(K+1, K+1), LDV)
+ END SUBROUTINE
diff --git a/SRC/zungkl.f b/SRC/zungkl.f
new file mode 100644
index 0000000000..bf3d61b301
--- /dev/null
+++ b/SRC/zungkl.f
@@ -0,0 +1,174 @@
+*> \brief \b ZUNGKL computes the explicit Q factor from ZGEQLF and ZLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZUNGKL(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZUNGKL generates an m by n complex matrix Q with orthonormal columns,
+*> which is defined as the last n columns of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V*T*V**H = H(n) . . . H(2) H(1)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by ZGEQLF and T is the n by n matrix returned by ZLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V, and the order of T.
+*> N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is COMPLEX*16 array, dimension (LDQ,N)
+*> On entry, Q(1:m-n+i-1,i) contains the vector which defines the
+*> elementary reflector H(i), for i=1,...,n as returned by ZGEQLF.
+*> In addition, the lower triangular portion of the submatrix given
+*> by Q(m-n+1:m,1:n) will contain the arry T as returned by ZLARFT.
+*> See further details for more information.
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The storage of the V and T components inside Q is best illustrated by
+*> the following example with m = 5, n = 3.
+*>
+*> Q = |----------|
+*> | V1 V2 V3 |
+*> | V1 V2 V3 |
+*> | T1 V2 V3 |
+*> | T1 T2 V3 |
+*> | T1 T2 T3 |
+*> |----------|
+*>
+*> \endverbatim
+*>
+* =====================================================================
+
+ SUBROUTINE ZUNGKL(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Subroutines ..
+ EXTERNAL ZTRMM, ZTRTRM, ZLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Parameters ..
+ COMPLEX*16 NEG_ONE, ONE
+ PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0))
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |---|
+* Q = | V |
+* | T |
+* |---|
+*
+* Where T is an n-by-n lower triangular matrix, and V is as described
+* in the Further Details section
+*
+* In turn, break apart V as follows
+*
+* |-----|
+* V = | V_2 |
+* | V_1 |
+* |-----|
+*
+* Where:
+*
+* V_1 \in \C^{n\times n} assumed unit upper triangular
+* V_2 \in \C^{m-n\times n}
+*
+* Compute T = T*V_1**T
+*
+ CALL ZTRTRM('Right', 'Lower', 'Conjugate Transpose',
+ $ 'Non-Unit', 'Unit', N, ONE, Q(M-N+1,1), LDQ, Q(M-N+1,1),
+ $ LDQ)
+*
+* Compute Q = -VT. This means that we need to break apart
+* Our computation in two parts
+*
+* |--------|
+* Q = | -V_2*T |
+* | -V_1*T |
+* |--------|
+*
+* Q_2 = -V_2*T (TRMM) but only when necessary
+*
+ IF (M.GT.N) THEN
+ CALL ZTRMM('Right', 'Lower', 'No Transpose', 'Non-Unit',
+ $ M-N, N, NEG_ONE, Q(M-N+1,1), LDQ, Q, LDQ)
+ END IF
+*
+* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL ZLUMM('Right', 'Non-Unit', 'Unit', N, NEG_ONE,
+ $ Q(M-N+1,1), LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(M-N+I,I) = Q(M-N+I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/zungkr.f b/SRC/zungkr.f
new file mode 100644
index 0000000000..27c51d85e6
--- /dev/null
+++ b/SRC/zungkr.f
@@ -0,0 +1,154 @@
+*> \brief \b ZUNGKR computes the explicit Q factor from ZGEQRF and ZLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZUNGKR(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZUNGKR generates an m by n complex matrix Q with orthonormal columns,
+*> which is defined as the first n columns of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V*T*V**H = H(1) H(2) . . . H(n)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by ZGEQRF and T is the n by n matrix returned by ZLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V, and the order of T.
+*> N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is COMPLEX*16 array, dimension (LDQ,N)
+*> On entry, the upper triangular part and diagonal contains
+*> The array T as returned from ZLARFT. In addition, the
+*> strictly lower triangular portion of the i-th column contains
+*> the vector which defines the elementary reflector H(i),
+*> for i = 1,2,...,n, as returned by ZGEQRF
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+* Cost: (2mn**2 + n**2 - n)/2
+ SUBROUTINE ZUNGKR(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Subroutines ..
+ EXTERNAL ZTRMM, ZTRTRM, ZLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Parameters ..
+ COMPLEX*16 NEG_ONE, ONE
+ PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0))
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |---|
+* Q = | T |
+* | V |
+* |---|
+*
+* Where T is an n-by-n upper triangular matrix, and V is an
+* m-by-n assumed unit lower trapezoidal matrix
+*
+* In turn, break apart V as follows
+*
+* |-----|
+* V = | V_1 |
+* | V_2 |
+* |-----|
+*
+* Where:
+*
+* V_1 \in \C^{n\times n} assumed unit lower triangular
+* V_2 \in \C^{m-n\times n}
+*
+* Compute T = T*V_1**H
+*
+ CALL ZTRTRM('Right', 'Upper', 'Conjugate Transpose',
+ $ 'Non-unit', 'Unit', N, ONE, Q, LDQ, Q, LDQ)
+*
+* Compute Q = -VT. This means that we need to break apart
+* Our computation in two parts
+*
+* |--------|
+* Q = | -V_1*T |
+* | -V_2*T |
+* |--------|
+*
+* Q_2 = -V_2*T (TRMM) but only when necessary
+*
+ IF (M.GT.N) THEN
+ CALL ZTRMM('Right', 'Upper', 'No Transpose', 'Non-unit',
+ $ M-N, N, NEG_ONE, Q, LDQ, Q(N+1,1), LDQ)
+ END IF
+*
+* Q_1 = -V_1*T (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL ZLUMM('Left', 'Unit', 'Non-Unit', N, NEG_ONE, Q, LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,I) = Q(I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/zunglk.f b/SRC/zunglk.f
new file mode 100644
index 0000000000..b4f89ca2ec
--- /dev/null
+++ b/SRC/zunglk.f
@@ -0,0 +1,149 @@
+*> \brief \b ZUNGLK computes the explicit Q factor from ZGELQF and ZLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZUNGLK(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZUNGLK generates an m by n complex matrix Q with orthonormal columns,
+*> which is defined as the first n rows of the product of n
+*> elementary reflectors
+*>
+*> Q = I - V'*T*V = H(1) H(2) . . . H(n)
+*>
+*> Where V is an m by n matrix whose rows are householder reflectors
+*> as returned by ZGELQF and T is the n by n matrix returned by ZLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V, and the order of T. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is COMPLEX*16 array, dimension (LDQ,N)
+*> On entry, the lower triangular part and diagonal contains
+*> The array T as returned from ZLARFT. In addition, the
+*> strictly upper triangular portion of the i-th row contains
+*> the vector which defines the elementary reflector H(i),
+*> for i = 1,2,...,m, as returned by ZGELQF
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+* =====================================================================
+ SUBROUTINE ZUNGLK(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. External Subroutines ..
+ EXTERNAL ZTRMM, ZTRTRM, ZLUMM
+* ..
+* .. Intrinsic Functions ..
+ INTRINSIC MIN
+* ..
+* .. Parameters ..
+ COMPLEX*16 NEG_ONE, ONE
+ PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0))
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |-----|
+* Q = | T V |
+* |-----|
+*
+* Where T is an m-by-m lower triangular matrix, and V is an
+* m-by-n assumed unit upper trapezoidal matrix
+*
+* In turn, break apart V as follows
+*
+* |---------|
+* V = | V_1 V_2 |
+* |---------|
+*
+* Where:
+*
+* V_1 \in \R^{m\times m} assumed unit upper triangular
+* V_2 \in \R^{m\times n-m}
+*
+* Compute T = V_1'*T
+*
+ CALL ZTRTRM('Left', 'Lower', 'Conjugate Transpose',
+ $ 'Non-unit', 'Unit', M, ONE, Q, LDQ, Q, LDQ)
+*
+* Compute Q = -TV. This means that we need to break apart
+* Our computation in two parts
+*
+* |---------------|
+* Q = | -T*V_1 -T*V_2 |
+* |---------------|
+*
+* Q_2 = -T*V_2 (TRMM) but only when necessary
+*
+ IF (N.GT.M) THEN
+ CALL ZTRMM('Left', 'Lower', 'No Transpose', 'Non-unit',
+ $ M, N-M, NEG_ONE, Q, LDQ, Q(1,M+1), LDQ)
+ END IF
+*
+* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL ZLUMM('Left', 'Non-unit', 'Unit', M, NEG_ONE, Q, LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,I) = Q(I,I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/zunglq.f b/SRC/zunglq.f
index 3cc107560d..4da17af295 100644
--- a/SRC/zunglq.f
+++ b/SRC/zunglq.f
@@ -94,8 +94,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,M).
-*> For optimum performance LWORK >= M*NB, where NB is
-*> the optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -136,20 +134,17 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX*16 ZERO
- PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, LDWORK,
+ $ NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNGL2
+ EXTERNAL XERBLA, ZLARFB0C2, ZLARFT,
+ $ ZUNGL2, ZUNGLK
* ..
* .. Intrinsic Functions ..
- INTRINSIC MAX, MIN
+ INTRINSIC MAX
* ..
* .. External Functions ..
INTEGER ILAENV
@@ -161,7 +156,7 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'ZUNGLQ', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, M )*NB
+ LWKOPT = MAX( 1, M )
WORK( 1 ) = LWKOPT
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -189,95 +184,97 @@ SUBROUTINE ZUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
RETURN
END IF
*
- NBMIN = 2
- NX = 0
+ NBMIN = MAX( 2, ILAENV( 2, 'ZUNGLQ', ' ', M, N, K, -1 ) )
+ NX = MAX( 0, ILAENV( 3, 'ZUNGLQ', ' ', M, N, K, -1 ) )
IWS = M
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
*
- NX = MAX( 0, ILAENV( 3, 'ZUNGLQ', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Treat the last NB block starting at KK+1 specially then use our blocking
+* method from the block starting at KI+1 to 1
*
- LDWORK = M
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked version
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'ZUNGLQ', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL ZUNGL2( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the last block.
-* The first kk rows are handled by the block method.
+* Factor the last block assuming that our first application
+* will be on the Identity matrix
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ I = KK + 1
+ IB = NB
*
-* Set A(kk+1:m,1:kk) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 20 J = 1, KK
- DO 10 I = KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL ZLARFT( 'Forward', 'Transpose', N-I+1, IB, A( I, I ),
+ $ LDA, TAU( I ), A( I, I ), LDA )
*
-* Use unblocked code for the last or only block.
+* Apply H to A(i+ib:m,i:n) from the right
*
- IF( KK.LT.M )
- $ CALL ZUNGL2( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL ZLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Forward',
+ $ 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I+IB,I), LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to columns i:n of current block
+
+ CALL ZUNGLK( IB, N-I+1, A( I, I ), LDA)
*
* Use blocked code
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.M ) THEN
+ DO I = KI + 1, 1, -NB
+ IB = NB
+*
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
+*
+ CALL ZLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I),
+ $ LDA, TAU( I ), A( I, I ), LDA )
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+* Apply H to A(i+ib:m,i:n) from the right
*
- CALL ZLARFT( 'Forward', 'Rowwise', N-I+1, IB, A( I,
- $ I ),
- $ LDA, TAU( I ), WORK, LDWORK )
+ CALL ZLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I+IB,I), LDA)
*
-* Apply H**H to A(i+ib:m,i:n) from the right
+* Apply H to columns i:n of current block
*
- CALL ZLARFB( 'Right', 'Conjugate transpose',
- $ 'Forward',
- $ 'Rowwise', M-I-IB+1, N-I+1, IB, A( I, I ),
- $ LDA, WORK, LDWORK, A( I+IB, I ), LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL ZUNGLK( IB, N-I+1, A( I, I ), LDA)
+ END DO
*
-* Apply H**H to columns i:n of current block
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
*
- CALL ZUNGL2( IB, N-I+1, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
-* Set columns 1:i-1 of current block to zero
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 40 J = 1, I - 1
- DO 30 L = I, I + IB - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL ZLARFT( 'Forward', 'Transpose', N-I+1, IB, A(I,I),
+ $ LDA, TAU( I ), A( I, I ), LDA )
+*
+* Apply H to A(i+ib:m,i:n) from the right
+*
+ CALL ZLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Forward', 'Rowwise', M-I-IB+1, N-I+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I+IB,I), LDA)
+*
+* Apply H to columns i:n of current block
+*
+ CALL ZUNGLK( IB, N-I+1, A( I, I ), LDA)
+ END IF
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/zungql.f b/SRC/zungql.f
index a64de501d3..7f8200a148 100644
--- a/SRC/zungql.f
+++ b/SRC/zungql.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -136,18 +134,15 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
* ..
*
* =====================================================================
-*
-* .. Parameters ..
- COMPLEX*16 ZERO
- PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-* ..
+*
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KK, L, LDWORK, LWKOPT,
- $ NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KK, LWKOPT, NB, NBMIN,
+ $ NX
* ..
* .. External Subroutines ..
- EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2L
+ EXTERNAL XERBLA, ZLARFB0C2, ZLARFT,
+ $ ZUNG2L, ZUNGKL
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -177,7 +172,11 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
LWKOPT = 1
ELSE
NB = ILAENV( 1, 'ZUNGQL', ' ', M, N, K, -1 )
- LWKOPT = N*NB
+*
+* Only need a workspace for zung2l in case of bailout
+* and for the panel factorization
+*
+ LWKOPT = N
END IF
WORK( 1 ) = LWKOPT
*
@@ -200,88 +199,75 @@ SUBROUTINE ZUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+ NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ))
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
*
- NX = MAX( 0, ILAENV( 3, 'ZUNGQL', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* We use blocked code for the entire construction
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KK = K
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Possibly bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQL', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL ZUNG2L( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the first block.
-* The last kk columns are handled by the block method.
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+ I = 1
+ IB = NB
*
-* Set A(m-kk+1:m,1:n-kk) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- DO 20 J = 1, N - KK
- DO 10 I = M - KK + 1, M
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA)
*
-* Use unblocked code for the first or only block.
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Exploit the fact that we are applying to an identity
*
- CALL ZUNG2L( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Backward',
+ $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB, A(1, N-K+I),
+ $ LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
-* Use blocked code
+ CALL ZUNGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA)
+
+* Use blocked code on the remaining blocks if there are any.
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- IF( N-K+I.GT.1 ) THEN
+ DO I = NB+1, K, NB
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* The last block may be less than size NB
*
- CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
- $ A( 1, N-K+I ), LDA, TAU( I ), WORK, LDWORK )
+ IB = MIN(NB, K-I+1)
*
-* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL ZLARFB( 'Left', 'No transpose', 'Backward',
- $ 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
- $ A( 1, N-K+I ), LDA, WORK, LDWORK, A, LDA,
- $ WORK( IB+1 ), LDWORK )
- END IF
+ CALL ZLARFT( 'Backward', 'Columnwise', M-K+I+IB-1, IB,
+ $ A( 1, N-K+I ), LDA, TAU( I ),
+ $ A( M-K+I, N-K+I ), LDA )
*
-* Apply H to rows 1:m-k+i+ib-1 of current block
+* Apply H to A(1:m-k+i+ib-1,1:n-k+i-1) from the left
*
- CALL ZUNG2L( M-K+I+IB-1, IB, IB, A( 1, N-K+I ), LDA,
- $ TAU( I ), WORK, IINFO )
+ CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Backward', 'Columnwise', M-K+I+IB-1, N-K+I-1, IB,
+ $ A(1, N-K+I), LDA, A( M-K+I, N-K+I ), LDA, A, LDA)
*
-* Set rows m-k+i+ib:m of current block to zero
+* Apply H to rows 1:m-k+i+ib-1 of current block
*
- DO 40 J = N - K + I, N - K + I + IB - 1
- DO 30 L = M - K + I + IB, M
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL ZUNGKL( M-K+I+IB-1, IB, A( 1, N-K+I ), LDA)
+ END DO
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/zungqr.f b/SRC/zungqr.f
index 6c9f2e3ff5..56022eeefa 100644
--- a/SRC/zungqr.f
+++ b/SRC/zungqr.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,N).
-*> For optimum performance LWORK >= N*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -137,20 +135,17 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX*16 ZERO
- PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, IINFO, IWS, J, KI, KK, L, LDWORK,
- $ LWKOPT, NB, NBMIN, NX
+ INTEGER I, IB, IINFO, IWS, KI, KK, LWKOPT, NB,
+ $ NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNG2R
+ EXTERNAL XERBLA, ZLARFB0C2, ZLARFT,
+ $ ZUNG2R, ZUNGKR
* ..
* .. Intrinsic Functions ..
- INTRINSIC MAX, MIN
+ INTRINSIC MAX
* ..
* .. External Functions ..
INTEGER ILAENV
@@ -162,7 +157,11 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
INFO = 0
NB = ILAENV( 1, 'ZUNGQR', ' ', M, N, K, -1 )
- LWKOPT = MAX( 1, N )*NB
+*
+* Only need a workspace for zung2r in case of bailout
+* and for the panel factorization
+*
+ LWKOPT = MAX( 1, N )
WORK( 1 ) = LWKOPT
LQUERY = ( LWORK.EQ.-1 )
IF( M.LT.0 ) THEN
@@ -191,92 +190,100 @@ SUBROUTINE ZUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
NBMIN = 2
- NX = 0
+* Determine when to cross over from unblocked to blocked
+ NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) )
IWS = N
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
*
-* Determine when to cross over from blocked to unblocked code.
-*
- NX = MAX( 0, ILAENV( 3, 'ZUNGQR', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Treat the last NB block starting at KK+1 specially then use our blocking
+* method from the block starting at KI+1 to 1
*
- LDWORK = N
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KI = K - 2 * NB
+ KK = K - NB
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked code.
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'ZUNGQR', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL ZUNG2R( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the last block.
-* The first kk columns are handled by the block method.
+* Factor the last block assuming that our first application
+* will be on the Identity matrix
*
- KI = ( ( K-NX-1 ) / NB )*NB
- KK = MIN( K, KI+NB )
+ I = KK + 1
+ IB = NB
*
-* Set A(1:kk,kk+1:n) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 20 J = KK + 1, N
- DO 10 I = 1, KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
-* Use unblocked code for the last or only block.
+* Apply H to A(i:m,i+ib:n) from the left
+* Exploit the fact that we are applying to an identity
*
- IF( KK.LT.N )
- $ CALL ZUNG2R( M-KK, N-KK, K-KK, A( KK+1, KK+1 ), LDA,
- $ TAU( KK+1 ), WORK, IINFO )
+ CALL ZLARFB0C2(.TRUE., 'Left', 'No Transpose', 'Forward',
+ $ 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I), LDA, A(I,I),
+ $ LDA, A(I,I+IB), LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to rows i:m of current block
*
-* Use blocked code
+ CALL ZUNGKR(M-I+1, IB, A(I,I), LDA)
*
- DO 50 I = KI + 1, 1, -NB
- IB = MIN( NB, K-I+1 )
- IF( I+IB.LE.N ) THEN
+* Use our standard blocking method after the last block
*
-* Form the triangular factor of the block reflector
-* H = H(i) H(i+1) . . . H(i+ib-1)
+ DO I = KI + 1, 1, -NB
+ IB = NB
*
- CALL ZLARFT( 'Forward', 'Columnwise', M-I+1, IB,
- $ A( I, I ), LDA, TAU( I ), WORK, LDWORK )
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
-* Apply H to A(i:m,i+ib:n) from the left
+ CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
*
- CALL ZLARFB( 'Left', 'No transpose', 'Forward',
- $ 'Columnwise', M-I+1, N-I-IB+1, IB,
- $ A( I, I ), LDA, WORK, LDWORK, A( I, I+IB ),
- $ LDA, WORK( IB+1 ), LDWORK )
- END IF
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
*
* Apply H to rows i:m of current block
*
- CALL ZUNG2R( M-I+1, IB, IB, A( I, I ), LDA, TAU( I ),
- $ WORK,
- $ IINFO )
+ CALL ZUNGKR(M-I+1, IB, A(I,I), LDA)
+ END DO
+*
+* This checks for if K was a perfect multiple of NB
+* so that we only have a special case for the last block when
+* necessary
+*
+ IF(I.LT.1) THEN
+ IB = I + NB - 1
+ I = 1
*
-* Set rows 1:i-1 of current block to zero
+* Form the triangular factor of the block reflector
+* H = H(i) H(i+1) . . . H(i+ib-1)
*
- DO 40 J = I, I + IB - 1
- DO 30 L = 1, I - 1
- A( L, J ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL ZLARFT('Forward', 'Column', M-I+1, IB, A(I,I),
+ $ LDA, TAU(I), A(I,I), LDA)
+*
+* Apply H to A(i:m,i+ib:n) from the left
+*
+ CALL ZLARFB0C2(.FALSE., 'Left', 'No Transpose',
+ $ 'Forward', 'Column', M-I+1, N-(I+IB)+1, IB, A(I,I),
+ $ LDA, A(I,I), LDA, A(I,I+IB), LDA)
+
+*
+* Apply H to rows i:m of current block
+*
+ CALL ZUNGKR(M-I+1, IB, A(I,I), LDA)
+ END IF
END IF
*
WORK( 1 ) = IWS
diff --git a/SRC/zungrk.f b/SRC/zungrk.f
new file mode 100644
index 0000000000..f73b8bb095
--- /dev/null
+++ b/SRC/zungrk.f
@@ -0,0 +1,169 @@
+*> \brief \b ZUNGRK computes the explicit Q factor from DGERQF and ZLARFT
+*
+* =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+* http://www.netlib.org/lapack/explore-html/
+*
+* Definition:
+* ===========
+*
+* SUBROUTINE ZUNGRK(M, N, Q, LDQ)
+*
+* .. Scalar Arguments ..
+* INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+* COMPLEX*16 Q(LDQ,*)
+* ..
+*
+*> \par Purpose:
+* =============
+*>
+*> \verbatim
+*>
+*> ZUNGRK generates an m by n complex matrix Q with orthonormal rows,
+*> which is defined as the last m rows of the product of m
+*> elementary reflectors
+*>
+*> Q = I - V'*T*V = H(m) . . . H(2) H(1)
+*>
+*> Where V is an m by n matrix whose columns are householder reflectors
+*> as returned by ZGERQF and T is the n by n matrix returned by ZLARFT
+*> \endverbatim
+*
+* Arguments:
+* ==========
+*
+*> \param[in] M
+*> \verbatim
+*> M is INTEGER
+*> The number of rows of the matrix V, and the order of T.
+*> M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*> N is INTEGER
+*> The number of columns of the matrix V. N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] Q
+*> \verbatim
+*> Q is COMPLEX array, dimension (LDQ,N)
+*> On entry, Q(i,1:n-m-1+i) contains the vector which defines the
+*> elementary reflector H(i), for i=1,...,n as returned by ZGERKF.
+*> In addition, the upper triangular portion of the submatrix given
+*> by Q(1:m,n-m:n) will contain the array T as returned by ZLARFT.
+*> See further details for more information.
+*> On exit, the m-by-n matrix Q.
+*> \endverbatim
+*
+* Authors:
+* ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \par Further Details:
+* =====================
+*>
+*> \verbatim
+*>
+*> The storage of the V and T components inside Q is best illustrated by
+*> the following example with m = 3, n = 5.
+*>
+*> Q = |----------------|
+*> | V1 V1 T1 T1 T1 |
+*> | V2 V2 V2 T2 T2 |
+*> | V3 V3 V3 V3 T3 |
+*> |----------------|
+*>
+*> \endverbatim
+*>
+* =====================================================================
+
+ SUBROUTINE ZUNGRK(M, N, Q, LDQ)
+*
+* -- LAPACK computational routine --
+* -- LAPACK is a software package provided by Univ. of Tennessee, --
+* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+* .. Scalar Arguments ..
+ INTEGER M, N, LDQ
+* ..
+* .. Array Arguments ..
+ COMPLEX*16 Q(LDQ,*)
+* ..
+*
+* =====================================================================
+*
+* .. Parameters ..
+ COMPLEX*16 NEG_ONE, ONE
+ PARAMETER(NEG_ONE=(-1.0D+0,0.0D+0), ONE=(1.0D+0,0.0D+0))
+* ..
+* .. Local Scalars ..
+ INTEGER I, J
+* ..
+* .. External Subroutines ..
+ EXTERNAL ZTRMM, ZTRTRM, ZLUMM
+* ..
+* .. Intrinsic Functions..
+ INTRINSIC MIN
+* ..
+* .. Executable Statements ..
+*
+* Break Q apart as follows
+*
+* |-----|
+* Q = | V T |
+* |-----|
+*
+* Where T is an m-by-m upper triangular matrix, and V is as described
+* in the Further Details section
+*
+* In turn, break apart V as follows
+*
+* |---------|
+* V = | V_2 V_1 |
+* |---------|
+*
+* Where:
+*
+* V_1 \in \R^{m\times m} assumed unit lower triangular
+* V_2 \in \R^{m\times n-m}
+*
+* Compute T = V_1'*T
+*
+ CALL ZTRTRM('Left', 'Upper', 'Conjugate Transpose',
+ $ 'Non-Unit', 'Unit', M, ONE, Q(1,N-M+1), LDQ, Q(1,N-M+1),
+ & LDQ)
+*
+* Compute Q = -TV. This means that we need to break apart
+* Our computation in two parts
+*
+* |---------------|
+* Q = | -T*V_2 -T*V_1 |
+* |---------------|
+*
+* Q_2 = -T*V_2 (TRMM) but only when necessary
+*
+ IF (N.GT.M) THEN
+ CALL ZTRMM('Left', 'Upper', 'No Transpose', 'Non-Unit',
+ $ M, N-M, NEG_ONE, Q(1,N-M+1), LDQ, Q, LDQ)
+ END IF
+*
+* Q_1 = -T*V_1 (Lower-Upper Matrix-Matrix multiplication)
+*
+ CALL ZLUMM('Right', 'Unit', 'Non-Unit', M, NEG_ONE,
+ $ Q(1,N-M+1), LDQ)
+*
+* Q = "I" + Q
+*
+ J = MIN(M,N)
+ DO I = 1, J
+ Q(I,N-M+I) = Q(I,N-M+I) + ONE
+ END DO
+ END SUBROUTINE
diff --git a/SRC/zungrq.f b/SRC/zungrq.f
index dcc2772d67..46ccf3c4aa 100644
--- a/SRC/zungrq.f
+++ b/SRC/zungrq.f
@@ -95,8 +95,6 @@
*> \verbatim
*> LWORK is INTEGER
*> The dimension of the array WORK. LWORK >= max(1,M).
-*> For optimum performance LWORK >= M*NB, where NB is the
-*> optimal blocksize.
*>
*> If LWORK = -1, then a workspace query is assumed; the routine
*> only calculates the optimal size of the WORK array, returns
@@ -137,17 +135,14 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
*
* =====================================================================
*
-* .. Parameters ..
- COMPLEX*16 ZERO
- PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) )
-* ..
* .. Local Scalars ..
LOGICAL LQUERY
- INTEGER I, IB, II, IINFO, IWS, J, KK, L, LDWORK,
+ INTEGER I, IB, II, IINFO, IWS, KK, LDWORK,
$ LWKOPT, NB, NBMIN, NX
* ..
* .. External Subroutines ..
- EXTERNAL XERBLA, ZLARFB, ZLARFT, ZUNGR2
+ EXTERNAL XERBLA, ZLARFB, ZLARFT,
+ $ ZUNGR2, ZUNGRK
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@@ -173,12 +168,8 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
END IF
*
IF( INFO.EQ.0 ) THEN
- IF( M.LE.0 ) THEN
- LWKOPT = 1
- ELSE
- NB = ILAENV( 1, 'ZUNGRQ', ' ', M, N, K, -1 )
- LWKOPT = M*NB
- END IF
+ LWKOPT = MAX(1, M)
+ NB = ILAENV( 1, 'ZUNGRQ', ' ', M, N, K, -1 )
WORK( 1 ) = LWKOPT
*
IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
@@ -199,92 +190,74 @@ SUBROUTINE ZUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
RETURN
END IF
*
- NBMIN = 2
- NX = 0
+ NBMIN = MAX( 2, ILAENV( 2, 'ZUNGRQ', ' ', M, N, K, -1 ) )
+ NX = MAX( 0, ILAENV( 3, 'ZUNGRQ', ' ', M, N, K, -1 ) )
IWS = M
- IF( NB.GT.1 .AND. NB.LT.K ) THEN
-*
-* Determine when to cross over from blocked to unblocked code.
*
- NX = MAX( 0, ILAENV( 3, 'ZUNGRQ', ' ', M, N, K, -1 ) )
- IF( NX.LT.K ) THEN
+ IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
*
-* Determine if workspace is large enough for blocked code.
+* Use blocked code after the first block.
+* The last kk rows are handled by the block method.
*
- LDWORK = M
- IWS = LDWORK*NB
- IF( LWORK.LT.IWS ) THEN
+ KK = K
+ ELSE
+ KK = 0
+ END IF
*
-* Not enough workspace to use optimal NB: reduce NB and
-* determine the minimum value of NB.
+* Potentially bail to the unblocked code
*
- NB = LWORK / LDWORK
- NBMIN = MAX( 2, ILAENV( 2, 'ZUNGRQ', ' ', M, N, K,
- $ -1 ) )
- END IF
- END IF
+ IF( KK.EQ.0 ) THEN
+ CALL ZUNGR2( M, N, K, A, LDA, TAU, WORK, IINFO )
END IF
*
- IF( NB.GE.NBMIN .AND. NB.LT.K .AND. NX.LT.K ) THEN
+ IF( KK.GT.0 ) THEN
*
-* Use blocked code after the first block.
-* The last kk rows are handled by the block method.
+* Factor the first block assuming that our first application
+* will be on the Identity matrix
*
- KK = MIN( K, ( ( K-NX+NB-1 ) / NB )*NB )
+ I = 1
+ IB = NB
+ II = M - K + I
*
-* Set A(1:m-kk,n-kk+1:n) to zero.
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- DO 20 J = N - KK + 1, N
- DO 10 I = 1, M - KK
- A( I, J ) = ZERO
- 10 CONTINUE
- 20 CONTINUE
- ELSE
- KK = 0
- END IF
+ CALL ZLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA )
*
-* Use unblocked code for the first or only block.
+* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL ZUNGR2( M-KK, N-KK, K-KK, A, LDA, TAU, WORK, IINFO )
+ CALL ZLARFB0C2(.TRUE., 'Right', 'No Transpose', 'Backward',
+ $ 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1), LDA,
+ $ A( II, N-K+I ), LDA, A, LDA)
*
- IF( KK.GT.0 ) THEN
+* Apply H to columns 1:n-k+i+ib-1 of current block
*
-* Use blocked code
+ CALL ZUNGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA )
*
- DO 50 I = K - KK + 1, K, NB
- IB = MIN( NB, K-I+1 )
- II = M - K + I
- IF( II.GT.1 ) THEN
+ DO I = NB+1, K, NB
*
-* Form the triangular factor of the block reflector
-* H = H(i+ib-1) . . . H(i+1) H(i)
+* The last block may be less than size NB
*
- CALL ZLARFT( 'Backward', 'Rowwise', N-K+I+IB-1, IB,
- $ A( II, 1 ), LDA, TAU( I ), WORK, LDWORK )
+ IB = MIN(NB, K-I+1)
+ II = M - K + I
*
-* Apply H**H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
+* Form the triangular factor of the block reflector
+* H = H(i+ib-1) . . . H(i+1) H(i)
*
- CALL ZLARFB( 'Right', 'Conjugate transpose',
- $ 'Backward',
- $ 'Rowwise', II-1, N-K+I+IB-1, IB, A( II, 1 ),
- $ LDA, WORK, LDWORK, A, LDA, WORK( IB+1 ),
- $ LDWORK )
- END IF
+ CALL ZLARFT( 'Transpose', 'Rowwise', N-K+I+IB-1, IB,
+ $ A( II, 1 ), LDA, TAU( I ), A( II, N-K+I ), LDA )
*
-* Apply H**H to columns 1:n-k+i+ib-1 of current block
+* Apply H to A(1:m-k+i-1,1:n-k+i+ib-1) from the right
*
- CALL ZUNGR2( IB, N-K+I+IB-1, IB, A( II, 1 ), LDA,
- $ TAU( I ),
- $ WORK, IINFO )
+ CALL ZLARFB0C2(.FALSE., 'Right', 'No Transpose',
+ $ 'Backward', 'Rowwise', II-1, N-K+I+IB-1, IB, A(II,1),
+ $ LDA, A( II, N-K+I ), LDA, A, LDA)
*
-* Set columns n-k+i+ib:n of current block to zero
+* Apply H to columns 1:n-k+i+ib-1 of current block
*
- DO 40 L = N - K + I + IB, N
- DO 30 J = II, II + IB - 1
- A( J, L ) = ZERO
- 30 CONTINUE
- 40 CONTINUE
- 50 CONTINUE
+ CALL ZUNGRK( IB, N-K+I+IB-1, A( II, 1 ), LDA )
+ END DO
END IF
*
WORK( 1 ) = IWS