SciML · ChrisRackauckas · Aug 18, 2021 · Aug 17, 2021 · Aug 17, 2021 · Aug 17, 2021
diff --git a/README.md b/README.md
@@ -90,10 +90,10 @@ For the other keyword arguments, `m` determines the dimension of the Krylov subs
 ## `_exp!`
 
 ```julia
-_exp(A)
+_exp!(A)
 ```
 
-A pure Julia implementation of a non-allocating matrix exponential using the Destructive matrix exponential using algorithm 
+A pure Julia implementation of a non-allocating matrix exponential using the Destructive matrix exponential using algorithm
 from Higham, 2008. Mostly generic, though the coefficients are geared towards 64-bit floating point calculations, and the
 use of BLAS requires a `StridedMatrix`.
 
@@ -102,9 +102,9 @@ use of BLAS requires a `StridedMatrix`.
 exp(x, vk=Val{10}())
 ```
 
-A pure Julia generic implementation of the exponential function using the 
-[scaling and squaring method](https://doi.org/10.1137/04061101X), working on any `x` for which the functions 
-`LinearAlgebra.opnorm`, `+`, `*`, `^`, and `/` (including addition with UniformScaling objects) are defined. 
+A pure Julia generic implementation of the exponential function using the
+[scaling and squaring method](https://doi.org/10.1137/04061101X), working on any `x` for which the functions
+`LinearAlgebra.opnorm`, `+`, `*`, `^`, and `/` (including addition with UniformScaling objects) are defined.
 Use the argument `vk` to adjust the number of terms used in the Pade approximants at compile time.
 
 ## Advanced features

diff --git a/src/ExponentialUtilities.jl b/src/ExponentialUtilities.jl
@@ -14,6 +14,7 @@ macro diagview(A,d::Integer=0)
 end
 
 include("exp.jl")
+include("exp_noalloc.jl")
 include("phi.jl")
 include("arnoldi.jl")
 include("krylov_phiv.jl")
@@ -56,7 +57,7 @@ function __init__()
             lmul!(beta, mul!(w, @view(V[:, 1:m]), dexpHe)) # exp(A) ≈ norm(b) * V * exp(H)e
         end
     end
-    
+
     @require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
         function ExponentialUtilities.expv!(w::CUDA.CuVector{Tw},
                                     t::Real, Ks::KrylovSubspace{T, U};

diff --git a/src/exp.jl b/src/exp.jl
@@ -3,7 +3,7 @@
 ##
 ## Non-allocating version of `LinearAlgebra.exp!`. Modifies `A` to
 ## become (approximately) `exp(A)`.
-function _exp!(A::StridedMatrix{T}; caches=nothing) where T <: LinearAlgebra.BlasFloat
+function _baseexp!(A::StridedMatrix{T}; caches=nothing) where T <: LinearAlgebra.BlasFloat
     X = A
     n = LinearAlgebra.checksquare(A)
     # if ishermitian(A)
@@ -192,7 +192,7 @@ _const(A::Array) = Base.Experimental.Const(A)
                     end
                     n += $NU
                 end
-                $(NU > 1 ? nrem_quote : nothing)               
+                $(NU > 1 ? nrem_quote : nothing)
             end
         end
         C

diff --git a/src/exp_generated/exp_1.jl b/src/exp_generated/exp_1.jl
@@ -0,0 +1,50 @@
+using LinearAlgebra
+
+
+@inline function exp_gen!(cache,A,::Val{1})
+    T=promote_type(eltype(A),Float64) # Make it work for many 'bigger' types (matrices and scalars)
+    # max_memslots=4
+    n=size(A,1)
+    # The first slots are precomputed nodes [:A]
+    memslots2 = getmem(cache,2)
+    memslots3 = getmem(cache,3)
+    memslots4 = getmem(cache,4)
+    # Assign precomputed nodes memslots 
+    memslots1=A # overwrite A
+    # Uniform scaling is exploited.
+    # No matrix I explicitly allocated.
+    # Computation order: A2 Ua U V Z X P
+    # Computing A2 with operation: mult
+    mul!(memslots2,memslots1,memslots1)
+    # Computing Ua = x*I+x*A2
+    coeff1=60.0
+    coeff2=1.0
+    memslots3 .= coeff2.*memslots2
+    inplace_add!(memslots3,I*coeff1)
+    # Computing U with operation: mult
+    mul!(memslots4,memslots3,memslots1)
+    # Deallocating Ua in slot 3
+    # Deallocating A in slot 1
+    # Computing V = x*I+x*A2
+    coeff1=120.0
+    coeff2=12.0
+    # Smart lincomb recycle A2
+    memslots2 .= coeff2.*memslots2
+    inplace_add!(memslots2,I*coeff1)
+    # Computing Z = x*V+x*U
+    coeff1=1.0
+    coeff2=-1.0
+    memslots1 .= coeff1.*memslots2 .+ coeff2.*memslots4
+    # Computing X = x*V+x*U
+    coeff1=1.0
+    coeff2=1.0
+    # Smart lincomb recycle V
+    memslots2 .= coeff1.*memslots2 .+ coeff2.*memslots4
+    # Deallocating U in slot 4
+    # Computing P with operation: ldiv
+    ldiv_for_generated!(memslots3, memslots1, memslots2)
+    # Deallocating Z in slot 1
+    # Deallocating X in slot 2
+    copyto!(A,memslots3) # Returning P
+end
+
diff --git a/src/exp_generated/exp_10.jl b/src/exp_generated/exp_10.jl
@@ -0,0 +1,103 @@
+using LinearAlgebra
+
+
+@inline function exp_gen!(cache,A,::Val{10})
+    T=promote_type(eltype(A),Float64) # Make it work for many 'bigger' types (matrices and scalars)
+    # max_memslots=6
+    n=size(A,1)
+    # The first slots are precomputed nodes [:A]
+    memslots2 = getmem(cache,2)
+    memslots3 = getmem(cache,3)
+    memslots4 = getmem(cache,4)
+    memslots5 = getmem(cache,5)
+    memslots6 = getmem(cache,6)
+    # Assign precomputed nodes memslots 
+    memslots1=A # overwrite A
+    # Uniform scaling is exploited.
+    # No matrix I explicitly allocated.
+    # Computation order: C A2 A4 A6 Ub3 Ub Uc U Vb3 Vb V Z X P S1 S2 S3 S4 S5
+    # Computing C = x*A+x*I
+    coeff1=0.03125
+    coeff2=0.0
+    # Smart lincomb recycle A
+    memslots1 .= coeff1.*memslots1
+    inplace_add!(memslots1,I*coeff2)
+    # Computing A2 with operation: mult
+    mul!(memslots2,memslots1,memslots1)
+    # Computing A4 with operation: mult
+    mul!(memslots3,memslots2,memslots2)
+    # Computing A6 with operation: mult
+    mul!(memslots4,memslots2,memslots3)
+    # Computing Ub3 = x*A2+x*A4+x*A6
+    coeff1=4.08408e7
+    coeff2=16380.0
+    coeff3=1.0
+    memslots5 .= coeff1.*memslots2 .+ coeff2.*memslots3 .+ coeff3.*memslots4
+    # Computing Ub with operation: mult
+    mul!(memslots6,memslots5,memslots4)
+    # Deallocating Ub3 in slot 5
+    # Computing Uc = x*I+x*A2+x*A4+x*A6+x*Ub
+    coeff1=3.238237626624e16
+    coeff2=1.1873537964288e15
+    coeff3=1.05594705216e13
+    coeff4=3.352212864e10
+    coeff5=1.0
+    # Smart lincomb recycle Ub
+    memslots6 .= coeff2.*memslots2 .+ coeff3.*memslots3 .+ coeff4.*memslots4 .+ coeff5.*memslots6
+    inplace_add!(memslots6,I*coeff1)
+    # Computing U with operation: mult
+    mul!(memslots5,memslots1,memslots6)
+    # Deallocating C in slot 1
+    # Deallocating Uc in slot 6
+    # Computing Vb3 = x*A2+x*A4+x*A6
+    coeff1=1.32324192e9
+    coeff2=960960.0
+    coeff3=182.0
+    memslots1 .= coeff1.*memslots2 .+ coeff2.*memslots3 .+ coeff3.*memslots4
+    # Computing Vb with operation: mult
+    mul!(memslots6,memslots1,memslots4)
+    # Deallocating Vb3 in slot 1
+    # Computing V = x*I+x*A2+x*A4+x*A6+x*Vb
+    coeff1=6.476475253248e16
+    coeff2=7.7717703038976e15
+    coeff3=1.29060195264e14
+    coeff4=6.704425728e11
+    coeff5=1.0
+    # Smart lincomb recycle A2
+    memslots2 .= coeff2.*memslots2 .+ coeff3.*memslots3 .+ coeff4.*memslots4 .+ coeff5.*memslots6
+    inplace_add!(memslots2,I*coeff1)
+    # Deallocating A4 in slot 3
+    # Deallocating A6 in slot 4
+    # Deallocating Vb in slot 6
+    # Computing Z = x*V+x*U
+    coeff1=1.0
+    coeff2=-1.0
+    memslots1 .= coeff1.*memslots2 .+ coeff2.*memslots5
+    # Computing X = x*V+x*U
+    coeff1=1.0
+    coeff2=1.0
+    # Smart lincomb recycle V
+    memslots2 .= coeff1.*memslots2 .+ coeff2.*memslots5
+    # Deallocating U in slot 5
+    # Computing P with operation: ldiv
+    ldiv_for_generated!(memslots3, memslots1, memslots2)
+    # Deallocating Z in slot 1
+    # Deallocating X in slot 2
+    # Computing S1 with operation: mult
+    mul!(memslots1,memslots3,memslots3)
+    # Deallocating P in slot 3
+    # Computing S2 with operation: mult
+    mul!(memslots2,memslots1,memslots1)
+    # Deallocating S1 in slot 1
+    # Computing S3 with operation: mult
+    mul!(memslots1,memslots2,memslots2)
+    # Deallocating S2 in slot 2
+    # Computing S4 with operation: mult
+    mul!(memslots2,memslots1,memslots1)
+    # Deallocating S3 in slot 1
+    # Computing S5 with operation: mult
+    mul!(memslots1,memslots2,memslots2)
+    # Deallocating S4 in slot 2
+    copyto!(A,memslots1) # Returning S5
+end
+
diff --git a/src/exp_generated/exp_11.jl b/src/exp_generated/exp_11.jl
@@ -0,0 +1,106 @@
+using LinearAlgebra
+
+
+@inline function exp_gen!(cache,A,::Val{11})
+    T=promote_type(eltype(A),Float64) # Make it work for many 'bigger' types (matrices and scalars)
+    # max_memslots=6
+    n=size(A,1)
+    # The first slots are precomputed nodes [:A]
+    memslots2 = getmem(cache,2)
+    memslots3 = getmem(cache,3)
+    memslots4 = getmem(cache,4)
+    memslots5 = getmem(cache,5)
+    memslots6 = getmem(cache,6)
+    # Assign precomputed nodes memslots 
+    memslots1=A # overwrite A
+    # Uniform scaling is exploited.
+    # No matrix I explicitly allocated.
+    # Computation order: C A2 A4 A6 Ub3 Ub Uc U Vb3 Vb V Z X P S1 S2 S3 S4 S5 S6
+    # Computing C = x*A+x*I
+    coeff1=0.015625
+    coeff2=0.0
+    # Smart lincomb recycle A
+    memslots1 .= coeff1.*memslots1
+    inplace_add!(memslots1,I*coeff2)
+    # Computing A2 with operation: mult
+    mul!(memslots2,memslots1,memslots1)
+    # Computing A4 with operation: mult
+    mul!(memslots3,memslots2,memslots2)
+    # Computing A6 with operation: mult
+    mul!(memslots4,memslots2,memslots3)
+    # Computing Ub3 = x*A2+x*A4+x*A6
+    coeff1=4.08408e7
+    coeff2=16380.0
+    coeff3=1.0
+    memslots5 .= coeff1.*memslots2 .+ coeff2.*memslots3 .+ coeff3.*memslots4
+    # Computing Ub with operation: mult
+    mul!(memslots6,memslots5,memslots4)
+    # Deallocating Ub3 in slot 5
+    # Computing Uc = x*I+x*A2+x*A4+x*A6+x*Ub
+    coeff1=3.238237626624e16
+    coeff2=1.1873537964288e15
+    coeff3=1.05594705216e13
+    coeff4=3.352212864e10
+    coeff5=1.0
+    # Smart lincomb recycle Ub
+    memslots6 .= coeff2.*memslots2 .+ coeff3.*memslots3 .+ coeff4.*memslots4 .+ coeff5.*memslots6
+    inplace_add!(memslots6,I*coeff1)
+    # Computing U with operation: mult
+    mul!(memslots5,memslots1,memslots6)
+    # Deallocating C in slot 1
+    # Deallocating Uc in slot 6
+    # Computing Vb3 = x*A2+x*A4+x*A6
+    coeff1=1.32324192e9
+    coeff2=960960.0
+    coeff3=182.0
+    memslots1 .= coeff1.*memslots2 .+ coeff2.*memslots3 .+ coeff3.*memslots4
+    # Computing Vb with operation: mult
+    mul!(memslots6,memslots1,memslots4)
+    # Deallocating Vb3 in slot 1
+    # Computing V = x*I+x*A2+x*A4+x*A6+x*Vb
+    coeff1=6.476475253248e16
+    coeff2=7.7717703038976e15
+    coeff3=1.29060195264e14
+    coeff4=6.704425728e11
+    coeff5=1.0
+    # Smart lincomb recycle A2
+    memslots2 .= coeff2.*memslots2 .+ coeff3.*memslots3 .+ coeff4.*memslots4 .+ coeff5.*memslots6
+    inplace_add!(memslots2,I*coeff1)
+    # Deallocating A4 in slot 3
+    # Deallocating A6 in slot 4
+    # Deallocating Vb in slot 6
+    # Computing Z = x*V+x*U
+    coeff1=1.0
+    coeff2=-1.0
+    memslots1 .= coeff1.*memslots2 .+ coeff2.*memslots5
+    # Computing X = x*V+x*U
+    coeff1=1.0
+    coeff2=1.0
+    # Smart lincomb recycle V
+    memslots2 .= coeff1.*memslots2 .+ coeff2.*memslots5
+    # Deallocating U in slot 5
+    # Computing P with operation: ldiv
+    ldiv_for_generated!(memslots3, memslots1, memslots2)
+    # Deallocating Z in slot 1
+    # Deallocating X in slot 2
+    # Computing S1 with operation: mult
+    mul!(memslots1,memslots3,memslots3)
+    # Deallocating P in slot 3
+    # Computing S2 with operation: mult
+    mul!(memslots2,memslots1,memslots1)
+    # Deallocating S1 in slot 1
+    # Computing S3 with operation: mult
+    mul!(memslots1,memslots2,memslots2)
+    # Deallocating S2 in slot 2
+    # Computing S4 with operation: mult
+    mul!(memslots2,memslots1,memslots1)
+    # Deallocating S3 in slot 1
+    # Computing S5 with operation: mult
+    mul!(memslots1,memslots2,memslots2)
+    # Deallocating S4 in slot 2
+    # Computing S6 with operation: mult
+    mul!(memslots2,memslots1,memslots1)
+    # Deallocating S5 in slot 1
+    copyto!(A,memslots2) # Returning S6
+end
+