A friend helped me optimize this. The end result is the following:
using Memoization
using FFTW
using BenchmarkTools
# This structure is type unstable
struct Operators{F1<:Function, F2<:Function}
K̂::F1
T̂::F2
end # Simulation
function Operators(α, x_order, ω)
# Select function
function K̂(α)
fun = if α == 0
@memoize function K_cubic(dx::Real)
println("Computing and caching K(dx = $dx) for cubic NLSE")
ifftshift(cis.(dx*ω.^2/2))
end
K_cubic
elseif α > 0
@memoize function K_hirota(dx::Real)
println("Computing and caching K(dx = $dx) for Hirota Equation")
fftshift(cis.(dx*α*ω.^3))
end
K_hirota
end
return fun
end
# Select function 2
T̂ = T₂ˢ
if x_order == 2 && α == 0
T̂ = T₂ˢ
elseif x_order == 2 && α > 0
T̂ = T₂ˢʰ
#elseif ...
# many more functions to select from
end
ops = Operators(K̂(α), T̂)
return ops
end
function T₂ˢ(ψ, dx, ops)
# 3 allocs
ψ .= ops.K̂(dx) .* ψ
end #T₂ˢ
function run(N)
M = 512
x_order = 2
α = 0
dx = 1e-4
ψ = Array{Complex{Float64}}(undef, M, N)
ψ[:, 1] = rand(M) + im*rand(M)
ω = rand(M)
# Decide which functions to use
ops = Operators(α, x_order, ω)
# 3 allocs per loop iteration
myloop(N, ψ, dx, ops)
end
function myloop(N, ψ, dx, ops)
for i = 1:N-1
@time @views ψ[:, i+1] .= ops.T̂(ψ[:, i], dx, ops) # 0 allocs from calling ops.T̂
end
end
run(10)
Using let to define K significantly improves the allocations since the compiler doesn’t have to deal with it. We also use a function barrier in the form of myloop which decreases allocations since myloop already knows the type of ops. Currently, there are 3 allocations per loop iteration due to the lookups done by @memoize, I think.
Any other suggestions are still welcome however.