Mh, I thought the whole point of providing a minimal working example to demonstrate the problem was to isolate the identified issue in a more complex code. Obviously, this leads to information loss - such as which other parts might be slowing down the computation. Here is the entire objective function that I have to minimize:
function jumpResidual(x::T...) where {T<:Real}
#function jumpResidual(x::Vector,a::T) where {T}
global globlibs,globexp,ω,Ie,loc,params,evalfloat,evaldual
print("Allocation: ")
@time begin
# DeVectorization of fitting parameters
devectorizeParams!(params,x,loc)
#params.T = params.T*10
#params.X["N2"] = params.X["N2"]/1000
# Library parameters
available_species = unique(vcat([collect(keys(lib)) for lib in globlibs]...))
γ = convert(T,globlibs[1][available_species[1]].γ)
ω = globlibs[1][collect(keys(globlibs[1]))[1]].ω
ωres = convert(T,globlibs[1][available_species[1]].ωres) # wavenumber resolution used in library
num_pr = length(globlibs) # number of pumping regions
# get the temperature index
# if interpolation is enabled, get the two neighboring ones
if CARSOptions["Interpolate"]
num_T = 1
idx0 = findlower(globlibs[1][available_species[1]].T,params.T)
idx1 = idx0+1
idxs = [idx0 idx1]
else
num_T = 1
idxs = findnearest(globlibs[1][available_species[1]].T,params.T)
end
# PREALLOCATIONS (optimize with memoizing?)
χM = zeros(T,num_pr,length(ω),num_T)
χR = zeros(T,num_pr,length(ω),num_T)
χI = zeros(T,num_pr,length(ω),num_T)
I = zeros(T,length(ω),num_T)
# depending on the input type, some operations are implemented differently
# the automatic differentiation in ForwardDiff uses the Dual type (which is <:Real)
# but for function evaluations, the type may be Float64/Float32. This allows to use
# optimizations from LoopVectorization.jl such as @avxt or @tturbo which makes it much faster.
dual = occursin("Dual",string(typeof(I)))
#χNR = convert(T,0)
P = globlibs[1][available_species[1]].P
N = NA*273.15/params.T*(P/molvol); # number density based on temperature and pressure
end
print("NR Signal:")
@time begin
# add the correct non-resonant signal contributions as a dictionary to params
χNR = Dict(name => convert(T,0) for name in available_species)
for species in available_species
# find species in one of the libraries
if haskey(globlibs[1],species)
χNR[species]=globlibs[1][species].χNR
continue
elseif haskey(globlibs[2],species)
χNR[species]=globlibs[2][species].χNR
continue
else
error("Couldn't find χNR for species $(species). Aborting.")
end
end
# store in params struct
params.χNR = χNR
χB = params.χB/NA*molvol;
XB = 1-sum(values(params.X))
χNR = convert(T,N*(XB*χB + sum([params.X[species]*params.χNR[species] for species in available_species])))
end
# library access
print("Get Tinterp: ")
@time begin
T0 = globlibs[1][available_species[1]].T[idxs[1]]
T1 = globlibs[1][available_species[1]].T[idxs[2]]
Tx = params.T
w0 = (T1-Tx)/(T1-T0)
w1 = (Tx-T0)/(T1-T0)
end
#for i in 1:2
print("Library Access:")
@time for pr in 1:size(χM,1)
for species in available_species
if haskey(globlibs[pr],species)
X = convert(T,params.X[species]);
if dual
@. χM[pr,:] += (w0*globlibs[pr][species].χM[idxs[1],:]+w1*globlibs[pr][species].χM[idxs[2],:]) * X^2
@. χR[pr,:] += (w0*globlibs[pr][species].χR[idxs[1],:]+w1*globlibs[pr][species].χR[idxs[2],:]) * X
@. χI[pr,:] += (w0*globlibs[pr][species].χI[idxs[1],:]+w1*globlibs[pr][species].χI[idxs[2],:]) * X
else
@avxt for j in 1:size(χM,2); χM[pr,j] += (w0*globlibs[pr][species].χM[idxs[1],j]+w1*globlibs[pr][species].χM[idxs[2],j]) * X^2;end
@avxt for j in 1:size(χM,2); χR[pr,j] += (w0*globlibs[pr][species].χR[idxs[1],j]+w1*globlibs[pr][species].χR[idxs[2],j]) * X;end
@avxt for j in 1:size(χM,2); χI[pr,j] += (w0*globlibs[pr][species].χI[idxs[1],j]+w1*globlibs[pr][species].χI[idxs[2],j]) * X;end
end
end
end
end
#end
i=1
# convolution to final laser linewidths
print("Convolution: ")
@time for pr in 1:size(χM,1)
# convolve each pumping region with the "leftover" gaussian kernel
kern = gaussiankernelT(ωres,sqrt(params.LineWidth[pr]^2-γ^2));
χM[pr,:,i]=fastconv(χM[pr,:,i],kern,dual)
χI[pr,:,i]=fastconv(χI[pr,:,i],kern,dual)
χR[pr,:,i]=fastconv(χR[pr,:,i],kern,dual)
end
if dual
print("DUAL Summation: ")
evaldual += 1
@time if size(χM,1) == 1 # single pump case: reduced from dual pump case with 1 == 2
@. I[:,i] = 2*(χM[1,:,i] + 2*χNR*χR[1,:,i] + χNR^2) +
2*(χR[1,:,i]^2 + 2*χNR*χR[1,:,i] + χNR^2 + χI[1,:,i]^2)
else # dual pump case. divide amplitudes by 2? or is the non resonant background added twice and everything is fine?
@. I[:,i] = χM[1,:,i] + 2*χNR*χR[1,:,i] + χNR^2 +
χM[2,:,i] + 2*χNR*χR[2,:,i] + χNR^2 +
2*(χR[1,:,i]*χR[2,:,i] + χNR*χR[1,:,i] + χNR*χR[2,:,i] + χNR^2 + χI[1,:,i]*χI[2,:,i])
end
else
print("AVXT Summation: ")
evalfloat += 1
if size(χM,1) == 1 # single pump case: reduced from dual pump case with 1 == 2
@time @avxt for j in 1:size(I,1)
I[j,i] = 2*(χM[1,j,i] + 2*χNR*χR[1,j,i] + χNR^2) +
2*(χR[1,j,i]^2 + 2*χNR*χR[1,j,i] + χNR^2 + χI[1,j,i]^2)
end
else
@time @avxt for j in 1:size(I,1)
I[j,i] = χM[1,j,i] + 2*χNR*χR[1,j,i] + χNR^2 +
χM[2,j,i] + 2*χNR*χR[2,j,i] + χNR^2 +
2*(χR[1,j,i]*χR[2,j,i] + χNR*χR[1,j,i] + χNR*χR[2,j,i] + χNR^2 + χI[1,j,i]*χI[2,j,i])
end
end
end
#end
# if interpolation is used, do it now. better to just interpolate one final spectrum rather than up to 6.
#=
print("Temp. Interpolation: ")
@time if CARSOptions["Interpolate"]
T0 = globlibs[1][available_species[1]].T[idxs[1]]
T1 = globlibs[1][available_species[1]].T[idxs[2]]
Tx = params.T
# overwrite first element in I to save memory
I[:,1] = I[:,1] + (Tx-T0)/(T1-T0) * (I[:,2]-I[:,1])
end
#println([params.T, params.X["N2"]])
=#
# do the last convolution
print("Final convolution: ")
@time I[:,1] = fastconv(I[:,1],gaussiankernelT(ωres,convert(T,params.Instrumental)),dual)
print("Rest: ")
@time begin
@. I[:,1] = sqrt(I[:,1])
# interpolate on final grid
intf = linear_interpolation(ω, I[:,1])
Ie = intf[globexp[:,1]]
# normalize to sum
Ie = Ie ./ sum(Ie)
# calculate the residual
res = globexp[:,2] - Ie
val = sum(res.^2)
#println(params.T)
#println(val)
end
return val
end
This function is heavy work in progress and there are useless parts in it - I know that. These are artifacts from attempting to use different kinds of solvers, which may not need a gradient at all, such as BlackBoxOptim or a Genetic Algorithm.
The code is at the moment wrapped in begin...end
blocks to time them individually.
I do already treat Dual numbers and Floats differently (and yes, this can be done more elegantly by having two functions accepting different types, this was just to try it out and see if there is some benefit), which leads to some performance gain for the cases where I simply have to evaluate the function using Floats.
If I run this code, that is evaluating the function and getting the gradient with respect to 4 decision variables:
T = rand(300.:2000.)
X1 = rand(0.:0.001:1.)
X2 = rand(0.:0.001:1.)
L = rand(0.8:0.001:1.2)
println("####### FUNCEVAL #######")
@time jumpResidual(T,X1,X2,L)
println("####### GRADIENT #######")
@time ForwardDiff.gradient(y -> jumpResidual(y...), [T,X1,X2,L])
This is the output after running it multiple times to get rid of compilation time:
####### FUNCEVAL #######
Allocation: 0.002627 seconds (308 allocations: 1.643 MiB)
NR Signal: 0.000106 seconds (30 allocations: 1.328 KiB)
Get Tinterp: 0.000031 seconds (10 allocations: 480 bytes)
Library Access: 0.002455 seconds (231 allocations: 9.156 KiB)
Convolution: 0.006463 seconds (513 allocations: 6.154 MiB)
AVXT Summation: 0.000353 seconds (45 allocations: 1.688 KiB)
Final convolution: 0.001642 seconds (84 allocations: 1.129 MiB)
Rest: 0.005513 seconds (57 allocations: 1.943 MiB)
0.025435 seconds (1.50 k allocations: 10.895 MiB)
####### GRADIENT #######
Allocation: 0.006881 seconds (352 allocations: 8.054 MiB)
NR Signal: 0.000121 seconds (30 allocations: 2.234 KiB)
Get Tinterp: 0.000040 seconds (10 allocations: 608 bytes)
Library Access: 0.013246 seconds (188 allocations: 8.252 MiB)
Convolution: 0.304534 seconds (135 allocations: 20.833 MiB)
DUAL Summation: 0.006778 seconds (116 allocations: 11.453 MiB)
Final convolution: 0.061118 seconds (23 allocations: 3.518 MiB)
Rest: 0.005144 seconds (57 allocations: 7.131 MiB)
0.403787 seconds (1.13 k allocations: 59.259 MiB)
So yes, the gradient is the bottleneck - and within the gradient, it is the convolution, which is 50 times slower than for the single evaluation, because I have to fall back to direct convolution instead of using DSP.conv()/LoopVectorization/FFT because none of them can handle Dual Numbers.
To see how often the function is evaluated with Dual Number vs. Float64, I keep track of the number of calls with each input type during an optimization using the globals evaldual, evalfloat
and it turns out, that even though Ipopt returns the following:
Number of objective function evaluations = 107
Number of objective gradient evaluations = 27
Number of equality constraint evaluations = 0
Number of inequality constraint evaluations = 0
Number of equality constraint Jacobian evaluations = 0
Number of inequality constraint Jacobian evaluations = 0
Number of Lagrangian Hessian evaluations = 0
Total seconds in IPOPT = 21.346
EXIT: Optimal Solution Found.
println([evaldual evalfloat])
[84 85]
Meaning that every other call is using Dual Numbers as input type.
And again, it is not the gradient calculation itself, that slows me down. It is that I cannot use an efficient way to calculate the convolution of two Dual number vectors during the calculation of the gradient. I totally accept that there is a penalty on calculating the gradient - but the difference in runtime just arises from the aforementioned limitation of Dual Number convolution.
Providing an analytical gradient - well, I did think about it and for some of the parameters, it is actually not that hard. But for example T
, which is the temperature dependency of the spectra, this get’s really tricky. In part, this is due to the strategy of accessing a library of precomputed spectra for different molecules, which are all just f(T)
but have a vastly different physical model behind them and thus a very different gradient with respect to T. And the whole point of using this library is to get rid of the really computationally expensive stuff. Essentially, for every molecule we have to solve a complex Lorentzian on a much finer grid resolution for every transition in the area of interest. For CO2, this is in the order of 10,000 transitions, each on an array with 400,000 Elements. Depending on the model parameters, these individual transitions all depend on temperature - but not in the same way. So no, getting the gradient analytically is really not an option.