The biggest problem is probably that sum(1 ./ (ot[nn] .+ 0.001*im .- st)) allocates a big array at every step. If you replace that with a generator it’s much faster:
begin
ot=range(start=-3,length=3600*4,stop=3) # lazy range
st=rand(10^4)
sgf=zeros(ComplexF64, length(ot))
# function will be compiled:
function work!(sgf::AbstractVector, st::AbstractVector, ot::AbstractVector)
length(sgf) == length(ot) || error("bad lengths")
for nn in eachindex(ot)
# sgf[nn]=sum(1 ./ (ot[nn] .+ 0.001*im .- st)) # allocates 2.151 GiB as before
sgf[nn] = sum(1 / (ot[nn] + 0.001*im - s) for s in st)
end
end
@time work!(sgf, st, ot); # 1.2 seconds, 7x faster
end
begin
ot=collect(range(start=-3,length=3600*4,stop=3));
st=rand(10^4) # 1000_000);
sgf=zeros(ComplexF64, length(ot));
@time for nn in 1:length(ot)
sgf[nn]=sum(1 ./ (ot[nn] + 0.001*im .- st));
end
end; # original code, 10^4 not 10^6, takes 8.8 seconds
Edit: One much faster way is this:
using Tullio
function work_t!(sgf::AbstractVector, st::AbstractVector, ot::AbstractVector)
@tullio sgf[nn] = 1 / (ot[nn] + 0.001*im - st[k])
end
@time work_t!(sgf, st, ot); # 0.21 seconds, first run
@time work_t!(sgf, st, ot); # 0.04 seconds, second run
This is multi-threaded, which is a factor of 4 here, 4-core machine. And it’s calling @fastmath which might be cheating.