How to optimize computation within vectorized list operation and large array?

The biggest problem is probably that sum(1 ./ (ot[nn] .+ 0.001*im .- st)) allocates a big array at every step. If you replace that with a generator it’s much faster:

begin
  ot=range(start=-3,length=3600*4,stop=3)  # lazy range
  st=rand(10^4)
  sgf=zeros(ComplexF64, length(ot))
  # function will be compiled:
  function work!(sgf::AbstractVector, st::AbstractVector, ot::AbstractVector)
    length(sgf) == length(ot) || error("bad lengths")
    for nn in eachindex(ot)
      # sgf[nn]=sum(1 ./ (ot[nn] .+ 0.001*im .- st))  # allocates 2.151 GiB as before
      sgf[nn] = sum(1 / (ot[nn] + 0.001*im - s) for s in st)
    end
  end
  @time work!(sgf, st, ot);  # 1.2 seconds, 7x faster
end

begin
  ot=collect(range(start=-3,length=3600*4,stop=3));
  st=rand(10^4) # 1000_000);
  sgf=zeros(ComplexF64, length(ot));
  @time for nn in 1:length(ot)
      sgf[nn]=sum(1 ./ (ot[nn] + 0.001*im .- st));
  end
end;  # original code, 10^4 not 10^6, takes 8.8 seconds

Edit: One much faster way is this:

using Tullio
function work_t!(sgf::AbstractVector, st::AbstractVector, ot::AbstractVector)
  @tullio sgf[nn] = 1 / (ot[nn] + 0.001*im - st[k])
end
@time work_t!(sgf, st, ot); # 0.21 seconds, first run
@time work_t!(sgf, st, ot); # 0.04 seconds, second run

This is multi-threaded, which is a factor of 4 here, 4-core machine. And it’s calling @fastmath which might be cheating.

1 Like