In this case, using a for
loop is interesting in itself, since it allows
- fusing the two loops, and
- avoiding the materialization of intermediate vector
P
.
For example:
function reweight_for(A::AbstractArray, w::AbstractArray, ω::Real)
s1 = zero(eltype(A))
s2 = zero(eltype(A))
@inbounds for i in eachindex(A)
p = exp(-ω * w[i])
s1 += A[i] * p
s2 += p
end
return s1 / s2
end
julia> N = 100_000_000;
julia> A = rand(N);
julia> w = rand(N);
julia> ω = rand();
julia> @test reweight(A, w, ω) ≈ reweight_for(A, w, ω)
Test Passed
julia> @btime reweight($A, $w, $ω);
949.582 ms (4 allocations: 1.49 GiB)
julia> @btime reweight_for($A, $w, $ω);
782.361 ms (0 allocations: 0 bytes)
From there, it is more easily parallelized, although I don’t think your case features enough arithmetic intensity for it to be useful:
function reweight_threads(A::AbstractArray, w::AbstractArray, ω::Real)
s1 = zeros(eltype(A), Threads.nthreads())
s2 = zeros(eltype(A), Threads.nthreads())
@inbounds Threads.@threads for i in eachindex(A)
id = Threads.threadid()
p = exp(-ω * w[i])
s1[id] += A[i] * p
s2[id] += p
end
return sum(s1) / sum(s2)
end
julia> @test reweight(A, w, ω) ≈ reweight_threads(A, w, ω)
Test Passed
julia> Threads.nthreads()
4
julia> @btime reweight_threads($A, $w, $ω);
1.700 s (3 allocations: 288 bytes)
There might be some gain to get from vectorization, though:
function reweight_simd(A::AbstractArray, w::AbstractArray, ω::Real)
s1 = zero(eltype(A))
s2 = zero(eltype(A))
@inbounds @simd for i in eachindex(A)
p = exp(-ω * w[i])
s1 += A[i] * p
s2 += p
end
return s1 / s2
end
julia> @test reweight(A, w, ω) ≈ reweight_simd(A, w, ω)
Test Passed
julia> @btime reweight_simd($A, $w, $ω)
672.069 ms (0 allocations: 0 bytes)