@time let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m)
v[1:i] .+= @view w[end-i+1:end]
v[i+1:end] .+= @view w[1:end-i]
end
end
import time
import numpy as np
import random
t0 = time.time()
m = 10000019
v = np.array([0]*m)
w = np.copy(v)
for _ in range(100):
i = random.randint(0, m-1)
if i > 0:
v[:i] += w[-i:]
v[i:] += w[:-i]
else:
v += w
print(time.time() - t0)
On my machine, the Julia version takes 4.39 seconds, and the Python version takes only 1.85 seconds.
Is there any improvement to the Julia code to match numpy’s performance?
P.S. This came up when I was trying to do the Euler Project No. 655, during which I learned a DP method involving array broadcasting.
Single threaded, I’m getting Base < FastBroadcast.jl < NumPy < LoopVectorization.jl:
julia> @time let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m)
@views @. v[1:i] += w[end-i+1:end]
@views @. v[i+1:end] += w[1:end-i]
end
end
3.530138 seconds (4 allocations: 152.588 MiB, 0.82% gc time)
julia> using FastBroadcast
julia> @time let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m)
@views @.. v[1:i] += w[end-i+1:end]
@views @.. v[i+1:end] += w[1:end-i]
end
end
3.177666 seconds (4 allocations: 152.588 MiB, 0.91% gc time)
julia> using LoopVectorization
julia> @time let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m);
@turbo view(v,1:i) .+= view(w,m-i+1:m)
@turbo view(v,i+1:m) .+= view(w,1:m-i)
end
end
1.564778 seconds (4 allocations: 152.588 MiB)
Python:
In [3]: import time
...: import numpy as np
...: import random
...:
...: t0 = time.time()
...: m = 10000019
...: v = np.array([0]*m)
...: w = np.copy(v)
...: for _ in range(100):
...: i = random.randint(0, m-1)
...: if i > 0:
...: v[:i] += w[-i:]
...: v[i:] += w[:-i]
...: else:
...: v += w
...: print(time.time() - t0)
1.9152345657348633
FastBroadcast.jl and LoopVectorization.jl are easy to multithread:
julia> @time let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m)
@views @.. thread=true v[1:i] += w[end-i+1:end]
@views @.. thread=true v[i+1:end] += w[1:end-i]
end
end
0.361589 seconds (4 allocations: 152.588 MiB, 2.10% gc time)
julia> @time let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m);
@tturbo view(v,1:i) .+= view(w,m-i+1:m)
@tturbo view(v,i+1:m) .+= view(w,1:m-i)
end
end
0.372530 seconds (4 allocations: 152.588 MiB)
FastBroadcast and base broadcast are failing to SIMD here, but this code is so memory bound when multithreaded that it doesn’t matter.
The problem, as it so often is, was with timing in the global scope.
julia> broadbench() = let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m)
@views @. v[1:i] += w[end-i+1:end]
@views @. v[i+1:end] += w[1:end-i]
end
end
broadbench (generic function with 1 method)
julia> fastbroadbench() = let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m)
@views @.. v[1:i] += w[end-i+1:end]
@views @.. v[i+1:end] += w[1:end-i]
end
end
fastbroadbench (generic function with 1 method)
julia> broadbench() = let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m)
@views @. v[1:i] += w[end-i+1:end]
@views @. v[i+1:end] += w[1:end-i]
end
end
broadbench (generic function with 1 method)
julia> turbobroadbench() = let m = 10^7 + 19
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m);
@turbo view(v,1:i) .+= view(w,m-i+1:m)
@turbo view(v,i+1:m) .+= view(w,1:m-i)
end
end
turbobroadbench (generic function with 1 method)
I’m not sure if this is a simpler way to look at it, but to quote the docs, “X .+= Y etcetera is equivalent to X .= X .+ Y”. So v[1:i] .+= @view w[end-i+1:end] is equivalent to v[1:i] .= v[1:i] .+ @view w[end-i+1:end]. Just a hazard of the shorthand +='s left side actually representing duplicate subexpressions on the left and right side of the actual assignment.
I am actually surprised here, because the code was in a local let block to begin with, and all the benchmarks are first-call @time. I can’t spot any globals, and the fact that it’s always 4 allocations and 152 MiB suggests to me there weren’t any type instabilities being fixed.
Perhaps it is more accurate to say the problem is that it isn’t compiled, and the overhead is high.
Dispatches often result in zero allocations.
This allocates a lot because it compiles:
julia> @time let m = 10^7 + 19; (m -> begin
v = fill(0, m)
w = copy(v)
for _ in 1:100
i = mod(rand(Int), m)
@views @. v[1:i] += w[end-i+1:end]
@views @. v[i+1:end] += w[1:end-i]
end
end)(m); end
1.451741 seconds (369.68 k allocations: 177.840 MiB, 1.52% gc time, 6.56% compilation time)
Ah that’s right, Julia compiles per first method call. The let block may have no type instabilities, but it’s not going to benefit from the optimizing compiler. Makes sense, why bother optimizing something that can’t run repeatedly?
Note also that it’s a lot easier to just apply @views to the whole let block (or the whole function); there’s typically not much reason to apply it line-by-line.
The @view docs seem pretty comprehensive? @view for individual indexing subexpressions, @views to slap @view on every place in an expression that would be a getindex, and @view works on the left side of a broadcasted assignment (.=) even with updating operators (.+=).
While we’re optimizing, you could try making m an Int by writing it the same way you did in Python (and can we maybe have 1E7 or something be an integer in Julia 2.0?)