function testThread00()
a = zeros(Int, 1000)
for i=1:length(a)
for j = 1:1000000
a[i] = j+i
end
end
println(sum(a))
end
function testThreada0()
a = zeros(Int, 1000)
testThread_10(a)
println(sum(a))
end
function testThread_10(a::Array{Int})
for i=1:length(a)
for j = 1:1000000
a[i] = j+i
end
end
nothing
end
$ julia
_
_ _ _(_)_ | A fresh approach to technical computing
(_) | (_) (_) | Documentation: https://docs.julialang.org
_ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help.
| | | | | | |/ _` | |
| | |_| | | | (_| | | Version 0.7.0 (2018-08-08 06:46 UTC)
_/ |\__'_|_|_|\__'_| | Official http://julialang.org/ release
|__/ | x86_64-pc-linux-gnu
julia> include("testThreads.jl")
testThread_1 (generic function with 1 method)
julia> @timev testThread00()
1000500500
0.463337 seconds (69.65 k allocations: 3.470 MiB)
elapsed time (ns): 463337432
bytes allocated: 3638054
pool allocs: 69635
non-pool GC allocs:13
julia> @timev testThread00()
1000500500
0.370684 seconds (13 allocations: 8.297 KiB)
elapsed time (ns): 370683786
bytes allocated: 8496
pool allocs: 12
non-pool GC allocs:1
julia> @timev testThreada0()
1000500500
0.031791 seconds (28.52 k allocations: 1.440 MiB)
elapsed time (ns): 31790655
bytes allocated: 1509658
pool allocs: 28522
non-pool GC allocs:2
julia> @timev testThreada0()
1000500500
0.000027 seconds (13 allocations: 8.297 KiB)
elapsed time (ns): 27118
bytes allocated: 8496
pool allocs: 12
non-pool GC allocs:1
I know the documentation recommends putting core computations in separate functions, and it sounds it has more to do with type matching. But in my test, there is no type uncertainty involved. Why is there a huge difference in performance?
I think the second version manages to elide bounds checks, while the first does not. I don’t know why that is, but:
julia> function testThread00ib()
a = zeros(Int, 1000)
@inbounds for i=1:length(a)
for j = 1:1000000
a[i] = j+i
end
end
sum(a)
end
testThread00ib (generic function with 1 method)
julia> function testThread_a0ib()
a = zeros(Int, 1000)
testThread_10ib(a)
sum(a)
end
testThread_a0ib (generic function with 1 method)
julia> function testThread_10ib(a::Array{Int})
@inbounds for i=1:length(a)
for j = 1:1000000
a[i] = j+i
end
end
end
testThread_10ib (generic function with 1 method)
julia> @btime testThread00ib()
946.156 ns (1 allocation: 7.94 KiB)
1000500500
julia> @btime testThread_a0ib()
947.273 ns (1 allocation: 7.94 KiB)
1000500500
Now they’re the same fast (and faster than either version before).
I was actually testing about the improvement by using threads. With these results, it appears threads are irrelevant. It actually hurts to use multi-threads. See the two additional tests:
function testThread()
# a = SharedArray{Int}(1000)
a = zeros(Int, 1000)
for i=1:length(a)
Threads.@threads for j = 1:1000000
a[i] = j+i
end
end
println(sum(a))
end
function testThread1()
# a = SharedArray{Int}(1000)
a = zeros(Int, 1000)
Threads.@threads for i=1:length(a)
for j = 1:1000000
a[i] = j+i
end
end
println(sum(a))
end
$ export JULIA_NUM_THREADS=2
$ julia --check-bounds=no --math-mode=fast
_
_ _ _(_)_ | A fresh approach to technical computing
(_) | (_) (_) | Documentation: https://docs.julialang.org
_ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help.
| | | | | | |/ _` | |
| | |_| | | | (_| | | Version 0.7.0 (2018-08-08 06:46 UTC)
_/ |\__'_|_|_|\__'_| | Official http://julialang.org/ release
|__/ | x86_64-pc-linux-gnu
julia> Threads.nthreads()
2
julia> include("testThreads.jl")
testThread_1 (generic function with 1 method)
julia> @timev testThread00()
1000500500
0.072504 seconds (69.64 k allocations: 3.428 MiB)
elapsed time (ns): 72503629
bytes allocated: 3594534
pool allocs: 69635
non-pool GC allocs:10
julia> @timev testThread00()
1000500500
0.000039 seconds (13 allocations: 8.297 KiB)
elapsed time (ns): 38903
bytes allocated: 8496
pool allocs: 12
non-pool GC allocs:1
julia> @timev testThread()
999000500
0.047719 seconds (65.30 k allocations: 3.341 MiB)
elapsed time (ns): 47718671
bytes allocated: 3502871
pool allocs: 65288
non-pool GC allocs:15
julia> @timev testThread()
1000500500
0.000498 seconds (1.01 k allocations: 55.172 KiB)
elapsed time (ns): 498414
bytes allocated: 56496
pool allocs: 1012
non-pool GC allocs:1
julia> @timev testThread1()
1000500500
0.058109 seconds (61.24 k allocations: 3.134 MiB)
elapsed time (ns): 58108797
bytes allocated: 3286419
pool allocs: 61232
non-pool GC allocs:9
julia> @timev testThread1()
1000500500
0.011405 seconds (14 allocations: 8.328 KiB)
elapsed time (ns): 11405298
bytes allocated: 8528
pool allocs: 13
non-pool GC allocs:1
It seems like in some cases the compiler is able to remove the inner loop, the one over j:
function foo()
a = zeros(Int, 1000)
for i in 1:length(a)
for j in 1:1_000_000
a[i] = j+i
end
end
return sum(a)
end
function foo2()
a = zeros(Int, 1000)
for i in 1:length(a)
for j in 1:1_000
a[i] = j+i
end
end
return sum(a)
end
function bar(N)
a = zeros(Int, 1000)
for i in 1:length(a)
for j in 1:N
a[i] = j+i
end
end
return sum(a)
end
The first N-1 iterations of the inner loop is irrelevant, so you can skip straight to the last one. I don’t get why this happens in one case but not the other.
So from that it seems like the optimisation is still there. 0.7 and 1.0 should also both have the same times, as the only difference there should be removed deprecations.
That is indeed the weirdest part…
I would guess MacOS Mojave has something to do with. What is it though? Is there something more than removal of deprecations in 1.0 version?
I’ve tried to remove the 1.0 completely and run clean install but issue prevails.
For me it is reproducible on two macOS Mojave (18A371a) installations.
I’ve checked on 1.0.1-pre.0 Commit 1dd2f8b397 and 1.1.0-DEV.126
Commit 5013cfa897… No problem with performance there.
Never mind… let’s focus on loop optimisation problem