An extended version of Seif Sheblās excellent benchmark test
I tried the benchmark tests for N = 100, 200, 500, 1000, and 2000.
The results look different for each N.
Julia script:
# maxabstest.jl
using BenchmarkTools
N = isempty(ARGS) ? 500 : parse(Int, ARGS[1])
function max_abs(a, b)
m = zero(promote_type(eltype(a), eltype(b)))
for i in eachindex(a, b)
tmp = abs(a[i] - b[i])
tmp > m && (m = tmp)
end
m
end
a = randn(N, N)
b = randn(N, N)
@show VERSION
@show Threads.nthreads()
println()
@show N
print("simple for loop: ")
@btime max_abs($a, $b)
print("mapreduce absā- max: ")
@btime mapreduce(absā-, max, $a, $b)
print("maximum(abs, a - b): ")
@btime maximum(abs, $a - $b)
print("maximum(generator): ")
@btime maximum(abs(i - j) for (i, j) in zip($a, $b))
print("maximum splat(absā-):")
@btime maximum(Base.splat(absā-), zip($a, $b))
using Tullio
print("Tullio: ")
@btime @tullio (max) _ := abs($a[i] - $b[i])
print("Tullio (LoopVect.): ")
using LoopVectorization
@btime @tullio (max) _ := abs($a[i] - $b[i])
function max_abs_turbo(a, b)
m = zero(promote_type(eltype(a), eltype(b)))
@turbo for i in eachindex(a, b)
m = max(m, abs(a[i] - b[i]))
end
m
end
function max_abs_tturbo(a, b)
m = zero(promote_type(eltype(a), eltype(b)))
@tturbo for i in eachindex(a, b)
m = max(m, abs(a[i] - b[i]))
end
m
end
print("LoopVect. @turbo: ")
@btime max_abs_turbo($a, $b)
print("LoopVect. @tturbo: ")
@btime max_abs_tturbo($a, $b)
Results for N = 100, 200, 500, 1000, 2000
$ julia maxabstest.jl 100
VERSION = v"1.6.1"
Threads.nthreads() = 12
N = 100
simple for loop: 8.733 Ī¼s (0 allocations: 0 bytes)
maximum(abs, a - b): 10.100 Ī¼s (2 allocations: 78.20 KiB)
maximum(generator): 18.100 Ī¼s (0 allocations: 0 bytes)
maximum splat(absā-): 18.100 Ī¼s (0 allocations: 0 bytes)
mapreduce absā- max: 11.300 Ī¼s (2 allocations: 78.20 KiB)
Tullio: 8.767 Ī¼s (1 allocation: 16 bytes)
Tullio (LoopVect.): 1.290 Ī¼s (1 allocation: 16 bytes)
LoopVect. @turbo: 1.240 Ī¼s (0 allocations: 0 bytes)
LoopVect. @tturbo: 722.689 ns (0 allocations: 0 bytes)
$ julia maxabstest.jl 200
VERSION = v"1.6.1"
Threads.nthreads() = 12
N = 200
simple for loop: 34.900 Ī¼s (0 allocations: 0 bytes)
maximum(abs, a - b): 34.500 Ī¼s (2 allocations: 312.58 KiB)
maximum(generator): 71.600 Ī¼s (0 allocations: 0 bytes)
maximum splat(absā-): 71.700 Ī¼s (0 allocations: 0 bytes)
mapreduce absā- max: 42.300 Ī¼s (2 allocations: 312.58 KiB)
Tullio: 34.900 Ī¼s (1 allocation: 16 bytes)
Tullio (LoopVect.): 8.400 Ī¼s (1 allocation: 16 bytes)
LoopVect. @turbo: 9.600 Ī¼s (0 allocations: 0 bytes)
LoopVect. @tturbo: 2.010 Ī¼s (0 allocations: 0 bytes)
$ julia maxabstest.jl 500
VERSION = v"1.6.1"
Threads.nthreads() = 12
N = 500
simple for loop: 213.400 Ī¼s (0 allocations: 0 bytes)
maximum(abs, a - b): 444.500 Ī¼s (2 allocations: 1.91 MiB)
maximum(generator): 447.000 Ī¼s (0 allocations: 0 bytes)
maximum splat(absā-): 447.100 Ī¼s (0 allocations: 0 bytes)
mapreduce absā- max: 454.600 Ī¼s (2 allocations: 1.91 MiB)
Tullio: 213.400 Ī¼s (1 allocation: 16 bytes)
Tullio (LoopVect.): 55.900 Ī¼s (1 allocation: 16 bytes)
LoopVect. @turbo: 56.100 Ī¼s (0 allocations: 0 bytes)
LoopVect. @tturbo: 12.700 Ī¼s (0 allocations: 0 bytes)
$ julia maxabstest.jl 1000
VERSION = v"1.6.1"
Threads.nthreads() = 12
N = 1000
simple for loop: 1.053 ms (0 allocations: 0 bytes)
mapreduce absā- max: 2.354 ms (2 allocations: 7.63 MiB)
maximum(abs, a - b): 2.240 ms (2 allocations: 7.63 MiB)
maximum(generator): 1.939 ms (0 allocations: 0 bytes)
maximum splat(absā-): 1.937 ms (0 allocations: 0 bytes)
Tullio: 263.000 Ī¼s (93 allocations: 4.73 KiB)
Tullio (LoopVect.): 257.600 Ī¼s (93 allocations: 4.73 KiB)
LoopVect. @turbo: 410.200 Ī¼s (0 allocations: 0 bytes)
LoopVect. @tturbo: 207.600 Ī¼s (0 allocations: 0 bytes)
$ julia maxabstest.jl 2000
VERSION = v"1.6.1"
Threads.nthreads() = 12
N = 2000
simple for loop: 4.193 ms (0 allocations: 0 bytes)
mapreduce absā- max: 10.075 ms (2 allocations: 30.52 MiB)
maximum(abs, a - b): 11.004 ms (2 allocations: 30.52 MiB)
maximum(generator): 7.903 ms (0 allocations: 0 bytes)
maximum splat(absā-): 7.896 ms (0 allocations: 0 bytes)
Tullio: 2.388 ms (199 allocations: 10.17 KiB)
Tullio (LoopVect.): 2.855 ms (200 allocations: 10.20 KiB)
LoopVect. @turbo: 2.540 ms (0 allocations: 0 bytes)
LoopVect. @tturbo: 2.275 ms (0 allocations: 0 bytes)