When I tried to reproduce the result, I guess I figured out another possible source of slowdown, it’s Options
type.
With overall setup
function BINOP!(x::Array{Float32, 1}, y::Array{Float32, 1}, i::Int, clen::Int, options::Options)
op = options.binops[i]
BINOP!(op, x, y, clen, options)
end
function BINOP!(op::F, x::Array{Float32, 1}, y::Array{Float32, 1}, clen::Int, options::Options) where F
@inbounds @simd for j=1:clen
x[j] = op(x[j], y[j])
end
end
function BINOP2!(x, y, i, clen, options)
if i == 1
@inbounds @simd for j=1:clen
x[j] = (+)(x[j], y[j])
end
end
end
x = rand(Float32, 100)
y = rand(Float32, 100)
clen = length(x)
julia> @btime BINOP2!($x, $y, 1, $clen, $options)
7.923 ns (0 allocations: 0 bytes)
I have tried three versions
1.
struct Options
binops
end
options = Options([+, -, *])
julia> @btime BINOP!($x, $y, 1, $clen, $options)
44.011 ns (1 allocation: 16 bytes)
struct Options{T}
binops::T
end
options = Options([+, -, *])
julia> @btime BINOP!($x, $y, 1, $clen, $options)
31.602 ns (1 allocation: 16 bytes)
struct Options{T}
binops::T
end
options = Options((+, -, *))
julia> @btime BINOP!($x, $y, 1, $clen, $options)
23.229 ns (0 allocations: 0 bytes)
So, of all of this versions, fully typed tuple version of operations is the fastest. I think the difference 23 - 8 = 15
ns is the price that one have to pay to figure out op
dynamically and it can’t be improved further.
If this performance is not enough then closest to this issue is union-splitting and write macro along the lines of ManualDispatch.jl or this snippet. But for it to work, all operations should be defined at compile time.