I think I figured it out, there was an error on my code and the loop of the remaining was actually computing almost all coordinates. Fixing this the runtimes are more reasonable. I am giving better names to the methods to make it more readable.
Also making the function parametrizable with the simd width (c_a_times_b_SIMD_handmade_parametrized ) makes the method faster than c_a_times_b_SIMD_handmade!.
I guess one could use metaprograming to, given the SIMD width and length of the array, unroll the loop from n_remaining:n_elements. Anyone know examples along those lines? or in general, examples of generated code to look at?
The updated MWE from below produces
Length divisible by SIMD width
c_a_times_b! Trial(313.750 μs)
c_a_times_b_SIMD! Trial(309.250 μs)
c_a_times_b_SIMD_handmade! Trial(314.667 μs)
c_a_times_b_SIMD_handmade_parametrized! Trial(300.458 μs)
Length NOT divisible by SIMD width
c_a_times_b! Trial(306.291 μs)
c_a_times_b_SIMD! Trial(307.542 μs)
c_a_times_b_SIMD_handmade! Trial(324.708 μs)
c_a_times_b_SIMD_handmade_parametrized! Trial(303.625 μs)
Updated MWE
using SIMD
using BenchmarkTools
using Random
Random.seed!(1234)
function c_a_times_b!(c::Array{T}, a::Array{T}, b::Array{T}) where T
@assert length(c) == length(a) == length(b)
for i in 1:length(c)
c[i] = a[i]*b[i]
end
end
function c_a_times_b_SIMD!(c::Array{T}, a::Array{T}, b::Array{T}) where T
@assert length(c) == length(a) == length(b)
@simd for i in 1:length(c)
c[i] = a[i]*b[i]
end
end
function c_a_times_b_SIMD_handmade!(c, a, b)
N = 4
T = eltype(c)
n_elements = length(a)
n_remaining = mod(n_elements, N)
n_first = n_elements - n_remaining
vtype = Vec{N,T}
@inbounds @fastmath for i in 1:N:n_first
a_chunk = vload(vtype, a, i)
b_chunk = vload(vtype, b, i)
a_chunk *= b_chunk
vstore(a_chunk,c,i)
end
@inbounds for i in (n_first+1):n_elements
c[i] = a[i]*b[i]
end
end
function c_a_times_b_SIMD_handmade_parametrized!(c, a, b, ::Val{W}) where {W}
T = eltype(c)
vtype = Vec{W, T}
len_a = length(a)
i_last = len_a - rem(len_a, W)
@inbounds @simd for i in 1:W:i_last
va = vload(vtype, a, i)
vb = vload(vtype, b, i)
vc = va * vb
vstore(vc, c, i)
end
@inbounds for i in (i_last + 1):len_a
c[i] = a[i] * b[i]
end
end
T = Float64
M = 1_000_000
a = Vector{T}(1:M)
b = Vector{T}(1:M)
c = zeros(T,M)
println("Length divisible by SIMD width")
println("c_a_times_b!\t", @benchmark c_a_times_b!($c, $a, $b))
println("c_a_times_b_SIMD!\t", @benchmark c_a_times_b_SIMD!($c, $a, $b))
println("c_a_times_b_SIMD_handmade!\t", @benchmark c_a_times_b_SIMD_handmade!($c, $a, $b))
println("c_a_times_b_SIMD_handmade_parametrized!\t", @benchmark c_a_times_b_SIMD_handmade_parametrized!($c, $a, $b, Val(8)))
a = Vector{T}(1:M+10)
b = Vector{T}(1:M+10)
c = zeros(T,M+10)
println("\nLength NOT divisible by SIMD width")
println("c_a_times_b!\t", @benchmark c_a_times_b!($c, $a, $b))
println("c_a_times_b_SIMD!\t", @benchmark c_a_times_b_SIMD!($c, $a, $b))
println("c_a_times_b_SIMD_handmade!\t", @benchmark c_a_times_b_SIMD_handmade!($c, $a, $b))
println("c_a_times_b_SIMD_handmade_parametrized!\t", @benchmark c_a_times_b_SIMD_handmade_parametrized!($c, $a, $b, Val(8)))