How to make the most of SIMD.jl when number of data elements is not divisible by SIMD width

I think I figured it out, there was an error on my code and the loop of the remaining was actually computing almost all coordinates. Fixing this the runtimes are more reasonable. I am giving better names to the methods to make it more readable.

Also making the function parametrizable with the simd width (c_a_times_b_SIMD_handmade_parametrized ) makes the method faster than c_a_times_b_SIMD_handmade!.

I guess one could use metaprograming to, given the SIMD width and length of the array, unroll the loop from n_remaining:n_elements. Anyone know examples along those lines? or in general, examples of generated code to look at?

The updated MWE from below produces

Length divisible by SIMD width
c_a_times_b!	Trial(313.750 μs)
c_a_times_b_SIMD!	Trial(309.250 μs)
c_a_times_b_SIMD_handmade!	Trial(314.667 μs)
c_a_times_b_SIMD_handmade_parametrized!	Trial(300.458 μs)

Length NOT divisible by SIMD width
c_a_times_b!	Trial(306.291 μs)
c_a_times_b_SIMD!	Trial(307.542 μs)
c_a_times_b_SIMD_handmade!	Trial(324.708 μs)
c_a_times_b_SIMD_handmade_parametrized!	Trial(303.625 μs)

Updated MWE


using SIMD
using BenchmarkTools
using Random
Random.seed!(1234)

function c_a_times_b!(c::Array{T}, a::Array{T}, b::Array{T}) where T
    @assert length(c) == length(a) == length(b)
    for i in 1:length(c)
        c[i] = a[i]*b[i]
    end
end

function c_a_times_b_SIMD!(c::Array{T}, a::Array{T}, b::Array{T}) where T
    @assert length(c) == length(a) == length(b)
    @simd for i in 1:length(c)
        c[i] = a[i]*b[i]
    end
end


function c_a_times_b_SIMD_handmade!(c, a, b)
    N = 4
    T = eltype(c)
    n_elements = length(a)
    n_remaining = mod(n_elements, N)
    n_first = n_elements - n_remaining
    vtype = Vec{N,T}
    
    @inbounds @fastmath for i in 1:N:n_first
        a_chunk = vload(vtype, a, i) 
        b_chunk = vload(vtype, b, i) 
        a_chunk *=  b_chunk
        vstore(a_chunk,c,i)
    end
    
    @inbounds for i in (n_first+1):n_elements
        c[i] = a[i]*b[i]
    end
end

function c_a_times_b_SIMD_handmade_parametrized!(c, a, b, ::Val{W}) where {W}
    T = eltype(c)
    vtype = Vec{W, T}
    len_a = length(a)
    i_last = len_a - rem(len_a, W)

    @inbounds @simd for i in 1:W:i_last
        va = vload(vtype, a, i)
        vb = vload(vtype, b, i)
        vc = va * vb
        vstore(vc, c, i)
    end

    @inbounds for i in (i_last + 1):len_a
        c[i] = a[i] * b[i]
    end
end


T = Float64
M = 1_000_000
a = Vector{T}(1:M)
b = Vector{T}(1:M)
c = zeros(T,M)
println("Length divisible by SIMD width")
println("c_a_times_b!\t", @benchmark c_a_times_b!($c, $a, $b))
println("c_a_times_b_SIMD!\t", @benchmark c_a_times_b_SIMD!($c, $a, $b))
println("c_a_times_b_SIMD_handmade!\t", @benchmark c_a_times_b_SIMD_handmade!($c, $a, $b))
println("c_a_times_b_SIMD_handmade_parametrized!\t", @benchmark c_a_times_b_SIMD_handmade_parametrized!($c, $a, $b, Val(8)))

a = Vector{T}(1:M+10)
b = Vector{T}(1:M+10)
c = zeros(T,M+10)

println("\nLength NOT divisible by SIMD width")
println("c_a_times_b!\t", @benchmark c_a_times_b!($c, $a, $b))
println("c_a_times_b_SIMD!\t", @benchmark c_a_times_b_SIMD!($c, $a, $b))
println("c_a_times_b_SIMD_handmade!\t", @benchmark c_a_times_b_SIMD_handmade!($c, $a, $b))
println("c_a_times_b_SIMD_handmade_parametrized!\t", @benchmark c_a_times_b_SIMD_handmade_parametrized!($c, $a, $b, Val(8)))