For element wise SIMD operations, when SVector doesn’t fill the whole length of SIMD register, we could load the whole length from the memory anyway. For example, SVector{3, Float32} would load junk as the last float in <4 x float> (LLVM IR term), perform the operation and do masked store, which will disregard the junk end of SIMD register. I don’t know if LLVM IR makes it feasible.
Looking at LLVM input, I noticed that SVector already starts with much more complex code than SIMD.Vec:
julia> @code_llvm debuginfo=:none broadcast(min, a, b)
define void @julia_broadcast_20236([1 x [4 x float]]* noalias nocapture sret, [1 x [4 x float]] addrspace(11)* nocapture nonnull readonly dereferenceable(16), [1 x [4 x float]] addrspace(11)* nocapture nonnull readonly dereferenceable(16)) {
top:
%3 = bitcast [1 x [4 x float]] addrspace(11)* %1 to <4 x float> addrspace(11)*
%4 = load <4 x float>, <4 x float> addrspace(11)* %3, align 1
%5 = bitcast [1 x [4 x float]] addrspace(11)* %2 to <4 x float> addrspace(11)*
%6 = load <4 x float>, <4 x float> addrspace(11)* %5, align 1
%7 = fcmp olt <4 x float> %6, %4
%8 = bitcast <4 x float> %6 to <4 x i32>
%9 = bitcast <4 x float> %4 to <4 x i32>
%10 = icmp sgt <4 x i32> %9, <i32 -1, i32 -1, i32 -1, i32 -1>
%11 = icmp slt <4 x i32> %8, zeroinitializer
%12 = and <4 x i1> %10, %11
%13 = or <4 x i1> %7, %12
%14 = fcmp ord <4 x float> %4, zeroinitializer
%15 = select <4 x i1> %14, <4 x float> %6, <4 x float> %4
%16 = fcmp ord <4 x float> %6, zeroinitializer
%17 = select <4 x i1> %16, <4 x float> %4, <4 x float> %6
%18 = select <4 x i1> %13, <4 x float> %15, <4 x float> %17
%19 = bitcast [1 x [4 x float]]* %0 to <4 x float>*
store <4 x float> %18, <4 x float>* %19, align 4
ret void
}
vs
julia> @code_llvm debuginfo=:none min(c, d)
define void @julia_min_20071([1 x <4 x float>]* noalias nocapture sret, [1 x <4 x float>] addrspace(11)* nocapture nonnull readonly dereferenceable(16), [1 x <4 x float>] addrspace(11)* nocapture nonnull readonly dereferenceable(16)) {
top:
%3 = getelementptr inbounds [1 x <4 x float>], [1 x <4 x float>] addrspace(11)* %1, i64 0, i64 0
%4 = getelementptr inbounds [1 x <4 x float>], [1 x <4 x float>] addrspace(11)* %2, i64 0, i64 0
%5 = load <4 x float>, <4 x float> addrspace(11)* %3, align 16
%6 = load <4 x float>, <4 x float> addrspace(11)* %4, align 16
%res.i = call <4 x float> @llvm.minnum.v4f32(<4 x float> %5, <4 x float> %6)
%7 = getelementptr inbounds [1 x <4 x float>], [1 x <4 x float>]* %0, i64 0, i64 0
store <4 x float> %res.i, <4 x float>* %7, align 16
ret void
}
The trickery of SVector code in lieu of just using llvm.minnum.v4f32 doesn’t pay off:
const n = 100
rng = MersenneTwister(314159)
a = [SIMD.Vec{4,Float32}((rand.(rng, (Float32, Float32, Float32, Float32)))) for _ in 1:n]
rng = MersenneTwister(314159)
b = [SVector{4, Float32}(rand.(rng, (Float32, Float32, Float32, Float32))) for _ in 1:n]
function fsimd(a)
r = SIMD.Vec{4,Float32}((3, 1, 4, 1))
for x in a
r = min(r, x)
end
return r
end
function fsvec(a)
r = SVector{4, Float32}((3, 1, 4, 1))
for x in a
r = min.(r, x)
end
return r
end
julia> @benchmark fsimd(a)
minimum time: 209.007 ns (0.00% GC)
julia> @benchmark fsvec(b)
minimum time: 259.254 ns (0.00% GC)