Fast short Float32 vector

For element wise SIMD operations, when SVector doesn’t fill the whole length of SIMD register, we could load the whole length from the memory anyway. For example, SVector{3, Float32} would load junk as the last float in <4 x float> (LLVM IR term), perform the operation and do masked store, which will disregard the junk end of SIMD register. I don’t know if LLVM IR makes it feasible.

Looking at LLVM input, I noticed that SVector already starts with much more complex code than SIMD.Vec:

julia> @code_llvm debuginfo=:none broadcast(min, a, b)

define void @julia_broadcast_20236([1 x [4 x float]]* noalias nocapture sret, [1 x [4 x float]] addrspace(11)* nocapture nonnull readonly dereferenceable(16), [1 x [4 x float]] addrspace(11)* nocapture nonnull readonly dereferenceable(16)) {
top:
  %3 = bitcast [1 x [4 x float]] addrspace(11)* %1 to <4 x float> addrspace(11)*
  %4 = load <4 x float>, <4 x float> addrspace(11)* %3, align 1
  %5 = bitcast [1 x [4 x float]] addrspace(11)* %2 to <4 x float> addrspace(11)*
  %6 = load <4 x float>, <4 x float> addrspace(11)* %5, align 1
  %7 = fcmp olt <4 x float> %6, %4
  %8 = bitcast <4 x float> %6 to <4 x i32>
  %9 = bitcast <4 x float> %4 to <4 x i32>
  %10 = icmp sgt <4 x i32> %9, <i32 -1, i32 -1, i32 -1, i32 -1>
  %11 = icmp slt <4 x i32> %8, zeroinitializer
  %12 = and <4 x i1> %10, %11
  %13 = or <4 x i1> %7, %12
  %14 = fcmp ord <4 x float> %4, zeroinitializer
  %15 = select <4 x i1> %14, <4 x float> %6, <4 x float> %4
  %16 = fcmp ord <4 x float> %6, zeroinitializer
  %17 = select <4 x i1> %16, <4 x float> %4, <4 x float> %6
  %18 = select <4 x i1> %13, <4 x float> %15, <4 x float> %17
  %19 = bitcast [1 x [4 x float]]* %0 to <4 x float>*
  store <4 x float> %18, <4 x float>* %19, align 4
  ret void
}

vs

julia> @code_llvm debuginfo=:none min(c, d)

define void @julia_min_20071([1 x <4 x float>]* noalias nocapture sret, [1 x <4 x float>] addrspace(11)* nocapture nonnull readonly dereferenceable(16), [1 x <4 x float>] addrspace(11)* nocapture nonnull readonly dereferenceable(16)) {
top:
  %3 = getelementptr inbounds [1 x <4 x float>], [1 x <4 x float>] addrspace(11)* %1, i64 0, i64 0
  %4 = getelementptr inbounds [1 x <4 x float>], [1 x <4 x float>] addrspace(11)* %2, i64 0, i64 0
  %5 = load <4 x float>, <4 x float> addrspace(11)* %3, align 16
  %6 = load <4 x float>, <4 x float> addrspace(11)* %4, align 16
  %res.i = call <4 x float> @llvm.minnum.v4f32(<4 x float> %5, <4 x float> %6)
  %7 = getelementptr inbounds [1 x <4 x float>], [1 x <4 x float>]* %0, i64 0, i64 0
  store <4 x float> %res.i, <4 x float>* %7, align 16
  ret void
}

The trickery of SVector code in lieu of just using llvm.minnum.v4f32 doesn’t pay off:

const n = 100

rng = MersenneTwister(314159)
a = [SIMD.Vec{4,Float32}((rand.(rng, (Float32, Float32, Float32, Float32)))) for _ in 1:n]
rng = MersenneTwister(314159)
b = [SVector{4, Float32}(rand.(rng, (Float32, Float32, Float32, Float32))) for _ in 1:n]

function fsimd(a)
  r = SIMD.Vec{4,Float32}((3, 1, 4, 1))
  for x in a
    r = min(r, x) 
  end
  return r
end

function fsvec(a)
  r = SVector{4, Float32}((3, 1, 4, 1))
  for x in a
    r = min.(r, x) 
  end
  return r
end
julia> @benchmark fsimd(a)
minimum time:     209.007 ns (0.00% GC)

julia> @benchmark fsvec(b)
minimum time:     259.254 ns (0.00% GC)