Note that it does add extra compiler latency, so itād be your call as the author on whether the likelihood of alternate supported types is worth the trade-off.

But it should do better even in relatively simple cases:

```
using LoopVectorization, BenchmarkTools, Random
function mydotsimd(A, B)
s = zero(promote_type(eltype(A), eltype(B)))
@inbounds @simd for i in eachindex(A,B)
s += A[i] * B[i]
end
s
end
function mydotavx(A, B)
s = zero(promote_type(eltype(A), eltype(B)))
@avx for i in eachindex(A,B)
s += A[i] * B[i]
end
s
end
# benchmark random vector lengths from 1:512
N = 512;
x = rand(N); y = rand(N);
Ns = shuffle(1:N);
function testsizes(f::F, x::AbstractVector, y::AbstractVector, Ns) where {F}
foreach(n -> @views(f(x[1:n], y[1:n])), Ns)
end
@btime testsizes(mydotsimd, $x, $y, $Ns)
@btime testsizes(mydotavx, $x, $y, $Ns)
```

For random sized vectors, yields:

```
julia> @btime testsizes(mydotsimd, $x, $y, $Ns)
10.696 Ī¼s (0 allocations: 0 bytes)
julia> @btime testsizes(mydotavx, $x, $y, $Ns)
8.156 Ī¼s (0 allocations: 0 bytes)
```

Now, getting a bit more complicated

```
M = 32;
A = rand(M, M);
B = rand(M, M);
Ms = shuffle(1:M);
function testsizes(f::F, x::AbstractMatrix, y::AbstractMatrix, Ns) where {F}
foreach(n -> @views(f(x[1:n,1:n], y[1:n,1:n])), Ns)
end
@btime testsizes(mydotsimd, $A, $B, $Ms)
@btime testsizes(mydotavx, $A, $B, $Ms)
@btime testsizes(mydotsimd, $A', $B, $Ms)
@btime testsizes(mydotavx, $A', $B, $Ms)
@btime testsizes(mydotsimd, $A, $B', $Ms)
@btime testsizes(mydotavx, $A, $B', $Ms)
@btime testsizes(mydotsimd, $A', $B', $Ms)
@btime testsizes(mydotavx, $A', $B', $Ms)
```

And now we get:

```
julia> @btime testsizes(mydotsimd, $A, $B, $Ms)
9.208 Ī¼s (0 allocations: 0 bytes)
julia> @btime testsizes(mydotavx, $A, $B, $Ms)
1.056 Ī¼s (0 allocations: 0 bytes)
julia> @btime testsizes(mydotsimd, $A', $B, $Ms)
9.630 Ī¼s (0 allocations: 0 bytes)
julia> @btime testsizes(mydotavx, $A', $B, $Ms)
2.344 Ī¼s (0 allocations: 0 bytes)
julia> @btime testsizes(mydotsimd, $A, $B', $Ms)
9.575 Ī¼s (0 allocations: 0 bytes)
julia> @btime testsizes(mydotavx, $A, $B', $Ms)
2.329 Ī¼s (0 allocations: 0 bytes)
julia> @btime testsizes(mydotsimd, $A', $B', $Ms)
9.628 Ī¼s (0 allocations: 0 bytes)
julia> @btime testsizes(mydotavx, $A', $B', $Ms)
1.035 Ī¼s (0 allocations: 0 bytes)
```

Also helpful to look at specifically the 32x32 case, which is ideal for LLVM:

```
julia> @btime mydotsimd($A, $B)
33.207 ns (0 allocations: 0 bytes)
255.8607178195125
julia> @btime mydotavx($A, $B)
34.313 ns (0 allocations: 0 bytes)
255.8607178195125
julia> @btime mydotsimd($A', $B)
852.364 ns (0 allocations: 0 bytes)
257.12741470722705
julia> @btime mydotavx($A', $B)
157.423 ns (0 allocations: 0 bytes)
257.12741470722733
julia> @btime mydotsimd($A, $B')
847.691 ns (0 allocations: 0 bytes)
257.12741470722705
julia> @btime mydotavx($A, $B')
148.625 ns (0 allocations: 0 bytes)
257.12741470722733
julia> @btime mydotsimd($A', $B')
853.938 ns (0 allocations: 0 bytes)
255.86071781951276
julia> @btime mydotavx($A', $B')
51.592 ns (0 allocations: 0 bytes)
255.86071781951247
```

`mydotavx(A', B')`

should be the same fast as `mydotavx(A, B)`

, but that doesnāt seem to be the case.

Iām currently working on rewriting the library, and will try to address this then.

Iām currently working on adding 1.6 support to the old version of the library, and will hopefully have that done reasonably soon. The rewrite will probably take a few months.