Here, I am comparing to MKL:
julia> using LinearAlgebra
julia> BLAS.vendor()
:mkl
julia> versioninfo()
Julia Version 0.7.0-beta.200
Commit 9277d3af2d (2018-07-07 19:47 UTC)
Platform Info:
OS: Linux (x86_64-redhat-linux)
CPU: Intel(R) Core(TM) i9-7900X CPU @ 3.30GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.0 (ORCJIT, skylake)
Currently, SIMD.jl is broken on master: https://github.com/JuliaLang/julia/issues/27694
So pardon this work around:
struct Vec{N,T} <: AbstractVector{T}
# data::NTuple{N,Core.VecElement{T}}
data::NTuple{N,T}
Vec(data::Vararg{T,N}) where {N,T} = new{N,T}(data)
end
@inline Base.getindex(x::Vec, i) = x.data[i]
@inline Base.length(::Vec{N}) where N = N
@inline Base.size(::Vec{N}) where N = (N,)
@inline Base.eltype(::Vec{N,T}) where {N,T} = T
@inline function vload(::Type{Vec{N,T}}, x::AbstractArray{T}, i) where {N,T}
unsafe_load(Base.unsafe_convert(Ptr{Vec{N,T}}, pointer(x)) + sizeof(T)*i)
end
@inline function vstore!(x::AbstractArray{T}, v::Vec{N,T}, i) where {N,T}
unsafe_store!(Base.unsafe_convert(Ptr{Vec{N,T}}, pointer(x)) + sizeof(T)*i, v)
end
function create_quote()
q = quote
@inbounds @fastmath begin
$(Expr(:meta, :inline))
Vec()
end
end
q, q.args[2].args[3].args[3].args[4].args
end
@generated function Base.fma(x::Vec{N,T}, y::T, z::Vec{N,T}) where {N,T}
q, qa = create_quote()
for n ∈ 1:N
push!(qa, :(x[$n] * y + z[$n]))
end
q
end
LLVM is good enough at autovectorizing that we don’t really need to be explicit anyway.
Here is a new version that uses the above functions; I probably should have just called it Syrk:
@generated function add_mggt_ut_only!(Ke, gradN, mult, ::Val{mdim}) where mdim
L = 8
quote
@assert size(Ke, 1) == size(Ke, 2)
Kedim = size(Ke, 1)
# nne, mdim = size(gradN)
V = Vec{$L,Float64}
SIMD_loop_max = Kedim ÷ $L * $L -1
@inbounds for c = 1:Kedim# ÷ $L * $L
Base.Cartesian.@nexprs $mdim m -> gradNc_m = gradN[c, m] * mult
for r = 0:$L:min(c-1,SIMD_loop_max)
Ktemp = vload(V, Ke, r + (c-1)*Kedim)
Base.Cartesian.@nexprs $mdim m -> (Ktemp = fma(vload(V, gradN, r+(m-1)*Kedim), gradNc_m, Ktemp))
vstore!(Ke, Ktemp, r + (c-1)*Kedim)
end
end
remainder = Kedim % $L
if remainder > 0
bound = Kedim-remainder+1
@inbounds for c = bound:Kedim
Base.Cartesian.@nexprs $mdim m -> gradNc_m = gradN[c, m] * mult
for r = bound:c
Ktemp = Ke[r,c]
Base.Cartesian.@nexprs $mdim m -> (Ktemp = fma(gradN[r,m], gradNc_m, Ktemp))
Ke[r,c] = Ktemp
end
end
end
return true
end
end
It’s kind of naive, and I wrote it assuming that mdim
is very small, For example, 3.
It also gives jagged results.
The upper triangle is complete, but there’s lots of stuff extending into the lower triangle. The numbers there aren’t wrong, I just didn’t bother continuing in the lower triangle any further beyond completing a vector.
It’s the version taking a Val
argument, for the number of columns in gradN
:
julia> using LinearAlgebra, BenchmarkTools
julia> mult = 1.4
1.4
julia> gradN = rand(64, 3);
julia> Ke = fill(0.0, size(gradN, 1), size(gradN, 1));
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult);
4.397 μs (0 allocations: 0 bytes)
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult, Val(3));
688.748 ns (0 allocations: 0 bytes)
julia> @btime BLAS.syrk!('U', 'N', $mult, $gradN, 0.0, $Ke);
1.425 μs (0 allocations: 0 bytes)
julia> @btime BLAS.gemm!('N', 'T', $mult, $gradN, $gradN, 0.0, $Ke);
713.936 ns (0 allocations: 0 bytes)
julia> gradN = rand(63, 3);
julia> Ke = fill(0.0, size(gradN, 1), size(gradN, 1));
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult);
4.262 μs (0 allocations: 0 bytes)
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult, Val(3));
780.188 ns (0 allocations: 0 bytes)
julia> @btime BLAS.syrk!('U', 'N', $mult, $gradN, 0.0, $Ke);
1.555 μs (0 allocations: 0 bytes)
julia> @btime BLAS.gemm!('N', 'T', $mult, $gradN, $gradN, 0.0, $Ke);
907.872 ns (0 allocations: 0 bytes)
julia> gradN = rand(65, 3);
julia> Ke = fill(0.0, size(gradN, 1), size(gradN, 1));
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult);
4.521 μs (0 allocations: 0 bytes)
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult, Val(3));
853.724 ns (0 allocations: 0 bytes)
julia> @btime BLAS.syrk!('U', 'N', $mult, $gradN, 0.0, $Ke);
1.679 μs (0 allocations: 0 bytes)
julia> @btime BLAS.gemm!('N', 'T', $mult, $gradN, $gradN, 0.0, $Ke);
1.094 μs (0 allocations: 0 bytes)
EDIT:
Comparing to OpenBLAS:
julia> mult = 1.4
1.4
julia> gradN = rand(64, 3);
julia> Ke = fill(0.0, size(gradN, 1), size(gradN, 1));
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult);
4.396 μs (0 allocations: 0 bytes)
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult, Val(3));
741.113 ns (0 allocations: 0 bytes)
julia> @btime BLAS.syrk!('U', 'N', $mult, $gradN, 0.0, $Ke);
2.922 μs (0 allocations: 0 bytes)
julia> @btime BLAS.gemm!('N', 'T', $mult, $gradN, $gradN, 0.0, $Ke);
2.778 μs (0 allocations: 0 bytes)
julia> gradN = rand(63, 3);
julia> Ke = fill(0.0, size(gradN, 1), size(gradN, 1));
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult);
4.255 μs (0 allocations: 0 bytes)
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult, Val(3));
745.946 ns (0 allocations: 0 bytes)
julia> @btime BLAS.syrk!('U', 'N', $mult, $gradN, 0.0, $Ke);
2.993 μs (0 allocations: 0 bytes)
julia> @btime BLAS.gemm!('N', 'T', $mult, $gradN, $gradN, 0.0, $Ke);
2.909 μs (0 allocations: 0 bytes)
julia> gradN = rand(65, 3);
julia> Ke = fill(0.0, size(gradN, 1), size(gradN, 1));
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult);
4.522 μs (0 allocations: 0 bytes)
julia> @btime add_mggt_ut_only!($Ke, $gradN, $mult, Val(3));
705.270 ns (0 allocations: 0 bytes)
julia> @btime BLAS.syrk!('U', 'N', $mult, $gradN, 0.0, $Ke);
3.076 μs (0 allocations: 0 bytes)
julia> @btime BLAS.gemm!('N', 'T', $mult, $gradN, $gradN, 0.0, $Ke);
2.996 μs (0 allocations: 0 bytes)