I’m working on a series of blog posts about trying to optimize matrix multiplication in Julia.
This is on:
julia> versioninfo()
Julia Version 1.0.0-DEV.0
Commit ce2aa22d47 (2018-08-02 23:11 UTC)
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: Intel(R) Core(TM) i3-4010U CPU @ 1.70GHz
WORD_SIZE: 64
LIBM: libimf
LLVM: libLLVM-6.0.0 (ORCJIT, haswell)
Environment:
JULIA_NUM_THREADS = 4
Consider this function:
using SIMD, StaticArrays, Base.Cartesian, LinearAlgebra
@generated function genkernel!(D::MMatrix{M,P,Float64},
A::MMatrix{M,N,Float64},
X::MMatrix{N,P,Float64}) where {M,N,P}
quote
pD, pA = pointer(D), pointer(A)
V = Vec{$M,Float64}
vA = vload(V, pA)
@inbounds begin
@nexprs $P p -> Dx_p = vA * V(X[1,p])
# for n ∈ 1:N-1
@nexprs $(N-1) n -> begin # Not better than a for loop!!!
vA = vload(V, pA + 64n) # 8 bytes/element * 8 elements/column
@nexprs $P p -> Dx_p = fma(vA, V(X[n+1,p]), Dx_p)
end
end
@nexprs $P p -> vstore(Dx_p, pD + 64p-64)
D
end
end
(It’s better to use the for loop “n ∈ 1:N” than “@nexprs $(N-1) n → begin”, but a for loop was a little complicated for me to get to compile correctly, because I had to switch the names of things like %rax and %rdx.)
This gives,
julia> D = @MMatrix randn(8,6);
julia> A = @MMatrix randn(8,16);
julia> X = @MMatrix randn(16,6);
julia> genkernel!(D, A, X)
8×6 MArray{Tuple{8,6},Float64,2,48}:
2.67373 3.78727 2.35039 0.520375 2.35468 3.25158
5.32136 1.59395 -6.38805 0.25491 4.62566 3.32113
-1.12869 -2.66418 -6.36596 2.30542 2.53305 3.01892
-2.1665 0.206013 -9.1473 -7.42947 0.693025 -4.36451
-2.19986 0.524508 9.07341 6.09982 -3.33561 1.10499
-4.90321 0.0642439 0.541991 -1.03617 -0.800158 -0.469398
-0.720618 8.21422 6.17857 1.12147 -2.51622 -1.84389
-1.09624 -10.2427 -1.39919 -1.55714 4.05241 4.56794
julia> mul!(D, A, X)
8×6 MArray{Tuple{8,6},Float64,2,48}:
2.67373 3.78727 2.35039 0.520375 2.35468 3.25158
5.32136 1.59395 -6.38805 0.25491 4.62566 3.32113
-1.12869 -2.66418 -6.36596 2.30542 2.53305 3.01892
-2.1665 0.206013 -9.1473 -7.42947 0.693025 -4.36451
-2.19986 0.524508 9.07341 6.09982 -3.33561 1.10499
-4.90321 0.0642439 0.541991 -1.03617 -0.800158 -0.469398
-0.720618 8.21422 6.17857 1.12147 -2.51622 -1.84389
-1.09624 -10.2427 -1.39919 -1.55714 4.05241 4.56794
julia> @benchmark genkernel!($D, $A, $X)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 87.963 ns (0.00% GC)
median time: 87.978 ns (0.00% GC)
mean time: 88.975 ns (0.00% GC)
maximum time: 191.660 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 960
Now, using @code_native genkernel!(D, A, X)
, copy and pasting the code into an editor and cleaning it up (deleting all the lines starting with ;
, renaming registers like rax
and rdx
to match the pattern of code compiled with gcc using -S
, and also copy and pasting the end – which is why the end says “GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0”, so I can get it to compile correctly):
.file "jmulkernel.s"
.text
.p2align 4,,15
.globl jmulkernel
.type jmulkernel, @function
jmulkernel:
.LFB0:
.cfi_startproc
vbroadcastsd (%rdx), %ymm1
vmovupd (%rsi), %ymm13
vmovupd 32(%rsi), %ymm12
vmovupd 64(%rsi), %ymm10
vmovupd 96(%rsi), %ymm11
vmulpd %ymm1, %ymm12, %ymm0
vmulpd %ymm1, %ymm13, %ymm1
vbroadcastsd 128(%rdx), %ymm3
vmulpd %ymm3, %ymm12, %ymm2
vmulpd %ymm3, %ymm13, %ymm3
vbroadcastsd 256(%rdx), %ymm5
vmulpd %ymm5, %ymm12, %ymm4
vmulpd %ymm5, %ymm13, %ymm5
vbroadcastsd 384(%rdx), %ymm7
vmulpd %ymm7, %ymm12, %ymm6
vmulpd %ymm7, %ymm13, %ymm7
vbroadcastsd 512(%rdx), %ymm9
vmulpd %ymm9, %ymm12, %ymm8
vmulpd %ymm9, %ymm13, %ymm9
vbroadcastsd 640(%rdx), %ymm14
vmulpd %ymm14, %ymm12, %ymm12
vmulpd %ymm14, %ymm13, %ymm13
vbroadcastsd 8(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm1
vfmadd231pd %ymm14, %ymm11, %ymm0
vbroadcastsd 136(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm3
vfmadd231pd %ymm14, %ymm11, %ymm2
vbroadcastsd 264(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm5
vfmadd231pd %ymm14, %ymm11, %ymm4
vbroadcastsd 392(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm7
vfmadd231pd %ymm14, %ymm11, %ymm6
vbroadcastsd 520(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm9
vfmadd231pd %ymm14, %ymm11, %ymm8
vbroadcastsd 648(%rdx), %ymm14
vfmadd213pd %ymm13, %ymm14, %ymm10
vfmadd231pd %ymm14, %ymm11, %ymm12
vmovupd 128(%rsi), %ymm13
vmovupd 160(%rsi), %ymm11
vbroadcastsd 16(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm0
vfmadd231pd %ymm14, %ymm13, %ymm1
vbroadcastsd 144(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm2
vfmadd231pd %ymm14, %ymm13, %ymm3
vbroadcastsd 272(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm4
vfmadd231pd %ymm14, %ymm13, %ymm5
vbroadcastsd 400(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm6
vfmadd231pd %ymm14, %ymm13, %ymm7
vbroadcastsd 528(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm8
vfmadd231pd %ymm14, %ymm13, %ymm9
vbroadcastsd 656(%rdx), %ymm14
vfmadd213pd %ymm12, %ymm14, %ymm11
vfmadd231pd %ymm14, %ymm13, %ymm10
vmovupd 224(%rsi), %ymm13
vmovupd 192(%rsi), %ymm12
vbroadcastsd 24(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm1
vfmadd231pd %ymm14, %ymm13, %ymm0
vbroadcastsd 152(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm3
vfmadd231pd %ymm14, %ymm13, %ymm2
vbroadcastsd 280(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm5
vfmadd231pd %ymm14, %ymm13, %ymm4
vbroadcastsd 408(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm7
vfmadd231pd %ymm14, %ymm13, %ymm6
vbroadcastsd 536(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm9
vfmadd231pd %ymm14, %ymm13, %ymm8
vbroadcastsd 664(%rdx), %ymm14
vfmadd213pd %ymm10, %ymm14, %ymm12
vfmadd231pd %ymm14, %ymm13, %ymm11
vmovupd 256(%rsi), %ymm13
vmovupd 288(%rsi), %ymm10
vbroadcastsd 32(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm0
vfmadd231pd %ymm14, %ymm13, %ymm1
vbroadcastsd 160(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm2
vfmadd231pd %ymm14, %ymm13, %ymm3
vbroadcastsd 288(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm4
vfmadd231pd %ymm14, %ymm13, %ymm5
vbroadcastsd 416(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm6
vfmadd231pd %ymm14, %ymm13, %ymm7
vbroadcastsd 544(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm8
vfmadd231pd %ymm14, %ymm13, %ymm9
vbroadcastsd 672(%rdx), %ymm14
vfmadd213pd %ymm11, %ymm14, %ymm10
vfmadd231pd %ymm14, %ymm13, %ymm12
vmovupd 352(%rsi), %ymm13
vmovupd 320(%rsi), %ymm11
vbroadcastsd 40(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm1
vfmadd231pd %ymm14, %ymm13, %ymm0
vbroadcastsd 168(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm3
vfmadd231pd %ymm14, %ymm13, %ymm2
vbroadcastsd 296(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm5
vfmadd231pd %ymm14, %ymm13, %ymm4
vbroadcastsd 424(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm7
vfmadd231pd %ymm14, %ymm13, %ymm6
vbroadcastsd 552(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm9
vfmadd231pd %ymm14, %ymm13, %ymm8
vbroadcastsd 680(%rdx), %ymm14
vfmadd213pd %ymm12, %ymm14, %ymm11
vfmadd231pd %ymm14, %ymm13, %ymm10
vmovupd 384(%rsi), %ymm13
vmovupd 416(%rsi), %ymm12
vbroadcastsd 48(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm0
vfmadd231pd %ymm14, %ymm13, %ymm1
vbroadcastsd 176(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm2
vfmadd231pd %ymm14, %ymm13, %ymm3
vbroadcastsd 304(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm4
vfmadd231pd %ymm14, %ymm13, %ymm5
vbroadcastsd 432(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm6
vfmadd231pd %ymm14, %ymm13, %ymm7
vbroadcastsd 560(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm8
vfmadd231pd %ymm14, %ymm13, %ymm9
vbroadcastsd 688(%rdx), %ymm14
vfmadd213pd %ymm10, %ymm14, %ymm12
vfmadd231pd %ymm14, %ymm13, %ymm11
vmovupd 480(%rsi), %ymm13
vmovupd 448(%rsi), %ymm10
vbroadcastsd 56(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm1
vfmadd231pd %ymm14, %ymm13, %ymm0
vbroadcastsd 184(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm3
vfmadd231pd %ymm14, %ymm13, %ymm2
vbroadcastsd 312(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm5
vfmadd231pd %ymm14, %ymm13, %ymm4
vbroadcastsd 440(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm7
vfmadd231pd %ymm14, %ymm13, %ymm6
vbroadcastsd 568(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm9
vfmadd231pd %ymm14, %ymm13, %ymm8
vbroadcastsd 696(%rdx), %ymm14
vfmadd213pd %ymm11, %ymm14, %ymm10
vfmadd231pd %ymm14, %ymm13, %ymm12
vmovupd 512(%rsi), %ymm13
vmovupd 544(%rsi), %ymm11
vbroadcastsd 64(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm0
vfmadd231pd %ymm14, %ymm13, %ymm1
vbroadcastsd 192(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm2
vfmadd231pd %ymm14, %ymm13, %ymm3
vbroadcastsd 320(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm4
vfmadd231pd %ymm14, %ymm13, %ymm5
vbroadcastsd 448(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm6
vfmadd231pd %ymm14, %ymm13, %ymm7
vbroadcastsd 576(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm8
vfmadd231pd %ymm14, %ymm13, %ymm9
vbroadcastsd 704(%rdx), %ymm14
vfmadd213pd %ymm12, %ymm14, %ymm11
vfmadd231pd %ymm14, %ymm13, %ymm10
vmovupd 608(%rsi), %ymm13
vmovupd 576(%rsi), %ymm12
vbroadcastsd 72(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm1
vfmadd231pd %ymm14, %ymm13, %ymm0
vbroadcastsd 200(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm3
vfmadd231pd %ymm14, %ymm13, %ymm2
vbroadcastsd 328(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm5
vfmadd231pd %ymm14, %ymm13, %ymm4
vbroadcastsd 456(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm7
vfmadd231pd %ymm14, %ymm13, %ymm6
vbroadcastsd 584(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm9
vfmadd231pd %ymm14, %ymm13, %ymm8
vbroadcastsd 712(%rdx), %ymm14
vfmadd213pd %ymm10, %ymm14, %ymm12
vfmadd231pd %ymm14, %ymm13, %ymm11
vmovupd 640(%rsi), %ymm13
vmovupd 672(%rsi), %ymm10
vbroadcastsd 80(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm0
vfmadd231pd %ymm14, %ymm13, %ymm1
vbroadcastsd 208(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm2
vfmadd231pd %ymm14, %ymm13, %ymm3
vbroadcastsd 336(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm4
vfmadd231pd %ymm14, %ymm13, %ymm5
vbroadcastsd 464(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm6
vfmadd231pd %ymm14, %ymm13, %ymm7
vbroadcastsd 592(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm8
vfmadd231pd %ymm14, %ymm13, %ymm9
vbroadcastsd 720(%rdx), %ymm14
vfmadd213pd %ymm11, %ymm14, %ymm10
vfmadd231pd %ymm14, %ymm13, %ymm12
vmovupd 736(%rsi), %ymm13
vmovupd 704(%rsi), %ymm11
vbroadcastsd 88(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm1
vfmadd231pd %ymm14, %ymm13, %ymm0
vbroadcastsd 216(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm3
vfmadd231pd %ymm14, %ymm13, %ymm2
vbroadcastsd 344(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm5
vfmadd231pd %ymm14, %ymm13, %ymm4
vbroadcastsd 472(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm7
vfmadd231pd %ymm14, %ymm13, %ymm6
vbroadcastsd 600(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm9
vfmadd231pd %ymm14, %ymm13, %ymm8
vbroadcastsd 728(%rdx), %ymm14
vfmadd213pd %ymm12, %ymm14, %ymm11
vfmadd231pd %ymm14, %ymm13, %ymm10
vmovupd 768(%rsi), %ymm13
vmovupd 800(%rsi), %ymm12
vbroadcastsd 96(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm0
vfmadd231pd %ymm14, %ymm13, %ymm1
vbroadcastsd 224(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm2
vfmadd231pd %ymm14, %ymm13, %ymm3
vbroadcastsd 352(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm4
vfmadd231pd %ymm14, %ymm13, %ymm5
vbroadcastsd 480(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm6
vfmadd231pd %ymm14, %ymm13, %ymm7
vbroadcastsd 608(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm8
vfmadd231pd %ymm14, %ymm13, %ymm9
vbroadcastsd 736(%rdx), %ymm14
vfmadd213pd %ymm10, %ymm14, %ymm12
vfmadd231pd %ymm14, %ymm13, %ymm11
vmovupd 864(%rsi), %ymm13
vmovupd 832(%rsi), %ymm10
vbroadcastsd 104(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm1
vfmadd231pd %ymm14, %ymm13, %ymm0
vbroadcastsd 232(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm3
vfmadd231pd %ymm14, %ymm13, %ymm2
vbroadcastsd 360(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm5
vfmadd231pd %ymm14, %ymm13, %ymm4
vbroadcastsd 488(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm7
vfmadd231pd %ymm14, %ymm13, %ymm6
vbroadcastsd 616(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm10, %ymm9
vfmadd231pd %ymm14, %ymm13, %ymm8
vbroadcastsd 744(%rdx), %ymm14
vfmadd213pd %ymm11, %ymm14, %ymm10
vfmadd231pd %ymm14, %ymm13, %ymm12
vmovupd 896(%rsi), %ymm13
vmovupd 928(%rsi), %ymm11
vbroadcastsd 112(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm0
vfmadd231pd %ymm14, %ymm13, %ymm1
vbroadcastsd 240(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm2
vfmadd231pd %ymm14, %ymm13, %ymm3
vbroadcastsd 368(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm4
vfmadd231pd %ymm14, %ymm13, %ymm5
vbroadcastsd 496(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm6
vfmadd231pd %ymm14, %ymm13, %ymm7
vbroadcastsd 624(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm11, %ymm8
vfmadd231pd %ymm14, %ymm13, %ymm9
vbroadcastsd 752(%rdx), %ymm14
vfmadd213pd %ymm12, %ymm14, %ymm11
vfmadd231pd %ymm14, %ymm13, %ymm10
vmovupd 992(%rsi), %ymm13
vmovupd 960(%rsi), %ymm12
vbroadcastsd 120(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm1
vfmadd231pd %ymm14, %ymm13, %ymm0
vbroadcastsd 248(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm3
vfmadd231pd %ymm14, %ymm13, %ymm2
vbroadcastsd 376(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm5
vfmadd231pd %ymm14, %ymm13, %ymm4
vbroadcastsd 504(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm7
vfmadd231pd %ymm14, %ymm13, %ymm6
vbroadcastsd 632(%rdx), %ymm14
vfmadd231pd %ymm14, %ymm12, %ymm9
vfmadd231pd %ymm14, %ymm13, %ymm8
vbroadcastsd 760(%rdx), %ymm14
vfmadd213pd %ymm10, %ymm14, %ymm12
vfmadd231pd %ymm14, %ymm13, %ymm11
vmovupd %ymm0, 32(%rdi)
vmovupd %ymm1, (%rdi)
vmovupd %ymm2, 96(%rdi)
vmovupd %ymm3, 64(%rdi)
vmovupd %ymm4, 160(%rdi)
vmovupd %ymm5, 128(%rdi)
vmovupd %ymm6, 224(%rdi)
vmovupd %ymm7, 192(%rdi)
vmovupd %ymm8, 288(%rdi)
vmovupd %ymm9, 256(%rdi)
vmovupd %ymm11, 352(%rdi)
vmovupd %ymm12, 320(%rdi)
vzeroupper
ret
.cfi_endproc
.LFE0:
.size jmulkernel, .-jmulkernel
.ident "GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0"
.section .note.GNU-stack,"",@progbits
I compiled this with gcc -shared -fPIC jmulkernel.s -o libjmulkernel.so
, where jmulkernel.s
is the filename.
Now,
julia> asmpath = "path/to/the/file"
julia> const jmullib = joinpath(asmpath, "libjmulkernel.so")
"/home/chris/Documents/progwork/fortran/libjmulkernel.so"
julia> function jmul!(D::MMatrix{8,6,Float64},A::MMatrix{8,16,Float64},X::MMatrix{16,6,Float64})
ccall((:jmulkernel, jmullib), Cvoid, (Ptr{Float64},Ptr{Float64},Ptr{Float64}), D, A, X)
D
end
jmul! (generic function with 1 method)
julia> D .= 0;
julia> jmul!(D, A, X)
8×6 MArray{Tuple{8,6},Float64,2,48}:
2.67373 3.78727 2.35039 0.520375 2.35468 3.25158
5.32136 1.59395 -6.38805 0.25491 4.62566 3.32113
-1.12869 -2.66418 -6.36596 2.30542 2.53305 3.01892
-2.1665 0.206013 -9.1473 -7.42947 0.693025 -4.36451
-2.19986 0.524508 9.07341 6.09982 -3.33561 1.10499
-4.90321 0.0642439 0.541991 -1.03617 -0.800158 -0.469398
-0.720618 8.21422 6.17857 1.12147 -2.51622 -1.84389
-1.09624 -10.2427 -1.39919 -1.55714 4.05241 4.56794
julia> @benchmark jmul!($D, $A, $X)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 65.779 ns (0.00% GC)
median time: 66.399 ns (0.00% GC)
mean time: 67.271 ns (0.00% GC)
maximum time: 242.758 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 979
That is roughly one third faster!
Any idea why this may be?
This is important, because the above is almost good enough to match MKL if we scale it up:
julia> BLAS.vendor()
:mkl
julia> Dbig = randn(800,600);
julia> Abig = randn(800,1600);
julia> Xbig = randn(1600,600);
julia> BLAS.set_num_threads(1)
julia> @benchmark mul!($Dbig, $Abig, $Xbig)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 65.495 ms (0.00% GC)
median time: 65.948 ms (0.00% GC)
mean time: 66.375 ms (0.00% GC)
maximum time: 72.479 ms (0.00% GC)
--------------
samples: 76
evals/sample: 1
Is there any reason why simply running code should by about one third slower than taking the output of @code_native
, compiling that assembly, and then using ccall?
Is there some way I can get that performance directly with Julia?
I hadn’t noticed this before, so I could try checking out other commits and seeing how reproducible this is.