Julia one third slower than ccall-ing `@code_native` assembly compiled with gcc?


#1

I’m working on a series of blog posts about trying to optimize matrix multiplication in Julia.
This is on:

julia> versioninfo()
Julia Version 1.0.0-DEV.0
Commit ce2aa22d47 (2018-08-02 23:11 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: Intel(R) Core(TM) i3-4010U CPU @ 1.70GHz
  WORD_SIZE: 64
  LIBM: libimf
  LLVM: libLLVM-6.0.0 (ORCJIT, haswell)
Environment:
  JULIA_NUM_THREADS = 4

Consider this function:

using SIMD, StaticArrays, Base.Cartesian, LinearAlgebra
@generated function genkernel!(D::MMatrix{M,P,Float64},
                A::MMatrix{M,N,Float64},
                X::MMatrix{N,P,Float64}) where {M,N,P}
    quote
    pD, pA = pointer(D), pointer(A)
    V = Vec{$M,Float64}
    vA = vload(V, pA)
    @inbounds begin
        @nexprs $P p -> Dx_p = vA * V(X[1,p])
        # for n ∈ 1:N-1
        @nexprs $(N-1) n -> begin # Not better than a for loop!!!
            vA = vload(V, pA + 64n) # 8 bytes/element * 8 elements/column
            @nexprs $P p -> Dx_p = fma(vA, V(X[n+1,p]), Dx_p)
        end
    end
    @nexprs $P p -> vstore(Dx_p, pD + 64p-64)
    D
    end
end

(It’s better to use the for loop “n ∈ 1:N” than “@nexprs $(N-1) n -> begin”, but a for loop was a little complicated for me to get to compile correctly, because I had to switch the names of things like %rax and %rdx.)

This gives,

julia> D = @MMatrix randn(8,6);

julia> A = @MMatrix randn(8,16);

julia> X = @MMatrix randn(16,6);

julia> genkernel!(D, A, X)
8×6 MArray{Tuple{8,6},Float64,2,48}:
  2.67373     3.78727     2.35039    0.520375   2.35468    3.25158 
  5.32136     1.59395    -6.38805    0.25491    4.62566    3.32113 
 -1.12869    -2.66418    -6.36596    2.30542    2.53305    3.01892 
 -2.1665      0.206013   -9.1473    -7.42947    0.693025  -4.36451 
 -2.19986     0.524508    9.07341    6.09982   -3.33561    1.10499 
 -4.90321     0.0642439   0.541991  -1.03617   -0.800158  -0.469398
 -0.720618    8.21422     6.17857    1.12147   -2.51622   -1.84389 
 -1.09624   -10.2427     -1.39919   -1.55714    4.05241    4.56794 

julia> mul!(D, A, X)
8×6 MArray{Tuple{8,6},Float64,2,48}:
  2.67373     3.78727     2.35039    0.520375   2.35468    3.25158 
  5.32136     1.59395    -6.38805    0.25491    4.62566    3.32113 
 -1.12869    -2.66418    -6.36596    2.30542    2.53305    3.01892 
 -2.1665      0.206013   -9.1473    -7.42947    0.693025  -4.36451 
 -2.19986     0.524508    9.07341    6.09982   -3.33561    1.10499 
 -4.90321     0.0642439   0.541991  -1.03617   -0.800158  -0.469398
 -0.720618    8.21422     6.17857    1.12147   -2.51622   -1.84389 
 -1.09624   -10.2427     -1.39919   -1.55714    4.05241    4.56794 

julia> @benchmark genkernel!($D, $A, $X)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     87.963 ns (0.00% GC)
  median time:      87.978 ns (0.00% GC)
  mean time:        88.975 ns (0.00% GC)
  maximum time:     191.660 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     960

Now, using @code_native genkernel!(D, A, X), copy and pasting the code into an editor and cleaning it up (deleting all the lines starting with ;, renaming registers like rax and rdx to match the pattern of code compiled with gcc using -S, and also copy and pasting the end – which is why the end says “GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0”, so I can get it to compile correctly):

	.file	"jmulkernel.s"
	.text
	.p2align 4,,15
	.globl	jmulkernel
	.type	jmulkernel, @function
jmulkernel:
.LFB0:
	.cfi_startproc
	vbroadcastsd	(%rdx), %ymm1
	vmovupd	(%rsi), %ymm13
	vmovupd	32(%rsi), %ymm12
	vmovupd	64(%rsi), %ymm10
	vmovupd	96(%rsi), %ymm11
	vmulpd	%ymm1, %ymm12, %ymm0
	vmulpd	%ymm1, %ymm13, %ymm1
	vbroadcastsd	128(%rdx), %ymm3
	vmulpd	%ymm3, %ymm12, %ymm2
	vmulpd	%ymm3, %ymm13, %ymm3
	vbroadcastsd	256(%rdx), %ymm5
	vmulpd	%ymm5, %ymm12, %ymm4
	vmulpd	%ymm5, %ymm13, %ymm5
	vbroadcastsd	384(%rdx), %ymm7
	vmulpd	%ymm7, %ymm12, %ymm6
	vmulpd	%ymm7, %ymm13, %ymm7
	vbroadcastsd	512(%rdx), %ymm9
	vmulpd	%ymm9, %ymm12, %ymm8
	vmulpd	%ymm9, %ymm13, %ymm9
	vbroadcastsd	640(%rdx), %ymm14
	vmulpd	%ymm14, %ymm12, %ymm12
	vmulpd	%ymm14, %ymm13, %ymm13
	vbroadcastsd	8(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm1
	vfmadd231pd	%ymm14, %ymm11, %ymm0
	vbroadcastsd	136(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm3
	vfmadd231pd	%ymm14, %ymm11, %ymm2
	vbroadcastsd	264(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm5
	vfmadd231pd	%ymm14, %ymm11, %ymm4
	vbroadcastsd	392(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm7
	vfmadd231pd	%ymm14, %ymm11, %ymm6
	vbroadcastsd	520(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm9
	vfmadd231pd	%ymm14, %ymm11, %ymm8
	vbroadcastsd	648(%rdx), %ymm14
	vfmadd213pd	%ymm13, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm11, %ymm12
	vmovupd	128(%rsi), %ymm13
	vmovupd	160(%rsi), %ymm11
	vbroadcastsd	16(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	144(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	272(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	400(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	528(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	656(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	224(%rsi), %ymm13
	vmovupd	192(%rsi), %ymm12
	vbroadcastsd	24(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	152(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	280(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	408(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	536(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	664(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	256(%rsi), %ymm13
	vmovupd	288(%rsi), %ymm10
	vbroadcastsd	32(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	160(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	288(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	416(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	544(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	672(%rdx), %ymm14
	vfmadd213pd	%ymm11, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm13, %ymm12
	vmovupd	352(%rsi), %ymm13
	vmovupd	320(%rsi), %ymm11
	vbroadcastsd	40(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	168(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	296(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	424(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	552(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	680(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	384(%rsi), %ymm13
	vmovupd	416(%rsi), %ymm12
	vbroadcastsd	48(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	176(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	304(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	432(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	560(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	688(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	480(%rsi), %ymm13
	vmovupd	448(%rsi), %ymm10
	vbroadcastsd	56(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	184(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	312(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	440(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	568(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	696(%rdx), %ymm14
	vfmadd213pd	%ymm11, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm13, %ymm12
	vmovupd	512(%rsi), %ymm13
	vmovupd	544(%rsi), %ymm11
	vbroadcastsd	64(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	192(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	320(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	448(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	576(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	704(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	608(%rsi), %ymm13
	vmovupd	576(%rsi), %ymm12
	vbroadcastsd	72(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	200(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	328(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	456(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	584(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	712(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	640(%rsi), %ymm13
	vmovupd	672(%rsi), %ymm10
	vbroadcastsd	80(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	208(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	336(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	464(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	592(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	720(%rdx), %ymm14
	vfmadd213pd	%ymm11, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm13, %ymm12
	vmovupd	736(%rsi), %ymm13
	vmovupd	704(%rsi), %ymm11
	vbroadcastsd	88(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	216(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	344(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	472(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	600(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	728(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	768(%rsi), %ymm13
	vmovupd	800(%rsi), %ymm12
	vbroadcastsd	96(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	224(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	352(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	480(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	608(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	736(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	864(%rsi), %ymm13
	vmovupd	832(%rsi), %ymm10
	vbroadcastsd	104(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	232(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	360(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	488(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	616(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	744(%rdx), %ymm14
	vfmadd213pd	%ymm11, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm13, %ymm12
	vmovupd	896(%rsi), %ymm13
	vmovupd	928(%rsi), %ymm11
	vbroadcastsd	112(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	240(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	368(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	496(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	624(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	752(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	992(%rsi), %ymm13
	vmovupd	960(%rsi), %ymm12
	vbroadcastsd	120(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	248(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	376(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	504(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	632(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	760(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	%ymm0, 32(%rdi)
	vmovupd	%ymm1, (%rdi)
	vmovupd	%ymm2, 96(%rdi)
	vmovupd	%ymm3, 64(%rdi)
	vmovupd	%ymm4, 160(%rdi)
	vmovupd	%ymm5, 128(%rdi)
	vmovupd	%ymm6, 224(%rdi)
	vmovupd	%ymm7, 192(%rdi)
	vmovupd	%ymm8, 288(%rdi)
	vmovupd	%ymm9, 256(%rdi)
	vmovupd	%ymm11, 352(%rdi)
	vmovupd	%ymm12, 320(%rdi)
	vzeroupper
	ret
	.cfi_endproc
.LFE0:
	.size	jmulkernel, .-jmulkernel
    .ident	"GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0"
    .section	.note.GNU-stack,"",@progbits

I compiled this with gcc -shared -fPIC jmulkernel.s -o libjmulkernel.so, where jmulkernel.s is the filename.
Now,

julia> asmpath = "path/to/the/file"

julia> const jmullib = joinpath(asmpath, "libjmulkernel.so")
"/home/chris/Documents/progwork/fortran/libjmulkernel.so"

julia> function jmul!(D::MMatrix{8,6,Float64},A::MMatrix{8,16,Float64},X::MMatrix{16,6,Float64})
           ccall((:jmulkernel, jmullib), Cvoid, (Ptr{Float64},Ptr{Float64},Ptr{Float64}), D, A, X)
           D
       end
jmul! (generic function with 1 method)

julia> D .= 0;

julia> jmul!(D, A, X)
8×6 MArray{Tuple{8,6},Float64,2,48}:
  2.67373     3.78727     2.35039    0.520375   2.35468    3.25158 
  5.32136     1.59395    -6.38805    0.25491    4.62566    3.32113 
 -1.12869    -2.66418    -6.36596    2.30542    2.53305    3.01892 
 -2.1665      0.206013   -9.1473    -7.42947    0.693025  -4.36451 
 -2.19986     0.524508    9.07341    6.09982   -3.33561    1.10499 
 -4.90321     0.0642439   0.541991  -1.03617   -0.800158  -0.469398
 -0.720618    8.21422     6.17857    1.12147   -2.51622   -1.84389 
 -1.09624   -10.2427     -1.39919   -1.55714    4.05241    4.56794 

julia> @benchmark jmul!($D, $A, $X)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     65.779 ns (0.00% GC)
  median time:      66.399 ns (0.00% GC)
  mean time:        67.271 ns (0.00% GC)
  maximum time:     242.758 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     979

That is roughly one third faster!
Any idea why this may be?

This is important, because the above is almost good enough to match MKL if we scale it up:

julia> BLAS.vendor()
:mkl

julia> Dbig = randn(800,600);

julia> Abig = randn(800,1600);

julia> Xbig = randn(1600,600);

julia> BLAS.set_num_threads(1)

julia> @benchmark mul!($Dbig, $Abig, $Xbig)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     65.495 ms (0.00% GC)
  median time:      65.948 ms (0.00% GC)
  mean time:        66.375 ms (0.00% GC)
  maximum time:     72.479 ms (0.00% GC)
  --------------
  samples:          76
  evals/sample:     1

Is there any reason why simply running code should by about one third slower than taking the output of @code_native, compiling that assembly, and then using ccall?
Is there some way I can get that performance directly with Julia?

I hadn’t noticed this before, so I could try checking out other commits and seeing how reproducible this is.


We can write an optimized BLAS library in pure Julia (please skip OP and jump to post 4)