Julia one third slower than ccall-ing `@code_native` assembly compiled with gcc?

I’m working on a series of blog posts about trying to optimize matrix multiplication in Julia.
This is on:

julia> versioninfo()
Julia Version 1.0.0-DEV.0
Commit ce2aa22d47 (2018-08-02 23:11 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: Intel(R) Core(TM) i3-4010U CPU @ 1.70GHz
  WORD_SIZE: 64
  LIBM: libimf
  LLVM: libLLVM-6.0.0 (ORCJIT, haswell)
Environment:
  JULIA_NUM_THREADS = 4

Consider this function:

using SIMD, StaticArrays, Base.Cartesian, LinearAlgebra
@generated function genkernel!(D::MMatrix{M,P,Float64},
                A::MMatrix{M,N,Float64},
                X::MMatrix{N,P,Float64}) where {M,N,P}
    quote
    pD, pA = pointer(D), pointer(A)
    V = Vec{$M,Float64}
    vA = vload(V, pA)
    @inbounds begin
        @nexprs $P p -> Dx_p = vA * V(X[1,p])
        # for n ∈ 1:N-1
        @nexprs $(N-1) n -> begin # Not better than a for loop!!!
            vA = vload(V, pA + 64n) # 8 bytes/element * 8 elements/column
            @nexprs $P p -> Dx_p = fma(vA, V(X[n+1,p]), Dx_p)
        end
    end
    @nexprs $P p -> vstore(Dx_p, pD + 64p-64)
    D
    end
end

(It’s better to use the for loop “n ∈ 1:N” than “@nexprs $(N-1) n → begin”, but a for loop was a little complicated for me to get to compile correctly, because I had to switch the names of things like %rax and %rdx.)

This gives,

julia> D = @MMatrix randn(8,6);

julia> A = @MMatrix randn(8,16);

julia> X = @MMatrix randn(16,6);

julia> genkernel!(D, A, X)
8×6 MArray{Tuple{8,6},Float64,2,48}:
  2.67373     3.78727     2.35039    0.520375   2.35468    3.25158 
  5.32136     1.59395    -6.38805    0.25491    4.62566    3.32113 
 -1.12869    -2.66418    -6.36596    2.30542    2.53305    3.01892 
 -2.1665      0.206013   -9.1473    -7.42947    0.693025  -4.36451 
 -2.19986     0.524508    9.07341    6.09982   -3.33561    1.10499 
 -4.90321     0.0642439   0.541991  -1.03617   -0.800158  -0.469398
 -0.720618    8.21422     6.17857    1.12147   -2.51622   -1.84389 
 -1.09624   -10.2427     -1.39919   -1.55714    4.05241    4.56794 

julia> mul!(D, A, X)
8×6 MArray{Tuple{8,6},Float64,2,48}:
  2.67373     3.78727     2.35039    0.520375   2.35468    3.25158 
  5.32136     1.59395    -6.38805    0.25491    4.62566    3.32113 
 -1.12869    -2.66418    -6.36596    2.30542    2.53305    3.01892 
 -2.1665      0.206013   -9.1473    -7.42947    0.693025  -4.36451 
 -2.19986     0.524508    9.07341    6.09982   -3.33561    1.10499 
 -4.90321     0.0642439   0.541991  -1.03617   -0.800158  -0.469398
 -0.720618    8.21422     6.17857    1.12147   -2.51622   -1.84389 
 -1.09624   -10.2427     -1.39919   -1.55714    4.05241    4.56794 

julia> @benchmark genkernel!($D, $A, $X)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     87.963 ns (0.00% GC)
  median time:      87.978 ns (0.00% GC)
  mean time:        88.975 ns (0.00% GC)
  maximum time:     191.660 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     960

Now, using @code_native genkernel!(D, A, X), copy and pasting the code into an editor and cleaning it up (deleting all the lines starting with ;, renaming registers like rax and rdx to match the pattern of code compiled with gcc using -S, and also copy and pasting the end – which is why the end says “GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0”, so I can get it to compile correctly):

	.file	"jmulkernel.s"
	.text
	.p2align 4,,15
	.globl	jmulkernel
	.type	jmulkernel, @function
jmulkernel:
.LFB0:
	.cfi_startproc
	vbroadcastsd	(%rdx), %ymm1
	vmovupd	(%rsi), %ymm13
	vmovupd	32(%rsi), %ymm12
	vmovupd	64(%rsi), %ymm10
	vmovupd	96(%rsi), %ymm11
	vmulpd	%ymm1, %ymm12, %ymm0
	vmulpd	%ymm1, %ymm13, %ymm1
	vbroadcastsd	128(%rdx), %ymm3
	vmulpd	%ymm3, %ymm12, %ymm2
	vmulpd	%ymm3, %ymm13, %ymm3
	vbroadcastsd	256(%rdx), %ymm5
	vmulpd	%ymm5, %ymm12, %ymm4
	vmulpd	%ymm5, %ymm13, %ymm5
	vbroadcastsd	384(%rdx), %ymm7
	vmulpd	%ymm7, %ymm12, %ymm6
	vmulpd	%ymm7, %ymm13, %ymm7
	vbroadcastsd	512(%rdx), %ymm9
	vmulpd	%ymm9, %ymm12, %ymm8
	vmulpd	%ymm9, %ymm13, %ymm9
	vbroadcastsd	640(%rdx), %ymm14
	vmulpd	%ymm14, %ymm12, %ymm12
	vmulpd	%ymm14, %ymm13, %ymm13
	vbroadcastsd	8(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm1
	vfmadd231pd	%ymm14, %ymm11, %ymm0
	vbroadcastsd	136(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm3
	vfmadd231pd	%ymm14, %ymm11, %ymm2
	vbroadcastsd	264(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm5
	vfmadd231pd	%ymm14, %ymm11, %ymm4
	vbroadcastsd	392(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm7
	vfmadd231pd	%ymm14, %ymm11, %ymm6
	vbroadcastsd	520(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm9
	vfmadd231pd	%ymm14, %ymm11, %ymm8
	vbroadcastsd	648(%rdx), %ymm14
	vfmadd213pd	%ymm13, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm11, %ymm12
	vmovupd	128(%rsi), %ymm13
	vmovupd	160(%rsi), %ymm11
	vbroadcastsd	16(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	144(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	272(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	400(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	528(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	656(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	224(%rsi), %ymm13
	vmovupd	192(%rsi), %ymm12
	vbroadcastsd	24(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	152(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	280(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	408(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	536(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	664(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	256(%rsi), %ymm13
	vmovupd	288(%rsi), %ymm10
	vbroadcastsd	32(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	160(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	288(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	416(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	544(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	672(%rdx), %ymm14
	vfmadd213pd	%ymm11, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm13, %ymm12
	vmovupd	352(%rsi), %ymm13
	vmovupd	320(%rsi), %ymm11
	vbroadcastsd	40(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	168(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	296(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	424(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	552(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	680(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	384(%rsi), %ymm13
	vmovupd	416(%rsi), %ymm12
	vbroadcastsd	48(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	176(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	304(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	432(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	560(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	688(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	480(%rsi), %ymm13
	vmovupd	448(%rsi), %ymm10
	vbroadcastsd	56(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	184(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	312(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	440(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	568(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	696(%rdx), %ymm14
	vfmadd213pd	%ymm11, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm13, %ymm12
	vmovupd	512(%rsi), %ymm13
	vmovupd	544(%rsi), %ymm11
	vbroadcastsd	64(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	192(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	320(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	448(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	576(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	704(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	608(%rsi), %ymm13
	vmovupd	576(%rsi), %ymm12
	vbroadcastsd	72(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	200(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	328(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	456(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	584(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	712(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	640(%rsi), %ymm13
	vmovupd	672(%rsi), %ymm10
	vbroadcastsd	80(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	208(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	336(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	464(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	592(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	720(%rdx), %ymm14
	vfmadd213pd	%ymm11, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm13, %ymm12
	vmovupd	736(%rsi), %ymm13
	vmovupd	704(%rsi), %ymm11
	vbroadcastsd	88(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	216(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	344(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	472(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	600(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	728(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	768(%rsi), %ymm13
	vmovupd	800(%rsi), %ymm12
	vbroadcastsd	96(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	224(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	352(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	480(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	608(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	736(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	864(%rsi), %ymm13
	vmovupd	832(%rsi), %ymm10
	vbroadcastsd	104(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	232(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	360(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	488(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	616(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm10, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	744(%rdx), %ymm14
	vfmadd213pd	%ymm11, %ymm14, %ymm10
	vfmadd231pd	%ymm14, %ymm13, %ymm12
	vmovupd	896(%rsi), %ymm13
	vmovupd	928(%rsi), %ymm11
	vbroadcastsd	112(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm0
	vfmadd231pd	%ymm14, %ymm13, %ymm1
	vbroadcastsd	240(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm2
	vfmadd231pd	%ymm14, %ymm13, %ymm3
	vbroadcastsd	368(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm4
	vfmadd231pd	%ymm14, %ymm13, %ymm5
	vbroadcastsd	496(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm6
	vfmadd231pd	%ymm14, %ymm13, %ymm7
	vbroadcastsd	624(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm11, %ymm8
	vfmadd231pd	%ymm14, %ymm13, %ymm9
	vbroadcastsd	752(%rdx), %ymm14
	vfmadd213pd	%ymm12, %ymm14, %ymm11
	vfmadd231pd	%ymm14, %ymm13, %ymm10
	vmovupd	992(%rsi), %ymm13
	vmovupd	960(%rsi), %ymm12
	vbroadcastsd	120(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm1
	vfmadd231pd	%ymm14, %ymm13, %ymm0
	vbroadcastsd	248(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm3
	vfmadd231pd	%ymm14, %ymm13, %ymm2
	vbroadcastsd	376(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm5
	vfmadd231pd	%ymm14, %ymm13, %ymm4
	vbroadcastsd	504(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm7
	vfmadd231pd	%ymm14, %ymm13, %ymm6
	vbroadcastsd	632(%rdx), %ymm14
	vfmadd231pd	%ymm14, %ymm12, %ymm9
	vfmadd231pd	%ymm14, %ymm13, %ymm8
	vbroadcastsd	760(%rdx), %ymm14
	vfmadd213pd	%ymm10, %ymm14, %ymm12
	vfmadd231pd	%ymm14, %ymm13, %ymm11
	vmovupd	%ymm0, 32(%rdi)
	vmovupd	%ymm1, (%rdi)
	vmovupd	%ymm2, 96(%rdi)
	vmovupd	%ymm3, 64(%rdi)
	vmovupd	%ymm4, 160(%rdi)
	vmovupd	%ymm5, 128(%rdi)
	vmovupd	%ymm6, 224(%rdi)
	vmovupd	%ymm7, 192(%rdi)
	vmovupd	%ymm8, 288(%rdi)
	vmovupd	%ymm9, 256(%rdi)
	vmovupd	%ymm11, 352(%rdi)
	vmovupd	%ymm12, 320(%rdi)
	vzeroupper
	ret
	.cfi_endproc
.LFE0:
	.size	jmulkernel, .-jmulkernel
    .ident	"GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0"
    .section	.note.GNU-stack,"",@progbits

I compiled this with gcc -shared -fPIC jmulkernel.s -o libjmulkernel.so, where jmulkernel.s is the filename.
Now,

julia> asmpath = "path/to/the/file"

julia> const jmullib = joinpath(asmpath, "libjmulkernel.so")
"/home/chris/Documents/progwork/fortran/libjmulkernel.so"

julia> function jmul!(D::MMatrix{8,6,Float64},A::MMatrix{8,16,Float64},X::MMatrix{16,6,Float64})
           ccall((:jmulkernel, jmullib), Cvoid, (Ptr{Float64},Ptr{Float64},Ptr{Float64}), D, A, X)
           D
       end
jmul! (generic function with 1 method)

julia> D .= 0;

julia> jmul!(D, A, X)
8×6 MArray{Tuple{8,6},Float64,2,48}:
  2.67373     3.78727     2.35039    0.520375   2.35468    3.25158 
  5.32136     1.59395    -6.38805    0.25491    4.62566    3.32113 
 -1.12869    -2.66418    -6.36596    2.30542    2.53305    3.01892 
 -2.1665      0.206013   -9.1473    -7.42947    0.693025  -4.36451 
 -2.19986     0.524508    9.07341    6.09982   -3.33561    1.10499 
 -4.90321     0.0642439   0.541991  -1.03617   -0.800158  -0.469398
 -0.720618    8.21422     6.17857    1.12147   -2.51622   -1.84389 
 -1.09624   -10.2427     -1.39919   -1.55714    4.05241    4.56794 

julia> @benchmark jmul!($D, $A, $X)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     65.779 ns (0.00% GC)
  median time:      66.399 ns (0.00% GC)
  mean time:        67.271 ns (0.00% GC)
  maximum time:     242.758 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     979

That is roughly one third faster!
Any idea why this may be?

This is important, because the above is almost good enough to match MKL if we scale it up:

julia> BLAS.vendor()
:mkl

julia> Dbig = randn(800,600);

julia> Abig = randn(800,1600);

julia> Xbig = randn(1600,600);

julia> BLAS.set_num_threads(1)

julia> @benchmark mul!($Dbig, $Abig, $Xbig)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     65.495 ms (0.00% GC)
  median time:      65.948 ms (0.00% GC)
  mean time:        66.375 ms (0.00% GC)
  maximum time:     72.479 ms (0.00% GC)
  --------------
  samples:          76
  evals/sample:     1

Is there any reason why simply running code should by about one third slower than taking the output of @code_native, compiling that assembly, and then using ccall?
Is there some way I can get that performance directly with Julia?

I hadn’t noticed this before, so I could try checking out other commits and seeing how reproducible this is.

7 Likes

I poked at this a bit today, but didn’t find the smoking gun…m but one doesn’t need to roundtrip through gcc.

using LLVM
using LLVM.Interop
f(D, A, X) = @asmcall("""
.align 0x20
vbroadcastsd	(%rdx), %ymm1
vmovupd	(%rsi), %ymm13
vmovupd	32(%rsi), %ymm12
vmovupd	64(%rsi), %ymm10
vmovupd	96(%rsi), %ymm11
vmulpd	%ymm1, %ymm12, %ymm0
vmulpd	%ymm1, %ymm13, %ymm1
vbroadcastsd	128(%rdx), %ymm3
vmulpd	%ymm3, %ymm12, %ymm2
vmulpd	%ymm3, %ymm13, %ymm3
vbroadcastsd	256(%rdx), %ymm5
vmulpd	%ymm5, %ymm12, %ymm4
vmulpd	%ymm5, %ymm13, %ymm5
vbroadcastsd	384(%rdx), %ymm7
vmulpd	%ymm7, %ymm12, %ymm6
vmulpd	%ymm7, %ymm13, %ymm7
vbroadcastsd	512(%rdx), %ymm9
vmulpd	%ymm9, %ymm12, %ymm8
vmulpd	%ymm9, %ymm13, %ymm9
vbroadcastsd	640(%rdx), %ymm14
vmulpd	%ymm14, %ymm12, %ymm12
vmulpd	%ymm14, %ymm13, %ymm13
vbroadcastsd	8(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm1
vfmadd231pd	%ymm14, %ymm11, %ymm0
vbroadcastsd	136(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm3
vfmadd231pd	%ymm14, %ymm11, %ymm2
vbroadcastsd	264(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm5
vfmadd231pd	%ymm14, %ymm11, %ymm4
vbroadcastsd	392(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm7
vfmadd231pd	%ymm14, %ymm11, %ymm6
vbroadcastsd	520(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm9
vfmadd231pd	%ymm14, %ymm11, %ymm8
vbroadcastsd	648(%rdx), %ymm14
vfmadd213pd	%ymm13, %ymm14, %ymm10
vfmadd231pd	%ymm14, %ymm11, %ymm12
vmovupd	128(%rsi), %ymm13
vmovupd	160(%rsi), %ymm11
vbroadcastsd	16(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm0
vfmadd231pd	%ymm14, %ymm13, %ymm1
vbroadcastsd	144(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm2
vfmadd231pd	%ymm14, %ymm13, %ymm3
vbroadcastsd	272(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm4
vfmadd231pd	%ymm14, %ymm13, %ymm5
vbroadcastsd	400(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm6
vfmadd231pd	%ymm14, %ymm13, %ymm7
vbroadcastsd	528(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm8
vfmadd231pd	%ymm14, %ymm13, %ymm9
vbroadcastsd	656(%rdx), %ymm14
vfmadd213pd	%ymm12, %ymm14, %ymm11
vfmadd231pd	%ymm14, %ymm13, %ymm10
vmovupd	224(%rsi), %ymm13
vmovupd	192(%rsi), %ymm12
vbroadcastsd	24(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm1
vfmadd231pd	%ymm14, %ymm13, %ymm0
vbroadcastsd	152(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm3
vfmadd231pd	%ymm14, %ymm13, %ymm2
vbroadcastsd	280(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm5
vfmadd231pd	%ymm14, %ymm13, %ymm4
vbroadcastsd	408(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm7
vfmadd231pd	%ymm14, %ymm13, %ymm6
vbroadcastsd	536(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm9
vfmadd231pd	%ymm14, %ymm13, %ymm8
vbroadcastsd	664(%rdx), %ymm14
vfmadd213pd	%ymm10, %ymm14, %ymm12
vfmadd231pd	%ymm14, %ymm13, %ymm11
vmovupd	256(%rsi), %ymm13
vmovupd	288(%rsi), %ymm10
vbroadcastsd	32(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm0
vfmadd231pd	%ymm14, %ymm13, %ymm1
vbroadcastsd	160(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm2
vfmadd231pd	%ymm14, %ymm13, %ymm3
vbroadcastsd	288(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm4
vfmadd231pd	%ymm14, %ymm13, %ymm5
vbroadcastsd	416(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm6
vfmadd231pd	%ymm14, %ymm13, %ymm7
vbroadcastsd	544(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm8
vfmadd231pd	%ymm14, %ymm13, %ymm9
vbroadcastsd	672(%rdx), %ymm14
vfmadd213pd	%ymm11, %ymm14, %ymm10
vfmadd231pd	%ymm14, %ymm13, %ymm12
vmovupd	352(%rsi), %ymm13
vmovupd	320(%rsi), %ymm11
vbroadcastsd	40(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm1
vfmadd231pd	%ymm14, %ymm13, %ymm0
vbroadcastsd	168(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm3
vfmadd231pd	%ymm14, %ymm13, %ymm2
vbroadcastsd	296(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm5
vfmadd231pd	%ymm14, %ymm13, %ymm4
vbroadcastsd	424(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm7
vfmadd231pd	%ymm14, %ymm13, %ymm6
vbroadcastsd	552(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm9
vfmadd231pd	%ymm14, %ymm13, %ymm8
vbroadcastsd	680(%rdx), %ymm14
vfmadd213pd	%ymm12, %ymm14, %ymm11
vfmadd231pd	%ymm14, %ymm13, %ymm10
vmovupd	384(%rsi), %ymm13
vmovupd	416(%rsi), %ymm12
vbroadcastsd	48(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm0
vfmadd231pd	%ymm14, %ymm13, %ymm1
vbroadcastsd	176(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm2
vfmadd231pd	%ymm14, %ymm13, %ymm3
vbroadcastsd	304(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm4
vfmadd231pd	%ymm14, %ymm13, %ymm5
vbroadcastsd	432(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm6
vfmadd231pd	%ymm14, %ymm13, %ymm7
vbroadcastsd	560(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm8
vfmadd231pd	%ymm14, %ymm13, %ymm9
vbroadcastsd	688(%rdx), %ymm14
vfmadd213pd	%ymm10, %ymm14, %ymm12
vfmadd231pd	%ymm14, %ymm13, %ymm11
vmovupd	480(%rsi), %ymm13
vmovupd	448(%rsi), %ymm10
vbroadcastsd	56(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm1
vfmadd231pd	%ymm14, %ymm13, %ymm0
vbroadcastsd	184(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm3
vfmadd231pd	%ymm14, %ymm13, %ymm2
vbroadcastsd	312(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm5
vfmadd231pd	%ymm14, %ymm13, %ymm4
vbroadcastsd	440(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm7
vfmadd231pd	%ymm14, %ymm13, %ymm6
vbroadcastsd	568(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm9
vfmadd231pd	%ymm14, %ymm13, %ymm8
vbroadcastsd	696(%rdx), %ymm14
vfmadd213pd	%ymm11, %ymm14, %ymm10
vfmadd231pd	%ymm14, %ymm13, %ymm12
vmovupd	512(%rsi), %ymm13
vmovupd	544(%rsi), %ymm11
vbroadcastsd	64(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm0
vfmadd231pd	%ymm14, %ymm13, %ymm1
vbroadcastsd	192(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm2
vfmadd231pd	%ymm14, %ymm13, %ymm3
vbroadcastsd	320(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm4
vfmadd231pd	%ymm14, %ymm13, %ymm5
vbroadcastsd	448(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm6
vfmadd231pd	%ymm14, %ymm13, %ymm7
vbroadcastsd	576(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm8
vfmadd231pd	%ymm14, %ymm13, %ymm9
vbroadcastsd	704(%rdx), %ymm14
vfmadd213pd	%ymm12, %ymm14, %ymm11
vfmadd231pd	%ymm14, %ymm13, %ymm10
vmovupd	608(%rsi), %ymm13
vmovupd	576(%rsi), %ymm12
vbroadcastsd	72(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm1
vfmadd231pd	%ymm14, %ymm13, %ymm0
vbroadcastsd	200(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm3
vfmadd231pd	%ymm14, %ymm13, %ymm2
vbroadcastsd	328(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm5
vfmadd231pd	%ymm14, %ymm13, %ymm4
vbroadcastsd	456(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm7
vfmadd231pd	%ymm14, %ymm13, %ymm6
vbroadcastsd	584(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm9
vfmadd231pd	%ymm14, %ymm13, %ymm8
vbroadcastsd	712(%rdx), %ymm14
vfmadd213pd	%ymm10, %ymm14, %ymm12
vfmadd231pd	%ymm14, %ymm13, %ymm11
vmovupd	640(%rsi), %ymm13
vmovupd	672(%rsi), %ymm10
vbroadcastsd	80(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm0
vfmadd231pd	%ymm14, %ymm13, %ymm1
vbroadcastsd	208(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm2
vfmadd231pd	%ymm14, %ymm13, %ymm3
vbroadcastsd	336(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm4
vfmadd231pd	%ymm14, %ymm13, %ymm5
vbroadcastsd	464(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm6
vfmadd231pd	%ymm14, %ymm13, %ymm7
vbroadcastsd	592(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm8
vfmadd231pd	%ymm14, %ymm13, %ymm9
vbroadcastsd	720(%rdx), %ymm14
vfmadd213pd	%ymm11, %ymm14, %ymm10
vfmadd231pd	%ymm14, %ymm13, %ymm12
vmovupd	736(%rsi), %ymm13
vmovupd	704(%rsi), %ymm11
vbroadcastsd	88(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm1
vfmadd231pd	%ymm14, %ymm13, %ymm0
vbroadcastsd	216(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm3
vfmadd231pd	%ymm14, %ymm13, %ymm2
vbroadcastsd	344(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm5
vfmadd231pd	%ymm14, %ymm13, %ymm4
vbroadcastsd	472(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm7
vfmadd231pd	%ymm14, %ymm13, %ymm6
vbroadcastsd	600(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm9
vfmadd231pd	%ymm14, %ymm13, %ymm8
vbroadcastsd	728(%rdx), %ymm14
vfmadd213pd	%ymm12, %ymm14, %ymm11
vfmadd231pd	%ymm14, %ymm13, %ymm10
vmovupd	768(%rsi), %ymm13
vmovupd	800(%rsi), %ymm12
vbroadcastsd	96(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm0
vfmadd231pd	%ymm14, %ymm13, %ymm1
vbroadcastsd	224(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm2
vfmadd231pd	%ymm14, %ymm13, %ymm3
vbroadcastsd	352(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm4
vfmadd231pd	%ymm14, %ymm13, %ymm5
vbroadcastsd	480(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm6
vfmadd231pd	%ymm14, %ymm13, %ymm7
vbroadcastsd	608(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm8
vfmadd231pd	%ymm14, %ymm13, %ymm9
vbroadcastsd	736(%rdx), %ymm14
vfmadd213pd	%ymm10, %ymm14, %ymm12
vfmadd231pd	%ymm14, %ymm13, %ymm11
vmovupd	864(%rsi), %ymm13
vmovupd	832(%rsi), %ymm10
vbroadcastsd	104(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm1
vfmadd231pd	%ymm14, %ymm13, %ymm0
vbroadcastsd	232(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm3
vfmadd231pd	%ymm14, %ymm13, %ymm2
vbroadcastsd	360(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm5
vfmadd231pd	%ymm14, %ymm13, %ymm4
vbroadcastsd	488(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm7
vfmadd231pd	%ymm14, %ymm13, %ymm6
vbroadcastsd	616(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm10, %ymm9
vfmadd231pd	%ymm14, %ymm13, %ymm8
vbroadcastsd	744(%rdx), %ymm14
vfmadd213pd	%ymm11, %ymm14, %ymm10
vfmadd231pd	%ymm14, %ymm13, %ymm12
vmovupd	896(%rsi), %ymm13
vmovupd	928(%rsi), %ymm11
vbroadcastsd	112(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm0
vfmadd231pd	%ymm14, %ymm13, %ymm1
vbroadcastsd	240(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm2
vfmadd231pd	%ymm14, %ymm13, %ymm3
vbroadcastsd	368(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm4
vfmadd231pd	%ymm14, %ymm13, %ymm5
vbroadcastsd	496(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm6
vfmadd231pd	%ymm14, %ymm13, %ymm7
vbroadcastsd	624(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm11, %ymm8
vfmadd231pd	%ymm14, %ymm13, %ymm9
vbroadcastsd	752(%rdx), %ymm14
vfmadd213pd	%ymm12, %ymm14, %ymm11
vfmadd231pd	%ymm14, %ymm13, %ymm10
vmovupd	992(%rsi), %ymm13
vmovupd	960(%rsi), %ymm12
vbroadcastsd	120(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm1
vfmadd231pd	%ymm14, %ymm13, %ymm0
vbroadcastsd	248(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm3
vfmadd231pd	%ymm14, %ymm13, %ymm2
vbroadcastsd	376(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm5
vfmadd231pd	%ymm14, %ymm13, %ymm4
vbroadcastsd	504(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm7
vfmadd231pd	%ymm14, %ymm13, %ymm6
vbroadcastsd	632(%rdx), %ymm14
vfmadd231pd	%ymm14, %ymm12, %ymm9
vfmadd231pd	%ymm14, %ymm13, %ymm8
vbroadcastsd	760(%rdx), %ymm14
vfmadd213pd	%ymm10, %ymm14, %ymm12
vfmadd231pd	%ymm14, %ymm13, %ymm11
vmovupd	%ymm0, 32(%rdi)
vmovupd	%ymm1, (%rdi)
vmovupd	%ymm2, 96(%rdi)
vmovupd	%ymm3, 64(%rdi)
vmovupd	%ymm4, 160(%rdi)
vmovupd	%ymm5, 128(%rdi)
vmovupd	%ymm6, 224(%rdi)
vmovupd	%ymm7, 192(%rdi)
vmovupd	%ymm8, 288(%rdi)
vmovupd	%ymm9, 256(%rdi)
vmovupd	%ymm11, 352(%rdi)
vmovupd	%ymm12, 320(%rdi)
vzeroupper
ret
""",  "", false, Nothing, Tuple{Ptr{Float64},Ptr{Float64},Ptr{Float64}}, pointer(D), pointer(A), pointer(X))

Inlining (via adding $(Expr(:meta, :inline)) to the quote), or defining the matrices as constants rather than interpolating them, helps.

julia> @benchmark genkernel!($D, $A, $X)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     60.377 ns (0.00% GC)
  median time:      61.173 ns (0.00% GC)
  mean time:        62.598 ns (0.00% GC)
  maximum time:     405.960 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     982

julia> @benchmark f($D, $A, $X)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     49.402 ns (0.00% GC)
  median time:      51.596 ns (0.00% GC)
  mean time:        52.376 ns (0.00% GC)
  maximum time:     100.278 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     987

julia> @benchmark inlinegenkernel!($D, $A, $X)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     49.991 ns (0.00% GC)
  median time:      51.657 ns (0.00% GC)
  mean time:        52.406 ns (0.00% GC)
  maximum time:     64.508 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     987

julia> const cD, cA, cX = D, A, X;

julia> @benchmark genkernel!(cD, cA, cX)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     51.201 ns (0.00% GC)
  median time:      52.481 ns (0.00% GC)
  mean time:        53.217 ns (0.00% GC)
  maximum time:     61.890 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     986

julia> @benchmark inlinegenkernel!(cD, cA, cX)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     47.508 ns (0.00% GC)
  median time:      49.262 ns (0.00% GC)
  mean time:        49.980 ns (0.00% GC)
  maximum time:     64.819 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     988

julia> @benchmark f(cD, cA, cX) #no segfault when running outside of benchmark

signal (11): Segmentation fault
in expression starting at no file:0
macro expansion at /home/chris/.julia/packages/LLVM/RyXLI/src/interop/base.jl:42 [inlined]
macro expansion at /home/chris/.julia/packages/LLVM/RyXLI/src/interop/asmcall.jl:7 [inlined]
_asmcall at /home/chris/.julia/packages/LLVM/RyXLI/src/interop/asmcall.jl:7 [inlined]
f at ./REPL[18]:1 [inlined]
##core#415 at /home/chris/.julia/packages/BenchmarkTools/dtwnm/src/execution.jl:293
##sample#416 at /home/chris/.julia/packages/BenchmarkTools/dtwnm/src/execution.jl:299
#_run#20 at /home/chris/.julia/packages/BenchmarkTools/dtwnm/src/execution.jl:327
unknown function (ip: 0x7faba5d2e45b)
jl_fptr_trampoline at /home/chris/Documents/prog/julia-dev2/src/gf.c:1843
jl_apply_generic at /home/chris/Documents/prog/julia-dev2/src/gf.c:2198

Not sure why the asm function segfaults in the benchmark, but not outside of it.

For anyone wanting to replicate, the asm is platform dependent and seems finicky (see above segfault). My earlier post was made on a Haswell i3, while with Ryzen I got segfaults until I reran @code_native and cleaned the output. It looked almost identical, except it lead with:

	movq	%rsi, -8(%rsp)
	movq	16(%rsi), %rcx
	movq	8(%rsi), %rdx
	movq	(%rsi), %rax

Then it worked for the non-constant arguments, but still segfaulted with the constants. I don’t know how to write asm, so perhaps I’m missing something obvious.

Reguarding speed differences, could something be going on with the dispatch?