How can I make Julia use AVX2 instructions for the following code?
kc = 2
Ac = rand(4*kc)
Bc = rand(kc*4)
AB = zeros(4*4)
function micro_ker!(AB, Ac, Bc, kc::Int, offsetA::Int, offsetB::Int)
MR = 4
NR = 4
fill!(AB, zero(eltype(AB)))
for k in 1:kc
for j in 1:NR
@inbounds @simd for i in 1:MR
AB[i + (j-1)*MR] += Ac[offsetA+i] * Bc[offsetB+j]
end
end
offsetA += MR
offsetB += NR
end
end
@code_native micro_ker!(AB, Ac, Bc, kc, 0, 0)
I got
julia> @code_native micro_ker!(AB, Ac, Bc, kc, 0, 0)
...
; Location: REPL[9]:8
; Function getindex; {
; Location: array.jl:731
vmovsd 40(%rax), %xmm0 # xmm0 = mem[0],zero
vmovsd 48(%rax), %xmm1 # xmm1 = mem[0],zero
vmovsd 56(%rax), %xmm2 # xmm2 = mem[0],zero
vmovsd 64(%rax), %xmm3 # xmm3 = mem[0],zero
vmovsd 72(%rax), %xmm4 # xmm4 = mem[0],zero
vmovsd 80(%rax), %xmm5 # xmm5 = mem[0],zero
vmovsd 88(%rax), %xmm6 # xmm6 = mem[0],zero
;}}}
...
julia> versioninfo()
Julia Version 1.0.0
Commit 5d4eaca0c9 (2018-08-08 20:58 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-6.0.0 (ORCJIT, skylake)
Environment:
JULIA_PKG3_PRECOMPILE = 1
julia>
But with a simple matmul!
julia> function matmul!(C,A,B)
m, n = size(C)
_m, k = size(A)
_k, _n = size(B)
@assert n == _n && _k == k && m == _m
@inbounds for j in 1:n, k in 1:k, i in 1:m
C[i, j] += A[i, k] * B[k, j]
end
C
end
matmul! (generic function with 1 method)
I got
L608:
vbroadcastsd (%r11), %ymm0
vmulpd -96(%r14,%r9,8), %ymm0, %ymm1
vmulpd -64(%r14,%r9,8), %ymm0, %ymm2
vmulpd -32(%r14,%r9,8), %ymm0, %ymm3
vmulpd (%r14,%r9,8), %ymm0, %ymm0
;}
; Function +; {
; Location: float.jl:395
vaddpd -96(%rsi,%r9,8), %ymm1, %ymm1
vaddpd -64(%rsi,%r9,8), %ymm2, %ymm2
vaddpd -32(%rsi,%r9,8), %ymm3, %ymm3
vaddpd (%rsi,%r9,8), %ymm0, %ymm0
;}
; Function setindex!; {
; Location: array.jl:771
vmovupd %ymm1, -96(%rsi,%r9,8)
vmovupd %ymm2, -64(%rsi,%r9,8)
vmovupd %ymm3, -32(%rsi,%r9,8)
vmovupd %ymm0, (%rsi,%r9,8)
addq $16, %r9
cmpq %r9, %rcx
jne L608