How to make Julia use AVX2 instructions?

How can I make Julia use AVX2 instructions for the following code?

kc = 2
Ac = rand(4*kc)
Bc = rand(kc*4)
AB = zeros(4*4)
function micro_ker!(AB, Ac, Bc, kc::Int, offsetA::Int, offsetB::Int)
    MR = 4
    NR = 4
    fill!(AB, zero(eltype(AB)))
    for k in 1:kc
        for j in 1:NR
            @inbounds @simd for i in 1:MR
                AB[i + (j-1)*MR] += Ac[offsetA+i] * Bc[offsetB+j]
            end
        end
        offsetA += MR
        offsetB += NR
    end
end
@code_native micro_ker!(AB, Ac, Bc, kc, 0, 0)

I got

julia> @code_native micro_ker!(AB, Ac, Bc, kc, 0, 0)
...
; Location: REPL[9]:8
; Function getindex; {
; Location: array.jl:731
        vmovsd  40(%rax), %xmm0         # xmm0 = mem[0],zero
        vmovsd  48(%rax), %xmm1         # xmm1 = mem[0],zero
        vmovsd  56(%rax), %xmm2         # xmm2 = mem[0],zero
        vmovsd  64(%rax), %xmm3         # xmm3 = mem[0],zero
        vmovsd  72(%rax), %xmm4         # xmm4 = mem[0],zero
        vmovsd  80(%rax), %xmm5         # xmm5 = mem[0],zero
        vmovsd  88(%rax), %xmm6         # xmm6 = mem[0],zero
;}}}
...

julia> versioninfo()
Julia Version 1.0.0
Commit 5d4eaca0c9 (2018-08-08 20:58 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-6.0.0 (ORCJIT, skylake)
Environment:
  JULIA_PKG3_PRECOMPILE = 1

julia>

But with a simple matmul!

julia> function matmul!(C,A,B)
           m, n = size(C)
           _m, k = size(A)
           _k, _n = size(B)
           @assert n == _n && _k == k && m == _m
           @inbounds for j in 1:n, k in 1:k, i in 1:m
               C[i, j] += A[i, k] * B[k, j]
           end
           C
       end
matmul! (generic function with 1 method)

I got

L608:
        vbroadcastsd    (%r11), %ymm0
        vmulpd  -96(%r14,%r9,8), %ymm0, %ymm1
        vmulpd  -64(%r14,%r9,8), %ymm0, %ymm2
        vmulpd  -32(%r14,%r9,8), %ymm0, %ymm3
        vmulpd  (%r14,%r9,8), %ymm0, %ymm0
;}
; Function +; {
; Location: float.jl:395
        vaddpd  -96(%rsi,%r9,8), %ymm1, %ymm1
        vaddpd  -64(%rsi,%r9,8), %ymm2, %ymm2
        vaddpd  -32(%rsi,%r9,8), %ymm3, %ymm3
        vaddpd  (%rsi,%r9,8), %ymm0, %ymm0
;}
; Function setindex!; {
; Location: array.jl:771
        vmovupd %ymm1, -96(%rsi,%r9,8)
        vmovupd %ymm2, -64(%rsi,%r9,8)
        vmovupd %ymm3, -32(%rsi,%r9,8)
        vmovupd %ymm0, (%rsi,%r9,8)
        addq    $16, %r9
        cmpq    %r9, %rcx
        jne     L608
1 Like

If you have anything specific in mind, explicit SIMD is easier than trying to seduce the compiler into behaving how you want.

I experimented with most of my usual tricks, but none seemed to work.
When you get to trying to unroll things with generated functions, that code really isn’t easier to write than just using SIMD, will probably be slower to compile, and is less reliable.

You also have to add @fastmath to get Julia to use vfmadd instructions.

2 Likes