How to make SIMD work?


#1

I’m trying follow this Demystifying Auto-Vectorization in Julia blog post. I couldn’t see the nice vaddpd instructions, however. Is there something that I need to do to enable SIMD? The E5-2699 v4 processor supports AVX2.

$ julia -O3

julia> versioninfo()
Julia Version 0.6.2
Commit d386e40c17 (2017-12-13 18:08 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas64_
  LIBM: libopenlibm
  LLVM: libLLVM-3.9.1 (ORCJIT, broadwell)

julia> function mysum(a::Vector)
           total = zero(eltype(a))
           @simd for x in a
               total += x
           end
           return total
       end
mysum (generic function with 1 method)

julia> @code_native mysum(rand(Float64 , 100000)) ;
        .text
Filename: REPL[8]
Source line: 67
        movq    8(%rdi), %rax
Source line: 64
        movq    24(%rdi), %rdx
        xorl    %ecx, %ecx
Source line: 79
        testq   %rdx, %rdx
        cmovnsq %rdx, %rcx
Source line: 68
        testq   %rax, %rax
        jle     L206
Source line: 79
        leaq    -1(%rcx), %rsi
        cmpq    %rdx, %rsi
        jae     L210
Source line: 50
        movq    (%rdi), %r9
Source line: 66
        leaq    16(%r9), %r8
        movq    %rax, %r10
        andq    $-4, %r10
        pxor    %xmm0, %xmm0
        xorl    %edi, %edi
        jmp     L192
Source line: 71
L64:
        testq   %rax, %rax
        jle     L192
Source line: 50
        cmpq    $4, %rax
        jae     L79
        xorl    %edx, %edx
        jmp     L165
L79:
        movq    %rax, %rdx
        andq    $-4, %rdx
        je      L163
        movq    %xmm0, %xmm0            # xmm0 = xmm0[0],zero
        xorpd   %xmm1, %xmm1
Source line: 74
        movq    %r10, %rsi
        movq    %r8, %rcx
        nopw    %cs:(%rax,%rax)
Source line: 50
L112:
        movupd  -16(%rcx), %xmm2
        movupd  (%rcx), %xmm3
Source line: 4
        addpd   %xmm2, %xmm0
        addpd   %xmm3, %xmm1
Source line: 50
        addq    $32, %rcx
        addq    $-4, %rsi
        jne     L112
Source line: 4
        addpd   %xmm0, %xmm1
        movapd  %xmm1, %xmm0
        shufpd  $1, %xmm0, %xmm0        # xmm0 = xmm0[1,0]
        addpd   %xmm1, %xmm0
        cmpq    %rdx, %rax
        je      L192
        jmp     L165
L163:
        xorl    %edx, %edx
Source line: 50
L165:
        movq    %rax, %rcx
        subq    %rdx, %rcx
        leaq    (%r9,%rdx,8), %rdx
        nop
Source line: 4
L176:
        addsd   (%rdx), %xmm0
Source line: 71
        addq    $8, %rdx
        decq    %rcx
        jne     L176
        nopl    (%rax)
Source line: 66
L192:
        incq    %rdi
        cmpq    $2, %rdi
        jne     L64
Source line: 6
        retq
L206:
        xorps   %xmm0, %xmm0
        retq
L210:
        pushq   %rbp
        movq    %rsp, %rbp
Source line: 79
        movq    %rsp, %rax
        leaq    -16(%rax), %rsi
        movq    %rsi, %rsp
        movq    %rcx, -16(%rax)
        movabsq $jl_bounds_error_ints, %rax
        movl    $1, %edx
        callq   *%rax
        nopw    %cs:(%rax,%rax)

#2

Have you rebuilt the system image for your machine (gotcha #7 here)?


#4

Gotcha #7 got me :slight_smile:
Thanks


#5

A graphical representation of the @simd + AVX2 effect

using BenchmarkTools
using Plots

pyplot()

function mysum_orig(a::Vector)
    total = zero(eltype(a))
    @simd for x in a
        total += x
    end
    return total
end
function mysum_nosimd(a::Vector)
    total = zero(eltype(a))
    for x in a
        total += x
    end
    return total
end
function mysum_nosimd_index(a::Vector)
    total = zero(eltype(a))
    n=length(a)
    for i=1:n
        @inbounds total += a[i]
    end
    return total
end
function mysum_native(a::Vector)
    return sum(a)
end

function test_myfunction(myfunction, T::Type,n::Int64)
    a=rand(T , n)
    t=@belapsed s=$myfunction($a)
    gflops=n/(t*1.e9)
    print("GFlops=",gflops," (",string(myfunction),")\n")
    return gflops
end

# for MyFloat in [Float32,Float64]
const MyFloat=Float64
for f in [mysum_orig;mysum_nosimd;mysum_nosimd_index;mysum_native]
    s=10
    gflops=[]
    sizes=[]
    for n=1:21
        push!(gflops,test_myfunction(f,MyFloat,s))
        push!(sizes,s)
        s*=2
    end
    plot!(sizes,gflops,xscale = :log10,linewidth=4,ylabel="GFLOPS",xlabel="arraySize",label=string(string(f)," ",string(MyFloat)))
end

savefig("gflops.pdf")
@code_native(mysum_orig(rand(Float64,1000)))ype or paste code here

The index based version looks a bit different than the range version…
gflops