Division isn't auto-vectorized when function inlined

I’m attempting to rewrite the benchmarks game spectralnorm implementation without relying on llvmcall for vectorization. I got close to the performance, but it requires awkwardly doing division in a helper function annotated with @noinline. I’m having trouble reading the generated llvm IR/assembly, but to me it looks like the division isn’t being vectorized to fdiv <2 x double> without the @noinline annotation.

Concretely, for the following program where main is called with “5500”, why is it roughly half the speed if the @noinline macro is removed from vecdiv, and what can I do about it?

A(i, j)::Float64 = (i + j) * (i + j + 1) ÷ 2 + i + 1
At(i, j) = A(j, i)

@noinline vecdiv(a, b, c, d) = (a / b, c / d)

function Av!(f, v, out)
    n = length(v)
    for i=1:n
        x = (0.0, 0.0)
        @inbounds for j=1:2:n
            x = x .+ vecdiv(v[j], f(i-1, j-1), v[j+1], f(i-1, j))
        end
        @inbounds out[i] = sum(x)
    end
end

function main(n)
    # This program is not compatible with odd values of n
    isodd(n) && (n += 1)

    u = ones(Float64, n)
    v = Vector{Float64}(undef, n)
    # temporary working vector w
    w = Vector{Float64}(undef, n)

    for _=1:10
        Av!(A, u, w)
        Av!(At, w, v)
        Av!(A, v, w)
        Av!(At, w, u)
    end

    uv = vv = 0.0
    @inbounds for i=1:n
        uv += u[i] * v[i]
        vv += v[i] * v[i]
    end
    sqrt(uv / vv)
end

(Speed also drops by factor of 2 if type annotation is removed from A)

Running your version…

julia> @btime main(1000)
  46.669 ms (3 allocations: 23.81 KiB)
1.274224148129483

julia> vecdiv(a, b, c, d) = (a / b, c / d)
vecdiv (generic function with 1 method)

julia> @btime main(1000)
  39.060 ms (3 allocations: 23.81 KiB)
1.274224148129483

Redefining a couple more things (doing the obvious of @simd to get simd):

A(i, j) = ((i + j) * (i + j + 1)) >>> 1 + i + 1.0
function Av!(f, v, out)
    n = length(v)
    for i=1:n
        x = (0.0, 0.0)
        @inbounds @simd for k=1:(n+1)>>>1;
            j = 2k - 1
            x = x .+ vecdiv(v[j], f(i-1, j-1), v[j+1], f(i-1, j))
        end
        @inbounds out[i] = sum(x)
    end
end

This yields:

julia> @btime main(1000)
  19.227 ms (3 allocations: 23.81 KiB)
1.274224148129483

Anyway, I should point out that AVX512 (which this computer has) can do SIMD Int64 → Float64 conversion, but AVX2 cannot.
Therefore, vectorizing the above code requires AVX512.

AVX can do SIMD Int32 → Float64, so I would recommend you try replacing the 64 bit integers with 32 bit.
The loop body:

L336:
        vpaddq  %zmm10, %zmm10, %zmm1
        vpaddq  %zmm24, %zmm1, %zmm14
        vporq   (%r15){1to8}, %zmm1, %zmm15
        vpaddq  (%r12){1to8}, %zmm1, %zmm16
        vmovq   %xmm1, %rax
        vmovq   %xmm14, %rcx
        vmovupd (%rsi,%rax,8), %zmm17
        vmovupd 64(%rsi,%rax,8), %zmm18
        vmovupd (%rsi,%rcx,8), %zmm19
        vmovupd 64(%rsi,%rcx,8), %zmm20
        vmovapd %zmm17, %zmm21
        vpermt2pd       %zmm18, %zmm25, %zmm21
        vmovapd %zmm19, %zmm22
        vpermt2pd       %zmm20, %zmm25, %zmm22
        vpermt2pd       %zmm18, %zmm4, %zmm17
        vpermt2pd       %zmm20, %zmm4, %zmm19
        vpaddq  %zmm8, %zmm1, %zmm18
        vpaddq  %zmm8, %zmm14, %zmm20
        vpaddq  %zmm9, %zmm1, %zmm1
        vpmullq %zmm1, %zmm18, %zmm1
        vpaddq  %zmm9, %zmm14, %zmm14
        vpmullq %zmm14, %zmm20, %zmm14
        vpsrlq  $1, %zmm1, %zmm1
        vpsrlq  $1, %zmm14, %zmm14
        vpaddq  %zmm8, %zmm1, %zmm1
        vpaddq  %zmm8, %zmm14, %zmm14
        vcvtqq2pd       %zmm1, %zmm1
        vcvtqq2pd       %zmm14, %zmm14
        vaddpd  %zmm5, %zmm1, %zmm1
        vdivpd  %zmm1, %zmm21, %zmm1
        vaddpd  %zmm1, %zmm11, %zmm11
        vaddpd  %zmm5, %zmm14, %zmm1
        vdivpd  %zmm1, %zmm22, %zmm1
        vaddpd  %zmm1, %zmm12, %zmm12
        vpaddq  %zmm8, %zmm15, %zmm1
        vpaddq  %zmm8, %zmm16, %zmm14
        vpaddq  %zmm9, %zmm15, %zmm15
        vpmullq %zmm15, %zmm1, %zmm1
        vpaddq  %zmm9, %zmm16, %zmm15
        vpmullq %zmm15, %zmm14, %zmm14
        vpsrlq  $1, %zmm1, %zmm1
        vpsrlq  $1, %zmm14, %zmm14
        vpaddq  %zmm8, %zmm1, %zmm1
        vpaddq  %zmm8, %zmm14, %zmm14
        vcvtqq2pd       %zmm1, %zmm1
        vcvtqq2pd       %zmm14, %zmm14
        vaddpd  %zmm5, %zmm1, %zmm1
        vdivpd  %zmm1, %zmm17, %zmm1
        vaddpd  %zmm1, %zmm6, %zmm6
        vaddpd  %zmm5, %zmm14, %zmm1
        vdivpd  %zmm1, %zmm19, %zmm1
        vaddpd  %zmm1, %zmm7, %zmm7
        vpaddq  %zmm24, %zmm10, %zmm10
        addq    $-16, %rbx
        jne     L336
        vaddpd  %zmm11, %zmm12, %zmm1
        vextractf64x4   $1, %zmm1, %ymm8
        vaddpd  %zmm8, %zmm1, %zmm1
        vextractf128    $1, %ymm1, %xmm8
        vaddpd  %zmm6, %zmm7, %zmm6
        vextractf64x4   $1, %zmm6, %ymm7
        vaddpd  %zmm7, %zmm6, %zmm6
        vextractf128    $1, %ymm6, %xmm7
        vaddpd  %xmm7, %xmm6, %xmm6
        vaddpd  %xmm8, %xmm1, %xmm1
        vhaddpd %xmm6, %xmm1, %xmm6
        cmpq    %rdx, %r11
        je      L208

Note the vcvtqq2pd instructions which require AVX512.
To make this fast on AVX as well (but placing a much smaller upper limit on allowed size of n, at 2^32, aka a little over 4 billion):

A(i, j) = ((i + j) * (i + j + one(UInt32))) >>> 1 + i + 1.0
function Av!(f, v, out)
    n = length(v)
    for i=one(UInt32):(n % UInt32)
        x = (0.0, 0.0)
        @inbounds @simd for k=one(UInt32):((n%UInt32)+one(UInt32))>>>1;
            j = k+k - one(UInt32)
            x = x .+ vecdiv(v[j], f(i-one(UInt32), j-one(UInt32)), v[j+one(UInt32)], f(i-one(UInt32), j))
        end
        @inbounds out[i] = sum(x)
    end
end

Now it’s about the same fast with AVX512, but should be much faster without it:

julia> @btime main(1000)
  19.585 ms (3 allocations: 23.81 KiB)
1.274224148129483

because it now uses vcvtdq2pd, which is available with SSE and AVX.

L336:
        vpaddd  %ymm13, %ymm13, %ymm2
        vpaddd  %ymm28, %ymm2, %ymm15
        vpord   (%r14){1to8}, %ymm2, %ymm16
        vpaddd  (%r15){1to8}, %ymm2, %ymm17
        vmovd   %xmm16, %ecx
        vmovd   %xmm17, %eax
        vmovupd -8(%rsi,%rcx,8), %zmm18
        vmovupd 56(%rsi,%rcx,8), %zmm19
        vmovupd -8(%rsi,%rax,8), %zmm20
        vmovupd 56(%rsi,%rax,8), %zmm21
        vmovapd %zmm18, %zmm22
        vpermt2pd       %zmm19, %zmm4, %zmm22
        vmovapd %zmm20, %zmm23
        vpermt2pd       %zmm21, %zmm4, %zmm23
        vpermt2pd       %zmm19, %zmm5, %zmm18
        vpermt2pd       %zmm21, %zmm5, %zmm20
        vpmovqd %zmm10, %ymm19
        vpaddd  %ymm0, %ymm19, %ymm21
        vpaddd  %ymm21, %ymm2, %ymm24
        vpaddd  %ymm21, %ymm15, %ymm25
        vpaddd  %ymm19, %ymm2, %ymm2
        vpmulld %ymm2, %ymm24, %ymm2
        vpaddd  %ymm19, %ymm15, %ymm15
        vpmulld %ymm15, %ymm25, %ymm15
        vpsrld  $1, %ymm2, %ymm2
        vpsrld  $1, %ymm15, %ymm15
        vpaddd  %ymm21, %ymm2, %ymm2
        vpaddd  %ymm21, %ymm15, %ymm15
        vcvtudq2pd      %ymm2, %zmm2
        vcvtudq2pd      %ymm15, %zmm15
        vaddpd  %zmm7, %zmm2, %zmm2
        vdivpd  %zmm2, %zmm22, %zmm2
        vaddpd  %zmm2, %zmm11, %zmm11
        vaddpd  %zmm7, %zmm15, %zmm2
        vdivpd  %zmm2, %zmm23, %zmm2
        vaddpd  %zmm2, %zmm12, %zmm12
        vpaddd  %ymm21, %ymm16, %ymm2
        vpaddd  %ymm19, %ymm16, %ymm15
        vpmulld %ymm15, %ymm2, %ymm2
        vpaddd  %ymm21, %ymm17, %ymm15
        vpaddd  %ymm19, %ymm17, %ymm16
        vpmulld %ymm16, %ymm15, %ymm15
        vpsrld  $1, %ymm2, %ymm2
        vpsrld  $1, %ymm15, %ymm15
        vpaddd  %ymm21, %ymm2, %ymm2
        vpaddd  %ymm21, %ymm15, %ymm15
        vcvtudq2pd      %ymm2, %zmm2
        vcvtudq2pd      %ymm15, %zmm15
        vaddpd  %zmm7, %zmm2, %zmm2
        vdivpd  %zmm2, %zmm18, %zmm2
        vaddpd  %zmm2, %zmm8, %zmm8
        vaddpd  %zmm7, %zmm15, %zmm2
        vdivpd  %zmm2, %zmm20, %zmm2
        vaddpd  %zmm2, %zmm9, %zmm9
        vpaddd  %ymm28, %ymm13, %ymm13
        addq    $-16, %rbx
        jne     L336
4 Likes

This is odd. On my computer:

julia> @btime main($1000)
  79.046 ms (3 allocations: 23.81 KiB)
1.274224148129483

julia> vecdiv(a, b, c, d) = (a / b, c / d)
vecdiv (generic function with 1 method)

julia> @btime main($1000)
  156.331 ms (3 allocations: 23.81 KiB)
1.274224148129483

julia> versioninfo()
Julia Version 1.3.1
Commit 2d5741174c (2019-12-30 21:36 UTC)
Platform Info:
  OS: Linux (x86_64-unknown-linux-gnu)
  CPU: Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-6.0.1 (ORCJIT, ivybridge)

I thought at first it might have something to do with the fact that I’ve been setting --cpu-target=core2, but performance is identical without that flag.

Ah. Reading more of your post, I see that it’s taking advantage of AVX512 on your computer. Good tip on using Int32. I’ll see if that helps.

I should note that because core2 has only the XMM registers, the @simd macro has no effect as long as the auto-vectorizer is able to notice the two side-by-side identical arithmetics in the inner loop.

Ah. With wider registers, the @simd lets it evaluate multiple loop iterations at a time instead. Meaning instead of holding the side-by-side numbers within a loop iteration, registers will hold just one of these numbers, but across loop iterations.

Ideally of course, I wouldn’t have to explicitly do 2 operations per loop iteration and could just write it naively with @simd and get the simd + reduction across all n iterations, but I haven’t been able to get that to work at all.

With this version testing with and without the @noinline I get:

julia> @btime main($1000)
  82.468 ms (3 allocations: 23.81 KiB)
1.274224148129483

julia> vecdiv(a, b, c, d) = (a / b, c / d)
vecdiv (generic function with 1 method)

julia> @btime main($1000)
  156.155 ms (3 allocations: 23.81 KiB)
1.274224148129483

Looking at the code_native, I see some divsd %xmm4, %xmm3, but my understanding is that most of the time the simd registers will be used even for unvectorized fp math, right? So I need to look more closely.

Ha ha. Somehow, I didn’t see that. Updated version of my code:

A(i, j) = ((i + j) * (i + j + one(UInt32))) >>> 1 + i + 1.0
function Av!(f, v, out)
    n = length(v)
    for i=one(UInt32):(n % UInt32)
        x = 0.0
        @inbounds @simd for j=one(UInt32):(n%UInt32);
            x += v[j] / f(i-one(UInt32), j-one(UInt32))
        end
        @inbounds out[i] = x
    end
end

Yielding the same time:

julia> @btime main(1000)
  19.350 ms (3 allocations: 23.81 KiB)
1.2742241481294834

Mind sharing the @code_native debuginfo=:none? Unfortunate that it did not work.

sd means “single double”. pd means “packed double”.
Thus, (v)divpd is what you’d be looking for.

With this version:

julia> @btime main($1000)
  156.168 ms (3 allocations: 23.81 KiB)
1.2742241481294836

Ahhh. Thanks. I still need a lot of practice reading native code.

@code_native in text-dump below:

julia> code_native(Av!, (typeof(A), Vector{Float64}, Vector{Float64}), debuginfo=:none)
	.text
	movq	%rsi, -8(%rsp)
	movq	8(%rsi), %rcx
	movq	8(%rcx), %rax
	testl	%eax, %eax
	je	L144
	movl	%eax, %r9d
	movq	16(%rsi), %rax
	movq	(%rax), %rax
	testq	%r9, %r9
	je	L160
	movq	(%rcx), %r8
	movl	$1, %r10d
	movabsq	$140044753166208, %rcx  # imm = 0x7F5EB5C3DF80
	movsd	(%rcx), %xmm0           # xmm0 = mem[0],zero
	nopl	(%rax,%rax)
L64:
	xorpd	%xmm1, %xmm1
	movl	%r10d, %edi
	movq	%r9, %rcx
	movq	%r8, %rdx
	nopl	(%rax)
L80:
	movsd	(%rdx), %xmm2           # xmm2 = mem[0],zero
	leal	-1(%rdi), %esi
	imull	%edi, %esi
	shrl	%esi
	leal	-1(%rsi,%r10), %esi
	xorps	%xmm3, %xmm3
	cvtsi2sdq	%rsi, %xmm3
	addsd	%xmm0, %xmm3
	divsd	%xmm3, %xmm2
	addsd	%xmm2, %xmm1
	addq	$8, %rdx
	incl	%edi
	decq	%rcx
	jne	L80
	movsd	%xmm1, -8(%rax,%r10,8)
	cmpq	%r9, %r10
	leaq	1(%r10), %r10
	jne	L64
L144:
	movabsq	$jl_system_image_data, %rax
	retq
	nopl	(%rax,%rax)
L160:
	movq	$0, (%rax)
	addq	$8, %rax
	jmp	L160
	nopl	(%rax)

EDIT: you should be able to see these same results and play with them on your own computer by using the --cpu-target=core2 flag when starting Julia, right?

No SIMD at all. I think holding vectors of integers in floating point vector registers is a rela
I also noticed

cvtsi2sdq	%rsi, %xmm3

I don’t recall which architectures were first able to hold vectors of integers in floating point registers.
Maybe that’s the problem?

Anyway, why use integers at all?

A(i, j) = @fastmath 0.5 * ((i + j) * (i + j + 1.0)) + i + 1.0
function Av!(f, v, out)
    n = length(v)
    ifloat = 0.0
    @fastmath for i=one(UInt32):(n % UInt32)
        x = 0.0; jfloat = 0.0
        @inbounds @simd for j=one(UInt32):(n%UInt32);
            x += v[j] / f(ifloat, jfloat)
            jfloat += 1.0
        end
        ifloat += 1.0
        @inbounds out[i] = x
    end
end

It’s slower on this computer, but perhaps does better on yours?

julia> @btime main(1000)
  20.216 ms (3 allocations: 23.81 KiB)
1.2742241481294834
1 Like

Good idea! This seems to do very marginally better than the original with explicit SIMD via llvmcall on my computer:

julia> @btime main($1000)
  78.511 ms (3 allocations: 23.81 KiB)
1.2742241481294827

julia> @btime Original.main($1000)
  78.822 ms (363 allocations: 56.31 KiB)
1.274224148129483

Only question is whether it will translate to the actual core2 cpu
that the benchmarks run on. I’ll tinker with this some more and then
submit it. I’m not sure the fragments we’re talking about here are
really copyrightable, but just in case, will you release your work above
under the benchmarks game
license
?

Sure, feel free to use the code.
Also, I’d replace the remaining UInt32 stuff with regular integers for readability. It shouldn’t make a difference to performance now that they’re not being converted to floating point (although I’d benchmark again to double check).

Why does Original.main allocate so much more? It may be faster if it didn’t.

It has the outer loop wrapped in Threads.@threads, which is actually what I’m attempting to do right now with your version above. But you’re right. I should be comparing them both without the Threads.@threads overhead even if JULIA_NUM_THREADS is set to 1.

Unfortunately using an all floating-point A function comes in just a tiny bit slower than the original (less than .1%).

julia> @benchmark Original.main($5500)
BenchmarkTools.Trial: 
  memory estimate:  250.86 KiB
  allocs estimate:  1230
  --------------
  minimum time:     626.770 ms (0.00% GC)
  median time:      626.805 ms (0.00% GC)
  mean time:        626.826 ms (0.00% GC)
  maximum time:     626.945 ms (0.00% GC)
  --------------
  samples:          8
  evals/sample:     1

julia> @benchmark FloatA.main($5500)
BenchmarkTools.Trial: 
  memory estimate:  250.67 KiB
  allocs estimate:  1218
  --------------
  minimum time:     626.286 ms (0.00% GC)
  median time:      626.382 ms (0.00% GC)
  mean time:        626.403 ms (0.00% GC)
  maximum time:     626.552 ms (0.00% GC)
  --------------
  samples:          8
  evals/sample:     1

Usually this would be insignificant, but the goal here is to have an idiomatic Julia solution (without llvmcall) show up as the fastest Julia implementation on the site, so even tiny differences like this matter. I’m going to submit it anyway, so that it’s there for people who do more than just look at the fastest numbers.

But for the goal of having the fastest pure Julia implementation be free of llvmcall, I’m back to the drawing board of figuring out how to get a Float64 division to vectorize (with --cpu-target=core2) in the presence of Int arithmetic A without imposing the overhead of a (non-inlined) call.

Here’s the cleaned up version with threading for anyone interested:

using Printf

A(i, j) = 0.5((i + j) * (i + j + 1.0)) + i + 1.0
At(i, j) = A(j, i)

function Av!(f, v, out, n=length(v))
    Threads.@threads for i=1:n
        x = 0.0
        @simd for j=1:n
            x += @fastmath @inbounds v[j] / f(i - 1.0, j - 1.0)
        end
        @inbounds out[i] = x
    end
end

function main(n)
    u = ones(Float64, n)
    v = Vector{Float64}(undef, n)
    # temporary working vector w
    w = Vector{Float64}(undef, n)

    for _=1:10
        Av!(A, u, w, n)
        Av!(At, w, v, n)
        Av!(A, v, w, n)
        Av!(At, w, u, n)
    end

    uv = vv = 0.0
    @inbounds for i=1:n
        uv += u[i] * v[i]
        vv += v[i] * v[i]
    end
    sqrt(uv / vv)
end