Running your version…
julia> @btime main(1000)
46.669 ms (3 allocations: 23.81 KiB)
1.274224148129483
julia> vecdiv(a, b, c, d) = (a / b, c / d)
vecdiv (generic function with 1 method)
julia> @btime main(1000)
39.060 ms (3 allocations: 23.81 KiB)
1.274224148129483
Redefining a couple more things (doing the obvious of @simd
to get simd):
A(i, j) = ((i + j) * (i + j + 1)) >>> 1 + i + 1.0
function Av!(f, v, out)
n = length(v)
for i=1:n
x = (0.0, 0.0)
@inbounds @simd for k=1:(n+1)>>>1;
j = 2k - 1
x = x .+ vecdiv(v[j], f(i-1, j-1), v[j+1], f(i-1, j))
end
@inbounds out[i] = sum(x)
end
end
This yields:
julia> @btime main(1000)
19.227 ms (3 allocations: 23.81 KiB)
1.274224148129483
Anyway, I should point out that AVX512 (which this computer has) can do SIMD Int64 → Float64 conversion, but AVX2 cannot.
Therefore, vectorizing the above code requires AVX512.
AVX can do SIMD Int32 → Float64, so I would recommend you try replacing the 64 bit integers with 32 bit.
The loop body:
L336:
vpaddq %zmm10, %zmm10, %zmm1
vpaddq %zmm24, %zmm1, %zmm14
vporq (%r15){1to8}, %zmm1, %zmm15
vpaddq (%r12){1to8}, %zmm1, %zmm16
vmovq %xmm1, %rax
vmovq %xmm14, %rcx
vmovupd (%rsi,%rax,8), %zmm17
vmovupd 64(%rsi,%rax,8), %zmm18
vmovupd (%rsi,%rcx,8), %zmm19
vmovupd 64(%rsi,%rcx,8), %zmm20
vmovapd %zmm17, %zmm21
vpermt2pd %zmm18, %zmm25, %zmm21
vmovapd %zmm19, %zmm22
vpermt2pd %zmm20, %zmm25, %zmm22
vpermt2pd %zmm18, %zmm4, %zmm17
vpermt2pd %zmm20, %zmm4, %zmm19
vpaddq %zmm8, %zmm1, %zmm18
vpaddq %zmm8, %zmm14, %zmm20
vpaddq %zmm9, %zmm1, %zmm1
vpmullq %zmm1, %zmm18, %zmm1
vpaddq %zmm9, %zmm14, %zmm14
vpmullq %zmm14, %zmm20, %zmm14
vpsrlq $1, %zmm1, %zmm1
vpsrlq $1, %zmm14, %zmm14
vpaddq %zmm8, %zmm1, %zmm1
vpaddq %zmm8, %zmm14, %zmm14
vcvtqq2pd %zmm1, %zmm1
vcvtqq2pd %zmm14, %zmm14
vaddpd %zmm5, %zmm1, %zmm1
vdivpd %zmm1, %zmm21, %zmm1
vaddpd %zmm1, %zmm11, %zmm11
vaddpd %zmm5, %zmm14, %zmm1
vdivpd %zmm1, %zmm22, %zmm1
vaddpd %zmm1, %zmm12, %zmm12
vpaddq %zmm8, %zmm15, %zmm1
vpaddq %zmm8, %zmm16, %zmm14
vpaddq %zmm9, %zmm15, %zmm15
vpmullq %zmm15, %zmm1, %zmm1
vpaddq %zmm9, %zmm16, %zmm15
vpmullq %zmm15, %zmm14, %zmm14
vpsrlq $1, %zmm1, %zmm1
vpsrlq $1, %zmm14, %zmm14
vpaddq %zmm8, %zmm1, %zmm1
vpaddq %zmm8, %zmm14, %zmm14
vcvtqq2pd %zmm1, %zmm1
vcvtqq2pd %zmm14, %zmm14
vaddpd %zmm5, %zmm1, %zmm1
vdivpd %zmm1, %zmm17, %zmm1
vaddpd %zmm1, %zmm6, %zmm6
vaddpd %zmm5, %zmm14, %zmm1
vdivpd %zmm1, %zmm19, %zmm1
vaddpd %zmm1, %zmm7, %zmm7
vpaddq %zmm24, %zmm10, %zmm10
addq $-16, %rbx
jne L336
vaddpd %zmm11, %zmm12, %zmm1
vextractf64x4 $1, %zmm1, %ymm8
vaddpd %zmm8, %zmm1, %zmm1
vextractf128 $1, %ymm1, %xmm8
vaddpd %zmm6, %zmm7, %zmm6
vextractf64x4 $1, %zmm6, %ymm7
vaddpd %zmm7, %zmm6, %zmm6
vextractf128 $1, %ymm6, %xmm7
vaddpd %xmm7, %xmm6, %xmm6
vaddpd %xmm8, %xmm1, %xmm1
vhaddpd %xmm6, %xmm1, %xmm6
cmpq %rdx, %r11
je L208
Note the vcvtqq2pd
instructions which require AVX512.
To make this fast on AVX as well (but placing a much smaller upper limit on allowed size of n
, at 2^32, aka a little over 4 billion):
A(i, j) = ((i + j) * (i + j + one(UInt32))) >>> 1 + i + 1.0
function Av!(f, v, out)
n = length(v)
for i=one(UInt32):(n % UInt32)
x = (0.0, 0.0)
@inbounds @simd for k=one(UInt32):((n%UInt32)+one(UInt32))>>>1;
j = k+k - one(UInt32)
x = x .+ vecdiv(v[j], f(i-one(UInt32), j-one(UInt32)), v[j+one(UInt32)], f(i-one(UInt32), j))
end
@inbounds out[i] = sum(x)
end
end
Now it’s about the same fast with AVX512, but should be much faster without it:
julia> @btime main(1000)
19.585 ms (3 allocations: 23.81 KiB)
1.274224148129483
because it now uses vcvtdq2pd
, which is available with SSE and AVX.
L336:
vpaddd %ymm13, %ymm13, %ymm2
vpaddd %ymm28, %ymm2, %ymm15
vpord (%r14){1to8}, %ymm2, %ymm16
vpaddd (%r15){1to8}, %ymm2, %ymm17
vmovd %xmm16, %ecx
vmovd %xmm17, %eax
vmovupd -8(%rsi,%rcx,8), %zmm18
vmovupd 56(%rsi,%rcx,8), %zmm19
vmovupd -8(%rsi,%rax,8), %zmm20
vmovupd 56(%rsi,%rax,8), %zmm21
vmovapd %zmm18, %zmm22
vpermt2pd %zmm19, %zmm4, %zmm22
vmovapd %zmm20, %zmm23
vpermt2pd %zmm21, %zmm4, %zmm23
vpermt2pd %zmm19, %zmm5, %zmm18
vpermt2pd %zmm21, %zmm5, %zmm20
vpmovqd %zmm10, %ymm19
vpaddd %ymm0, %ymm19, %ymm21
vpaddd %ymm21, %ymm2, %ymm24
vpaddd %ymm21, %ymm15, %ymm25
vpaddd %ymm19, %ymm2, %ymm2
vpmulld %ymm2, %ymm24, %ymm2
vpaddd %ymm19, %ymm15, %ymm15
vpmulld %ymm15, %ymm25, %ymm15
vpsrld $1, %ymm2, %ymm2
vpsrld $1, %ymm15, %ymm15
vpaddd %ymm21, %ymm2, %ymm2
vpaddd %ymm21, %ymm15, %ymm15
vcvtudq2pd %ymm2, %zmm2
vcvtudq2pd %ymm15, %zmm15
vaddpd %zmm7, %zmm2, %zmm2
vdivpd %zmm2, %zmm22, %zmm2
vaddpd %zmm2, %zmm11, %zmm11
vaddpd %zmm7, %zmm15, %zmm2
vdivpd %zmm2, %zmm23, %zmm2
vaddpd %zmm2, %zmm12, %zmm12
vpaddd %ymm21, %ymm16, %ymm2
vpaddd %ymm19, %ymm16, %ymm15
vpmulld %ymm15, %ymm2, %ymm2
vpaddd %ymm21, %ymm17, %ymm15
vpaddd %ymm19, %ymm17, %ymm16
vpmulld %ymm16, %ymm15, %ymm15
vpsrld $1, %ymm2, %ymm2
vpsrld $1, %ymm15, %ymm15
vpaddd %ymm21, %ymm2, %ymm2
vpaddd %ymm21, %ymm15, %ymm15
vcvtudq2pd %ymm2, %zmm2
vcvtudq2pd %ymm15, %zmm15
vaddpd %zmm7, %zmm2, %zmm2
vdivpd %zmm2, %zmm18, %zmm2
vaddpd %zmm2, %zmm8, %zmm8
vaddpd %zmm7, %zmm15, %zmm2
vdivpd %zmm2, %zmm20, %zmm2
vaddpd %zmm2, %zmm9, %zmm9
vpaddd %ymm28, %ymm13, %ymm13
addq $-16, %rbx
jne L336