Hmm.
@code_native
shows that my code actually failed to vectorize how I intended it. I’m trying to improve a macro to get that to work.
My @vectorize
macro needs a lot of work, but with a little finagling:
using SIMDPirates, SLEEFwrap, BenchmarkTools, Base.Cartesian
@generated function bar_tv2!(result::AbstractArray{T}, array::AbstractArray{T},::Val{NCOL}) where {T,NCOL}
quote
@nexprs $NCOL j -> p_j = @inbounds array[1,j]
@vectorize $T for i ∈ 1:size(array,1)
dp = zero(T)
@nexprs $NCOL j -> dp += p_j * array[i,j]
@nexprs $NCOL j -> result[i,j] = dp * p_j + array[i,j]
end
end
end
function bar2!(result::AbstractArray, array::AbstractArray)
m,n = size(array)
@views p1 = array[:,1]
@inbounds for i in 1:n
dp = zero(eltype(array))
@simd for j = 1:m
dp += p1[j] * array[j,i]
end
@simd for j = 1:m
result[j,i] = dp * p1[j] + array[j,i]
end
end
end
N = 1000
array = randn(3, N);
result = zeros(3, N);
array_t = copy(array');
result_t = zeros(N, 3);
This yields:
julia> bar_tv2!(result_t, array_t,Val(3)); result_t'
3×1000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
-2.22506 -3.39461 -0.89401 1.37359 -1.27377 2.85605 -1.22525 -0.387945 3.23087 -0.718255 … 1.60807 -2.39425 0.113308 1.24666 -1.2275 -0.713399 -0.601649 -0.664115 0.665728
1.60194 3.84549 -1.26712 -1.59311 -1.31202 -2.01284 1.95004 -0.276039 -4.76663 -0.187859 0.880181 2.02346 -1.44376 0.426184 0.570249 -1.27561 0.865626 0.9036 -0.543402
-1.68125 -1.77578 -1.45913 0.29928 -1.20614 2.13511 -1.24661 -1.1421 2.60744 -0.425533 -0.961413 0.0212073 0.423453 -0.270527 -1.72279 0.281566 -1.08591 1.90826 0.918416
julia> bar2!(result, array); result
3×1000 Array{Float64,2}:
-2.22506 -3.39461 -0.89401 1.37359 -1.27377 2.85605 -1.22525 -0.387945 3.23087 -0.718255 … 1.60807 -2.39425 0.113308 1.24666 -1.2275 -0.713399 -0.601649 -0.664115 0.665728
1.60194 3.84549 -1.26712 -1.59311 -1.31202 -2.01284 1.95004 -0.276039 -4.76663 -0.187859 0.880181 2.02346 -1.44376 0.426184 0.570249 -1.27561 0.865626 0.9036 -0.543402
-1.68125 -1.77578 -1.45913 0.29928 -1.20614 2.13511 -1.24661 -1.1421 2.60744 -0.425533 -0.961413 0.0212073 0.423453 -0.270527 -1.72279 0.281566 -1.08591 1.90826 0.918416
julia> @benchmark bar_tv2!($result_t, $array_t,Val(3))
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 433.854 ns (0.00% GC)
median time: 435.020 ns (0.00% GC)
mean time: 455.177 ns (0.00% GC)
maximum time: 668.051 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 198
julia> @benchmark bar2!($result, $array)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 3.025 μs (0.00% GC)
median time: 3.029 μs (0.00% GC)
mean time: 3.158 μs (0.00% GC)
maximum time: 7.316 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 8
Now, 100x faster than the original!
Note, I’m benchmarking on a computer with avx512, so most aren’t likely to see the same benefit.
EDIT:
Cleaned up assmebly (Discourse’s syntax highlighting doesn’t agree with the auto-inserted comments):
julia> @code_native bar_tv2!(result_t, array_t,Val(3))
.text
pushq %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $56, %rsp
movq %rsi, 48(%rsp)
movq (%rsi), %r14
movq 8(%rsi), %rax
movq (%rax), %r12
movq 24(%rax), %rbx
vmovsd (%r12), %xmm0 # xmm0 = mem[0],zero
vmovaps %xmm0, 32(%rsp)
vmovsd (%r12,%rbx,8), %xmm0 # xmm0 = mem[0],zero
vmovaps %xmm0, 16(%rsp)
movq %rbx, %rbp
shlq $4, %rbp
vmovsd (%r12,%rbp), %xmm0 # xmm0 = mem[0],zero
vmovapd %xmm0, (%rsp)
movq %rbx, %r13
sarq $63, %r13
shrq $61, %r13
addq %rbx, %r13
andq $-8, %r13
movq (%r14), %r15
movabsq $steprange_last, %rax
movl $1, %edi
movl $8, %esi
movq %r13, %rdx
callq *%rax
vmovapd 32(%rsp), %xmm10
vmovapd (%rsp), %xmm9
vmovapd 16(%rsp), %xmm8
testq %rax, %rax
jle L288
vbroadcastsd %xmm10, %zmm0
vbroadcastsd %xmm8, %zmm1
vbroadcastsd %xmm9, %zmm2
movq 24(%r14), %rcx
movq %rcx, %rdx
shlq $4, %rdx
addq $7, %rax
vxorpd %xmm3, %xmm3, %xmm3
movq %r12, %rsi
movq %r15, %rdi
nop
L192:
vmovupd (%rsi), %zmm4
vmovupd (%rsi,%rbx,8), %zmm5
vmovupd (%rsi,%rbp), %zmm6
vmovapd %zmm4, %zmm7
vfmadd213pd %zmm3, %zmm0, %zmm7
vfmadd231pd %zmm5, %zmm1, %zmm7
vfmadd231pd %zmm6, %zmm2, %zmm7
vfmadd231pd %zmm0, %zmm7, %zmm4
vmovupd %zmm4, (%rdi)
vfmadd231pd %zmm1, %zmm7, %zmm5
vmovupd %zmm5, (%rdi,%rcx,8)
vfmadd213pd %zmm6, %zmm2, %zmm7
vmovupd %zmm7, (%rdi,%rdx)
addq $64, %rdi
addq $64, %rsi
addq $-8, %rax
jne L192
L288:
movq %rbx, %rcx
subq %r13, %rcx
testq %rcx, %rcx
jle L495
vpbroadcastq %rcx, %zmm0
movabsq $140535612763840, %rax # imm = 0x7FD0FF46EEC0
vpaddq (%rax), %zmm0, %zmm0
movq %rbx, %rax
subq %rcx, %rax
vpternlogd $255, %zmm1, %zmm1, %zmm1
vpcmpgtq %zmm1, %zmm0, %k1
vmovupd (%r12,%rax,8), %zmm0 {%k1} {z}
vpxor %xmm1, %xmm1, %xmm1
leaq (%rbx,%rbx), %rcx
leaq (%rbx,%rax), %rdx
vmovupd (%r12,%rdx,8), %zmm2 {%k1} {z}
addq %rax, %rcx
vmovupd (%r12,%rcx,8), %zmm3 {%k1} {z}
vbroadcastsd %xmm10, %zmm4
vfmadd231pd %zmm0, %zmm4, %zmm1
vbroadcastsd %xmm8, %zmm5
vfmadd231pd %zmm2, %zmm5, %zmm1
vbroadcastsd %xmm9, %zmm6
vfmadd231pd %zmm3, %zmm6, %zmm1
vfmadd213pd %zmm0, %zmm1, %zmm4
vmovupd %zmm4, (%r15,%rax,8) {%k1}
vfmadd213pd %zmm2, %zmm1, %zmm5
movq 24(%r14), %rcx
leaq (%rcx,%rax), %rdx
vmovupd %zmm5, (%r15,%rdx,8) {%k1}
vfmadd213pd %zmm3, %zmm6, %zmm1
leaq (%rax,%rcx,2), %rax
vmovupd %zmm1, (%r15,%rax,8) {%k1}
movabsq $140535856324616, %rax # imm = 0x7FD10DCB6008
addq $56, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
vzeroupper
retq
L495:
movabsq $140535856324616, %rax # imm = 0x7FD10DCB6008
addq $56, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
vzeroupper
retq
a big pile of zmm
(512 bit) registers and vfmadd
instructions.
EDIT:
@Seif_Shebl, did you try Intel Fortran on the version with transposed array
and result
(ie, Nx3 instead of 3xN)?
In earlier experimenting, both ifort
and flang
sometimes managed to correctly vectorize such for loops. Sometimes gfortran
can do it to, but it often involves playing with flags like turning -fdisable-tree-cunrolli
on or off. The number of loops I tried on isn’t too large, but that llvm-based flang
worked on an example loop I tried earlier while Julia did not (without @vectorize
) surprised me.
On that note, I’ve been meaning to try and find the time to play with Intel’s [Single Program Multiple Data] Program Compiler (ispc) compiler, which is also LLVM-based. Here is a fascinating series of blog posts about it by the creator. From the sound of it, this is the sort of thing it should be well optimized for – although the author wishes some bigger project would work to better support it.
My success has actually been pretty low with Julia - but the ability to write complicated transformative macros means it’s actually the most robustly capable of loop vectorization in practice (for me).
And SIMD intrinsics are so much easier in Julia, even without macros.
I really need to refactor the code base behind my @vectorize
macro to make it easier to work with and extend. It is fairly limited at the moment. It would be easier to add features / support more types of loops with a different code organization.
EDIT:
Actually, both Flang and gfortran do a great job on this problem without any tricks.
The Fortran implementation:
module vectorization_test
implicit none
contains
subroutine bar(result, array, N)
integer(8), intent(in) :: N
real(8), dimension(N,3), intent(in) :: array
real(8), dimension(N,3), intent(out) :: result
integer(8) :: i
real(8) :: p1(3), dp
p1 = array(1,:)
do concurrent (i = 1:N)
dp = sum(p1 * array(i,:))
result(i,:) = dp * p1 + array(i,:)
end do
end subroutine bar
end module vectorization_test
and compiler versions (latest releases):
$ gfortran --version
GNU Fortran (GCC) 8.2.1 20181215 (Red Hat 8.2.1-6)
Copyright (C) 2018 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
$ flang --version
clang version 7.0.1 (https://github.com/flang-compiler/flang-driver.git 24bd54da5c41af04838bbe7b68f830840d47fc03) (https://github.com/flang-compiler/llvm.git 84335cded2f2cf9de26a77194835094fc75f7bb7)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin
Compiling with
$ gfortran -Ofast -march=native -mprefer-vector-width=512 -shared -fPIC vectorization_test.f90 -o libgfortran_vectorization_test.so
$ flang -Ofast -march=native -mprefer-vector-width=512 -shared -fPIC vectorization_test.f90 -o libflang_vectorization_test.so
julia> const test_dir = "/home/chriselrod/Documents/progwork/fortran"
"/home/chriselrod/Documents/progwork/fortran"
julia> const flang_lib = joinpath(test_dir, "libflang_vectorization_test.so")
"/home/chriselrod/Documents/progwork/fortran/libflang_vectorization_test.so"
julia> const gfort_lib = joinpath(test_dir, "libgfortran_vectorization_test.so")
"/home/chriselrod/Documents/progwork/fortran/libgfortran_vectorization_test.so"
julia> function flang_test(results, array, N)
ccall(
(:vectorization_test_bar_, flang_lib), # function and library
Cvoid, # return type
(Ptr{Float64},Ptr{Float64},Ptr{Int64}), # argument types
results, array, N # arguments
)
end
flang_test (generic function with 1 method)
julia> function gfortran_test(results, array, N)
ccall(
(:__vectorization_test_MOD_bar, gfort_lib), # function and library
Cvoid, # return type
(Ptr{Float64},Ptr{Float64},Ptr{Int64}), # argument types
results, array, N # arguments
)
end
gfortran_test (generic function with 1 method)
julia> Nr = Ref(N)
Base.RefValue{Int64}(1000)
julia> fill!(result_t, 0);
julia> flang_test(result_t, array_t, Nr); result_t' # matches the earlier Julia results
3×1000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
-2.22506 -3.39461 -0.89401 1.37359 -1.27377 2.85605 -1.22525 -0.387945 3.23087 -0.718255 … 1.60807 -2.39425 0.113308 1.24666 -1.2275 -0.713399 -0.601649 -0.664115 0.665728
1.60194 3.84549 -1.26712 -1.59311 -1.31202 -2.01284 1.95004 -0.276039 -4.76663 -0.187859 0.880181 2.02346 -1.44376 0.426184 0.570249 -1.27561 0.865626 0.9036 -0.543402
-1.68125 -1.77578 -1.45913 0.29928 -1.20614 2.13511 -1.24661 -1.1421 2.60744 -0.425533 -0.961413 0.0212073 0.423453 -0.270527 -1.72279 0.281566 -1.08591 1.90826 0.918416
julia> fill!(result_t, 0);
julia> gfortran_test(result_t, array_t, Nr); result_t'
3×1000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
-2.22506 -3.39461 -0.89401 1.37359 -1.27377 2.85605 -1.22525 -0.387945 3.23087 -0.718255 … 1.60807 -2.39425 0.113308 1.24666 -1.2275 -0.713399 -0.601649 -0.664115 0.665728
1.60194 3.84549 -1.26712 -1.59311 -1.31202 -2.01284 1.95004 -0.276039 -4.76663 -0.187859 0.880181 2.02346 -1.44376 0.426184 0.570249 -1.27561 0.865626 0.9036 -0.543402
-1.68125 -1.77578 -1.45913 0.29928 -1.20614 2.13511 -1.24661 -1.1421 2.60744 -0.425533 -0.961413 0.0212073 0.423453 -0.270527 -1.72279 0.281566 -1.08591 1.90826 0.918416
These get the same answer, yet are very fast:
julia> @benchmark gfortran_test($result_t, $array_t, $Nr)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 420.372 ns (0.00% GC)
median time: 421.322 ns (0.00% GC)
mean time: 436.942 ns (0.00% GC)
maximum time: 647.241 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 199
julia> @benchmark flang_test($result_t, $array_t, $Nr)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 419.518 ns (0.00% GC)
median time: 420.523 ns (0.00% GC)
mean time: 438.599 ns (0.00% GC)
maximum time: 668.719 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 199
My Flang is using LLVM-7. I don’t know what Flang’s front end is doing, but it seems like Julia ought to achieve the same performance and ease of writing code, without any fancy macros.
EDIT:
I’m less familiar with ifort
, but it gets the same performance too:
julia> const ifort_lib = joinpath(test_dir, "libifort_vectorization_test.so")
"/home/chriselrod/Documents/progwork/fortran/libifort_vectorization_test.so"
julia> function ifort_test(results, array, N)
ccall(
(:vectorization_test_mp_bar_, ifort_lib), # function and library
Cvoid, # return type
(Ptr{Float64},Ptr{Float64},Ptr{Int64}), # argument types
results, array, N # arguments
)
end
ifort_test (generic function with 1 method)
julia> fill!(result_t, 0);
julia> ifort_test(result_t, array_t, Nr); result_t'
3×1000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
-2.22506 -3.39461 -0.89401 1.37359 -1.27377 2.85605 -1.22525 -0.387945 3.23087 -0.718255 … 1.60807 -2.39425 0.113308 1.24666 -1.2275 -0.713399 -0.601649 -0.664115 0.665728
1.60194 3.84549 -1.26712 -1.59311 -1.31202 -2.01284 1.95004 -0.276039 -4.76663 -0.187859 0.880181 2.02346 -1.44376 0.426184 0.570249 -1.27561 0.865626 0.9036 -0.543402
-1.68125 -1.77578 -1.45913 0.29928 -1.20614 2.13511 -1.24661 -1.1421 2.60744 -0.425533 -0.961413 0.0212073 0.423453 -0.270527 -1.72279 0.281566 -1.08591 1.90826 0.918416
julia> @benchmark ifort_test($result_t, $array_t, $Nr)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 421.477 ns (0.00% GC)
median time: 422.523 ns (0.00% GC)
mean time: 443.197 ns (0.00% GC)
maximum time: 721.663 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 199
I compiled with
$ ifort -fast -qopt-zmm-usage=high -shared -fPIC vectorization_test.f90 -o libifort_vectorization_test.so
$ ifort --version
ifort (IFORT) 19.0.1.144 20181018
Copyright (C) 1985-2018 Intel Corporation. All rights reserved.
All three Fortran compilers produce code that takes just over 420 nanoseconds, while my best Julia version took about 435 nanoseconds. So that’s pretty good, but it needed a fancy code-transforming macro, while the Fortran code was written in a (comparatively) simple and naive manner.
Even if I go “all out” with macros from base Julia, I don’t get that performance:
julia> @generated function bar_tv3!(result::AbstractArray{T}, array::AbstractArray{T},::Val{NCOL}) where {T,NCOL}
quote
@nexprs $NCOL j -> p_j = @inbounds array[1,j]
@inbounds @fastmath @simd ivdep for i ∈ 1:size(array,1)
dp = zero(T)
@nexprs $NCOL j -> dp += p_j * array[i,j]
@nexprs $NCOL j -> result[i,j] = dp * p_j + array[i,j]
end
end
end
bar_tv3! (generic function with 1 method)
julia> @benchmark bar_tv3!($result_t, $array_t,Val(3))
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 1.091 μs (0.00% GC)
median time: 1.101 μs (0.00% GC)
mean time: 1.142 μs (0.00% GC)
maximum time: 2.841 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 10
because it fails to vectorize across loop iterations. So, I’d call it a win for Fortran at the moment.