Optimizing iteration over slices of multiple matrices

Hmm.
@code_native shows that my code actually failed to vectorize how I intended it. I’m trying to improve a macro to get that to work.

My @vectorize macro needs a lot of work, but with a little finagling:

using SIMDPirates, SLEEFwrap, BenchmarkTools, Base.Cartesian

@generated function bar_tv2!(result::AbstractArray{T}, array::AbstractArray{T},::Val{NCOL}) where {T,NCOL}
    quote
        @nexprs $NCOL j -> p_j = @inbounds array[1,j]
        @vectorize $T for i ∈ 1:size(array,1)
            dp = zero(T)
            @nexprs $NCOL j -> dp += p_j * array[i,j]
            @nexprs $NCOL j -> result[i,j] = dp * p_j + array[i,j]
        end
    end
end

function bar2!(result::AbstractArray, array::AbstractArray)
   m,n = size(array)
   @views p1 = array[:,1]
   @inbounds for i in 1:n
      dp = zero(eltype(array)) 
      @simd for j = 1:m
         dp += p1[j] * array[j,i]
      end 
      @simd for j = 1:m
         result[j,i] = dp * p1[j] + array[j,i]
      end
   end
end


N = 1000
array = randn(3, N);
result = zeros(3, N);

array_t = copy(array');
result_t = zeros(N, 3);

This yields:

julia> bar_tv2!(result_t, array_t,Val(3)); result_t'
3×1000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
 -2.22506  -3.39461  -0.89401   1.37359  -1.27377   2.85605  -1.22525  -0.387945   3.23087  -0.718255  …   1.60807   -2.39425     0.113308   1.24666   -1.2275    -0.713399  -0.601649  -0.664115   0.665728
  1.60194   3.84549  -1.26712  -1.59311  -1.31202  -2.01284   1.95004  -0.276039  -4.76663  -0.187859      0.880181   2.02346    -1.44376    0.426184   0.570249  -1.27561    0.865626   0.9036    -0.543402
 -1.68125  -1.77578  -1.45913   0.29928  -1.20614   2.13511  -1.24661  -1.1421     2.60744  -0.425533     -0.961413   0.0212073   0.423453  -0.270527  -1.72279    0.281566  -1.08591    1.90826    0.918416

julia> bar2!(result, array); result
3×1000 Array{Float64,2}:
 -2.22506  -3.39461  -0.89401   1.37359  -1.27377   2.85605  -1.22525  -0.387945   3.23087  -0.718255  …   1.60807   -2.39425     0.113308   1.24666   -1.2275    -0.713399  -0.601649  -0.664115   0.665728
  1.60194   3.84549  -1.26712  -1.59311  -1.31202  -2.01284   1.95004  -0.276039  -4.76663  -0.187859      0.880181   2.02346    -1.44376    0.426184   0.570249  -1.27561    0.865626   0.9036    -0.543402
 -1.68125  -1.77578  -1.45913   0.29928  -1.20614   2.13511  -1.24661  -1.1421     2.60744  -0.425533     -0.961413   0.0212073   0.423453  -0.270527  -1.72279    0.281566  -1.08591    1.90826    0.918416


julia> @benchmark bar_tv2!($result_t, $array_t,Val(3))
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     433.854 ns (0.00% GC)
  median time:      435.020 ns (0.00% GC)
  mean time:        455.177 ns (0.00% GC)
  maximum time:     668.051 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     198

julia> @benchmark bar2!($result, $array)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     3.025 μs (0.00% GC)
  median time:      3.029 μs (0.00% GC)
  mean time:        3.158 μs (0.00% GC)
  maximum time:     7.316 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     8

Now, 100x faster than the original!
Note, I’m benchmarking on a computer with avx512, so most aren’t likely to see the same benefit.

EDIT:
Cleaned up assmebly (Discourse’s syntax highlighting doesn’t agree with the auto-inserted comments):

julia> @code_native bar_tv2!(result_t, array_t,Val(3))
	.text
	pushq	%rbp
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx
	subq	$56, %rsp
	movq	%rsi, 48(%rsp)
	movq	(%rsi), %r14
	movq	8(%rsi), %rax
	movq	(%rax), %r12
	movq	24(%rax), %rbx
	vmovsd	(%r12), %xmm0           # xmm0 = mem[0],zero
	vmovaps	%xmm0, 32(%rsp)
	vmovsd	(%r12,%rbx,8), %xmm0    # xmm0 = mem[0],zero
	vmovaps	%xmm0, 16(%rsp)
	movq	%rbx, %rbp
	shlq	$4, %rbp
	vmovsd	(%r12,%rbp), %xmm0      # xmm0 = mem[0],zero
	vmovapd	%xmm0, (%rsp)
	movq	%rbx, %r13
	sarq	$63, %r13
	shrq	$61, %r13
	addq	%rbx, %r13
	andq	$-8, %r13
	movq	(%r14), %r15
	movabsq	$steprange_last, %rax
	movl	$1, %edi
	movl	$8, %esi
	movq	%r13, %rdx
	callq	*%rax
	vmovapd	32(%rsp), %xmm10
	vmovapd	(%rsp), %xmm9
	vmovapd	16(%rsp), %xmm8
	testq	%rax, %rax
	jle	L288
	vbroadcastsd	%xmm10, %zmm0
	vbroadcastsd	%xmm8, %zmm1
	vbroadcastsd	%xmm9, %zmm2
	movq	24(%r14), %rcx
	movq	%rcx, %rdx
	shlq	$4, %rdx
	addq	$7, %rax
	vxorpd	%xmm3, %xmm3, %xmm3
	movq	%r12, %rsi
	movq	%r15, %rdi
	nop
L192:
	vmovupd	(%rsi), %zmm4
	vmovupd	(%rsi,%rbx,8), %zmm5
	vmovupd	(%rsi,%rbp), %zmm6
	vmovapd	%zmm4, %zmm7
	vfmadd213pd	%zmm3, %zmm0, %zmm7
	vfmadd231pd	%zmm5, %zmm1, %zmm7
	vfmadd231pd	%zmm6, %zmm2, %zmm7
	vfmadd231pd	%zmm0, %zmm7, %zmm4
	vmovupd	%zmm4, (%rdi)
	vfmadd231pd	%zmm1, %zmm7, %zmm5
	vmovupd	%zmm5, (%rdi,%rcx,8)
	vfmadd213pd	%zmm6, %zmm2, %zmm7
	vmovupd	%zmm7, (%rdi,%rdx)
	addq	$64, %rdi
	addq	$64, %rsi
	addq	$-8, %rax
	jne	L192
L288:
	movq	%rbx, %rcx
	subq	%r13, %rcx
	testq	%rcx, %rcx
	jle	L495
	vpbroadcastq	%rcx, %zmm0
	movabsq	$140535612763840, %rax  # imm = 0x7FD0FF46EEC0
	vpaddq	(%rax), %zmm0, %zmm0
	movq	%rbx, %rax
	subq	%rcx, %rax
	vpternlogd	$255, %zmm1, %zmm1, %zmm1
	vpcmpgtq	%zmm1, %zmm0, %k1
	vmovupd	(%r12,%rax,8), %zmm0 {%k1} {z}
	vpxor	%xmm1, %xmm1, %xmm1
	leaq	(%rbx,%rbx), %rcx
	leaq	(%rbx,%rax), %rdx
	vmovupd	(%r12,%rdx,8), %zmm2 {%k1} {z}
	addq	%rax, %rcx
	vmovupd	(%r12,%rcx,8), %zmm3 {%k1} {z}
	vbroadcastsd	%xmm10, %zmm4
	vfmadd231pd	%zmm0, %zmm4, %zmm1
	vbroadcastsd	%xmm8, %zmm5
	vfmadd231pd	%zmm2, %zmm5, %zmm1
	vbroadcastsd	%xmm9, %zmm6
	vfmadd231pd	%zmm3, %zmm6, %zmm1
	vfmadd213pd	%zmm0, %zmm1, %zmm4
	vmovupd	%zmm4, (%r15,%rax,8) {%k1}
	vfmadd213pd	%zmm2, %zmm1, %zmm5
	movq	24(%r14), %rcx
	leaq	(%rcx,%rax), %rdx
	vmovupd	%zmm5, (%r15,%rdx,8) {%k1}
	vfmadd213pd	%zmm3, %zmm6, %zmm1
	leaq	(%rax,%rcx,2), %rax
	vmovupd	%zmm1, (%r15,%rax,8) {%k1}
	movabsq	$140535856324616, %rax  # imm = 0x7FD10DCB6008
	addq	$56, %rsp
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	popq	%rbp
	vzeroupper
	retq
L495:
	movabsq	$140535856324616, %rax  # imm = 0x7FD10DCB6008
	addq	$56, %rsp
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15
	popq	%rbp
	vzeroupper
	retq

a big pile of zmm (512 bit) registers and vfmadd instructions.

EDIT:
@Seif_Shebl, did you try Intel Fortran on the version with transposed array and result (ie, Nx3 instead of 3xN)?
In earlier experimenting, both ifort and flang sometimes managed to correctly vectorize such for loops. Sometimes gfortran can do it to, but it often involves playing with flags like turning -fdisable-tree-cunrolli on or off. The number of loops I tried on isn’t too large, but that llvm-based flang worked on an example loop I tried earlier while Julia did not (without @vectorize) surprised me.
On that note, I’ve been meaning to try and find the time to play with Intel’s [Single Program Multiple Data] Program Compiler (ispc) compiler, which is also LLVM-based. Here is a fascinating series of blog posts about it by the creator. From the sound of it, this is the sort of thing it should be well optimized for – although the author wishes some bigger project would work to better support it.

My success has actually been pretty low with Julia - but the ability to write complicated transformative macros means it’s actually the most robustly capable of loop vectorization in practice (for me).

And SIMD intrinsics are so much easier in Julia, even without macros.

I really need to refactor the code base behind my @vectorize macro to make it easier to work with and extend. It is fairly limited at the moment. It would be easier to add features / support more types of loops with a different code organization.

EDIT:

Actually, both Flang and gfortran do a great job on this problem without any tricks.
The Fortran implementation:

module vectorization_test

implicit none

contains

subroutine bar(result, array, N)
    integer(8),                 intent(in)  ::  N
    real(8),  dimension(N,3),   intent(in)  ::  array
    real(8),  dimension(N,3),   intent(out) ::  result

    integer(8)                              ::  i
    real(8)                                 ::  p1(3), dp

    p1 = array(1,:)

    do concurrent (i = 1:N)
        dp = sum(p1 * array(i,:))
        result(i,:) = dp * p1 + array(i,:)
    end do

end subroutine bar

end module vectorization_test

and compiler versions (latest releases):

$ gfortran --version
GNU Fortran (GCC) 8.2.1 20181215 (Red Hat 8.2.1-6)
Copyright (C) 2018 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

$ flang --version
clang version 7.0.1 (https://github.com/flang-compiler/flang-driver.git 24bd54da5c41af04838bbe7b68f830840d47fc03) (https://github.com/flang-compiler/llvm.git 84335cded2f2cf9de26a77194835094fc75f7bb7)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin

Compiling with

$ gfortran -Ofast -march=native -mprefer-vector-width=512 -shared -fPIC vectorization_test.f90 -o libgfortran_vectorization_test.so
$ flang -Ofast -march=native -mprefer-vector-width=512 -shared -fPIC vectorization_test.f90 -o libflang_vectorization_test.so
julia> const test_dir = "/home/chriselrod/Documents/progwork/fortran"
"/home/chriselrod/Documents/progwork/fortran"

julia> const flang_lib = joinpath(test_dir, "libflang_vectorization_test.so")
"/home/chriselrod/Documents/progwork/fortran/libflang_vectorization_test.so"

julia> const gfort_lib = joinpath(test_dir, "libgfortran_vectorization_test.so")
"/home/chriselrod/Documents/progwork/fortran/libgfortran_vectorization_test.so"

julia> function flang_test(results, array, N)
           ccall(
               (:vectorization_test_bar_, flang_lib), # function and library
               Cvoid, # return type
               (Ptr{Float64},Ptr{Float64},Ptr{Int64}), # argument types
               results, array, N # arguments
           )
       end
flang_test (generic function with 1 method)

julia> function gfortran_test(results, array, N)
           ccall(
               (:__vectorization_test_MOD_bar, gfort_lib), # function and library
               Cvoid, # return type
               (Ptr{Float64},Ptr{Float64},Ptr{Int64}), # argument types
               results, array, N # arguments
           )
       end
gfortran_test (generic function with 1 method)

julia> Nr = Ref(N)
Base.RefValue{Int64}(1000)

julia> fill!(result_t, 0);

julia> flang_test(result_t, array_t, Nr); result_t' # matches the earlier Julia results
3×1000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
 -2.22506  -3.39461  -0.89401   1.37359  -1.27377   2.85605  -1.22525  -0.387945   3.23087  -0.718255  …   1.60807   -2.39425     0.113308   1.24666   -1.2275    -0.713399  -0.601649  -0.664115   0.665728
  1.60194   3.84549  -1.26712  -1.59311  -1.31202  -2.01284   1.95004  -0.276039  -4.76663  -0.187859      0.880181   2.02346    -1.44376    0.426184   0.570249  -1.27561    0.865626   0.9036    -0.543402
 -1.68125  -1.77578  -1.45913   0.29928  -1.20614   2.13511  -1.24661  -1.1421     2.60744  -0.425533     -0.961413   0.0212073   0.423453  -0.270527  -1.72279    0.281566  -1.08591    1.90826    0.918416

julia> fill!(result_t, 0);

julia> gfortran_test(result_t, array_t, Nr); result_t'
3×1000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
 -2.22506  -3.39461  -0.89401   1.37359  -1.27377   2.85605  -1.22525  -0.387945   3.23087  -0.718255  …   1.60807   -2.39425     0.113308   1.24666   -1.2275    -0.713399  -0.601649  -0.664115   0.665728
  1.60194   3.84549  -1.26712  -1.59311  -1.31202  -2.01284   1.95004  -0.276039  -4.76663  -0.187859      0.880181   2.02346    -1.44376    0.426184   0.570249  -1.27561    0.865626   0.9036    -0.543402
 -1.68125  -1.77578  -1.45913   0.29928  -1.20614   2.13511  -1.24661  -1.1421     2.60744  -0.425533     -0.961413   0.0212073   0.423453  -0.270527  -1.72279    0.281566  -1.08591    1.90826    0.918416

These get the same answer, yet are very fast:

julia> @benchmark gfortran_test($result_t, $array_t, $Nr)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     420.372 ns (0.00% GC)
  median time:      421.322 ns (0.00% GC)
  mean time:        436.942 ns (0.00% GC)
  maximum time:     647.241 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     199

julia> @benchmark flang_test($result_t, $array_t, $Nr)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     419.518 ns (0.00% GC)
  median time:      420.523 ns (0.00% GC)
  mean time:        438.599 ns (0.00% GC)
  maximum time:     668.719 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     199

My Flang is using LLVM-7. I don’t know what Flang’s front end is doing, but it seems like Julia ought to achieve the same performance and ease of writing code, without any fancy macros.

EDIT:
I’m less familiar with ifort, but it gets the same performance too:

julia> const ifort_lib = joinpath(test_dir, "libifort_vectorization_test.so")
"/home/chriselrod/Documents/progwork/fortran/libifort_vectorization_test.so"

julia> function ifort_test(results, array, N)
           ccall(
               (:vectorization_test_mp_bar_, ifort_lib), # function and library
               Cvoid, # return type
               (Ptr{Float64},Ptr{Float64},Ptr{Int64}), # argument types
               results, array, N # arguments
           )
       end
ifort_test (generic function with 1 method)

julia> fill!(result_t, 0);

julia> ifort_test(result_t, array_t, Nr); result_t'
3×1000 LinearAlgebra.Adjoint{Float64,Array{Float64,2}}:
 -2.22506  -3.39461  -0.89401   1.37359  -1.27377   2.85605  -1.22525  -0.387945   3.23087  -0.718255  …   1.60807   -2.39425     0.113308   1.24666   -1.2275    -0.713399  -0.601649  -0.664115   0.665728
  1.60194   3.84549  -1.26712  -1.59311  -1.31202  -2.01284   1.95004  -0.276039  -4.76663  -0.187859      0.880181   2.02346    -1.44376    0.426184   0.570249  -1.27561    0.865626   0.9036    -0.543402
 -1.68125  -1.77578  -1.45913   0.29928  -1.20614   2.13511  -1.24661  -1.1421     2.60744  -0.425533     -0.961413   0.0212073   0.423453  -0.270527  -1.72279    0.281566  -1.08591    1.90826    0.918416

julia> @benchmark ifort_test($result_t, $array_t, $Nr)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     421.477 ns (0.00% GC)
  median time:      422.523 ns (0.00% GC)
  mean time:        443.197 ns (0.00% GC)
  maximum time:     721.663 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     199

I compiled with

$ ifort -fast -qopt-zmm-usage=high -shared -fPIC vectorization_test.f90 -o libifort_vectorization_test.so
$ ifort --version
ifort (IFORT) 19.0.1.144 20181018
Copyright (C) 1985-2018 Intel Corporation.  All rights reserved.

All three Fortran compilers produce code that takes just over 420 nanoseconds, while my best Julia version took about 435 nanoseconds. So that’s pretty good, but it needed a fancy code-transforming macro, while the Fortran code was written in a (comparatively) simple and naive manner.
Even if I go “all out” with macros from base Julia, I don’t get that performance:

julia> @generated function bar_tv3!(result::AbstractArray{T}, array::AbstractArray{T},::Val{NCOL}) where {T,NCOL}
           quote
               @nexprs $NCOL j -> p_j = @inbounds array[1,j]
               @inbounds @fastmath @simd ivdep for i ∈ 1:size(array,1)
                   dp = zero(T)
                   @nexprs $NCOL j -> dp += p_j * array[i,j]
                   @nexprs $NCOL j -> result[i,j] = dp * p_j + array[i,j]
               end
           end
       end
bar_tv3! (generic function with 1 method)

julia> @benchmark bar_tv3!($result_t, $array_t,Val(3))
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.091 μs (0.00% GC)
  median time:      1.101 μs (0.00% GC)
  mean time:        1.142 μs (0.00% GC)
  maximum time:     2.841 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

because it fails to vectorize across loop iterations. So, I’d call it a win for Fortran at the moment.

5 Likes