Inplace multiplication of sub-matrices without allocations

View is less performant when taking strides, but more performant when not.

julia> function foo!(A,B,C,dim)
           mul!(C[dim,1:10,1:10], A[dim,1:10,1:10], B)
           nothing
       end
foo! (generic function with 1 method)

julia> function foo2!(A,B,C,dim)
           mul!(C[dim,1:10,1:10], @view(A[dim,1:10,1:10]), @view(B[1:10,1:10]))
           nothing
       end
foo2! (generic function with 1 method)

julia> function foo3!(A,B,C,dim)
           mul!(C[:,:,dim], @view(A[:,:,dim]), B)
           nothing
       end
foo3! (generic function with 1 method)

julia> function foo4!(A,B,C,dim)
           mul!(@view(C[:,:,dim]), @view(A[:,:,dim]), B)
           nothing
       end
foo4! (generic function with 1 method)

julia> function foo5!(A,B,C,dim)
           mul!(C[:,:,dim], A[:,:,dim], B)
           nothing
       end
foo5! (generic function with 1 method)

julia> @btime foo!(A,B,C,1)
  1.140 μs (2 allocations: 1.75 KiB)

julia> @btime foo2!(A,B,C,1)
  2.022 μs (3 allocations: 21.38 KiB)

julia> @btime foo3!(A,B,C,1)
  450.289 ns (1 allocation: 400 bytes)

julia> @btime foo4!(A,B,C,1)
  324.885 ns (0 allocations: 0 bytes)

julia> @btime foo5!(A,B,C,1)
  526.380 ns (2 allocations: 800 bytes)
2 Likes