FWIW, both inplace_cumprod1!
and inplace_cumprod2!
are a good chunk faster on master on my computer.
julia> versioninfo()
Julia Version 0.6.2
Commit d386e40* (2017-12-13 18:08 UTC)
Platform Info:
OS: Linux (x86_64-pc-linux-gnu)
CPU: AMD Ryzen Threadripper 1950X 16-Core Processor
WORD_SIZE: 64
BLAS: libopenblas (USE64BITINT ZEN)
LAPACK: libopenblas
LIBM: libopenlibm
LLVM: libLLVM-3.9.1 (ORCJIT, generic)
julia> using BenchmarkTools
INFO: Recompiling stale cache file /home/chris/.julia/lib/v0.6/BenchmarkTools.ji for module BenchmarkTools.
julia> X = Array{Complex128,3}(10,10,1000);
julia> U = rand(Complex128,10,10,1000);
julia> U2 = [U[:,:,i] for i=1:1000];
julia> X2 = [X[:,:,i] for i=1:1000];
julia> function inplace_cumprod1!(X,U)
n = size(U,3)
n == size(X,3) || throw(DimensionMismatch("not the same number of submatrices!"))
@views copy!(X[:,:,1],U[:,:,1])
@inbounds for i = 2:n
@views A_mul_B!(X[:,:,i],U[:,:,i],X[:,:,i-1])
end
end
inplace_cumprod1! (generic function with 1 method)
julia> function inplace_cumprod2!(X,U)
n = length(U)
n == length(X) || throw(DimensionMismatch("not the same number of submatrices!"))
copy!(X[1],U[1])
@inbounds for i = 2:n
A_mul_B!(X[i],U[i],X[i-1])
end
end
inplace_cumprod2! (generic function with 1 method)
julia> @benchmark inplace_cumprod1!($X,$U)
BenchmarkTools.Trial:
memory estimate: 187.44 KiB
allocs estimate: 2999
--------------
minimum time: 874.181 μs (0.00% GC)
median time: 887.376 μs (0.00% GC)
mean time: 901.472 μs (0.63% GC)
maximum time: 1.721 ms (40.30% GC)
--------------
samples: 5542
evals/sample: 1
julia> @benchmark inplace_cumprod2!($X2,$U2)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 795.044 μs (0.00% GC)
median time: 811.104 μs (0.00% GC)
mean time: 817.069 μs (0.00% GC)
maximum time: 1.646 ms (0.00% GC)
--------------
samples: 6114
evals/sample: 1
julia> function allocate_matrices1(N,n)
return [Matrix{Complex128}(N,N) for i=1:n]
end
allocate_matrices1 (generic function with 1 method)
julia> function allocate_matrices3(N,n)
A = Array{Complex128,3}(N,N,n)
return [view(A,:,:,i) for i=1:n]
end
allocate_matrices3 (generic function with 1 method)
julia> N=10;n=4000;
julia> A = allocate_matrices3(N,n); B = allocate_matrices3(N,n);
julia> @benchmark inplace_cumprod2!($A,$B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 3.473 ms (0.00% GC)
median time: 3.512 ms (0.00% GC)
mean time: 3.544 ms (0.00% GC)
maximum time: 3.958 ms (0.00% GC)
--------------
samples: 1411
evals/sample: 1
julia> A = allocate_matrices1(N,n); B = allocate_matrices1(N,n);
julia> @benchmark inplace_cumprod2!($A,$B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 3.614 ms (0.00% GC)
median time: 3.660 ms (0.00% GC)
mean time: 3.690 ms (0.00% GC)
maximum time: 4.402 ms (0.00% GC)
--------------
samples: 1352
evals/sample: 1
julia> versioninfo()
Julia Version 0.7.0-DEV.4020
Commit 1bbd518* (2018-02-19 22:10 UTC)
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: AMD Ryzen Threadripper 1950X 16-Core Processor
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-3.9.1 (ORCJIT, znver1)
Environment:
julia> using BenchmarkTools, LinearAlgebra
julia> X = Array{Complex128,3}(10,10,1000);
WARNING: Base.Complex128 is deprecated, use ComplexF64 instead.
in module Main
┌ Warning: `Array{T, 3}(m::Int, n::Int, o::Int) where T` is deprecated, use `Array{T, 3}(uninitialized, m, n, o)` instead.
│ caller = top-level scope
└ @ Core :0
julia> U = rand(Complex128,10,10,1000);
WARNING: Base.Complex128 is deprecated, use ComplexF64 instead.
in module Main
julia> U2 = [U[:,:,i] for i=1:1000];
julia> X2 = [X[:,:,i] for i=1:1000];
julia> function inplace_cumprod1!(X,U)
n = size(U,3)
n == size(X,3) || throw(DimensionMismatch("not the same number of submatrices!"))
@views copyto!(X[:,:,1],U[:,:,1])
@inbounds for i = 2:n
@views mul!(X[:,:,i],U[:,:,i],X[:,:,i-1])
end
end
inplace_cumprod1! (generic function with 1 method)
julia> function inplace_cumprod2!(X,U)
n = length(U)
n == length(X) || throw(DimensionMismatch("not the same number of submatrices!"))
copyto!(X[1],U[1])
@inbounds for i = 2:n
mul!(X[i],U[i],X[i-1])
end
end
inplace_cumprod2! (generic function with 1 method)
julia> @benchmark inplace_cumprod1!($X,$U)
┌ Warning: `indmin` is deprecated, use `argmin` instead.
│ caller = minimum at trials.jl:112 [inlined]
└ @ Core trials.jl:112
┌ Warning: `indmax` is deprecated, use `argmax` instead.
│ caller = maximum at trials.jl:117 [inlined]
└ @ Core trials.jl:117
BenchmarkTools.Trial:
memory estimate: 187.38 KiB
allocs estimate: 2998
--------------
minimum time: 720.254 μs (0.00% GC)
median time: 763.515 μs (0.00% GC)
mean time: 791.029 μs (1.42% GC)
maximum time: 33.777 ms (97.57% GC)
--------------
samples: 6312
evals/sample: 1
julia> @benchmark inplace_cumprod2!($X2,$U2)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 685.600 μs (0.00% GC)
median time: 696.730 μs (0.00% GC)
mean time: 698.834 μs (0.00% GC)
maximum time: 864.746 μs (0.00% GC)
--------------
samples: 7147
evals/sample: 1
julia> function allocate_matrices1(N,n)
return [Matrix{Complex128}(N,N) for i=1:n]
end
allocate_matrices1 (generic function with 1 method)
julia> function allocate_matrices3(N,n)
A = Array{Complex128,3}(N,N,n)
return [view(A,:,:,i) for i=1:n]
end
allocate_matrices3 (generic function with 1 method)
julia> N=10;n=4000;
julia> A = allocate_matrices3(N,n); B = allocate_matrices3(N,n);
WARNING: Base.Complex128 is deprecated, use ComplexF64 instead.
in module Main
WARNING: Base.Complex128 is deprecated, use ComplexF64 instead.
in module Main
┌ Warning: `Array{T, 3}(m::Int, n::Int, o::Int) where T` is deprecated, use `Array{T, 3}(uninitialized, m, n, o)` instead.
│ caller = allocate_matrices3(::Int64, ::Int64) at REPL[12]:2
└ @ Main REPL[12]:2
julia> @benchmark inplace_cumprod2!($A,$B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 2.875 ms (0.00% GC)
median time: 2.897 ms (0.00% GC)
mean time: 2.953 ms (0.00% GC)
maximum time: 3.398 ms (0.00% GC)
--------------
samples: 1692
evals/sample: 1
julia> A = allocate_matrices1(N,n); B = allocate_matrices1(N,n);
WARNING: Base.Complex128 is deprecated, use ComplexF64 instead.
in module Main
WARNING: Base.Complex128 is deprecated, use ComplexF64 instead.
in module Main
┌ Warning: `Array{T, 2}(m::Int, n::Int) where T` is deprecated, use `Array{T, 2}(uninitialized, m, n)` instead.
│ caller = #7 at <missing>:0 [inlined]
└ @ Core <missing>:0
┌ Warning: `Array{T, 2}(m::Int, n::Int) where T` is deprecated, use `Array{T, 2}(uninitialized, m, n)` instead.
│ caller = #7 at <missing>:0 [inlined]
└ @ Core <missing>:0
julia> @benchmark inplace_cumprod2!($A,$B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 3.113 ms (0.00% GC)
median time: 3.148 ms (0.00% GC)
mean time: 3.174 ms (0.00% GC)
maximum time: 3.541 ms (0.00% GC)
--------------
samples: 1571
evals/sample: 1
Can’t say I know what’s going on, but because I’d guess that the vast majority of time is spent on repeated BLAS calls, the percent difference in speed looks really large/impressive to me. Wouldn’t have thought there’s that much overhead to trim off.