I have been using TensorOperations.jl for fast N-dimensional Array operations, which are written using generated functions that produce N nested @simd for loops. I discovered that while the runtime of these functions is very fast (TensorOperations.tensorcopy! is much faster than the equivalent Base.permutedims! for computing tensor transposes of large arrays), the compile time was very long for large (N >= 15) rank arrays. While the application of arrays with ranks >= 15 is somewhat niche due to the large memory requirements, 2x2x2x…x2 arrays are useful for numerical quantum physics with dimensions as large as N=30.
For comparison, here are the compile times and runtimes for a transpose! function (the full code is given in the example below) which is a drop-in replacement for Base.permutedims!. In the following, the arrays used are dimension N with size 2x2x2x…x2.
N=16
- Base.permutedims! compile time: 0.65 sec
- Base.permutedims! run time: 2.20 sec
- transpose! compile time: 29.1 sec
- transpose! run time: 0.00069 sec
N=17
- Base.permutedims! compile time: 0.5 sec
- Base.permutedims! run time: 4.8 sec
- transpose! compile time: 56.8 sec
- transpose! run time: 0.00107 sec
The compile time of transpose! is independent of the size of the arrays and approximately doubles with an increment in N by 1. The time to compute the transposes of the arrays is then approximately linear in the number of elements in that array, and so the runtime of transpose for a 2x2x2x…x2 array of dimension N also approximately doubles with each increment of N by 1 – but as you see in the table above, the compile time is vastly larger than the runtime.
Scaling this up to N=30, transpose! will take over 5 days to compile but 9 seconds to run, while permutedims! will run in approximately 11 hours.
In order to get the benefit of the fast @simd loops, it would be nice to not have compile times that are so large. To my surprise, I discovered that a simple one line change made the long compile times go away, and I’d like to understand why.
A simplified working example with the long compile times is here:
macro stridedloops(N, dims, args...)
esc(_stridedloops(N, dims, args...))
end
function _stridedloops(N::Int, dims::Symbol, args...)
mod(length(args),2)==1 || error("Wrong number of arguments")
argiter = 1:2:length(args)-1
body = args[end]
pre = [Expr(:(=), args[i], Symbol(args[i],0)) for i in argiter]
ex = Expr(:block, pre..., body)
for d = 1:N
pre = [Expr(:(=), Symbol(args[i], d-1), Symbol(args[i], d)) for i in argiter]
post = [Expr(:(+=), Symbol(args[i], d), Expr(:ref, args[i+1], d)) for i in argiter]
ex = Expr(:block, pre..., ex, post...)
rangeex = Expr(:(:), 1, Expr(:ref, dims, d))
forex = Expr(:(=), gensym(), rangeex)
ex = Expr(:for, forex, ex)
if d==1
ex = Expr(:macrocall, Symbol("@simd"), ex)
end
end
pre = [Expr(:(=),Symbol(args[i],N),1) for i in argiter]
ex = Expr(:block, pre..., ex)
end
immutable StridedData{N,T}
data::Vector{T}
strides::NTuple{N,Int}
end
Base.getindex(a::StridedData,i) = a.data[i]
Base.setindex!(a::StridedData,v,i) = (@inbounds a.data[i] = v)
@generated function transpose_micro!{N}(C::StridedData{N}, A::StridedData{N}, dims::NTuple{N, Int})
quote
stridesA = A.strides
stridesC = C.strides
@stridedloops($N, dims, indA, stridesA, indC, stridesC, @inbounds C[indC]=A[indA])
return C
end
end
function transpose!(C::StridedArray, A::StridedArray, perm)
N = ndims(A)
stridesA = ntuple(i->stride(A, i), N)
stridesC = ntuple(i->stride(C, i), N)
stridesA = stridesA[perm]
minstrides = ntuple(i->min(stridesA[i], stridesC[i]), N)
p = sortperm(collect(minstrides))
dims = size(C)
dims = dims[p]
stridesA = stridesA[p]
stridesC = stridesC[p]
minstrides = minstrides[p]
dataA = StridedData(vec(A), stridesA)
dataC = StridedData(vec(C), stridesC)
transpose_micro!(dataC, dataA, dims)
return C
end
for N in 10:17
A = rand(Complex128, fill(2, N)...)
C1 = zeros(Complex128, fill(2, N)...)
C2 = zeros(Complex128, fill(2, N)...)
perm = randperm(N)
println("N=$N")
println("transpose!")
@time transpose!(C1, A, perm)
@time transpose!(C1, A, perm)
println("permutedims!")
@time permutedims!(C2, A, perm)
@time permutedims!(C2, A, perm)
@assert C1 ≈ C2
end
Oddly, the compile times are drastically improved by changing the body of the @generated function from
@generated function transpose_micro!{N}(C::StridedData{N}, A::StridedData{N}, dims::NTuple{N, Int})
quote
stridesA = A.strides
stridesC = C.strides
@stridedloops($N, dims, indA, stridesA, indC, stridesC, @inbounds C[indC]=A[indA])
return C
end
end
to
@generated function transpose_micro!{N}(C::StridedData{N}, A::StridedData{N}, dims::NTuple{N, Int})
quote
@stridedloops($N, dims, indA, A.strides, indC, C.strides, @inbounds C[indC]=A[indA])
return C
end
end
which results in the times
N=17
- transpose! compile time: 0.56 sec
- transpose! run time: 0.00198 sec.
Additionally, the compile time scales linearly in N instead of exponentially, so the operation becomes fast for larger N.
When looking at the bodies of these generated functions, there isn’t an obvious indicator for why the compile times should be large in one version but not the other: the difference is a simple variable substitution with instances of A.strides in the body replaced by stridesA, preceded by the line stridesA = A.strides.
I’m wondering if there is a simple way to understand and improve the compile-time performance of these generated functions, or if this is a performance bug of some sort.