Help with type instabilities


#1

I’m having issues removing type instabilities from the function below, and would appreciate some tips.

A = rand(36,36);
out = [2]; sysdims = [2,9,2];
@code_warntype ptrace2(A,out,sysdims)
function ptrace2{T<:Number}(A::Matrix{T},out,sysdims)
    nsys = length(sysdims)
    keep = setdiff(1:nsys,out)
    keepdims = sysdims[keep]
    outdims = sysdims[out]::Vector{Int}
    N = prod(keepdims)
    R = Base.product(range.(1,keepdims)...)
    S = Base.product(range.(1,outdims)...)
    B = zeros(T,N,N)
    inds_ii = Vector{Int}(nsys)
    inds_jj = Vector{Int}(nsys)
    for r in R, q in R
        i, j = tindex(q,keepdims), tindex(r,keepdims)
        for l=1:length(keep)
            inds_ii[keep[l]] = q[l]
            inds_jj[keep[l]] = r[l]
        end
        for s in S
            for l=1:length(out)
                inds_ii[out[l]] = s[l]
                inds_jj[out[l]] = s[l]
            end
            ii = tindex(inds_ii,sysdims)
            jj = tindex(inds_jj,sysdims)
            B[i,j] += A[ii,jj]
        end
    end
    return B
end

function tindex(inds,sysdims)
    N = length(inds)
    i = inds[N]
    d = 1
    for n = N-1:-1:1
        d *= sysdims[n+1]
        i += d * (inds[n]-1)
    end
    return i
end

#2

In this case a function barrier can help. Here is my attempt (no change of the code except splitting the function into two parts):

function ptrace2{T<:Number}(A::Matrix{T},out,sysdims)
    nsys = length(sysdims)
    keep = setdiff(1:nsys,out)
    keepdims = sysdims[keep]
    outdims = sysdims[out]::Vector{Int}
    N = prod(keepdims)
    R = Base.product(range.(1,keepdims)...)
    S = Base.product(range.(1,outdims)...)
    B = zeros(T,N,N)
    _ptrace2!(B,A,R,S,nsys,sysdims,keep,keepdims,out)
end

function _ptrace2!(B,A,R,S,nsys,sysdims,keep,keepdims,out)
    inds_ii = Vector{Int}(nsys)
    inds_jj = Vector{Int}(nsys)
    for r in R, q in R
        i, j = tindex(q,keepdims), tindex(r,keepdims)
        for l=1:length(keep)
            inds_ii[keep[l]] = q[l]
            inds_jj[keep[l]] = r[l]
        end
        for s in S
            for l=1:length(out)
                inds_ii[out[l]] = s[l]
                inds_jj[out[l]] = s[l]
            end
            ii = tindex(inds_ii,sysdims)
            jj = tindex(inds_jj,sysdims)
            B[i,j] += A[ii,jj]
        end
    end
    return B
end

Using your example data, I get (Julia 0.6.0-rc2) with the barrier:

julia> @btime ptrace2($A, $out, $sysdims);
5.268 μs (25 allocations: 2.09 KiB)

while without the barrier (original code):

julia> @btime ptrace2($A, $out, $sysdims);
57.671 μs (810 allocations: 18.64 KiB)


#3

It seems to me that R = Base.product(range.(1,keepdims)...) is not inferred since the type of Base.product(range.(1,keepdims)...) depends on the expansion, in your casethe type inferred is Base.Iterators.Prod2{UnitRange{Int64},UnitRange{Int64}}.

I think splitting the function is ok so you have a dispatch on the type of R. Maybe someone can give a better solution with a more generic iterator?


#4

Thanks!


#5

Just as an aside, instead of Base.product(range.(1, keepdims)...) you might consider using a CartesianRange(keepdims...) which is designed for the kind of multidimensional indexing that you’re doing.

Unfortunately, CartesianRange also relies on knowing the number of dimensions for type-inference, so it doesn’t solve your immediate issue. But it might at least make your code more understandable and perhaps more performant.


#6

I tried using CartesianRange, but it actually made the function slower…

In any case, this is what I have now, after some more clever permutation of indices, if anyone wants to take a stab at making it faster:

julia> A = rand(36,36); out = (2,); sysdims = (2,9,2);
julia> @btime ptrace3($A,$out,$sysdims)
  4.221 μs (35 allocations: 2.63 KiB)
function ptrace3{T,M,nsub}(A::Matrix{T}, out::NTuple{M,Int}, sysdims::NTuple{nsub,Int})
    rkeep = (nsub + 1) .- reverse(setdiff(1:nsub,out))
    rout  = (nsub + 1) .- reverse([out...])
    rsysdims  = reverse(sysdims)
    rkeepdims = rsysdims[rkeep]::NTuple{nsub-M,Int}
    routdims  = rsysdims[rout]::NTuple{M,Int}
    N = prod(rkeepdims)::Int
    R = product(range.(1,rkeepdims)...)
    S = product(range.(1,routdims)...)
    B = zeros(T,N,N)
    _ptrace3!(B,A,R,S,nsub,rsysdims,rkeepdims,rkeep,rout)
end

function _ptrace3!(B,A,R,S,nsub,rsysdims,rkeepdims,rkeep,rout)
    K = length(rkeepdims)
    O = nsub - K
    rinds_ii = Vector{Int}(nsub)
    rinds_jj = Vector{Int}(nsub)
    for r in R, q in R
        i, j = tindex2(q,rkeepdims), tindex2(r,rkeepdims)
        #println("i: $i, j: $j")
        for k=1:K
            rinds_ii[rkeep[k]] = q[k]
            rinds_jj[rkeep[k]] = r[k]
        end
        for s in S
            for o=1:O
                rinds_ii[rout[o]] = s[o]
                rinds_jj[rout[o]] = s[o]
            end
            ii, jj = tindex2(rinds_ii,rsysdims), tindex2(rinds_jj,rsysdims)
            #println("ii: $ii, jj: $jj")
            B[i,j] += A[ii,jj]
        end
    end
    return B
end

function tindex2{N}(rinds,rsysdims::NTuple{N,Int})
    i = rinds[1]
    d = 1
    @inbounds for n = 2:N
        d *= rsysdims[n-1]
        i += d * (rinds[n]-1)
    end
    return i
end

#7

There is a package called TensorOperations which does this kind of calculation and many more. It is optimized and given the simplicity of code, it would probably save a lot of time down the line and allow someone else to worry about further optimizations.

The code for ptrace would look like this:

using BenchmarkTools
using TensorOperations

A = rand(2,9,2,2,9,2);

julia> @btime tensortrace(A,[:x1,:t,:z1,:x2,:t,:z2])
  5.668 μs (65 allocations: 5.20 KiB)
2×2×2×2 Array{Float64,4}:
[:, :, 1, 1] =
 5.1795   5.58526
 4.18046  6.27735

[:, :, 2, 1] =
 5.70867  5.40797
 6.11036  4.07155

[:, :, 1, 2] =
 5.26413  5.49429
 4.90246  3.99036

[:, :, 2, 2] =
 5.26287  3.62941
 6.27622  4.9152 

If a 2D matrix is really necessary (not really), then a helper functions would do the job:

function mat(A::Array)
    s,d = size(A),ndims(A)
    s[1:d>>>1] != s[(1:d>>>1) + (d>>>1)] && error("tensor size cannot be matrixfied: $s")
    n = prod(s[1:d>>>1])
    return reshape(A,n,n)
end

julia> @btime mat(tensortrace(A,[:x1,:t,:z1,:x2,:t,:z2]))
  5.924 μs (70 allocations: 5.39 KiB)
4×4 Array{Float64,2}:
 5.1795   5.70867  5.26413  5.26287
 4.18046  6.11036  4.90246  6.27622
 5.58526  5.40797  5.49429  3.62941
 6.27735  4.07155  3.99036  4.9152 

Note the timings/allocations are very similar.


#8

Hey thanks for the reply. I had indeed looked at TensorOperations, but I do need everything in 2D matrix form. I improved the code further (with some help) and managed to get the partial trace even faster:

julia> using Schrodinger, BenchmarkTools
julia> A = rand(36,36); out = (2,); sysdims = (2,9,2);
julia> @benchmark ptrace($A,$out,$sysdims)
BenchmarkTools.Trial:
  memory estimate:  480 bytes
  allocs estimate:  5
  --------------
  minimum time:     1.275 μs (0.00% GC)
  median time:      1.431 μs (0.00% GC)
  mean time:        1.520 μs (2.17% GC)
  maximum time:     181.442 μs (97.48% GC)
  --------------
  samples:          10000
  evals/sample:     10

The code is now available here (the package itself is not fully ready for prime time) for those interested.


#9

Depending on the size of the matrices involved, it might be better to go with TensorOperations. Small tests can obscure the coefficients of the running-time surface. In particular, we have the following for TensorOperations vs. Schrodinger implementation:

A comparable TensorOperations implementation with wrapping functions to make interface like ptrace's:

using TensorOperations
using BenchmarkTools

A = rand(4000,4000); out = (2,); sysdims = (10,40,10);

function tens(A::Matrix,sysdims)
    size(A)==(prod(sysdims),prod(sysdims)) || error("sysdims product must equal row and col number")
    return reshape(A,sysdims...,sysdims...)
end

function mat(A::Array)
    s,d = size(A),ndims(A)
    s[1:d>>>1] != s[(1:d>>>1) + (d>>>1)] && error("tensor size cannot be matrixfied: $s")
    n = prod(s[1:d>>>1])
    return reshape(A,n,n)
end

function traceout(A,out)
    inds = collect(1:ndims(A))
    inds[out+(ndims(A)>>>1)] = inds[out]
    return mat(tensortrace(A,inds))
end

ptrace(A,out,sysdims) = traceout(reshape(A,sysdims...,sysdims...),collect(out))

With benchmark:

julia> @benchmark ptrace($A,$out,$sysdims)
BenchmarkTools.Trial: 
  memory estimate:  83.69 KiB
  allocs estimate:  72
  --------------
  minimum time:     2.559 ms (0.00% GC)
  median time:      2.712 ms (0.00% GC)
  mean time:        2.756 ms (0.11% GC)
  maximum time:     5.178 ms (17.54% GC)
  --------------
  samples:          1810
  evals/sample:     1

Compared to ptrace from Schrodinger.jl:

A = rand(4000,4000); out = (2,); sysdims = (10,40,10);

julia> @benchmark ptrace(A,out,sysdims)
BenchmarkTools.Trial: 
  memory estimate:  78.47 KiB
  allocs estimate:  6
  --------------
  minimum time:     4.812 ms (0.00% GC)
  median time:      5.410 ms (0.00% GC)
  mean time:        5.461 ms (0.03% GC)
  maximum time:     10.425 ms (0.00% GC)
  --------------
  samples:          914
  evals/sample:     1

#10

If the helper functions are confusing, here is a more compact version (with less parameter checking):

using TensorOperations
using BenchmarkTools

A = rand(4000,4000); out = (2,); sysdims = (10,40,10);

mat(A::Array) = ( n = prod(size(A)[1:ndims(A)>>>1]) ; reshape(A,n,n) )

tens(M::Matrix,sysdims) = reshape(M,sysdims...,sysdims...)

traceout(A,out) = ( n = ndims(A)>>>1 ; mat(tensortrace(A,[i-n in out ? i-n : i for i=1:2n])) )

ptrace(M,out,sysdims) = traceout(tens(M,sysdims),out)

@benchmark ptrace($A,$out,$sysdims)