Help with type instabilities

I’m having issues removing type instabilities from the function below, and would appreciate some tips.

A = rand(36,36);
out = [2]; sysdims = [2,9,2];
@code_warntype ptrace2(A,out,sysdims)
function ptrace2{T<:Number}(A::Matrix{T},out,sysdims)
    nsys = length(sysdims)
    keep = setdiff(1:nsys,out)
    keepdims = sysdims[keep]
    outdims = sysdims[out]::Vector{Int}
    N = prod(keepdims)
    R = Base.product(range.(1,keepdims)...)
    S = Base.product(range.(1,outdims)...)
    B = zeros(T,N,N)
    inds_ii = Vector{Int}(nsys)
    inds_jj = Vector{Int}(nsys)
    for r in R, q in R
        i, j = tindex(q,keepdims), tindex(r,keepdims)
        for l=1:length(keep)
            inds_ii[keep[l]] = q[l]
            inds_jj[keep[l]] = r[l]
        end
        for s in S
            for l=1:length(out)
                inds_ii[out[l]] = s[l]
                inds_jj[out[l]] = s[l]
            end
            ii = tindex(inds_ii,sysdims)
            jj = tindex(inds_jj,sysdims)
            B[i,j] += A[ii,jj]
        end
    end
    return B
end

function tindex(inds,sysdims)
    N = length(inds)
    i = inds[N]
    d = 1
    for n = N-1:-1:1
        d *= sysdims[n+1]
        i += d * (inds[n]-1)
    end
    return i
end

In this case a function barrier can help. Here is my attempt (no change of the code except splitting the function into two parts):

function ptrace2{T<:Number}(A::Matrix{T},out,sysdims)
    nsys = length(sysdims)
    keep = setdiff(1:nsys,out)
    keepdims = sysdims[keep]
    outdims = sysdims[out]::Vector{Int}
    N = prod(keepdims)
    R = Base.product(range.(1,keepdims)...)
    S = Base.product(range.(1,outdims)...)
    B = zeros(T,N,N)
    _ptrace2!(B,A,R,S,nsys,sysdims,keep,keepdims,out)
end

function _ptrace2!(B,A,R,S,nsys,sysdims,keep,keepdims,out)
    inds_ii = Vector{Int}(nsys)
    inds_jj = Vector{Int}(nsys)
    for r in R, q in R
        i, j = tindex(q,keepdims), tindex(r,keepdims)
        for l=1:length(keep)
            inds_ii[keep[l]] = q[l]
            inds_jj[keep[l]] = r[l]
        end
        for s in S
            for l=1:length(out)
                inds_ii[out[l]] = s[l]
                inds_jj[out[l]] = s[l]
            end
            ii = tindex(inds_ii,sysdims)
            jj = tindex(inds_jj,sysdims)
            B[i,j] += A[ii,jj]
        end
    end
    return B
end

Using your example data, I get (Julia 0.6.0-rc2) with the barrier:

julia> @btime ptrace2($A, $out, $sysdims);
5.268 μs (25 allocations: 2.09 KiB)

while without the barrier (original code):

julia> @btime ptrace2($A, $out, $sysdims);
57.671 μs (810 allocations: 18.64 KiB)

2 Likes

It seems to me that R = Base.product(range.(1,keepdims)...) is not inferred since the type of Base.product(range.(1,keepdims)...) depends on the expansion, in your casethe type inferred is Base.Iterators.Prod2{UnitRange{Int64},UnitRange{Int64}}.

I think splitting the function is ok so you have a dispatch on the type of R. Maybe someone can give a better solution with a more generic iterator?

Thanks!

Just as an aside, instead of Base.product(range.(1, keepdims)...) you might consider using a CartesianRange(keepdims...) which is designed for the kind of multidimensional indexing that you’re doing.

Unfortunately, CartesianRange also relies on knowing the number of dimensions for type-inference, so it doesn’t solve your immediate issue. But it might at least make your code more understandable and perhaps more performant.

I tried using CartesianRange, but it actually made the function slower…

In any case, this is what I have now, after some more clever permutation of indices, if anyone wants to take a stab at making it faster:

julia> A = rand(36,36); out = (2,); sysdims = (2,9,2);
julia> @btime ptrace3($A,$out,$sysdims)
  4.221 μs (35 allocations: 2.63 KiB)
function ptrace3{T,M,nsub}(A::Matrix{T}, out::NTuple{M,Int}, sysdims::NTuple{nsub,Int})
    rkeep = (nsub + 1) .- reverse(setdiff(1:nsub,out))
    rout  = (nsub + 1) .- reverse([out...])
    rsysdims  = reverse(sysdims)
    rkeepdims = rsysdims[rkeep]::NTuple{nsub-M,Int}
    routdims  = rsysdims[rout]::NTuple{M,Int}
    N = prod(rkeepdims)::Int
    R = product(range.(1,rkeepdims)...)
    S = product(range.(1,routdims)...)
    B = zeros(T,N,N)
    _ptrace3!(B,A,R,S,nsub,rsysdims,rkeepdims,rkeep,rout)
end

function _ptrace3!(B,A,R,S,nsub,rsysdims,rkeepdims,rkeep,rout)
    K = length(rkeepdims)
    O = nsub - K
    rinds_ii = Vector{Int}(nsub)
    rinds_jj = Vector{Int}(nsub)
    for r in R, q in R
        i, j = tindex2(q,rkeepdims), tindex2(r,rkeepdims)
        #println("i: $i, j: $j")
        for k=1:K
            rinds_ii[rkeep[k]] = q[k]
            rinds_jj[rkeep[k]] = r[k]
        end
        for s in S
            for o=1:O
                rinds_ii[rout[o]] = s[o]
                rinds_jj[rout[o]] = s[o]
            end
            ii, jj = tindex2(rinds_ii,rsysdims), tindex2(rinds_jj,rsysdims)
            #println("ii: $ii, jj: $jj")
            B[i,j] += A[ii,jj]
        end
    end
    return B
end

function tindex2{N}(rinds,rsysdims::NTuple{N,Int})
    i = rinds[1]
    d = 1
    @inbounds for n = 2:N
        d *= rsysdims[n-1]
        i += d * (rinds[n]-1)
    end
    return i
end

There is a package called TensorOperations which does this kind of calculation and many more. It is optimized and given the simplicity of code, it would probably save a lot of time down the line and allow someone else to worry about further optimizations.

The code for ptrace would look like this:

using BenchmarkTools
using TensorOperations

A = rand(2,9,2,2,9,2);

julia> @btime tensortrace(A,[:x1,:t,:z1,:x2,:t,:z2])
  5.668 μs (65 allocations: 5.20 KiB)
2×2×2×2 Array{Float64,4}:
[:, :, 1, 1] =
 5.1795   5.58526
 4.18046  6.27735

[:, :, 2, 1] =
 5.70867  5.40797
 6.11036  4.07155

[:, :, 1, 2] =
 5.26413  5.49429
 4.90246  3.99036

[:, :, 2, 2] =
 5.26287  3.62941
 6.27622  4.9152 

If a 2D matrix is really necessary (not really), then a helper functions would do the job:

function mat(A::Array)
    s,d = size(A),ndims(A)
    s[1:d>>>1] != s[(1:d>>>1) + (d>>>1)] && error("tensor size cannot be matrixfied: $s")
    n = prod(s[1:d>>>1])
    return reshape(A,n,n)
end

julia> @btime mat(tensortrace(A,[:x1,:t,:z1,:x2,:t,:z2]))
  5.924 μs (70 allocations: 5.39 KiB)
4×4 Array{Float64,2}:
 5.1795   5.70867  5.26413  5.26287
 4.18046  6.11036  4.90246  6.27622
 5.58526  5.40797  5.49429  3.62941
 6.27735  4.07155  3.99036  4.9152 

Note the timings/allocations are very similar.

Hey thanks for the reply. I had indeed looked at TensorOperations, but I do need everything in 2D matrix form. I improved the code further (with some help) and managed to get the partial trace even faster:

julia> using Schrodinger, BenchmarkTools
julia> A = rand(36,36); out = (2,); sysdims = (2,9,2);
julia> @benchmark ptrace($A,$out,$sysdims)
BenchmarkTools.Trial:
  memory estimate:  480 bytes
  allocs estimate:  5
  --------------
  minimum time:     1.275 μs (0.00% GC)
  median time:      1.431 μs (0.00% GC)
  mean time:        1.520 μs (2.17% GC)
  maximum time:     181.442 μs (97.48% GC)
  --------------
  samples:          10000
  evals/sample:     10

The code is now available here (the package itself is not fully ready for prime time) for those interested.

Depending on the size of the matrices involved, it might be better to go with TensorOperations. Small tests can obscure the coefficients of the running-time surface. In particular, we have the following for TensorOperations vs. Schrodinger implementation:

A comparable TensorOperations implementation with wrapping functions to make interface like ptrace's:

using TensorOperations
using BenchmarkTools

A = rand(4000,4000); out = (2,); sysdims = (10,40,10);

function tens(A::Matrix,sysdims)
    size(A)==(prod(sysdims),prod(sysdims)) || error("sysdims product must equal row and col number")
    return reshape(A,sysdims...,sysdims...)
end

function mat(A::Array)
    s,d = size(A),ndims(A)
    s[1:d>>>1] != s[(1:d>>>1) + (d>>>1)] && error("tensor size cannot be matrixfied: $s")
    n = prod(s[1:d>>>1])
    return reshape(A,n,n)
end

function traceout(A,out)
    inds = collect(1:ndims(A))
    inds[out+(ndims(A)>>>1)] = inds[out]
    return mat(tensortrace(A,inds))
end

ptrace(A,out,sysdims) = traceout(reshape(A,sysdims...,sysdims...),collect(out))

With benchmark:

julia> @benchmark ptrace($A,$out,$sysdims)
BenchmarkTools.Trial: 
  memory estimate:  83.69 KiB
  allocs estimate:  72
  --------------
  minimum time:     2.559 ms (0.00% GC)
  median time:      2.712 ms (0.00% GC)
  mean time:        2.756 ms (0.11% GC)
  maximum time:     5.178 ms (17.54% GC)
  --------------
  samples:          1810
  evals/sample:     1

Compared to ptrace from Schrodinger.jl:

A = rand(4000,4000); out = (2,); sysdims = (10,40,10);

julia> @benchmark ptrace(A,out,sysdims)
BenchmarkTools.Trial: 
  memory estimate:  78.47 KiB
  allocs estimate:  6
  --------------
  minimum time:     4.812 ms (0.00% GC)
  median time:      5.410 ms (0.00% GC)
  mean time:        5.461 ms (0.03% GC)
  maximum time:     10.425 ms (0.00% GC)
  --------------
  samples:          914
  evals/sample:     1

If the helper functions are confusing, here is a more compact version (with less parameter checking):

using TensorOperations
using BenchmarkTools

A = rand(4000,4000); out = (2,); sysdims = (10,40,10);

mat(A::Array) = ( n = prod(size(A)[1:ndims(A)>>>1]) ; reshape(A,n,n) )

tens(M::Matrix,sysdims) = reshape(M,sysdims...,sysdims...)

traceout(A,out) = ( n = ndims(A)>>>1 ; mat(tensortrace(A,[i-n in out ? i-n : i for i=1:2n])) )

ptrace(M,out,sysdims) = traceout(tens(M,sysdims),out)

@benchmark ptrace($A,$out,$sysdims)