Help with type instabilities

jebej · May 24, 2017, 3:21am

I’m having issues removing type instabilities from the function below, and would appreciate some tips.

A = rand(36,36);
out = [2]; sysdims = [2,9,2];
@code_warntype ptrace2(A,out,sysdims)

function ptrace2{T<:Number}(A::Matrix{T},out,sysdims)
    nsys = length(sysdims)
    keep = setdiff(1:nsys,out)
    keepdims = sysdims[keep]
    outdims = sysdims[out]::Vector{Int}
    N = prod(keepdims)
    R = Base.product(range.(1,keepdims)...)
    S = Base.product(range.(1,outdims)...)
    B = zeros(T,N,N)
    inds_ii = Vector{Int}(nsys)
    inds_jj = Vector{Int}(nsys)
    for r in R, q in R
        i, j = tindex(q,keepdims), tindex(r,keepdims)
        for l=1:length(keep)
            inds_ii[keep[l]] = q[l]
            inds_jj[keep[l]] = r[l]
        end
        for s in S
            for l=1:length(out)
                inds_ii[out[l]] = s[l]
                inds_jj[out[l]] = s[l]
            end
            ii = tindex(inds_ii,sysdims)
            jj = tindex(inds_jj,sysdims)
            B[i,j] += A[ii,jj]
        end
    end
    return B
end

function tindex(inds,sysdims)
    N = length(inds)
    i = inds[N]
    d = 1
    for n = N-1:-1:1
        d *= sysdims[n+1]
        i += d * (inds[n]-1)
    end
    return i
end

traktofon · May 24, 2017, 4:25am

In this case a function barrier can help. Here is my attempt (no change of the code except splitting the function into two parts):

function ptrace2{T<:Number}(A::Matrix{T},out,sysdims)
    nsys = length(sysdims)
    keep = setdiff(1:nsys,out)
    keepdims = sysdims[keep]
    outdims = sysdims[out]::Vector{Int}
    N = prod(keepdims)
    R = Base.product(range.(1,keepdims)...)
    S = Base.product(range.(1,outdims)...)
    B = zeros(T,N,N)
    _ptrace2!(B,A,R,S,nsys,sysdims,keep,keepdims,out)
end

function _ptrace2!(B,A,R,S,nsys,sysdims,keep,keepdims,out)
    inds_ii = Vector{Int}(nsys)
    inds_jj = Vector{Int}(nsys)
    for r in R, q in R
        i, j = tindex(q,keepdims), tindex(r,keepdims)
        for l=1:length(keep)
            inds_ii[keep[l]] = q[l]
            inds_jj[keep[l]] = r[l]
        end
        for s in S
            for l=1:length(out)
                inds_ii[out[l]] = s[l]
                inds_jj[out[l]] = s[l]
            end
            ii = tindex(inds_ii,sysdims)
            jj = tindex(inds_jj,sysdims)
            B[i,j] += A[ii,jj]
        end
    end
    return B
end

Using your example data, I get (Julia 0.6.0-rc2) with the barrier:

julia> @btime ptrace2($A, $out, $sysdims);
5.268 μs (25 allocations: 2.09 KiB)

while without the barrier (original code):

julia> @btime ptrace2($A, $out, $sysdims);
57.671 μs (810 allocations: 18.64 KiB)

smldis · May 24, 2017, 6:31am

It seems to me that R = Base.product(range.(1,keepdims)...) is not inferred since the type of Base.product(range.(1,keepdims)...) depends on the expansion, in your casethe type inferred is Base.Iterators.Prod2{UnitRange{Int64},UnitRange{Int64}}.

I think splitting the function is ok so you have a dispatch on the type of R. Maybe someone can give a better solution with a more generic iterator?

jebej · May 24, 2017, 4:29pm

Thanks!

rdeits · May 24, 2017, 6:03pm

Just as an aside, instead of Base.product(range.(1, keepdims)...) you might consider using a CartesianRange(keepdims...) which is designed for the kind of multidimensional indexing that you’re doing.

Unfortunately, CartesianRange also relies on knowing the number of dimensions for type-inference, so it doesn’t solve your immediate issue. But it might at least make your code more understandable and perhaps more performant.

jebej · May 24, 2017, 8:31pm

I tried using CartesianRange, but it actually made the function slower…

In any case, this is what I have now, after some more clever permutation of indices, if anyone wants to take a stab at making it faster:

julia> A = rand(36,36); out = (2,); sysdims = (2,9,2);
julia> @btime ptrace3($A,$out,$sysdims)
  4.221 μs (35 allocations: 2.63 KiB)

function ptrace3{T,M,nsub}(A::Matrix{T}, out::NTuple{M,Int}, sysdims::NTuple{nsub,Int})
    rkeep = (nsub + 1) .- reverse(setdiff(1:nsub,out))
    rout  = (nsub + 1) .- reverse([out...])
    rsysdims  = reverse(sysdims)
    rkeepdims = rsysdims[rkeep]::NTuple{nsub-M,Int}
    routdims  = rsysdims[rout]::NTuple{M,Int}
    N = prod(rkeepdims)::Int
    R = product(range.(1,rkeepdims)...)
    S = product(range.(1,routdims)...)
    B = zeros(T,N,N)
    _ptrace3!(B,A,R,S,nsub,rsysdims,rkeepdims,rkeep,rout)
end

function _ptrace3!(B,A,R,S,nsub,rsysdims,rkeepdims,rkeep,rout)
    K = length(rkeepdims)
    O = nsub - K
    rinds_ii = Vector{Int}(nsub)
    rinds_jj = Vector{Int}(nsub)
    for r in R, q in R
        i, j = tindex2(q,rkeepdims), tindex2(r,rkeepdims)
        #println("i: $i, j: $j")
        for k=1:K
            rinds_ii[rkeep[k]] = q[k]
            rinds_jj[rkeep[k]] = r[k]
        end
        for s in S
            for o=1:O
                rinds_ii[rout[o]] = s[o]
                rinds_jj[rout[o]] = s[o]
            end
            ii, jj = tindex2(rinds_ii,rsysdims), tindex2(rinds_jj,rsysdims)
            #println("ii: $ii, jj: $jj")
            B[i,j] += A[ii,jj]
        end
    end
    return B
end

function tindex2{N}(rinds,rsysdims::NTuple{N,Int})
    i = rinds[1]
    d = 1
    @inbounds for n = 2:N
        d *= rsysdims[n-1]
        i += d * (rinds[n]-1)
    end
    return i
end

Dan · May 27, 2017, 1:45pm

There is a package called TensorOperations which does this kind of calculation and many more. It is optimized and given the simplicity of code, it would probably save a lot of time down the line and allow someone else to worry about further optimizations.

The code for ptrace would look like this:

using BenchmarkTools
using TensorOperations

A = rand(2,9,2,2,9,2);

julia> @btime tensortrace(A,[:x1,:t,:z1,:x2,:t,:z2])
  5.668 μs (65 allocations: 5.20 KiB)
2×2×2×2 Array{Float64,4}:
[:, :, 1, 1] =
 5.1795   5.58526
 4.18046  6.27735

[:, :, 2, 1] =
 5.70867  5.40797
 6.11036  4.07155

[:, :, 1, 2] =
 5.26413  5.49429
 4.90246  3.99036

[:, :, 2, 2] =
 5.26287  3.62941
 6.27622  4.9152

If a 2D matrix is really necessary (not really), then a helper functions would do the job:

function mat(A::Array)
    s,d = size(A),ndims(A)
    s[1:d>>>1] != s[(1:d>>>1) + (d>>>1)] && error("tensor size cannot be matrixfied: $s")
    n = prod(s[1:d>>>1])
    return reshape(A,n,n)
end

julia> @btime mat(tensortrace(A,[:x1,:t,:z1,:x2,:t,:z2]))
  5.924 μs (70 allocations: 5.39 KiB)
4×4 Array{Float64,2}:
 5.1795   5.70867  5.26413  5.26287
 4.18046  6.11036  4.90246  6.27622
 5.58526  5.40797  5.49429  3.62941
 6.27735  4.07155  3.99036  4.9152

Note the timings/allocations are very similar.

jebej · May 27, 2017, 1:54pm

Hey thanks for the reply. I had indeed looked at TensorOperations, but I do need everything in 2D matrix form. I improved the code further (with some help) and managed to get the partial trace even faster:

julia> using Schrodinger, BenchmarkTools
julia> A = rand(36,36); out = (2,); sysdims = (2,9,2);
julia> @benchmark ptrace($A,$out,$sysdims)
BenchmarkTools.Trial:
  memory estimate:  480 bytes
  allocs estimate:  5
  --------------
  minimum time:     1.275 μs (0.00% GC)
  median time:      1.431 μs (0.00% GC)
  mean time:        1.520 μs (2.17% GC)
  maximum time:     181.442 μs (97.48% GC)
  --------------
  samples:          10000
  evals/sample:     10

The code is now available here (the package itself is not fully ready for prime time) for those interested.

Dan · May 27, 2017, 3:05pm

Depending on the size of the matrices involved, it might be better to go with TensorOperations. Small tests can obscure the coefficients of the running-time surface. In particular, we have the following for TensorOperations vs. Schrodinger implementation:

A comparable TensorOperations implementation with wrapping functions to make interface like ptrace’s:

using TensorOperations
using BenchmarkTools

A = rand(4000,4000); out = (2,); sysdims = (10,40,10);

function tens(A::Matrix,sysdims)
    size(A)==(prod(sysdims),prod(sysdims)) || error("sysdims product must equal row and col number")
    return reshape(A,sysdims...,sysdims...)
end

function mat(A::Array)
    s,d = size(A),ndims(A)
    s[1:d>>>1] != s[(1:d>>>1) + (d>>>1)] && error("tensor size cannot be matrixfied: $s")
    n = prod(s[1:d>>>1])
    return reshape(A,n,n)
end

function traceout(A,out)
    inds = collect(1:ndims(A))
    inds[out+(ndims(A)>>>1)] = inds[out]
    return mat(tensortrace(A,inds))
end

ptrace(A,out,sysdims) = traceout(reshape(A,sysdims...,sysdims...),collect(out))

With benchmark:

julia> @benchmark ptrace($A,$out,$sysdims)
BenchmarkTools.Trial: 
  memory estimate:  83.69 KiB
  allocs estimate:  72
  --------------
  minimum time:     2.559 ms (0.00% GC)
  median time:      2.712 ms (0.00% GC)
  mean time:        2.756 ms (0.11% GC)
  maximum time:     5.178 ms (17.54% GC)
  --------------
  samples:          1810
  evals/sample:     1

Compared to ptrace from Schrodinger.jl:

A = rand(4000,4000); out = (2,); sysdims = (10,40,10);

julia> @benchmark ptrace(A,out,sysdims)
BenchmarkTools.Trial: 
  memory estimate:  78.47 KiB
  allocs estimate:  6
  --------------
  minimum time:     4.812 ms (0.00% GC)
  median time:      5.410 ms (0.00% GC)
  mean time:        5.461 ms (0.03% GC)
  maximum time:     10.425 ms (0.00% GC)
  --------------
  samples:          914
  evals/sample:     1

Dan · May 27, 2017, 3:18pm

If the helper functions are confusing, here is a more compact version (with less parameter checking):

using TensorOperations
using BenchmarkTools

A = rand(4000,4000); out = (2,); sysdims = (10,40,10);

mat(A::Array) = ( n = prod(size(A)[1:ndims(A)>>>1]) ; reshape(A,n,n) )

tens(M::Matrix,sysdims) = reshape(M,sysdims...,sysdims...)

traceout(A,out) = ( n = ndims(A)>>>1 ; mat(tensortrace(A,[i-n in out ? i-n : i for i=1:2n])) )

ptrace(M,out,sysdims) = traceout(tens(M,sysdims),out)

@benchmark ptrace($A,$out,$sysdims)

Topic		Replies	Views
Best way to take trace of matrix product in Julia? Performance	17	959	August 3, 2023
Type Instability in performance critical function Performance	17	695	March 15, 2021
Partial trace of multidimensional arrays General Usage	6	2707	January 21, 2017
How far does type instability propagate? General Usage performance , compilation , dispatch	10	450	July 4, 2025
Improving performance elegantly - type stability General Usage type-stability	20	1405	January 13, 2019

Help with type instabilities

Related topics