Partial trace of multidimensional arrays

I’m trying to improve the implementation of partial traces in QuantumOptics.jl that I’ve written some time ago. At that time Base.Cartesian was the recommended approach. However, it seems that now one could use iteration over a CartesianRange and instead. According to Tim Holy’s blog post this should make the code simpler while being for many things just as fast as before. After trying it out it indeed seems to lead to much shorter and nicer code. However, I just can’t get anywhere close to the speed I managed to obtain with the Base.Cartesian version. It would be nice if anyone could give me some pointers at how this implementation can be improved.

First of all let me define the problem as concise as I can. The objects that we work with are of the form x::Array{Complex128, 2s} and have the constraint that size(x) = (N1,…Ns,N1,…,Ns). The partial trace over the index k is then y[:,:,…,:] = \sum_{ik=1}^{Nk} x[:,…,:,ik,:,…,:,ik,:…,:] where the index ik is here filled in at position k and (k+s).

For my tests I simplified the problem so that the array has always rank 6 and only the first index together with the 4th index are traced out. My two approaches using CartesianRange iteration look like the following

  • Version 1 (Iterate over everything but only add term if the first and 4th index are the same):
function ptrace_cartesian(x::Array{Complex128, 6})
    n1, n2, n3 = size(x)
    y = zeros(Complex128, 1, n2, n3, 1, n2, n3)
    ymax = CartesianIndex(size(y))
    for I in CartesianRange(size(x))
        if I.I[1] != I.I[4]
            continue
        end
        y[min(ymax, I)] += x[I]
    end
    reshape(y, n2, n3, n2, n3)
end
  • Version 2 (Iterate only over necessary indices)
function ptrace_cartesian2(x::Array{Complex128, 6})
    n1, n2, n3 = size(x)
    y = zeros(Complex128, 1, n2, n3, 1, n2, n3)
    for I in CartesianRange(size(y))
        for k in CartesianRange((n1, 1, 1))
            delta = CartesianIndex([[k.I...]; [k.I...]]...)
            y[I] += x[I+delta-1]
        end
    end
    reshape(y, n2, n3, n2, n3)
end

For me version 1 is nearly twice as fast as version 2 but both of them are two orders of magnitude slower than the Base.Cartesian implementation. The script below can be used to replicate these results and provides a few other implementations:

using Base.Cartesian

N1 = 23
N2 = 17
N3 = 19

srand(1)
x = rand(Complex128, N1, N2, N3, N1, N2, N3)


function ptrace_forloops(x)
    n1, n2, n3 = size(x)
    y = zeros(Complex128, n2, n3, n2, n3)
    for i5=1:n3
        for i4=1:n2
            for i3=1:n3
                for i2=1:n2
                    for i1=1:n1
                        y[i2,i3,i4,i5] += x[i1,i2,i3,i1,i4,i5]
                    end
                end
            end
        end
    end
    y
end

function ptrace_slicing(x::Array{Complex128, 6})
    n1, n2, n3 = size(x)
    y = zeros(Complex128, n2, n3, n2, n3)
    for i1=1:n1
        y += x[i1,:,:,i1,:,:]
    end
    y
end

function ptrace_cartesian(x::Array{Complex128, 6})
    n1, n2, n3 = size(x)
    y = zeros(Complex128, 1, n2, n3, 1, n2, n3)
    ymax = CartesianIndex(size(y))
    for I in CartesianRange(size(x))
        if I.I[1] != I.I[4]
            continue
        end
        y[min(ymax, I)] += x[I]
    end
    reshape(y, n2, n3, n2, n3)
end

function ptrace_cartesian2(x::Array{Complex128, 6})
    n1, n2, n3 = size(x)
    y = zeros(Complex128, 1, n2, n3, 1, n2, n3)
    for I in CartesianRange(size(y))
        for k in CartesianRange((n1, 1, 1))
            delta = CartesianIndex(k, k)
            y[I] += x[I+delta-1]
        end
    end
    reshape(y, n2, n3, n2, n3)
end

# Partial trace for dense operators.
function _strides(shape::Vector{Int})
    N = length(shape)
    S = zeros(Int, N)
    S[N] = 1
    for m=N-1:-1:1
        S[m] = S[m+1]*shape[m+1]
    end
    return S
end

@generated function _ptrace(a::Matrix{Complex128},
                                  shape_l::Vector{Int}, shape_r::Vector{Int},
                                  indices::Vector{Int})
    return quote
        a_strides_l = _strides(shape_l)
        result_shape_l = deepcopy(shape_l)
        result_shape_l[indices] = 1
        result_strides_l = _strides(result_shape_l)
        a_strides_r = _strides(shape_r)
        result_shape_r = deepcopy(shape_r)
        result_shape_r[indices] = 1
        result_strides_r = _strides(result_shape_r)
        N_result_l = prod(result_shape_l)
        N_result_r = prod(result_shape_r)
        result = zeros(Complex128, N_result_l, N_result_r)
        @nexprs 1 (d->(Jr_{3}=1;Ir_{3}=1))
        @nloops 3 ir (d->1:shape_r[d]) (d->(Ir_{d-1}=Ir_d; Jr_{d-1}=Jr_d)) (d->(Ir_d+=a_strides_r[d]; if !(d in indices) Jr_d+=result_strides_r[d] end)) begin
            @nexprs 1 (d->(Jl_{3}=1;Il_{3}=1))
            @nloops 3 il (k->1:shape_l[k]) (k->(Il_{k-1}=Il_k; Jl_{k-1}=Jl_k; if (k in indices && il_k!=ir_k) Il_k+=a_strides_l[k]; continue end)) (k->(Il_k+=a_strides_l[k]; if !(k in indices) Jl_k+=result_strides_l[k] end)) begin
                #println("Jl_0: ", Jl_0, "; Jr_0: ", Jr_0, "; Il_0: ", Il_0, "; Ir_0: ", Ir_0)
                result[Jl_0, Jr_0] += a[Il_0, Ir_0]
            end
        end
        return result
    end
end

function ptrace_nloop(x)
    n1, n2, n3 = size(x)
    n = n1*n2*n3
    x = reshape(x, n, n)
    y = _ptrace(x, [n3,n2,n1], [n3,n2,n1], [3])
    reshape(y, n2, n3, n2, n3)
end

dist(x,y) = sum(abs(x-y))
result = ptrace_forloops(x)

println(dist(result, ptrace_slicing(x)))
println(dist(result, ptrace_cartesian(x)))
println(dist(result, ptrace_cartesian2(x)))
println(dist(result, ptrace_nloop(x)))


println("Explicit loops")
@time ptrace_forloops(x)
@time ptrace_forloops(x)

println("Slicing")
@time ptrace_slicing(x)
@time ptrace_slicing(x)

println("Cartesian Index")
@time ptrace_cartesian(x)
@time ptrace_cartesian(x)

println("Cartesian Index 2")
@time ptrace_cartesian2(x)
@time ptrace_cartesian2(x)

println("nloop")
@time ptrace_nloop(x)
@time ptrace_nloop(x)
1 Like

I think you’re running into a couple of issues. The first looks like it might be this one: https://github.com/JuliaLang/julia/issues/18774 which causes indexing arrays with > 5 dimensions to be very slow. Your nloops code is computing linear indices into the arrays, so it doesn’t trigger that issue. Fortunately, this should be much better in julia 0.6. Have you tried this on the 0.6 nightly builds?

Also, I think that one problem with your second CartesianIndex implementation is that you are internally doing: [[k.I...]; [k.I...]]... which has to create a (actually several) brand-new arrays at every loop operation. That’s going to waste a lot of both time and memory. Fortunately, it’s easy to avoid by just doing something like y[I] += x[k, I.I[1], I.I[2], k, I.I[3], I.I[4]]. An @generated function can help write that syntax out for an arbitrary number of dimensions (so you won’t need different code for each array size). I can help with that if you end up going that direction.

Splatting those arrays is likely the biggest problem. Even better than building a generated function yourself, you can just pass multiple CartesianIndexes to the CartesianIndex constructor directly:

julia> CartesianIndex(CartesianIndex(1,2), 3, CartesianIndex(4,5,6))
CartesianIndex{6}((1,2,3,4,5,6))

Thank you both for the helpful comments. Changing the second CartesianIndex implementation to use the very nice composition syntax does indeed improve the speed! It is now as fast as the first CartesianIndex version but still much slower then the generated function with nloop. I will install julia 0.6 tomorrow to see if this really has to do something with the bug mentioned by rdeits, which definitely looks plausible.

Here is the changed version:

function ptrace_cartesian2(x::Array{Complex128, 6})
    n1, n2, n3 = size(x)
    y = zeros(Complex128, 1, n2, n3, 1, n2, n3)
    for I in CartesianRange(size(y))
        for k in CartesianRange((n1, 1, 1))
            delta = CartesianIndex(k, k)
            y[I] += x[I+delta-1]
        end
    end
    reshape(y, n2, n3, n2, n3)
end

The measured times are now:

Explicit loops
  0.119383 seconds (266.94 k allocations: 6.883 MB)
  0.126240 seconds (266.81 k allocations: 6.876 MB)
Slicing
  0.126794 seconds (266.84 k allocations: 7.782 MB)
  0.134372 seconds (266.84 k allocations: 7.782 MB, 8.53% gc time)
Cartesian Index
  0.369049 seconds (830.09 k allocations: 21.804 MB, 1.91% gc time)
  0.366026 seconds (830.09 k allocations: 21.804 MB, 1.38% gc time)
Cartesian Index 2
  0.370643 seconds (830.09 k allocations: 21.804 MB, 1.03% gc time)
  0.372281 seconds (830.09 k allocations: 21.804 MB, 1.10% gc time)
nloop
  0.001896 seconds (23 allocations: 94.953 KB)
  0.001701 seconds (23 allocations: 94.953 KB)

Just a note, your _ptrace implementation does not need to be a generated function. You can just write this as a normal function as is (remove the return quote).

Just FYI, on 0.6 I see millisecond-scale timing from ptrace_cartesian2.

Using the development branch indeed leads to amazing improvements!

From

Explicit loops
  4.105260 seconds (21.60 M allocations: 550.817 MB, 0.72% gc time)
  4.223228 seconds (21.60 M allocations: 550.810 MB, 0.87% gc time)
Slicing
  4.077188 seconds (21.60 M allocations: 624.046 MB, 2.23% gc time)
  3.985951 seconds (21.60 M allocations: 624.046 MB, 0.93% gc time)
Cartesian Index
 12.394154 seconds (67.19 M allocations: 1.718 GB, 1.22% gc time)
 12.428420 seconds (67.19 M allocations: 1.718 GB, 0.81% gc time)
Cartesian Index 2
 12.448843 seconds (67.19 M allocations: 1.718 GB, 0.81% gc time)
 12.365005 seconds (67.19 M allocations: 1.718 GB, 0.82% gc time)
nloop
  0.105678 seconds (23 allocations: 1.594 MB)
  0.109795 seconds (23 allocations: 1.594 MB, 1.47% gc time)

to

Explicit loops
  0.041595 seconds (108 allocations: 1.599 MB)
  0.042890 seconds (7 allocations: 1.592 MB)
Slicing
  0.052692 seconds (283 allocations: 74.832 MB, 20.73% gc time)
  0.042853 seconds (283 allocations: 74.832 MB, 2.01% gc time)
Cartesian Index
  0.258004 seconds (28 allocations: 1.593 MB)
  0.257404 seconds (28 allocations: 1.593 MB)
Cartesian Index 2
  0.209153 seconds (28 allocations: 1.593 MB)
  0.207769 seconds (28 allocations: 1.593 MB)
nloop
  0.103190 seconds (23 allocations: 1.594 MB)
  0.104168 seconds (23 allocations: 1.594 MB)

Finally the hard coded loops are faster then the generic Basic.Cartesian version. The cartesian index function is still the slowest but now only by a factor of 2 compared to a factor of 120 before. I guess I would be content to trade the speed for the much easier implementation. Maybe anyone has some other ideas for further improvements?