Performance difference in permuting Arrays

Hi guys,

I am trying to permute/shuffle a Vector of Vectors given a certain index. Some vectors can occur several times, so I need a buffer container for no allocations. MWE:

vals = [ [1, 2, 3], [4, 5, 6], [7, 8, 9] ]
buffer = [ [0,0,0], [0,0,0], [0,0,0] ]
index = [2, 1, 1]

@inline function _shuffle!(vals, buffer, index)
        # First assign new vector order in buffer
        @inbounds @simd for iter in eachindex(vals)
                buffer[iter] .= vals[index[iter]]
        end
        # Then input in vals
        @inbounds @simd for iter in eachindex(vals)
                vals[iter] .= buffer[iter]
        end
        return nothing
end
_shuffle!(vals, buffer, index)
vals #now has form [[4, 5, 6], [1, 2, 3], [1, 2, 3]]

Now, I need th .= copy syntax as otherwise I get issues with referencing, and I figured I could just write out the loop completely to get rid of that. MWE:

@inline function _shuffle2!(vals, buffer, index)
        # First sort new order in buffer
        MaxIter = length(vals[1])
        @inbounds @simd for iter in eachindex(vals)
                index_current = index[iter]
                @inbounds @simd for idx in Base.OneTo(MaxIter)
                        buffer[iter][idx] = vals[index_current][idx]
                end
        end
        # Then input in vals
        @inbounds @simd for iter in eachindex(vals)
                @inbounds @simd for idx in Base.OneTo(MaxIter)
                        vals[iter][idx] = buffer[iter][idx]
                end
        end
        return nothing
end
vals = [ [1, 2, 3], [4, 5, 6], [7, 8, 9] ]
buffer = [ [0,0,0], [0,0,0], [0,0,0] ]
index = [2, 1, 1]
_shuffle2!(vals, buffer, index)
vals #now has form [[4, 5, 6], [1, 2, 3], [1, 2, 3]]

Now, I assumed that _shuffle2! has to be much faster in this case, but its considerably slower:

using BenchmarkTools
Ncols = 1000
Nrows = 1000
vals = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)]
buffer = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)]
index = rand(1:Ncols, Nrows)

@btime _shuffle!($vals, $buffer, $index)  # 693.400 μs (0 allocations: 0 bytes)
@btime _shuffle2!($vals, $buffer, $index) # 1.218 ms (0 allocations: 0 bytes)

Now my question is, why? Is there a way to make the unenrolled loop faster?

Here is a modified MWE (simplified, because I didn’t want to look into the intricacies of SIMD):

function _shuffle0!(vals, buffer, index)
    # First assign new vector order in buffer
    for iter in eachindex(vals)
        buffer[iter] = vals[index[iter]]
    end
    # Then input in vals
    for iter in eachindex(vals)
        vals[iter] = buffer[iter]
    end
    return nothing
end

function _shuffle1!(vals, buffer, index)
    # First assign new vector order in buffer
    for iter in eachindex(vals)
        buffer[iter] .= vals[index[iter]]
    end
    # Then input in vals
    for iter in eachindex(vals)
        vals[iter] .= buffer[iter]
    end
    return nothing
end

function _shuffle2!(vals, buffer, index)
    # First sort new order in buffer
    MaxIter = length(vals[1])
    for iter in eachindex(vals)
        index_current = index[iter]
        for idx in Base.OneTo(MaxIter)
            buffer[iter][idx] = vals[index_current][idx]
        end
    end
    # Then input in vals
    for iter in eachindex(vals)
        for idx in Base.OneTo(MaxIter)
            vals[iter][idx] = buffer[iter][idx]
        end
    end
    return nothing
end

function example(f!)
    vals = [ [1, 2, 3], [4, 5, 6], [7, 8, 9] ]
    buffer = [ [0,0,0], [0,0,0], [0,0,0] ]
    index = [2, 1, 1]
    f!(vals, buffer, index)
    println(vals)
end

example(_shuffle0!)
example(_shuffle0!)
example(_shuffle0!)

using BenchmarkTools

@btime _shuffle0!(vals, buffer, index) setup=(
    Ncols = 1000;
    Nrows = 1000;
    vals = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)];
    buffer = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)];
    index = rand(1:Ncols, Nrows)
)

@btime _shuffle1!(vals, buffer, index) setup=(
    Ncols = 1000;
    Nrows = 1000;
    vals = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)];
    buffer = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)];
    index = rand(1:Ncols, Nrows)
)

@btime _shuffle2!(vals, buffer, index) setup=(
    Ncols = 1000;
    Nrows = 1000;
    vals = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)];
    buffer = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)];
    index = rand(1:Ncols, Nrows)
)

Console output:

[[4, 5, 6], [1, 2, 3], [1, 2, 3]]
[[4, 5, 6], [1, 2, 3], [1, 2, 3]]
[[4, 5, 6], [1, 2, 3], [1, 2, 3]]
  1.970 μs (0 allocations: 0 bytes)
  1.448 ms (0 allocations: 0 bytes)
  3.059 ms (0 allocations: 0 bytes)

Profiling with code like

Ncols = 1000;
Nrows = 1000;
vals = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)];
buffer = [ rand(1:10, Ncols) for _ in Base.OneTo(Nrows)];
index = rand(1:Ncols, Nrows)

using Profile
Profile.clear()
_shuffle1!(vals, buffer, index)
vals, buffer = (buffer, vals)
@profile (for i in 1:100; global vals; global buffer; _shuffle1!(vals, buffer, index); vals, buffer = (buffer, vals); end)
Juno.profiler()

shows the majority of the runtime of _shuffle2! is spent in iterate, getindex and setindex!, whereas _shuffle1! is dominated by broadcast.jl's materialize

@inline function materialize!(dest, bc::Broadcasted{Style}) where {Style}
    return materialize!(combine_styles(dest, bc), dest, bc)
end

which later resolves to a memmove.

What prevents you from using something naive like _shuffle0?

1 Like

Thank you for the input! The _shuffle0 version creates referencing/pointer issues for my actualy code, so I have to either copy .= or actually change every single element in the array.

It is failing to hoist the vector loads out of the loop. Doing so manually fixes the performance problem:

@inline function _shuffle3!(vals, buffer, index)
    # First sort new order in buffer
    MaxIter = length(vals[1])
    @inbounds for iter in eachindex(vals)
        vi = vals[index[iter]]
        bi = buffer[iter]
        @simd for idx in Base.OneTo(MaxIter)
            bi[idx] = vi[idx]
        end
    end
    # Then input in vals
    @inbounds for iter in eachindex(vals)
        vi = vals[iter]
        bi = buffer[iter]
        @simd for idx in Base.OneTo(MaxIter)
            vi[idx] = bi[idx]
        end
    end
    return nothing
end
1 Like

Thank you! I would not have been able to solve this myself.