Performance vs notation tradeoff: DataFrame vs AxisArray vs StaticArray vs?

For reference, let me first put the timings I get for your MWE here:

julia> @btime updateDF(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
  5.683 μs (101 allocations: 7.12 KiB)

julia> @btime updateAA(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
  1.150 μs (16 allocations: 1.73 KiB)

julia> @btime updateSA(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
  5.683 μs (68 allocations: 4.70 KiB)

julia> @btime updateSA2(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
  1.100 μs (16 allocations: 2.17 KiB)

I don’t see any theoretical reason why not. In the end this is just syntax. If you’re okay with dat1."col1" and type piracy, you could use

function Base.getproperty(am::AxisMatrix, s::String)
    return @view am[:, s]  # EDIT: added @view
end

function updateAA2(mat1, id1, colnames1, mat2, id2, colnames2)

    dat1 = AxisArray(mat1, Axis{:row}(eachindex(id1)), Axis{:col}(colnames1))
    dat2 = AxisArray(mat2, Axis{:row}(eachindex(id2)), Axis{:col}(colnames2))

    idxr   = indexin(id2, id1)
    upcols = dat1.axes[2][[1; 3]]

    for i in eachindex(upcols)
        dat1[idxr, upcols[i]] += @view dat2[:, upcols[i]]
    end

    dat1."col2"[idxr] += dat2."col2" .> 0
    dat1."col4"[idxr] = dat2."col4"

    return(dat1)
end

@btime updateAA2(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
#  1.390 μs (25 allocations: 2.16 KiB)
#  ( 1.200 μs (21 allocations: 2.30 KiB) without the `@view` in `getproperty` but yields incorrect results )

which has a minimal small performance impact. Be warned that type piracy should be avoided, though. If this is just for a personal project, and you’re fine with the potential (future) risks, then this should be okay. But if e.g. in a newer version AxisArrays would implement getproperty(..., ::String) itself, your code and AxisArrays itself would likely break. A safer option would be to create your own wrapper struct, but of course this requires more effort on your part.

[ EDIT: The AxisArray documentation states that “Indexing returns a view into the original data.”, but that does not actually seem to be the case. So we explicitly need to add views. ]

Below, I give some ways to get rid of the quotes, i.e. just write dat1.col2, but this gets a bit more complicated.

dat1.col2 without quotes

You could use the same approach of implementing getproperty, but with a Symbol to use dat1.col2 instead of dat1."col2".

getproperty, without metaprogramming

Without metaprogramming, I could not get to the same level of performance:

function Base.getproperty(am::AxisMatrix, symb::Symbol)
    if symb === :data
        return getfield(am, :data)
    elseif symb === :axes
        return getfield(am, :axes)
    else
        return @view am[:, string(symb)]   # EDIT: needs @view for the same reason as above
        # You could also first check if string(symb) in getfield(am, :axes)[2].val
        # and return getfield(am, symb) if not. This will be a bit slower, though.
    end
end

function updateAA3(mat1, id1, colnames1, mat2, id2, colnames2)
    dat1 = AxisArray(mat1, Axis{:row}(eachindex(id1)), Axis{:col}(colnames1))
    dat2 = AxisArray(mat2, Axis{:row}(eachindex(id2)), Axis{:col}(colnames2))

    idxr   = indexin(id2, id1)
    upcols = dat1.axes[2][[1; 3]]

    for i in eachindex(upcols)
        dat1[idxr, upcols[i]] += @view dat2[:, upcols[i]]
    end

    dat1.col2[idxr] += dat2.col2 .> 0
    dat1.col4[idxr] = dat2.col4

    return(dat1)
end

@btime updateAA3(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
  5.367 μs (81 allocations: 3.79 KiB)

Note this is even more dangerous type piracy, as we are now explicitly relying on an AxisMatrix having precisely the fields data and axes. A slightly safer option would be to use

function Base.getproperty(am::AxisMatrix, symb::Symbol)
    str_symb = string(symb)    
    if str_symb in getfield(am, :axes)[2].val
        return @view am[:, str_symb]  # (EDIT: @view)
    else
        return getfield(am, symb) 
    end
end

@btime updateAA(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
#  16.700 μs (218 allocations: 7.36 KiB)

@btime updateAA3(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
#  19.900 μs (269 allocations: 8.76 KiB)

but this will slow down every getproperty call, also the normal ones (e.g. am.data).

The problem in the first approach seems to be that the conversion from a Symbol to a String takes too long.

getproperty, with metaprogramming

We can basically cache the string(:col1) etc. results. As long as you don’t have too many columns, and use them sufficiently often (to offset the longer compilation time), this should be fine. Though metaprogramming also comes with a bit of a warning. But in contrast to type piracy, there’s definitely a time and place for metaprogramming (e.g. AxisArrays relies on it).

function Base.getproperty(am::AxisMatrix, symb::Symbol)
    return Base.getproperty(am, Val(symb))
end

Base.getproperty(am::AxisMatrix, ::Val{:axes}) = getfield(am, :axes)
Base.getproperty(am::AxisMatrix, ::Val{:data}) = getfield(am, :data)

@generated function Base.getproperty(am::AxisMatrix, ::Val{symb}) where symb
    symb_str = string(symb)
    return :(view(am, :, $symb_str))  # (EDIT: view)
end

@btime updateAA3(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
  1.340 μs (25 allocations: 2.16 KiB)
Using a macro

Since we basically just want to rewrite dat1.col2[4] to dat1.col2[4, "col2"], we can use a macro for this. In this manner, we won’t have to commit any type piracy.

function di!(ex)  # di from dot indexing
    if (ex.head === :ref
        && ex.args[1].head === :.
        && ex.args[1].args[2] isa QuoteNode
        && ex.args[1].args[2].value isa Symbol)
        # am.col1[4] -> am[4, "col1"]

        symbol_string = string(ex.args[1].args[2].value)
        matrix_symbol = ex.args[1].args[1]

        ex.args[1] = matrix_symbol
        push!(ex.args, symbol_string)

    elseif (ex.head === :.
        && ex.args[1] isa Symbol
        && ex.args[2] isa QuoteNode
        && ex.args[2].value isa Symbol)
        # am.col1 -> am[:, "col1"]

        symbol_string = string(ex.args[2].value)

        ex.head = :ref
        ex.args[2] = Symbol(":")
        push!(ex.args, symbol_string)
    end
end

macro di(ex)
    di!(ex)
    return esc(ex)
end

function updateAA4(mat1, id1, colnames1, mat2, id2, colnames2)

    dat1 = AxisArray(mat1, Axis{:row}(eachindex(id1)), Axis{:col}(colnames1))
    dat2 = AxisArray(mat2, Axis{:row}(eachindex(id2)), Axis{:col}(colnames2))

    idxr   = indexin(id2, id1)
    upcols = dat1.axes[2][[1; 3]]

    for i in eachindex(upcols)
        dat1[idxr, upcols[i]] += @view dat2[:, upcols[i]]
    end

    @di(dat1.col2[idxr]) += @di(dat2.col2) .> 0
    @di(dat1.col4[idxr]) = @di(dat2.col4)

    return(dat1)
end

@btime updateAA4(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
  1.080 μs (18 allocations: 1.92 KiB)

I restarted Julia before this @btime, i.e. the previous getproperty implementation no longer applies. You could probably write di! in a more elegant fashion (I’m not claiming to be an expert in any of this).

[ EDIT: no view is needed here, as we are just rewriting the code (no function returns involved). ]

Seeing as the point of a StaticArray is that the size is known to the compiler (i.e. before runtime), the short answer is no. If there are only a few possible sizes, it could make sense to use Val, though.

Because of the previous reason, I would again say no. If you don’t value the .col1 or [:, "col1"] indexing too much (like in updateSA; or if you are prepared to use something like a Dict to translate from column_name::String to column_index::Int), I’d just use a simple Matrix.

function updateM(mat1, id1, colnames1, mat2, id2, colnames2)  # (Note: better to write updateM!)
    idxr    = indexin(id2, id1)
    idxcol2 = indexin(colnames1, colnames2)
    upcols1 = [1; 3]
    upcols2 = @view idxcol2[upcols1]

    for i in eachindex(upcols1)
        mat1[idxr, upcols1[i]] += @view mat2[:, upcols2[i]]
    end

    mat1[idxr, 2] += @view(mat2[:, idxcol2[2]]) .> 0
    mat1[idxr, 4] = @view mat2[:, idxcol2[4]]

    return mat1
end

@btime updateM(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
#  985.714 ns (20 allocations: 2.28 KiB)

[ EDIT: added @views for better performance. ]

2 Likes