For reference, let me first put the timings I get for your MWE here:
julia> @btime updateDF(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
5.683 μs (101 allocations: 7.12 KiB)
julia> @btime updateAA(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
1.150 μs (16 allocations: 1.73 KiB)
julia> @btime updateSA(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
5.683 μs (68 allocations: 4.70 KiB)
julia> @btime updateSA2(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
1.100 μs (16 allocations: 2.17 KiB)
I don’t see any theoretical reason why not. In the end this is just syntax. If you’re okay with dat1."col1" and type piracy, you could use
function Base.getproperty(am::AxisMatrix, s::String)
return @view am[:, s] # EDIT: added @view
end
function updateAA2(mat1, id1, colnames1, mat2, id2, colnames2)
dat1 = AxisArray(mat1, Axis{:row}(eachindex(id1)), Axis{:col}(colnames1))
dat2 = AxisArray(mat2, Axis{:row}(eachindex(id2)), Axis{:col}(colnames2))
idxr = indexin(id2, id1)
upcols = dat1.axes[2][[1; 3]]
for i in eachindex(upcols)
dat1[idxr, upcols[i]] += @view dat2[:, upcols[i]]
end
dat1."col2"[idxr] += dat2."col2" .> 0
dat1."col4"[idxr] = dat2."col4"
return(dat1)
end
@btime updateAA2(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
# 1.390 μs (25 allocations: 2.16 KiB)
# ( 1.200 μs (21 allocations: 2.30 KiB) without the `@view` in `getproperty` but yields incorrect results )
which has a minimal small performance impact. Be warned that type piracy should be avoided, though. If this is just for a personal project, and you’re fine with the potential (future) risks, then this should be okay. But if e.g. in a newer version AxisArrays would implement getproperty(..., ::String) itself, your code and AxisArrays itself would likely break. A safer option would be to create your own wrapper struct, but of course this requires more effort on your part.
[ EDIT: The AxisArray documentation states that “Indexing returns a view into the original data.”, but that does not actually seem to be the case. So we explicitly need to add views. ]
Below, I give some ways to get rid of the quotes, i.e. just write dat1.col2, but this gets a bit more complicated.
dat1.col2 without quotes
You could use the same approach of implementing getproperty, but with a Symbol to use dat1.col2 instead of dat1."col2".
getproperty, without metaprogramming
Without metaprogramming, I could not get to the same level of performance:
function Base.getproperty(am::AxisMatrix, symb::Symbol)
if symb === :data
return getfield(am, :data)
elseif symb === :axes
return getfield(am, :axes)
else
return @view am[:, string(symb)] # EDIT: needs @view for the same reason as above
# You could also first check if string(symb) in getfield(am, :axes)[2].val
# and return getfield(am, symb) if not. This will be a bit slower, though.
end
end
function updateAA3(mat1, id1, colnames1, mat2, id2, colnames2)
dat1 = AxisArray(mat1, Axis{:row}(eachindex(id1)), Axis{:col}(colnames1))
dat2 = AxisArray(mat2, Axis{:row}(eachindex(id2)), Axis{:col}(colnames2))
idxr = indexin(id2, id1)
upcols = dat1.axes[2][[1; 3]]
for i in eachindex(upcols)
dat1[idxr, upcols[i]] += @view dat2[:, upcols[i]]
end
dat1.col2[idxr] += dat2.col2 .> 0
dat1.col4[idxr] = dat2.col4
return(dat1)
end
@btime updateAA3(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
5.367 μs (81 allocations: 3.79 KiB)
Note this is even more dangerous type piracy, as we are now explicitly relying on an AxisMatrix having precisely the fields data and axes. A slightly safer option would be to use
function Base.getproperty(am::AxisMatrix, symb::Symbol)
str_symb = string(symb)
if str_symb in getfield(am, :axes)[2].val
return @view am[:, str_symb] # (EDIT: @view)
else
return getfield(am, symb)
end
end
@btime updateAA(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
# 16.700 μs (218 allocations: 7.36 KiB)
@btime updateAA3(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
# 19.900 μs (269 allocations: 8.76 KiB)
but this will slow down every getproperty call, also the normal ones (e.g. am.data).
The problem in the first approach seems to be that the conversion from a Symbol to a String takes too long.
getproperty, with metaprogramming
We can basically cache the string(:col1) etc. results. As long as you don’t have too many columns, and use them sufficiently often (to offset the longer compilation time), this should be fine. Though metaprogramming also comes with a bit of a warning. But in contrast to type piracy, there’s definitely a time and place for metaprogramming (e.g. AxisArrays relies on it).
function Base.getproperty(am::AxisMatrix, symb::Symbol)
return Base.getproperty(am, Val(symb))
end
Base.getproperty(am::AxisMatrix, ::Val{:axes}) = getfield(am, :axes)
Base.getproperty(am::AxisMatrix, ::Val{:data}) = getfield(am, :data)
@generated function Base.getproperty(am::AxisMatrix, ::Val{symb}) where symb
symb_str = string(symb)
return :(view(am, :, $symb_str)) # (EDIT: view)
end
@btime updateAA3(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
1.340 μs (25 allocations: 2.16 KiB)
Using a macro
Since we basically just want to rewrite dat1.col2[4] to dat1.col2[4, "col2"], we can use a macro for this. In this manner, we won’t have to commit any type piracy.
function di!(ex) # di from dot indexing
if (ex.head === :ref
&& ex.args[1].head === :.
&& ex.args[1].args[2] isa QuoteNode
&& ex.args[1].args[2].value isa Symbol)
# am.col1[4] -> am[4, "col1"]
symbol_string = string(ex.args[1].args[2].value)
matrix_symbol = ex.args[1].args[1]
ex.args[1] = matrix_symbol
push!(ex.args, symbol_string)
elseif (ex.head === :.
&& ex.args[1] isa Symbol
&& ex.args[2] isa QuoteNode
&& ex.args[2].value isa Symbol)
# am.col1 -> am[:, "col1"]
symbol_string = string(ex.args[2].value)
ex.head = :ref
ex.args[2] = Symbol(":")
push!(ex.args, symbol_string)
end
end
macro di(ex)
di!(ex)
return esc(ex)
end
function updateAA4(mat1, id1, colnames1, mat2, id2, colnames2)
dat1 = AxisArray(mat1, Axis{:row}(eachindex(id1)), Axis{:col}(colnames1))
dat2 = AxisArray(mat2, Axis{:row}(eachindex(id2)), Axis{:col}(colnames2))
idxr = indexin(id2, id1)
upcols = dat1.axes[2][[1; 3]]
for i in eachindex(upcols)
dat1[idxr, upcols[i]] += @view dat2[:, upcols[i]]
end
@di(dat1.col2[idxr]) += @di(dat2.col2) .> 0
@di(dat1.col4[idxr]) = @di(dat2.col4)
return(dat1)
end
@btime updateAA4(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
1.080 μs (18 allocations: 1.92 KiB)
I restarted Julia before this @btime, i.e. the previous getproperty implementation no longer applies. You could probably write di! in a more elegant fashion (I’m not claiming to be an expert in any of this).
[ EDIT: no view is needed here, as we are just rewriting the code (no function returns involved). ]
Seeing as the point of a StaticArray is that the size is known to the compiler (i.e. before runtime), the short answer is no. If there are only a few possible sizes, it could make sense to use Val, though.
Because of the previous reason, I would again say no. If you don’t value the .col1 or [:, "col1"] indexing too much (like in updateSA; or if you are prepared to use something like a Dict to translate from column_name::String to column_index::Int), I’d just use a simple Matrix.
function updateM(mat1, id1, colnames1, mat2, id2, colnames2) # (Note: better to write updateM!)
idxr = indexin(id2, id1)
idxcol2 = indexin(colnames1, colnames2)
upcols1 = [1; 3]
upcols2 = @view idxcol2[upcols1]
for i in eachindex(upcols1)
mat1[idxr, upcols1[i]] += @view mat2[:, upcols2[i]]
end
mat1[idxr, 2] += @view(mat2[:, idxcol2[2]]) .> 0
mat1[idxr, 4] = @view mat2[:, idxcol2[4]]
return mat1
end
@btime updateM(copy($mat1), $id1, $colnames1, $mat2, $id2, $colnames2);
# 985.714 ns (20 allocations: 2.28 KiB)
[ EDIT: added @views for better performance. ]