My understanding is that in Julia macros are meant to be used for expressiveness and to make DSLs, not for performance hacks. But in this bit-fiddling SIMD loop, a macro is much faster than the corresponding inline function. Could you help me understand why the inline function is so much slower and how to write this in idiomatic julia?
Problem statement: For a bit-matrix where the bits are packed in UInt64, I want to exchange the positions of two bits in each row. I calculate the index in the UInt64 matrix, then calculate the appropriate bitmask, then extract the bits (bit=matrix[row,column]&mask
), reset them (matrix[row,column]&=~mask
), and set them to the desired new bit (matrix[row,column]|=bit
). I will be doing this in many different pieces of code, so I abstract away these bit operations in functions or macros called getbit
and setbit
. The macros are much faster.
Here is the code and benchmarks:
@inline getmask(col::Int) = UInt64(0x1) << ((col-1) & 63)
@inline getbigindex(col::Int) = (col-1)>>6+1
macro getxbit(s, r, cbig, mask)
:( $(esc(s))[$(esc(r)),$(esc(cbig))] & $(esc(mask)) )
end
macro getzbit(matrix, r, cbig, mask)
:( $(esc(matrix))[$(esc(r)),endĂ·2+$(esc(cbig))] & $(esc(mask)) )
end
macro setxbit(matrix, r, cbig, mask, x)
quote
$(esc(matrix))[$(esc(r)),$(esc(cbig))] &= ~$(esc(mask))
$(esc(matrix))[$(esc(r)),$(esc(cbig))] |= $(esc(x))
end
end
macro setzbit(matrix, r, cbig, mask, z)
quote
$(esc(matrix))[$(esc(r)),endĂ·2+$(esc(cbig))] &= ~$(esc(mask))
$(esc(matrix))[$(esc(r)),endĂ·2+$(esc(cbig))] |= $(esc(z))
end
end
@inline getxbit(matrix, r, cbig, mask) = @getxbit(matrix, r, cbig, mask)
@inline getzbit(matrix, r, cbig, mask) = @getzbit(matrix, r, cbig, mask)
@inline setxbit(matrix, r, cbig, mask, x) = @setxbit(matrix, r, cbig, mask, x)
@inline setzbit(matrix, r, cbig, mask, z) = @setzbit(matrix, r, cbig, mask, z)
function switch_bits_withmacro(matrix::AbstractMatrix{UInt64}, column)
mask = getmask(column)
cbig = getbigindex(column)
@inbounds @simd for r in 1:size(matrix, 1)
x = @getxbit(matrix, r, cbig, mask)
z = @getzbit(matrix, r, cbig, mask)
@setxbit(matrix, r, cbig, mask, z)
@setzbit(matrix, r, cbig, mask, x)
end
matrix
end
function switch_bits_withinline(matrix::AbstractMatrix{UInt64}, column)
mask = getmask(column)
cbig = getbigindex(column)
@inbounds @simd for r in 1:size(matrix, 1)
x = getxbit(matrix, r, cbig, mask)
z = getzbit(matrix, r, cbig, mask)
setxbit(matrix, r, cbig, mask, z)
setzbit(matrix, r, cbig, mask, x)
end
matrix
end
And here are the benchmarks:
julia> m = rand(UInt64, 50, 50);
julia> @btime switch_bits_withinline($m, 6);
74.877 ns (0 allocations: 0 bytes)
julia> @btime switch_bits_withmacro($m, 6);
28.273 ns (0 allocations: 0 bytes)
julia> switch_bits_withinline(copy(m),6) == switch_bits_withmacro(copy(m),6)
true
The inline functions are much slower than the macros… Am I doing something stupid?