Using a lookup table is often the way to go for localized bit operations like this. It’s dumb and not very satisfying, but fast.
const bytes = @SVector [
0x00, 0x01, 0x01, 0x03, 0x01, 0x03, 0x03, 0x07, 0x01, 0x03, 0x03, 0x07, 0x03, 0x07, 0x07, 0x0f,
0x10, 0x11, 0x11, 0x13, 0x11, 0x13, 0x13, 0x17, 0x11, 0x13, 0x13, 0x17, 0x13, 0x17, 0x17, 0x1f,
0x10, 0x11, 0x11, 0x13, 0x11, 0x13, 0x13, 0x17, 0x11, 0x13, 0x13, 0x17, 0x13, 0x17, 0x17, 0x1f,
0x30, 0x31, 0x31, 0x33, 0x31, 0x33, 0x33, 0x37, 0x31, 0x33, 0x33, 0x37, 0x33, 0x37, 0x37, 0x3f,
0x10, 0x11, 0x11, 0x13, 0x11, 0x13, 0x13, 0x17, 0x11, 0x13, 0x13, 0x17, 0x13, 0x17, 0x17, 0x1f,
0x30, 0x31, 0x31, 0x33, 0x31, 0x33, 0x33, 0x37, 0x31, 0x33, 0x33, 0x37, 0x33, 0x37, 0x37, 0x3f,
0x30, 0x31, 0x31, 0x33, 0x31, 0x33, 0x33, 0x37, 0x31, 0x33, 0x33, 0x37, 0x33, 0x37, 0x37, 0x3f,
0x70, 0x71, 0x71, 0x73, 0x71, 0x73, 0x73, 0x77, 0x71, 0x73, 0x73, 0x77, 0x73, 0x77, 0x77, 0x7f,
0x10, 0x11, 0x11, 0x13, 0x11, 0x13, 0x13, 0x17, 0x11, 0x13, 0x13, 0x17, 0x13, 0x17, 0x17, 0x1f,
0x30, 0x31, 0x31, 0x33, 0x31, 0x33, 0x33, 0x37, 0x31, 0x33, 0x33, 0x37, 0x33, 0x37, 0x37, 0x3f,
0x30, 0x31, 0x31, 0x33, 0x31, 0x33, 0x33, 0x37, 0x31, 0x33, 0x33, 0x37, 0x33, 0x37, 0x37, 0x3f,
0x70, 0x71, 0x71, 0x73, 0x71, 0x73, 0x73, 0x77, 0x71, 0x73, 0x73, 0x77, 0x73, 0x77, 0x77, 0x7f,
0x30, 0x31, 0x31, 0x33, 0x31, 0x33, 0x33, 0x37, 0x31, 0x33, 0x33, 0x37, 0x33, 0x37, 0x37, 0x3f,
0x70, 0x71, 0x71, 0x73, 0x71, 0x73, 0x73, 0x77, 0x71, 0x73, 0x73, 0x77, 0x73, 0x77, 0x77, 0x7f,
0x70, 0x71, 0x71, 0x73, 0x71, 0x73, 0x73, 0x77, 0x71, 0x73, 0x73, 0x77, 0x73, 0x77, 0x77, 0x7f,
0xf0, 0xf1, 0xf1, 0xf3, 0xf1, 0xf3, 0xf3, 0xf7, 0xf1, 0xf3, 0xf3, 0xf7, 0xf3, 0xf7, 0xf7, 0xff,
]
function packnibble6(x::UInt64)
@inbounds (
bytes[1 + x & 0xff] |
bytes[1 + (x >> 8) & 0xff] << 8 |
bytes[1 + (x >> 16) & 0xff] << 16 |
bytes[1 + (x >> 24) & 0xff] << 24 |
bytes[1 + (x >> 32) & 0xff] << 32 |
bytes[1 + (x >> 40) & 0xff] << 40 |
bytes[1 + (x >> 48) & 0xff] << 48 |
bytes[1 + (x >> 56) & 0xff] << 56
)
end
@btime packnibble5($(rand(UInt64)))
@btime packnibble6($(rand(UInt64)))
Gives me
2.073 ns (0 allocations: 0 bytes)
1.300 ns (0 allocations: 0 bytes)
Edit: Fixed #$%@ one-based indexing.