I didn’t actually expect LoopVectorization
to make the code any faster since it isn’t very conducive to SIMD, as you said. It has a large stride between array elements. Here’s the code I used to test it, similar to what I posted before:
using BenchmarkTools
using LoopVectorization
# Auxillary functions
begin
const _uint_bit_length = sizeof(UInt) * 8
const _div_uint_size_shift = Int(log2(_uint_bit_length))
@inline _mul2(i::Integer) = i << 1
@inline _div2(i::Integer) = i >> 1
@inline _map_to_index(i::Integer) = _div2(i - 1)
@inline _map_to_factor(i::Integer) = _mul2(i) + 1
@inline _mod_uint_size(i::Integer) = i & (_uint_bit_length - 1)
@inline _div_uint_size(i::Integer) = i >> _div_uint_size_shift
@inline _get_chunk_index(i::Integer) = _div_uint_size(i + (_uint_bit_length - 1))
@inline _get_bit_index_mask(i::Integer) = UInt(1) << _mod_uint_size(i - 1)
end
# Main code
function clear_factors_while!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
factor = _map_to_factor(factor_index)
index = _div2(factor * factor)
while index <= max_index
@inbounds arr[_get_chunk_index(index)] |= _get_bit_index_mask(index)
index += factor
end
return arr
end
function clear_factors_simd!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
factor = _map_to_factor(factor_index)
@simd for index in _div2(factor * factor):factor:max_index
@inbounds arr[_get_chunk_index(index)] |= _get_bit_index_mask(index)
end
return arr
end
function clear_factors_turbo!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
factor = _map_to_factor(factor_index)
factor < _uint_bit_length && error("Factor must be greater than UInt bit length to avoid memory dependencies")
start_index = _div2(factor * factor)
iterations = cld(max_index - start_index, factor) - 1
@turbo for i in 0:iterations
index = start_index + (factor * i)
@inbounds arr[(index + 63) >> _div_uint_size_shift] |= 1 << ((index - 1) & 63)
end
return arr
end
println(
clear_factors_while!(zeros(UInt, cld(500_000, _uint_bit_length)), 202, 500_000) ==
clear_factors_simd!(zeros(UInt, cld(500_000, _uint_bit_length)), 202, 500_000) ==
clear_factors_turbo!(zeros(UInt, cld(500_000, _uint_bit_length)), 202, 500_000) ==
)
const x = zeros(UInt, cld(500_000, sizeof(UInt) * 8))
@benchmark clear_factors_while!(x, 202, 500_000)
@benchmark clear_factors_simd!(x, 202, 500_000)
@benchmark clear_factors_turbo!(x, 202, 500_000)