[YouTube/GitHub] What is the FASTEST Computer Language? 45 Languages Tested

I didn’t actually expect LoopVectorization to make the code any faster since it isn’t very conducive to SIMD, as you said. It has a large stride between array elements. Here’s the code I used to test it, similar to what I posted before:

using BenchmarkTools
using LoopVectorization

# Auxillary functions
begin
const _uint_bit_length = sizeof(UInt) * 8
const _div_uint_size_shift = Int(log2(_uint_bit_length))
@inline _mul2(i::Integer) = i << 1
@inline _div2(i::Integer) = i >> 1
@inline _map_to_index(i::Integer) = _div2(i - 1)
@inline _map_to_factor(i::Integer) = _mul2(i) + 1
@inline _mod_uint_size(i::Integer) = i & (_uint_bit_length - 1)
@inline _div_uint_size(i::Integer) = i >> _div_uint_size_shift
@inline _get_chunk_index(i::Integer) = _div_uint_size(i + (_uint_bit_length - 1))
@inline _get_bit_index_mask(i::Integer) = UInt(1) << _mod_uint_size(i - 1)
end

# Main code
function clear_factors_while!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
    factor = _map_to_factor(factor_index)
    index = _div2(factor * factor)
    while index <= max_index
        @inbounds arr[_get_chunk_index(index)] |= _get_bit_index_mask(index)
        index += factor
    end
    return arr
end

function clear_factors_simd!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
    factor = _map_to_factor(factor_index)
    @simd for index in _div2(factor * factor):factor:max_index
        @inbounds arr[_get_chunk_index(index)] |= _get_bit_index_mask(index)
    end
    return arr
end

function clear_factors_turbo!(arr::Vector{UInt}, factor_index::Integer, max_index::Integer)
    factor = _map_to_factor(factor_index)
    factor < _uint_bit_length && error("Factor must be greater than UInt bit length to avoid memory dependencies")
    start_index = _div2(factor * factor)
    iterations = cld(max_index - start_index, factor) - 1
    @turbo for i in 0:iterations
        index = start_index + (factor * i)
        @inbounds arr[(index + 63) >> _div_uint_size_shift] |= 1 << ((index - 1) & 63)
    end
    return arr
end

println(
    clear_factors_while!(zeros(UInt, cld(500_000, _uint_bit_length)), 202, 500_000) ==
    clear_factors_simd!(zeros(UInt, cld(500_000, _uint_bit_length)), 202, 500_000) ==
    clear_factors_turbo!(zeros(UInt, cld(500_000, _uint_bit_length)), 202, 500_000) ==
)

const x = zeros(UInt, cld(500_000, sizeof(UInt) * 8))
@benchmark clear_factors_while!(x, 202, 500_000)
@benchmark clear_factors_simd!(x, 202, 500_000)
@benchmark clear_factors_turbo!(x, 202, 500_000)