How to reduce memory allocations in the Sieve of Eratosthenes?

vini-fda · October 31, 2021, 6:20am

Hello there!

So I was trying to implement a cache-friendly version of the famous Sieve of Erathostenes in julia, and came up with this:

function sieve_cache_optim(limit::UInt32; block_size = 2^15)
    max_primes = isqrt(limit)
    primes::Vector{UInt32} = [2]
    sizehint!(primes, max_primes)
    is_prime = repeat([true], limit)
    is_prime[1] = false;

    for block in Iterators.partition(1:limit, block_size)
        min_blk = minimum(block)
        max_blk = maximum(block)
        max_prime = isqrt(maximum(block))
        p = 2 # prime base
        n = 1 # prime index
        while (p <= max_prime)
            # m₀ is the first multiple of p within [min_blk, max_blk]
            # and is also bigger than or equal to p²
            # Therefore, m₀ is such that
            # m₀ >= p² and
            # m₀ >= min_blk
            m₀ = p * max(p, div(min_blk, p, RoundUp))
            # Mark multiples of the prime p
            for m in m₀:p:max_blk
                is_prime[m] = false
            end
            # Find next prime
            n += 1
            if n <= length(primes)
                p = primes[n]
            else
                # If we don't already have the next prime,
                # we've got to find and append it to the primes vector
                while true
                    p += 1
                    if is_prime[p]
                        if p > last(primes)
                            push!(primes, p)
                        end
                        break
                    end
                    (p <= max_blk) || break
                end
            end
        end
    end

    # Add final primes
    p = last(primes) + 1
    while p <= limit
        if is_prime[p]
            push!(primes, p)
        end
        p += 1
    end

    primes
end

Compared to a naive approach, I got a speed-up of about 2 times(from ~12s to ~6s), which is good, but nothing surprising. The @time macro also reports the following:

@time sieve_cache_optim(convert(UInt32, 1_000_000_000), block_size = 2^18)

6.466829 seconds (17 allocations: 1.174 GiB, 0.29% gc time)

And, after running julia with --track-allocation=user, I got the following result:

        - function sieve_cache_optim(limit::UInt32; block_size = 2^18)
        -     # 2^18 = 262144 is usually the size(in bytes) of the L2 cache in modern CPUs
       96     primes::Vector{UInt32} = [2]
        0     sizehint!(primes, max_primes)
       96     is_prime = repeat([true], limit)
        0     is_prime[1] = false;
        - 
        0     for block in Iterators.partition(1:limit, block_size)
        0         min_blk = minimum(block)
        0         max_blk = maximum(block)
        0         max_prime = isqrt(maximum(block))
        -         p = 2 # prime base
        -         n = 1 # prime index
        0         while (p <= max_prime)
        -             # m₀ is the first multiple of p within [min_blk, max_blk]
        -             # and is also bigger than or equal to p²
        -             # Therefore, m₀ is such that
        -             # m₀ >= p² and
        -             # m₀ >= min_blk
        0             m₀ = p * max(p, div(min_blk, p, RoundUp))
        -             # Mark multiples of the prime p
        0             for m in m₀:p:max_blk
        0                 is_prime[m] = false
        -             end
        -             # Find next prime
        0             n += 1
        0             if n <= length(primes)
        0                 p = primes[n]
        -             else
        -                 # If we don't already have the next prime,
        -                 # we've got to find and append it to the primes vector
        0                 while true
        0                     p += 1
        0                     if is_prime[p]
        0                         if p > last(primes)
    33184                             push!(primes, p)
        -                         end
        -                         break
        -                     end
        0                     (p <= max_blk) || break
        -                 end
        -             end
        -         end
        -     end
        - 
        -     # Add final primes
        0     p = last(primes) + 1
        0     while p <= limit
        0         if is_prime[p]
269451584             push!(primes, p)
        -         end
        0         p += 1
        -     end
        - 
        0     primes
        - end

Which sounds pretty suspicious. Is there anything I can do to reduce these allocations when pushing values into the primes vector?

Edit: as a side note, I’ve implemented this same blocking strategy for cache optimization in C, and got the following results on a Xeon E5-2695v2 (L1 size 2^15, L2 size 2^18), always with an array of 1 billion numbers:

gustaphe · October 31, 2021, 6:58am

push! is expensive because it changes the length of the array, which means it needs to expand into surrounding memory or, from time to time, move entirely. You can get around this by preallocating: use the prime number theorem to get an upper bound for how long your array needs to be, create an undef array that long and put your values in there instead.

https://github.com/tpapp/PushVectors.jl is an implementation specifically for this problem (but they don’t seem very confident)

gustaphe · October 31, 2021, 9:29am

(and yes, one could have expected sizehint! to take care of that, but it seems like that’s not guaranteed)

kristoffer.carlsson · October 31, 2021, 10:15am

push! grows the array by a fixed factor (2x) every time it grows so the cost of growing it is constant (see Amortized analysis - Wikipedia). So I don’t think preallocating will give a very big improvement, but always worth measuring.

healyp · October 31, 2021, 1:48pm

When I looked at this code earlier this morning that was my thought “why isn’t sizehint! taking care of this?” From its doc:

For types that support sizehint!,…

Do I take it that booleans ~~are not supported~~ do not support sizehint!()?

mschauer · October 31, 2021, 4:42pm

Just note that this is perhaps not 269451584 allocations but one or several big allocations.

vini-fda · November 1, 2021, 5:27pm

So I’ve preallocated the vector with an upper bound for the primes, and removed the push!es in the code:

# Upper bound on primes from [1,n]
function Π(n)
    if n > 568
        ceil(UInt32, (n / log(n))*(1 + 1.2762/log(n)))
    else
        div(n, 2, RoundUp)
    end
end

function sieve_cache_optim(limit::UInt32; block_size = 2^15)
    max_primes = Π(limit)
    primes = Vector{UInt32}(undef, max_primes)
    primes[1] = 2
    primes_size = 1
    is_prime = repeat([true], limit)
    is_prime[1] = false;

    for block in Iterators.partition(1:limit, block_size)
        min_blk = minimum(block)
        max_blk = maximum(block)
        max_prime = isqrt(maximum(block))
        p = 2 # prime base
        n = 1 # prime index
        while (p <= max_prime)
            # m₀ is the first multiple of p within [min_blk, max_blk]
            # and is also bigger than or equal to p²
            # Therefore, m₀ is such that
            # m₀ >= p² and
            # m₀ >= min_blk
            m₀ = p * max(p, div(min_blk, p, RoundUp))
            # Mark multiples of the prime p
            for m in m₀:p:max_blk
                is_prime[m] = false
            end
            # Find next prime
            n += 1
            if n <= primes_size
                p = primes[n]
            else
                # If we don't already have the next prime,
                # we've got to find and append it to the primes vector
                while true
                    p += 1
                    if is_prime[p]
                        if p > last(primes)
                            primes[n] = p
                            primes_size += 1
                        end
                        break
                    end
                    (p <= max_blk) || break
                end
            end
        end
    end

    # Add final primes
    p = primes[primes_size] + 1
    n = primes_size + 1
    while p <= limit
        if is_prime[p]
            primes[n] = p
            primes_size += 1
            n += 1
        end
        p += 1
    end

    primes[1:primes_size]
end

Now, the code performs about the same as before, and the allocated memory is pretty much unchanged(even though the amount of allocations was reduced to 7):

@time sieve_cache_optim(convert(UInt32, 1_000_000_000), block_size = 2^18)

6.311668 seconds (7 allocations: 1.312 GiB, 0.65% gc time)

mschauer · November 2, 2021, 9:24am

Yeah, looks like @kristoffer.carlsson was right.

goerch · November 5, 2021, 2:57pm

It should but my 1.6.3 Base shows

function push!(a::Array{T,1}, item) where T
    # convert first so we don't grow the array if the assignment won't work
    itemT = convert(T, item)
    _growend!(a, 1)
    a[end] = itemT
    return a
end

Do I misinterpret _growend? It looks like

_growend!(a::Vector, delta::Integer) =
    ccall(:jl_array_grow_end, Cvoid, (Any, UInt), a, delta)

So the magic is happening in jl_array_grow_end?

Nevermind, just found it, yes.

Topic		Replies	Views
Sieve of Eratosthenes iterator New to Julia python , generator , iterators , julia	1	149	October 20, 2024
Help to reduce number of allocations General Usage	23	5487	January 6, 2019
Help to reduce memory allocations in a function Performance	2	230	July 10, 2023
Mysterious allocations in nested scalar loops Performance memory-allocation	5	460	November 17, 2022
Track memory allocation not working correctly General Usage performance	8	1928	July 26, 2021

How to reduce memory allocations in the Sieve of Eratosthenes?

Related topics