How to reduce memory allocations in the Sieve of Eratosthenes?

Hello there!

So I was trying to implement a cache-friendly version of the famous Sieve of Erathostenes in julia, and came up with this:

function sieve_cache_optim(limit::UInt32; block_size = 2^15)
    max_primes = isqrt(limit)
    primes::Vector{UInt32} = [2]
    sizehint!(primes, max_primes)
    is_prime = repeat([true], limit)
    is_prime[1] = false;

    for block in Iterators.partition(1:limit, block_size)
        min_blk = minimum(block)
        max_blk = maximum(block)
        max_prime = isqrt(maximum(block))
        p = 2 # prime base
        n = 1 # prime index
        while (p <= max_prime)
            # m₀ is the first multiple of p within [min_blk, max_blk]
            # and is also bigger than or equal to p²
            # Therefore, m₀ is such that
            # m₀ >= p² and
            # m₀ >= min_blk
            m₀ = p * max(p, div(min_blk, p, RoundUp))
            # Mark multiples of the prime p
            for m in m₀:p:max_blk
                is_prime[m] = false
            end
            # Find next prime
            n += 1
            if n <= length(primes)
                p = primes[n]
            else
                # If we don't already have the next prime,
                # we've got to find and append it to the primes vector
                while true
                    p += 1
                    if is_prime[p]
                        if p > last(primes)
                            push!(primes, p)
                        end
                        break
                    end
                    (p <= max_blk) || break
                end
            end
        end
    end

    # Add final primes
    p = last(primes) + 1
    while p <= limit
        if is_prime[p]
            push!(primes, p)
        end
        p += 1
    end

    primes
end

Compared to a naive approach, I got a speed-up of about 2 times(from ~12s to ~6s), which is good, but nothing surprising. The @time macro also reports the following:

@time sieve_cache_optim(convert(UInt32, 1_000_000_000), block_size = 2^18)

6.466829 seconds (17 allocations: 1.174 GiB, 0.29% gc time)

And, after running julia with --track-allocation=user, I got the following result:

        - function sieve_cache_optim(limit::UInt32; block_size = 2^18)
        -     # 2^18 = 262144 is usually the size(in bytes) of the L2 cache in modern CPUs
       96     primes::Vector{UInt32} = [2]
        0     sizehint!(primes, max_primes)
       96     is_prime = repeat([true], limit)
        0     is_prime[1] = false;
        - 
        0     for block in Iterators.partition(1:limit, block_size)
        0         min_blk = minimum(block)
        0         max_blk = maximum(block)
        0         max_prime = isqrt(maximum(block))
        -         p = 2 # prime base
        -         n = 1 # prime index
        0         while (p <= max_prime)
        -             # m₀ is the first multiple of p within [min_blk, max_blk]
        -             # and is also bigger than or equal to p²
        -             # Therefore, m₀ is such that
        -             # m₀ >= p² and
        -             # m₀ >= min_blk
        0             m₀ = p * max(p, div(min_blk, p, RoundUp))
        -             # Mark multiples of the prime p
        0             for m in m₀:p:max_blk
        0                 is_prime[m] = false
        -             end
        -             # Find next prime
        0             n += 1
        0             if n <= length(primes)
        0                 p = primes[n]
        -             else
        -                 # If we don't already have the next prime,
        -                 # we've got to find and append it to the primes vector
        0                 while true
        0                     p += 1
        0                     if is_prime[p]
        0                         if p > last(primes)
    33184                             push!(primes, p)
        -                         end
        -                         break
        -                     end
        0                     (p <= max_blk) || break
        -                 end
        -             end
        -         end
        -     end
        - 
        -     # Add final primes
        0     p = last(primes) + 1
        0     while p <= limit
        0         if is_prime[p]
269451584             push!(primes, p)
        -         end
        0         p += 1
        -     end
        - 
        0     primes
        - end

Which sounds pretty suspicious. Is there anything I can do to reduce these allocations when pushing values into the primes vector?

Edit: as a side note, I’ve implemented this same blocking strategy for cache optimization in C, and got the following results on a Xeon E5-2695v2 (L1 size 2^15, L2 size 2^18), always with an array of 1 billion numbers:

push! is expensive because it changes the length of the array, which means it needs to expand into surrounding memory or, from time to time, move entirely. You can get around this by preallocating: use the prime number theorem to get an upper bound for how long your array needs to be, create an undef array that long and put your values in there instead.

GitHub - tpapp/PushVectors.jl: workaround for julia#24909 is an implementation specifically for this problem (but they don’t seem very confident)

6 Likes

(and yes, one could have expected sizehint! to take care of that, but it seems like that’s not guaranteed)

1 Like

push! grows the array by a fixed factor (2x) every time it grows so the cost of growing it is constant (see Amortized analysis - Wikipedia). So I don’t think preallocating will give a very big improvement, but always worth measuring.

1 Like

When I looked at this code earlier this morning that was my thought “why isn’t sizehint! taking care of this?” From its doc:

For types that support sizehint!,…

Do I take it that booleans are not supported do not support sizehint!()?

Just note that this is perhaps not 269451584 allocations but one or several big allocations.

1 Like

So I’ve preallocated the vector with an upper bound for the primes, and removed the push!es in the code:

# Upper bound on primes from [1,n]
function Π(n)
    if n > 568
        ceil(UInt32, (n / log(n))*(1 + 1.2762/log(n)))
    else
        div(n, 2, RoundUp)
    end
end

function sieve_cache_optim(limit::UInt32; block_size = 2^15)
    max_primes = Π(limit)
    primes = Vector{UInt32}(undef, max_primes)
    primes[1] = 2
    primes_size = 1
    is_prime = repeat([true], limit)
    is_prime[1] = false;

    for block in Iterators.partition(1:limit, block_size)
        min_blk = minimum(block)
        max_blk = maximum(block)
        max_prime = isqrt(maximum(block))
        p = 2 # prime base
        n = 1 # prime index
        while (p <= max_prime)
            # m₀ is the first multiple of p within [min_blk, max_blk]
            # and is also bigger than or equal to p²
            # Therefore, m₀ is such that
            # m₀ >= p² and
            # m₀ >= min_blk
            m₀ = p * max(p, div(min_blk, p, RoundUp))
            # Mark multiples of the prime p
            for m in m₀:p:max_blk
                is_prime[m] = false
            end
            # Find next prime
            n += 1
            if n <= primes_size
                p = primes[n]
            else
                # If we don't already have the next prime,
                # we've got to find and append it to the primes vector
                while true
                    p += 1
                    if is_prime[p]
                        if p > last(primes)
                            primes[n] = p
                            primes_size += 1
                        end
                        break
                    end
                    (p <= max_blk) || break
                end
            end
        end
    end

    # Add final primes
    p = primes[primes_size] + 1
    n = primes_size + 1
    while p <= limit
        if is_prime[p]
            primes[n] = p
            primes_size += 1
            n += 1
        end
        p += 1
    end

    primes[1:primes_size]
end

Now, the code performs about the same as before, and the allocated memory is pretty much unchanged(even though the amount of allocations was reduced to 7):

@time sieve_cache_optim(convert(UInt32, 1_000_000_000), block_size = 2^18)

6.311668 seconds (7 allocations: 1.312 GiB, 0.65% gc time)

Yeah, looks like @kristoffer.carlsson was right.

It should but my 1.6.3 Base shows

function push!(a::Array{T,1}, item) where T
    # convert first so we don't grow the array if the assignment won't work
    itemT = convert(T, item)
    _growend!(a, 1)
    a[end] = itemT
    return a
end

Do I misinterpret _growend? It looks like

_growend!(a::Vector, delta::Integer) =
    ccall(:jl_array_grow_end, Cvoid, (Any, UInt), a, delta)

So the magic is happening in jl_array_grow_end?

Nevermind, just found it, yes.