Hello there!
So I was trying to implement a cache-friendly version of the famous Sieve of Erathostenes in julia, and came up with this:
function sieve_cache_optim(limit::UInt32; block_size = 2^15)
max_primes = isqrt(limit)
primes::Vector{UInt32} = [2]
sizehint!(primes, max_primes)
is_prime = repeat([true], limit)
is_prime[1] = false;
for block in Iterators.partition(1:limit, block_size)
min_blk = minimum(block)
max_blk = maximum(block)
max_prime = isqrt(maximum(block))
p = 2 # prime base
n = 1 # prime index
while (p <= max_prime)
# mâ‚€ is the first multiple of p within [min_blk, max_blk]
# and is also bigger than or equal to p²
# Therefore, mâ‚€ is such that
# m₀ >= p² and
# mâ‚€ >= min_blk
mâ‚€ = p * max(p, div(min_blk, p, RoundUp))
# Mark multiples of the prime p
for m in mâ‚€:p:max_blk
is_prime[m] = false
end
# Find next prime
n += 1
if n <= length(primes)
p = primes[n]
else
# If we don't already have the next prime,
# we've got to find and append it to the primes vector
while true
p += 1
if is_prime[p]
if p > last(primes)
push!(primes, p)
end
break
end
(p <= max_blk) || break
end
end
end
end
# Add final primes
p = last(primes) + 1
while p <= limit
if is_prime[p]
push!(primes, p)
end
p += 1
end
primes
end
Compared to a naive approach, I got a speed-up of about 2 times(from ~12s to ~6s), which is good, but nothing surprising. The @time
macro also reports the following:
@time sieve_cache_optim(convert(UInt32, 1_000_000_000), block_size = 2^18)
6.466829 seconds (17 allocations: 1.174 GiB, 0.29% gc time)
And, after running julia with --track-allocation=user
, I got the following result:
- function sieve_cache_optim(limit::UInt32; block_size = 2^18)
- # 2^18 = 262144 is usually the size(in bytes) of the L2 cache in modern CPUs
96 primes::Vector{UInt32} = [2]
0 sizehint!(primes, max_primes)
96 is_prime = repeat([true], limit)
0 is_prime[1] = false;
-
0 for block in Iterators.partition(1:limit, block_size)
0 min_blk = minimum(block)
0 max_blk = maximum(block)
0 max_prime = isqrt(maximum(block))
- p = 2 # prime base
- n = 1 # prime index
0 while (p <= max_prime)
- # mâ‚€ is the first multiple of p within [min_blk, max_blk]
- # and is also bigger than or equal to p²
- # Therefore, mâ‚€ is such that
- # m₀ >= p² and
- # mâ‚€ >= min_blk
0 mâ‚€ = p * max(p, div(min_blk, p, RoundUp))
- # Mark multiples of the prime p
0 for m in mâ‚€:p:max_blk
0 is_prime[m] = false
- end
- # Find next prime
0 n += 1
0 if n <= length(primes)
0 p = primes[n]
- else
- # If we don't already have the next prime,
- # we've got to find and append it to the primes vector
0 while true
0 p += 1
0 if is_prime[p]
0 if p > last(primes)
33184 push!(primes, p)
- end
- break
- end
0 (p <= max_blk) || break
- end
- end
- end
- end
-
- # Add final primes
0 p = last(primes) + 1
0 while p <= limit
0 if is_prime[p]
269451584 push!(primes, p)
- end
0 p += 1
- end
-
0 primes
- end
Which sounds pretty suspicious. Is there anything I can do to reduce these allocations when pushing values into the primes
vector?
Edit: as a side note, I’ve implemented this same blocking strategy for cache optimization in C, and got the following results on a Xeon E5-2695v2 (L1 size 2^15, L2 size 2^18), always with an array of 1 billion numbers: