Hello there!
So I was trying to implement a cachefriendly version of the famous Sieve of Erathostenes in julia, and came up with this:
function sieve_cache_optim(limit::UInt32; block_size = 2^15)
max_primes = isqrt(limit)
primes::Vector{UInt32} = [2]
sizehint!(primes, max_primes)
is_prime = repeat([true], limit)
is_prime[1] = false;
for block in Iterators.partition(1:limit, block_size)
min_blk = minimum(block)
max_blk = maximum(block)
max_prime = isqrt(maximum(block))
p = 2 # prime base
n = 1 # prime index
while (p <= max_prime)
# mâ‚€ is the first multiple of p within [min_blk, max_blk]
# and is also bigger than or equal to pÂ˛
# Therefore, mâ‚€ is such that
# mâ‚€ >= pÂ˛ and
# mâ‚€ >= min_blk
mâ‚€ = p * max(p, div(min_blk, p, RoundUp))
# Mark multiples of the prime p
for m in mâ‚€:p:max_blk
is_prime[m] = false
end
# Find next prime
n += 1
if n <= length(primes)
p = primes[n]
else
# If we don't already have the next prime,
# we've got to find and append it to the primes vector
while true
p += 1
if is_prime[p]
if p > last(primes)
push!(primes, p)
end
break
end
(p <= max_blk)  break
end
end
end
end
# Add final primes
p = last(primes) + 1
while p <= limit
if is_prime[p]
push!(primes, p)
end
p += 1
end
primes
end
Compared to a naive approach, I got a speedup of about 2 times(from ~12s to ~6s), which is good, but nothing surprising. The @time
macro also reports the following:
@time sieve_cache_optim(convert(UInt32, 1_000_000_000), block_size = 2^18)
6.466829 seconds (17 allocations: 1.174 GiB, 0.29% gc time)
And, after running julia with trackallocation=user
, I got the following result:
 function sieve_cache_optim(limit::UInt32; block_size = 2^18)
 # 2^18 = 262144 is usually the size(in bytes) of the L2 cache in modern CPUs
96 primes::Vector{UInt32} = [2]
0 sizehint!(primes, max_primes)
96 is_prime = repeat([true], limit)
0 is_prime[1] = false;

0 for block in Iterators.partition(1:limit, block_size)
0 min_blk = minimum(block)
0 max_blk = maximum(block)
0 max_prime = isqrt(maximum(block))
 p = 2 # prime base
 n = 1 # prime index
0 while (p <= max_prime)
 # mâ‚€ is the first multiple of p within [min_blk, max_blk]
 # and is also bigger than or equal to pÂ˛
 # Therefore, mâ‚€ is such that
 # mâ‚€ >= pÂ˛ and
 # mâ‚€ >= min_blk
0 mâ‚€ = p * max(p, div(min_blk, p, RoundUp))
 # Mark multiples of the prime p
0 for m in mâ‚€:p:max_blk
0 is_prime[m] = false
 end
 # Find next prime
0 n += 1
0 if n <= length(primes)
0 p = primes[n]
 else
 # If we don't already have the next prime,
 # we've got to find and append it to the primes vector
0 while true
0 p += 1
0 if is_prime[p]
0 if p > last(primes)
33184 push!(primes, p)
 end
 break
 end
0 (p <= max_blk)  break
 end
 end
 end
 end

 # Add final primes
0 p = last(primes) + 1
0 while p <= limit
0 if is_prime[p]
269451584 push!(primes, p)
 end
0 p += 1
 end

0 primes
 end
Which sounds pretty suspicious. Is there anything I can do to reduce these allocations when pushing values into the primes
vector?
Edit: as a side note, Iâ€™ve implemented this same blocking strategy for cache optimization in C, and got the following results on a Xeon E52695v2 (L1 size 2^15, L2 size 2^18), always with an array of 1 billion numbers: