Mysterious allocations in nested scalar loops

gtgt · November 17, 2022, 3:33am

I am writing some code to solve Project Euler #60. Given an upper bound N, the program uses 5 nested for loops to find the smallest sum of quintuples (p_1, p_2, p_3, p_4, p_5) of prime numbers, p_i \leqslant N such that concatenating any two of them in any order produces a prime. What surprises me is that there is only scalar code in the loops, yet benchmarking shows a bazillion allocations.

julia> @btime euler60(10_000)
  712.060 ms (2325284 allocations: 35.49 MiB)
26033

Full code

using Primes 

function num_digits(n)
    d = 0
    while n > 0
        d += 1
        n ÷= 10
    end
    d
end

function concatenate(a, b)
    d = num_digits(b)
    10^d * a + b
end

function is_admissible(values::Tuple, p) 
    # check if a prime p can be extended to a tuple of already found primes
    for q in values
        if !isprime(concatenate(p, q)) || !isprime(concatenate(q, p)) 
            return false
        end
    end
    true 
end

function euler60(upper_bound)
    prime_list = Primes.primes(3, upper_bound) 
    n = length(prime_list) 
    min_sum = typemax(Int)  

    @inbounds for i1 in 1:(n - 4)
        p1 = prime_list[i1]
        5p1 > min_sum && break
        for i2 in (i1 + 1):(n - 3)
            p2 = prime_list[i2] 
            p1 + 4p2 >= min_sum && break
            is_admissible((p1), p2) || continue
            
            for i3 in (i2 + 1):(n - 2)
                p3 = prime_list[i3]
                p1 + p2 + 3p3 >= min_sum && break
                is_admissible((p1, p2), p3) || continue

                for i4 in (i3 + 1):(n - 1)
                    p4 = prime_list[i4]
                    p1 + p2 + p3 + 2p4 >= min_sum && break
                    is_admissible((p1, p2, p3), p4) || continue

                    for i5 in (i4 + 1):n
                        p5 = prime_list[i5]
                        s = p1 + p2 + p3 + p4 + p5
                        s >= min_sum && break
                        is_admissible((p1, p2, p3, p4), p5) || continue
                        
                        # println("primes: $p1 $p2 $p3 $p4 $p5, sum = $s") 
                        min_sum = s
                    end
                end
            end
        end
    end
    min_sum 
end

There doesn’t seem to be any type instabilities either.

@code_warntype output

julia> @code_warntype euler60(10_000)
MethodInstance for euler60(::Int64)
  from euler60(upper_bound) @ Main ~/project-euler/euler60.jl:72
Arguments
  #self#::Core.Const(euler60)
  upper_bound::Int64
Locals
  @_3::Union{Nothing, Tuple{Int64, Int64}}
  val::Nothing
  min_sum::Int64
  n::Int64
  prime_list::Vector{Int64}
  @_8::Union{Nothing, Tuple{Int64, Int64}}
  i1::Int64
  p1::Int64
  @_11::Union{Nothing, Tuple{Int64, Int64}}
  i2::Int64
  p2::Int64
  @_14::Union{Nothing, Tuple{Int64, Int64}}
  i3::Int64
  p3::Int64
  @_17::Union{Nothing, Tuple{Int64, Int64}}
  i4::Int64
  p4::Int64
  i5::Int64
  s::Int64
  p5::Int64
Body::Int64
1 ──        Core.NewvarNode(:(val))
│    %2   = Primes.primes::Core.Const(Primes.primes)
│           (prime_list = (%2)(3, upper_bound))
│           (n = Main.length(prime_list))
│           (min_sum = Main.typemax(Main.Int))
│           nothing
│    %7   = (n - 4)::Int64
│    %8   = (1:%7)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
│           (@_3 = Base.iterate(%8))
│    %10  = (@_3 === nothing)::Bool
│    %11  = Base.not_int(%10)::Bool
└───        goto #39 if not %11
2 ┄─        Core.NewvarNode(:(@_8))
│    %14  = @_3::Tuple{Int64, Int64}
│           (i1 = Core.getfield(%14, 1))
│    %16  = Core.getfield(%14, 2)::Int64
│           (p1 = Base.getindex(prime_list, i1))
│    %18  = (5 * p1)::Int64
│    %19  = (%18 > min_sum)::Bool
└───        goto #4 if not %19
3 ──        goto #39
4 ── %22  = (i1 + 1)::Int64
│    %23  = (n - 3)::Int64
│    %24  = (%22:%23)::UnitRange{Int64}
│           (@_8 = Base.iterate(%24))
│    %26  = (@_8 === nothing)::Bool
│    %27  = Base.not_int(%26)::Bool
└───        goto #37 if not %27
5 ┄─        Core.NewvarNode(:(@_11))
│    %30  = @_8::Tuple{Int64, Int64}
│           (i2 = Core.getfield(%30, 1))
│    %32  = Core.getfield(%30, 2)::Int64
│           (p2 = Base.getindex(prime_list, i2))
│    %34  = p1::Int64
│    %35  = (4 * p2)::Int64
│    %36  = (%34 + %35)::Int64
│    %37  = (%36 >= min_sum)::Bool
└───        goto #7 if not %37
6 ──        goto #37
7 ── %40  = Main.is_admissible(p1, p2)::Bool
└───        goto #9 if not %40
8 ──        goto #10
9 ──        goto #35
10 ─ %44  = (i2 + 1)::Int64
│    %45  = (n - 2)::Int64
│    %46  = (%44:%45)::UnitRange{Int64}
│           (@_11 = Base.iterate(%46))
│    %48  = (@_11 === nothing)::Bool
│    %49  = Base.not_int(%48)::Bool
└───        goto #35 if not %49
11 ┄        Core.NewvarNode(:(@_14))
│    %52  = @_11::Tuple{Int64, Int64}
│           (i3 = Core.getfield(%52, 1))
│    %54  = Core.getfield(%52, 2)::Int64
│           (p3 = Base.getindex(prime_list, i3))
│    %56  = p1::Int64
│    %57  = p2::Int64
│    %58  = (3 * p3)::Int64
│    %59  = (%56 + %57 + %58)::Int64
│    %60  = (%59 >= min_sum)::Bool
└───        goto #13 if not %60
12 ─        goto #35
13 ─ %63  = Core.tuple(p1, p2)::Tuple{Int64, Int64}
│    %64  = Main.is_admissible(%63, p3)::Bool
└───        goto #15 if not %64
14 ─        goto #16
15 ─        goto #33
16 ─ %68  = (i3 + 1)::Int64
│    %69  = (n - 1)::Int64
│    %70  = (%68:%69)::UnitRange{Int64}
│           (@_14 = Base.iterate(%70))
│    %72  = (@_14 === nothing)::Bool
│    %73  = Base.not_int(%72)::Bool
└───        goto #33 if not %73
17 ┄        Core.NewvarNode(:(@_17))
│    %76  = @_14::Tuple{Int64, Int64}
│           (i4 = Core.getfield(%76, 1))
│    %78  = Core.getfield(%76, 2)::Int64
│           (p4 = Base.getindex(prime_list, i4))
│    %80  = p1::Int64
│    %81  = p2::Int64
│    %82  = p3::Int64
│    %83  = (2 * p4)::Int64
│    %84  = (%80 + %81 + %82 + %83)::Int64
│    %85  = (%84 >= min_sum)::Bool
└───        goto #19 if not %85
18 ─        goto #33
19 ─ %88  = Core.tuple(p1, p2, p3)::Tuple{Int64, Int64, Int64}
│    %89  = Main.is_admissible(%88, p4)::Bool
└───        goto #21 if not %89
20 ─        goto #22
21 ─        goto #31
22 ─ %93  = (i4 + 1)::Int64
│    %94  = (%93:n)::UnitRange{Int64}
│           (@_17 = Base.iterate(%94))
│    %96  = (@_17 === nothing)::Bool
│    %97  = Base.not_int(%96)::Bool
└───        goto #31 if not %97
23 ┄ %99  = @_17::Tuple{Int64, Int64}
│           (i5 = Core.getfield(%99, 1))
│    %101 = Core.getfield(%99, 2)::Int64
│           (p5 = Base.getindex(prime_list, i5))
│           (s = p1 + p2 + p3 + p4 + p5)
│    %104 = (s >= min_sum)::Bool
└───        goto #25 if not %104
24 ─        goto #31
25 ─ %107 = Core.tuple(p1, p2, p3, p4)::NTuple{4, Int64}
│    %108 = Main.is_admissible(%107, p5)::Bool
└───        goto #27 if not %108
26 ─        goto #28
27 ─        goto #29
28 ─ %112 = Base.string("primes: ", p1, " ", p2, " ", p3, " ", p4, " ", p5, ", sum = ", s)::String
│           Main.println(%112)
└───        (min_sum = s)
29 ┄        (@_17 = Base.iterate(%94, %101))
│    %116 = (@_17 === nothing)::Bool
│    %117 = Base.not_int(%116)::Bool
└───        goto #31 if not %117
30 ─        goto #23
31 ┄        (@_14 = Base.iterate(%70, %78))
│    %121 = (@_14 === nothing)::Bool
│    %122 = Base.not_int(%121)::Bool
└───        goto #33 if not %122
32 ─        goto #17
33 ┄        (@_11 = Base.iterate(%46, %54))
│    %126 = (@_11 === nothing)::Bool
│    %127 = Base.not_int(%126)::Bool
└───        goto #35 if not %127
34 ─        goto #11
35 ┄        (@_8 = Base.iterate(%24, %32))
│    %131 = (@_8 === nothing)::Bool
│    %132 = Base.not_int(%131)::Bool
└───        goto #37 if not %132
36 ─        goto #5
37 ┄        (@_3 = Base.iterate(%8, %16))
│    %136 = (@_3 === nothing)::Bool
│    %137 = Base.not_int(%136)::Bool
└───        goto #39 if not %137
38 ─        goto #2
39 ┄        (val = nothing)
│           nothing
│           val
└───        return min_sum

System information

julia> versioninfo()
Julia Version 1.10.0-DEV.15
Commit 3e7d796c17 (2022-11-16 23:01 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: 16 × 12th Gen Intel(R) Core(TM) i5-12500H
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, alderlake)
  Threads: 16 on 16 virtual cores

DNF · November 17, 2022, 8:52am

When I run your code I get:

julia> euler60(10_000)
ERROR: MethodError: no method matching is_admissible(::Int64, ::Int64)

So you may have some stray method definition lying around.

The reason for the error is in this line:

which should be

is_admissible((p1,), p2) || continue  # note extra comma to get a tuple

After fixing this, I still get the same amount of allocations, though.

Another note: this seems to beg for a recursive solution. One loop depth per tuple length seems untenable. What will you do for an octuple, or a quinvigintuple?

sgaure · November 17, 2022, 10:33am

It’s probably the isprime function. It allocates for some large integers.

julia> @btime isprime(65521)
3.268 ns (0 allocations: 0 bytes)
true

julia> @btime isprime(65537)
212.412 ns (1 allocation: 16 bytes)
true

julia> @btime isprime(6553765601)
4.250 μs (0 allocations: 0 bytes)
true

julia> @btime isprime(1000723)
474.679 ns (1 allocation: 16 bytes)
true

gtgt · November 17, 2022, 12:23pm

I did some backtracking at first but don’t know how to efficiently prune the search tree (for example if 5p1 > min_sum I want to stop immediately instead of unwinding the recursion stack).

The main difficulty here is that you don’t know how large the largest number can get . It could hypothetically be the case that 3, 7, 109, 673, 19xxx produces a smaller sum. So to be absolutely sure you need to check with the upper bound 26,000, which is much faster with the nested for loops approach than naive recursion.

gtgt · November 17, 2022, 12:26pm

Thanks, that seems to be the culprit. I wonder if those allocations negatively affect performance too much.

stevengj · November 17, 2022, 12:59pm

Seems like this could be fixed with some refactoring, if someone wants to make a PR to Primes.jl:

github.com/JuliaMath/Primes.jl

isprime(n) allocates

opened 12:58PM - 17 Nov 22 UTC

closed 12:02PM - 03 Jul 23 UTC

stevengj

See [this discourse comment](https://discourse.julialang.org/t/mysterious-alloca…tions-in-nested-scalar-loops/90380/3?u=stevengj). It seems like this could be fixed with a little refactoring. Instead of having [`witnesses(n)`](https://github.com/JuliaMath/Primes.jl/blob/fde566c12c3c1d5418d6d2e67087ed9cb2a11db0/src/Primes.jl#L237-L247) return a variably sized tuple (which is not type stable), it would be better to inline this into `isprime` and do something like: ```jl return n < 4294967296 ? checkwitnesses(n, _witnesses(UInt64(n))) : n < 2152302898747 ? checkwitnesses(n, (2, 3, 5, 7, 11)) : n < 3474749660383 ? checkwitnesses(n, (2, 3, 5, 7, 11, 13)): checkwitnesses(n, (2, 325, 9375, 28178, 450775, 9780504, 1795265022)) ``` where `checkwitnesses(n, witnesses)` is simply [this loop](https://github.com/JuliaMath/Primes.jl/blob/fde566c12c3c1d5418d6d2e67087ed9cb2a11db0/src/Primes.jl#L178-L190) refactored into a function.

Topic		Replies	Views
Problem with huge number of memory allocations in for loops New to Julia question , performance , memory-allocation	11	1212	February 21, 2021
Unclear allocation behaviour with built-in sum() Performance question , memory-allocation	4	188	February 2, 2025
Too many allocations for this couple of small functions Performance	4	364	October 4, 2022
Unexpected allocations in looped vs broadcasted functions on tuples of arrays Performance	4	505	March 12, 2020
Why mem allocation may surge when function reads single scalar global variable? Performance benchmarktools , loops	4	142	May 2, 2025

Mysterious allocations in nested scalar loops

Related topics