Massive memory allocation on iterating algorithm `8.789115 seconds (26.65 M allocations: 7.826 GiB, 25.07% gc time)`

Hey everyone, I’ve written my first major algorithm in Julia which uses a Genetic Algorithm approach to finding a solution to the Travelling Salesman Problem.

It works fine but next week I will be pitching it aginst other algorithms to see who gets the best solution. Most guys are using existing libraries for GAMS or Python. I’ll be the only representing Julia in the competition.

Because of this competition of course I want to increase the performance. I have followed the suggsted steps in the Performance Tips page and have tried to follow conventions on organizing code.

My code runs comparably lighting-fast when compared to other Optimization software and algorithms. However my Memory allocation is huge
8.789115 seconds (26.65 M allocations: 7.826 GiB, 25.07% gc time)

Can some of you guys with a lot more experience take a quick look and let me know what you think can be done?

This is the output for --track-allocation=user , hope you can take a quick look and let me know what would you do to fix this.

Thanks

        - using Random, CSV
        - const file = "21_Points.csv"
        - 
        - const pop_size = 20000
        - const elites = 200
        - const survivors = 9900
        - const mutts = 2000
        - 
        - const gens = 2
        - 
        - function generate_distance_matrix(file = "21_Points.csv")
        -     #Importar datos de Excel
        -     nodes_import = CSV.read(file, header = true)
        - 
        0     #Definir que hay en cada columna del excel y cuantos nodos hay en total
        -     num_nodes = size(nodes_import,1)
        -     idcol = 1
        0     Xcol = 2
        -     Ycol = 3
        - 
        -     #Generar la Matriz de distancias a partir del numero de nodos
        -     distance_matrix = Array{Float64}(undef, (num_nodes, num_nodes))
        -     #Llenar la matriz de distancias entre todos los puntos usando Pitagoras
        0     for n in 1:num_nodes
        -         for s in 1:num_nodes
        0             distance_matrix[n,s] = sqrt((nodes_import[n,Ycol] - nodes_import[s,Ycol])^2 +
        0             (nodes_import[n,Xcol] - nodes_import[s,Xcol])^2)
        0         end
        -     end
        -     #println(pretty_table(nodes_import))
        -     #println(pretty_table(distance_matrix))
        -     return distance_matrix
        - end
        0 
        - const distance_matrix = generate_distance_matrix("21_Points.csv")
        - const num_nodes = size(distance_matrix, 1)
        - 
        - function fitness_function(sequence)
        -     arcs::Array{Bool,2} = zeros(Bool, num_nodes, num_nodes)
        -     i = sequence[1]
323299200     for s in 2:length(sequence)
        0         j = sequence[s]
        0         arcs[i,j] = 1
        0         i = j
        0     end
        0     arcs[i,sequence[1]] = 1
        -     return sum(arcs.*distance_matrix)
        0 end
2206041600 
        - struct Individual
        -     generation::Int
        -     genes::Vector{Int}
        -     fitness_score::Float64
        - end
        - Individual(gen, genes) = Individual(gen, genes, fitness_function(genes))
        - 
        - function init_poulation(pop_size)
        -     population = Array{Individual}(undef,0)
        -     for i in 1:pop_size
       80         push!(population, Individual(1,randperm(num_nodes)))
        0     end
  6924928     return population
        - end
        0 
        - function new_generation!(population, mating_pool, gen)
        - 
        -     for p in 1:survivors
        -         parent1 = mating_pool[p]
        0         parent2 = mating_pool[rand(1:survivors)]
        0 
        0         children = Array{Individual,1}(undef, 2)
        -         cross_parents!(children, parent1.genes, parent2.genes, gen)
 18057600 
        0         population[elites+p] = children[1]
        -         population[elites+survivors+p] = children[2]
        0     end
        0 
        -     for m in 1:(mutts÷2)
        -         mutt_1 = population[rand(elites:pop_size)]
        0         mutt_2 = population[rand(elites:pop_size)]
        0         mutate_type1!(mutt_1)
        0         mutate_type2!(mutt_2)
        0     end
        0 end
        - 
        - function cross_parents!(children, parent1_genes, parent2_genes, gen)
        - 
        -     cross_point = rand(1:num_nodes-1)
        -     child1_genes = parent1_genes[1:cross_point]
        0     child2_genes = parent2_genes[1:cross_point]
 35441328 
 35441328     parent1_left = parent1_genes[cross_point+1:end]
        -     parent2_left = parent2_genes[cross_point+1:end]
 35342624 
 35342624     ch1_pending = Array{Tuple{Int64,Int64}}(undef,0)
        -     foreach(l -> push!(ch1_pending,
 15048000                 (l, findfirst(isequal(l), parent2_genes))),
193167360                 parent1_left)
        -     sort!(ch1_pending, by = e -> e[:][2])
        -     foreach(p -> push!(child1_genes, p[1]), ch1_pending)
 21243648 
 84618992     ch2_pending = Array{Tuple{Int64,Int64}}(undef,0)
        -     foreach(l -> push!(ch2_pending,
 15048000                 (l, findfirst(isequal(l), parent1_genes))),
193167360                 parent2_left)
        -     sort!(ch2_pending, by = e -> e[:][2])
        -     foreach(p -> push!(child2_genes, p[1]), ch2_pending)
 21243648 
 84618992     children[1] = Individual(gen,child1_genes)
        -     children[2] = Individual(gen,child2_genes)
  6019200 end
  6019200 
        - function mutate_type1!(mutt_1::Individual)
        -     mutt_gene = rand(1:num_nodes-1)
        -     i = findfirst(isequal(mutt_gene), mutt_1.genes)
        0     j = findfirst(isequal(mutt_gene+1), mutt_1.genes)
        0     mutt_1.genes[i] = mutt_gene+1
        0     mutt_1.genes[j] = mutt_gene
        0 end
        0 
        - function mutate_type2!(mutt_2::Individual)
        -     mutt_gene = rand(1:num_nodes-2)
        -     i = findfirst(isequal(mutt_gene), mutt_2.genes)
        0     j = findfirst(isequal(mutt_gene+2), mutt_2.genes)
        0     mutt_2.genes[i] = mutt_gene+2
        0     mutt_2.genes[j] = mutt_gene
        0 end
        0 
        - function GA()
        -     population = init_poulation(pop_size)
        -     mating_pool = Array{Individual}(undef, survivors)
     2848 
    79312     for g = 2:gens
        - 
        0         sort!(population, by = p -> p.fitness_score)
        -         mating_pool = population[1:survivors]
  1522736 
  1506928         new_generation!(population, mating_pool, g)
        - 
        0         if (population[1].fitness_score == population[elites].fitness_score) || ((g - population[1].generation) > 6)
        -             print("Convergencia en la GEN")
        0             println(gens)
        0             break
        0         end
        -         println(g)
        -     end
     3680 
        -     sort!(population, by = p -> p.fitness_score)
        -     return population[1]
    80144 end
        0 
        - @time winner = GA()
        - print(winner)
        - 

Have you tried profiling this? ProfileView is a really useful package for problems like this.

If it runs fast, it might not actually matter much. You have lots of small (I think?) calls to rand. Those are going to allocate. If you do that 1M times, it adds up. But because it is lots of small calls, it doesn’t really slow things down the same way that allocating a larger matrix over and over again would. It also seems unavoidable in this case.

I just updated the code in the post to show the output for --track-allocation=user. What do you think now? @tbeason @Oscar_Smith

Thanks for the help by the way!

if num_nodes is known and const ,then reuse the array arc::Array{Bool,2} in fitness_function will be helpful, because this function is called by Individual several times.
That is, use a global array and fitness_function reuses this array (this works as long as you don’t use multithreading)

3 Likes
        -         cross_parents!(children, parent1.genes, parent2.genes, gen)
 18057600 
        0         population[elites+p] = children[1]
        -         population[elites+survivors+p] = children[2]

just let cross_parents! return a Tuple{Individual,Individual} and unpacked into population, the two-element array children is not necessary here.
most memory allocation happens in cross_parents!, some guesses here: findfirst will return Union{Nothing,Int}, it seems that you don’t handle the nothing case ? Add a type assertion to the return value of findfirst, so compiler can infer the types well.

3 Likes

Some notes in addition to what others wrote:

  • children in new_generation! is recreated in every loop iteration - that’s unnecessary, just return a tuple and unpack that.
  • cross_parents! grows a lot of arrays dynamically via push! - assuming the final size will stay the same, preallocate the final child1_genes at the start and simply mutate the array. Allocating one large chunk of memory is faster than growing a small array to the same size.

e.g.

child1_genes = similar(parent1_genes)
child1_genes[1:cross_point] .= @view parent1_genes[1:cross_point]
child1_genes[cross_point+1:end] .= @view parent1_genes[cross_point+1:end]
partialsort!(child1_genes, cross_point+1:lastindex(child1_genes), by=x -> findfirst(==(x), parent2_genes))

# same for child2

return (child1, child2)

and similarly for child2_genes.

  • findfirst can return nothing, your code is written with the assumption that there will be a match in the other parents genes.
2 Likes

New @time = 4.434048 seconds (10.43 M allocations: 4.885 GiB, 24.69% gc time)

That is like a 40% reduction in both time and memory-allocation.

Thanks everyone (so far) I have implemented the changes suggested along with some other more:

  1. Declare arcs once as global
  2. Changing !push functions to mutation at index i in fixed sized array
  3. Add type assertion on findfirst()
  4. Removing 2-element children tuple

Here is the most recent code with --track-allocation=user in case anyone else wants to contribute something else.

        - using Random, CSV, DataFrames
        - const file = "21_Points.csv"
        - 
        - const pop_size = 20000
        - const elites = 200
        - const survivors = 9900
        - const mutts = 2000
        - const gens = 50
        - #=
        - const pop_size = 10
        - const elites = 2
        - const survivors = 4
        - const mutts = 2
        - const gens = 3
        - =#
        - function generate_distance_matrix(file = "21_Points.csv")
        -     #Importar datos de Excel
        0     nodes_import = CSV.read(file, header = true)
        - 
        -     #Definir que hay en cada columna del excel y cuantos nodos hay en total
        0     num_nodes = size(nodes_import,1)
        -     idcol = 1
        -     Xcol = 2
        -     Ycol = 3
        - 
        -     #Generar la Matriz de distancias a partir del numero de nodos
        0     distance_matrix = Array{Float64}(undef, (num_nodes, num_nodes))
        -     #Llenar la matriz de distancias entre todos los puntos usando Pitagoras
        0     for n in 1:num_nodes
        0         for s in 1:num_nodes
        0             distance_matrix[n,s] = sqrt((nodes_import[n,Ycol] - nodes_import[s,Ycol])^2 +
        -             (nodes_import[n,Xcol] - nodes_import[s,Xcol])^2)
        -         end
        -     end
        -     #println(pretty_table(nodes_import))
        -     #println(pretty_table(distance_matrix))
        0     return distance_matrix
        - end
        - 
        - const distance_matrix = generate_distance_matrix("21_Points.csv")
        - const num_nodes = size(distance_matrix, 1)
        - 
        - arcs = Array{Bool,2}(undef, (num_nodes, num_nodes))
        - 
        - function fitness_function(sequence)
        0     arcs::Array{Bool,2}
        0     fill!(arcs, 0)
        0     i = sequence[1]
253491200     for s in sequence[2:end]
        -         j = s
        0         arcs[i,j] = 1
        0         i = j
        -     end
        0     arcs[i,sequence[1]] = 1
4436096000     return sum(arcs.*distance_matrix)
        - end
        - 
        - struct Individual
        -     generation::Int
        -     genes::Vector{Int}
        -     fitness_score::Float64
        - end
 31686400 Individual(gen, genes) = Individual(gen, genes, fitness_function(genes))
        - 
        - function init_poulation(pop_size)
       80     population = Array{Individual}(undef,0)
        0     for i in 1:pop_size
  5964928         push!(population, Individual(1,randperm(num_nodes)))
        -     end
        0     return population
        - end
        - 
        - function new_generation!(population, mating_pool, gen)
        - 
        0     for p in 1:survivors
        0         parent1 = mating_pool[p]
        0         parent2 = mating_pool[rand(1:survivors)]
        - 
        0         for (c,I) in enumerate(cross_parents(parent1.genes, parent2.genes, gen))
        0             if c == 1
        0                 population[elites+p] = I
        0             elseif c == 2
        0                 population[elites+survivors+p] = I
        -             else
        0                 print("Error")
        -             end
        -         end
        -     end
        - 
        0     for m in 1:(mutts÷2)
        0         mutt_1 = population[rand(elites:pop_size)]
        0         mutt_2 = population[rand(elites:pop_size)]
        0         mutate_type1!(mutt_1)
        0         mutate_type2!(mutt_2)
        -     end
        - end
        - 
        - function cross_parents(parent1_genes, parent2_genes, gen)
        - 
        0     cross_point = rand(1:num_nodes-1)
 85414816     parent1_left = parent1_genes[cross_point+1:end]
 85414816     parent2_left = parent2_genes[cross_point+1:end]
        - 
131947200     child1_genes = Array{Int64,1}(undef, num_nodes)
131947200     child2_genes = Array{Int64,1}(undef, num_nodes)
        -     child1_genes::Array{Int64,1}
        -     child2_genes::Array{Int64,1}
 85340384     child1_genes[1:cross_point] = parent1_genes[1:cross_point]
 85340384     child2_genes[1:cross_point] = parent2_genes[1:cross_point]
        - 
130259680     ch2_pending = Array{Tuple{Int64,Int64}}(undef, size(parent2_left,1))
130259680     ch1_pending = Array{Tuple{Int64,Int64}}(undef, size(parent1_left,1))
        - 
        0     for (index, value) in enumerate(parent1_left)
        0         ch1_pending[index] = (value, findfirst(isequal(value), parent2_genes)::Int64)
        -     end
 43067712     sort!(ch1_pending, by = e -> e[:][2])
        0     for (index, value) in enumerate(ch1_pending)
        0         child1_genes[cross_point+index] = value[1]
        -     end
        - 
        0     for (i, v) in enumerate(parent2_left)
        0         ch2_pending[i] = (v, findfirst(isequal(v), parent1_genes)::Int64)
        -     end
 43067712     sort!(ch2_pending, by = e -> e[:][2])
        0     for (i,v) in enumerate(ch2_pending)
        0         child2_genes[cross_point+i] = v[1]
        -     end
        - 
 15523200     return (Individual(gen,child1_genes), Individual(gen,child2_genes))
        - end
        - 
        - function mutate_type1!(mutt_1::Individual)
        0     mutt_gene = rand(1:num_nodes-1)
        0     i = findfirst(isequal(mutt_gene), mutt_1.genes)
        0     j = findfirst(isequal(mutt_gene+1), mutt_1.genes)
        0     mutt_1.genes[i] = mutt_gene+1
        0     mutt_1.genes[j] = mutt_gene
        - end
        - 
        - function mutate_type2!(mutt_2::Individual)
        0     mutt_gene = rand(1:num_nodes-2)
        0     i = findfirst(isequal(mutt_gene), mutt_2.genes)
        0     j = findfirst(isequal(mutt_gene+2), mutt_2.genes)
        0     mutt_2.genes[i] = mutt_gene+2
        0     mutt_2.genes[j] = mutt_gene
        - end
        - 
        - function GA()
     2560     population = init_poulation(pop_size)
    79312     mating_pool = Array{Individual}(undef, survivors)
        - 
        0     for g = 2:gens
        - 
  3927056         sort!(population, by = p -> p.fitness_score)
  3886288         mating_pool = population[1:survivors]
        - 
        0         new_generation!(population, mating_pool, g)
        - 
        0         if (population[1].fitness_score == population[elites].fitness_score) || ((g - population[1].generation) > 6)
        0             print("Convergencia en la GEN")
        0             println(g)
        0             print("TODOS ELITES IGUALES")
        -             break
        -         end
        0         if ((g - population[1].generation) > gens/10)
        0             print("Convergencia en la GEN")
        0             println(g)
        0             print("SIN CAMBIOS en x GENS x = ")
        0             println(gens/10)
        -             break
        -         end
     9968         println(g)
        0     end
        0 
    80144     sort!(population, by = p -> p.fitness_score)
        0     return population[1]
        0 end
        - 
        0 @time winner = GA()
        0 print(winner)
        - 

Tips to optimize fitness_function:

  • a[i:j] creates a copy in Julia, use @view a[i:j] for non-allocating slices.
  • sum(x .* y) also allocates a temporary array. LinearAlgebra.dot(x, y) seems to work for matrices.
3 Likes

You may give length directly (pop_size) with array allocation and just assign elements instead of push!. Or use sizehint! on population. Also, be careful about return type of CSV.read, it may give Union type. I know the Base's readdlm returns Union. Therefore, these two variables may be doing something unexpected.

2 Likes

A better idea for fitness_function is to stop micro-optimizing it and look at what it computes. The key observation is that arcs doesn’t actually do anything useful. This is untested but as far as I can tell the whole function can be reduced to

function fitness_function(sequence)
    return distance_matrix[sequence[end], sequence[1]] +
        sum(distance_matrix[sequence[i - 1], sequence[i]] for i = 2:num_nodes)
end

which should be way faster and not allocate at all.

5 Likes

@Vasily_Pisarev proposal helped solve the memory-allocation issues just fine but your proposed approach definitely did the job about 2x faster!

1 Like

This won’t change performance, but I suggest that you get rid of the type assertions, such as child2_genes::Array{Int64,1}. They do absolutely nothing, except cause visual noise and confusion. You have already specified the types when you write child2_genes = Array{Int64,1}(undef, num_nodes), asserting it a second time has no useful effect.

In fact, you should almost never use assertions, except when you have strong reason to believe that the compiler cannot figure out a type, which is definitely not the case here.

1 Like

I am trying to get those Array declarations out of there to a “less iterated” function, again to reduce memory allocation. That was part of an attempt to declare them as global variables that so far hasn’t worked…

I’m not sure what that means, are you saying that you are in the process of moving that code around?

In any case, type assertions are almost never needed, and do cause visual noise. As currently written, I think they should be deleted.

1 Like

Even though you are declaring them as const, globals are a bit problematic. Even if it doesn’t impact performance, I would get rid of them as much as possible. For example, I think that fitness_function should take distance_matrix and sequence as inputs (and of course, ditch arcs as I think you already have.) It’s better programming practice, and makes your code more robust and easier to understand, especially if it grows.

I also suggest, in terms of being idiomatic, not including the types of variables in their names. So, for example, distances instead of distance_matrix, and fitness instead of fitness_function. The practice of annotating variables with type info is less useful in Julia than in other dynamic languages, and mostly avoided. (As a somewhat banal example, I wouldn’t rename cos into cos_function. It’s a function, yes, but no need to put “function” in the name.)

2 Likes

Some more tips on style:

population = Array{Individual}(undef,0)

is more commonly written as

population = Individual[]

Array{T,1} has a nice alias Vector{T}.


sort!(ch1_pending, by = e -> e[:][2]) can be written as sort!(ch1_pending, by = last) (as e is a 2-tuple).


for (index, value) in enumerate(ch1_pending)
    child1_genes[cross_point+index] = value[1]
end

can be re-written as

for (index, (value, _)) in enumerate(ch1_pending)
    child1_genes[cross_point+index] = value
end

sort!(population, by = p -> p.fitness_score)
mating_pool = population[1:survivors]

Looks like only survivors elements are needed, so it’s a good place to use partialsort! and in-place assignment mating_pool .= @view population[1:survivors].


ch2_pending = Array{Tuple{Int64,Int64}}(undef, size(parent2_left,1))

can be written as

ch2_pending = similar(parent2_left, Tuple{Int, Int})
4 Likes

I haven’t tested it, but they way cross_parents works still has a large amount of unnecessary allocations and loops, like the various pending arrays. Is this code still up to date? If so, the code I posted earlier should achieve the exact same result with minimal allocations (just the new genes for the child, no intermediaries needed).

For example, the pending arrays aren’t necessary because you can sort the relevant parts in-place without affecting the rest of the genes.

This should be pretty optimal:

function cross_parents(parent1_genes, parent2_genes, gen)
    cross_point = rand(1:num_nodes-1)

    child1_genes = similar(parent1_genes) 
    child1_genes[1:cross_point] .= @view parent1_genes[1:cross_point]
    child1_genes[cross_point+1:end] .= @view parent1_genes[cross_point+1:end]
    partialsort!(child1_genes, cross_point+1:lastindex(child1_genes), by=x -> findfirst(==(x), parent2_genes))

    child2_genes = similar(parent2_genes)
    child2_genes[1:cross_point] .= @view parent2_genes[1:cross_point]
    child2_genes[cross_point+1:end] .= @view parent2_genes[cross_point+1:end]
    partialsort!(child2_genes, cross_point+1:lastindex(child2_genes), by=x -> findfirst(==(x), parent1_genes))

    # possibly gen+1 here instead
    return (Individual(gen, child1_genes), Individual(gen, child2_genes))
end
2 Likes

Add type assertion to findfirst might be helpful…But I am not sure.

1 Like

How to handle findfirst returning nothing is really up to the original poster to handle - a type assertion would only make the failure case throw a different error (at conversion to the asserted type instead of during a comparison operation during sorting).

1 Like