I’ve followed the performance tips to implement a function guard for setting up objects to be accessed in the inner loop of an algorithm, as follows:
function fast_marching!(grid::Matrix{T}, h::Real, maxdist::Real=1) where {T <: AbstractFloat}
states = zeros(UInt8, size(grid))::Matrix{UInt8}
fast_marching!(states, grid, h, maxdist)
end
where the last line of the function calls the “real” algorithm that reads and writes to the preallocated states array along with the other inputs. The inner function seems to run fine, with just a few bytes of memory usage (6 allocs):
using BenchmarkTools
grid = ones(200,200)
grid[100,100] = 0
states = zeros(UInt8, size(grid))
@benchmark fast_marching!($states, $grid, 0.025, 4)
BenchmarkTools.Trial:
memory estimate: 848 bytes
allocs estimate: 6
--------------
minimum time: 433.333 μs (0.00% GC)
median time: 467.555 μs (0.00% GC)
mean time: 489.823 μs (0.00% GC)
maximum time: 2.914 ms (0.00% GC)
--------------
samples: 10000
evals/sample: 1
But the outer function seems to run a totally different version of the algorithm:
@benchmark fast_marching!($grid, 0.025, 4)
BenchmarkTools.Trial:
memory estimate: 49.10 MiB
allocs estimate: 1788630
--------------
minimum time: 114.365 ms (2.80% GC)
median time: 118.648 ms (2.74% GC)
mean time: 118.945 ms (3.07% GC)
maximum time: 127.768 ms (2.38% GC)
--------------
samples: 43
evals/sample: 1
The only difference between these functions is that the fist one pre-allocates an array before calling the second. The number of allocations suggests that this is happening in the inner loop. Are they compiling differently for some reason? I thought the function guard is supposed to mitigate exactly that problem by compiling the “core” part of the algorithm separately.
(As an aside, should I be using a function guard here? If not, my question about why these differ so drastically still stands).
And here is the code for the actual algorithm being executed:
using DataStructures
function fast_marching!(states::Matrix{UInt8}, grid::Matrix{T}, h::Real, maxdist::Real) where {T <: AbstractFloat}
L = PriorityQueue{CartesianIndex{2}, T}()
for i in eachindex(grid)
if grid[i] <= 0
states[i] = 2
else
grid[i] = maxdist
end
end
R = CartesianIndices(grid)
Ifirst, Ilast = first(R), last(R)
Iv = CartesianIndex(1, 0)
Ih = CartesianIndex(0, 1)
for I in R
if states[I] == 0 &&
(states[max(Ifirst, I-Iv)] == 2 ||
states[max(Ifirst, I-Ih)] == 2 ||
states[min(Ilast, I+Ih)] == 2 ||
states[min(Ilast, I+Iv)] == 2)
states[I] = 1
dist = compute_distance(grid, I, Ifirst, Ilast, h, maxdist)
grid[I] = dist
enqueue!(L, I, dist)
end
end
#main loop
while !isempty(L)
I = dequeue!(L)
states[I] = 2
for Ii = (Iv, Ih)
for J = (min(I+Ii, Ilast), max(I-Ii, Ifirst))
if states[J] != 2
dist = compute_distance(grid, J, Ifirst, Ilast, h, maxdist)
if dist < grid[J]
grid[J] = dist
if states[J] == 1
L[J] = dist
else
states[J] = 1
enqueue!(L, J, dist)
end
end
end
end
end
end
return nothing
end
function compute_distance(grid::Matrix{T}, I::CartesianIndex{2}, Ifirst::CartesianIndex{2}, Ilast::CartesianIndex{2}, h::Real, maxdist::Real) where {T <: AbstractFloat}
Iv = CartesianIndex(1, 0)
Ih = CartesianIndex(0, 1)
Uh = Uv = maxdist
function m(Ii)
Jfirst = max(I-Ii, Ifirst)
Ufirst = Jfirst == I ? maxdist : grid[Jfirst]
Jlast = min(I+Ii, Ilast)
Ulast = Jlast == I ? maxdist : grid[Jlast]
Umin = min(Ufirst, Ulast)
end
Uh, Uv = m(Ih), m(Iv)
if Uh < maxdist && Uv < maxdist
disc = 2*h^2-(Uh-Uv)^2
if disc >= 0
return min(0.5*(Uh+Uv)+0.5*sqrt(disc), maxdist)
end
end
Umin = min(Uv, Uh)
return min(h+Umin, maxdist)
end