I have found that using a small nested function to handle repeated code has resulted in unwanted allocations. I tried to simplify the below function:
function compute_distance(grid::Matrix{T}, I::CartesianIndex{2}, Ifirst::CartesianIndex{2}, Ilast::CartesianIndex{2}, h::Real, maxdist::Real) where {T <: AbstractFloat}
Iv = CartesianIndex(1, 0)
Ih = CartesianIndex(0, 1)
Jfirst = max(I-Iv, Ifirst)
Ufirst = Jfirst == I ? maxdist : grid[Jfirst]
Jlast = min(I+Iv, Ilast)
Ulast = Jlast == I ? maxdist : grid[Jlast]
Uv = min(Ufirst, Ulast)
Jfirst = max(I-Ih, Ifirst)
Ufirst = Jfirst == I ? maxdist : grid[Jfirst]
Jlast = min(I+Ih, Ilast)
Ulast = Jlast == I ? maxdist : grid[Jlast]
Uh = min(Ufirst, Ulast)
if Uh < maxdist && Uv < maxdist
disc = 2*h^2-(Uh-Uv)^2
if disc >= 0
return min(0.5*(Uh+Uv)+0.5*sqrt(disc), maxdist)
end
end
Umin = min(Uv, Uh)
return min(h+Umin, maxdist)
end
using the function m():
function compute_distance(grid::Matrix{T}, I::CartesianIndex{2}, Ifirst::CartesianIndex{2}, Ilast::CartesianIndex{2}, h::Real, maxdist::Real) where {T <: AbstractFloat}
Iv = CartesianIndex(1, 0)
Ih = CartesianIndex(0, 1)
function m(Ii)
Jfirst = max(I-Ii, Ifirst)
Ufirst = Jfirst == I ? maxdist : grid[Jfirst]
Jlast = min(I+Ii, Ilast)
Ulast = Jlast == I ? maxdist : grid[Jlast]
Umin = min(Ufirst, Ulast)
end
Uh, Uv = m(Ih), m(Iv)
if Uh < maxdist && Uv < maxdist
disc = 2*h^2-(Uh-Uv)^2
if disc >= 0
return min(0.5*(Uh+Uv)+0.5*sqrt(disc), maxdist)
end
end
Umin = min(Uv, Uh)
return min(h+Umin, maxdist)
end
When I benchmark these two versions with
grid = zeros(60,60);
R = CartesianIndices(grid)
Ifirst, Ilast = first(R), last(R)
@btime compute_distance($(grid), CartesianIndex(50, 20), $Ifirst, $Ilast, 0.1, 3.0)
The first version gives
19.555 ns (0 allocations: 0 bytes)
The second version gives
28.444 ns (1 allocation: 16 bytes)
I thought maybe it was the captured variables creating a βboxβ as described in the performance tips, so I made m() a separate function which accepts Ifirst, Ilast, I, and grid as arguments, but this proved to be even worse:
285.022 ns (18 allocations: 288 bytes)
Whatβs the cause of this? When can I not trust functions to handle small tasks within my code without worsening performance?