I was trying to understand the cause of some unexpected memory allocations and found the program has the following strange behaviors. I am really confused and hope someone can help me figure it out, thanks!

I tested on `1.8.5`

and `1.9.0-rc1`

and the results are the same.

### 1. 2.

```
using BenchmarkTools, StaticArrays
using LinearAlgebra: norm_sqr
const Vector3 = SVector{3, Float64}
struct Foo
ϵ::Float64
f::Bool
end
get_calc_dx(foo::Foo) = if foo.f
(x1, x2) -> x2 - x1
else
(x1, x2) -> x1 - x2
end
```

```
f1(dx::Vector3, ϵ) = ϵ * 1.0 / norm_sqr(dx)
f1(x1, x2; ϵ::Float64, calc_dx::F) where F = f1(calc_dx(x1, x2), ϵ)
function test1(foo, xs)
calc_dx = get_calc_dx(foo)
ϵ = foo.ϵ
N = length(xs)
s = 0.0
@inbounds for i = 1:N-1
s += f1(xs[i], xs[i+1]; ϵ, calc_dx)
end
s
end
function test2(foo, xs)
calc_dx = get_calc_dx(foo)
ϵ = foo.ϵ
N = length(xs)
s = 0.0
@inbounds for i = 1:N-1
dx = calc_dx(xs[i], xs[i+1])
s += f1(dx, ϵ)
end
s
end
```

What `test1`

/`test2`

do is basically calculate the sum of some function over an array (\sum_i{\epsilon/(x_i-x_{i+1})^2}). `calc_dx`

is a function determined by `foo.f`

, so its type is determined during runtime.

`test1`

calls the function `f1`

with keyword arguments and `test2`

calls it after calculating `dx`

.

When I tried to benchmark the two functions:

```
foo = Foo(1.0, true)
xs = rand(Vector3, 200000)
@benchmark test1($foo, $xs)
```

```
BenchmarkTools.Trial: 110 samples with 1 evaluation.
Range (min … max): 43.860 ms … 52.788 ms ┊ GC (min … max): 0.00% … 3.64%
Time (median): 45.268 ms ┊ GC (median): 0.00%
Time (mean ± σ): 45.611 ms ± 1.410 ms ┊ GC (mean ± σ): 0.88% ± 1.41%
▂ ▃▃▃█ ▃ ▂
▃▆▅█▃▃████▆▇▆▆█▆█▆▇▇▅▃█▅▅▁▃▁▁▃▁▁▃▁▁▁▁▁▁▃▃▃▁▃▁▁▃▃▁▁▃▁▁▁▁▁▁▁▃ ▃
43.9 ms Histogram: frequency by time 50.4 ms <
Memory estimate: 12.21 MiB, allocs estimate: 799996.
```

```
@benchmark test2($foo, $xs)
```

```
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 235.816 μs … 410.115 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 242.694 μs ┊ GC (median): 0.00%
Time (mean ± σ): 244.038 μs ± 5.493 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▂█
▂▂▂▂▁▂▁▁▂▂▆▅▂██▇▅█▆▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▃
236 μs Histogram: frequency by time 262 μs <
Memory estimate: 0 bytes, allocs estimate: 0.
```

The first one is surprisingly and significantly slower than the second one, probably due to the large number of allocs.

`julia --track-allocation=user ./test.jl`

tells me that all the allocations happen at the line in the for loop: `f1(xs[i], xs[i+1]; ϵ=foo.ϵ, calc_dx)`

Note that I already explicitly specialize the type of `calc_dx`

in `f1`

since it’s a function.

### 3.

If I am not using keyword arguments, the allocations don’t exist:

```
f1(x1, x2, ϵ, calc_dx::F) where F = f1(calc_dx(x1, x2), ϵ)
function test3(foo, xs)
calc_dx = get_calc_dx(foo)
N = length(xs)
s = 0.0
@inbounds for i = 1:N-1
s += f1(xs[i], xs[i+1], foo.ϵ, calc_dx)
end
s
end
@benchmark test3($foo, $xs)
```

```
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 235.816 μs … 410.115 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 242.694 μs ┊ GC (median): 0.00%
Time (mean ± σ): 244.038 μs ± 5.493 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▂█
▂▂▂▂▁▂▁▁▂▂▆▅▂██▇▅█▆▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▃
236 μs Histogram: frequency by time 262 μs <
Memory estimate: 0 bytes, allocs estimate: 0.
```

### 4.

I also found some other strange behaviors, for example, if we use an inner function to make it more nested, the allocations return and the number even increases:

```
function test4_inner(xs, ϵ, calc_dx::F) where F
N = length(xs)
s = 0.0
@inbounds for i = 1:N-1
s += f1(xs[i], xs[i+1], foo.ϵ, calc_dx)
end
s
end
function test4(foo, xs)
calc_dx = get_calc_dx(foo)
test4_inner(xs, foo.ϵ, calc_dx)
end
@benchmark test4($foo, $xs)
```

```
BenchmarkTools.Trial: 258 samples with 1 evaluation.
Range (min … max): 17.429 ms … 26.226 ms ┊ GC (min … max): 0.00% … 7.78%
Time (median): 19.729 ms ┊ GC (median): 10.10%
Time (mean ± σ): 19.380 ms ± 1.146 ms ┊ GC (mean ± σ): 6.66% ± 5.01%
▄▁ █ ▄ ▃▅▃
▃▃▃▅▅▅████▅▅▅▆▃▅▁▃▃▄▃▁▃▁▁▁▁▁▃▃▅█▇███▆███▇▇▅▄▃▄▄▃▃▃▁▃▁▃▃▁▁▃▃ ▃
17.4 ms Histogram: frequency by time 21.5 ms <
Memory estimate: 27.47 MiB, allocs estimate: 1199994.
```

### 5.

This might be unrelated, but I just wanted to also leave it here. If we define `calc_dx`

in the function instead of getting it from another function (`get_calc_dx`

), and we pass `f1`

in a test function with an explicit `@nospecialize`

, the number of allocations will also be very strange:

```
f2(dx::Vector3, ϵ) = ϵ / norm_sqr(dx)
f2(x1, x2; ϵ = 1.0, calc_dx) = f2(calc_dx(x1, x2), ϵ)
function test5(@nospecialize(f), xs, ϵ)
N = length(xs)
s = 0.0
calc_dx(x1, x2) = x1 - x2
@inbounds for i = 1:N-1
s += f(xs[i], xs[i+1]; ϵ, calc_dx)
end
s
end
ϵ = foo.ϵ
@benchmark test5($f1, $xs, $ϵ)
```

```
BenchmarkTools.Trial: 271 samples with 1 evaluation.
Range (min … max): 16.736 ms … 23.466 ms ┊ GC (min … max): 0.00% … 13.19%
Time (median): 18.749 ms ┊ GC (median): 0.00%
Time (mean ± σ): 18.436 ms ± 1.172 ms ┊ GC (mean ± σ): 5.52% ± 5.39%
▁▅▄▄ ▃ ▃ ▇█▁ ▆▃ ▂▁ ▁ ▇▄▅▁▁
▃▆▆████▇█▆▅██████▆▁▁▁▁▁▁▁▁▁▃▁▁▁▃▃▄▃██▇███▇███████▅▃▁▁▁▁▁▁▄▃ ▄
16.7 ms Histogram: frequency by time 20.5 ms <
Memory estimate: 21.36 MiB, allocs estimate: 999995.
```

```
@benchmark test5($f2, $xs, $ϵ)
```

```
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
Range (min … max): 238.661 μs … 364.500 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 243.351 μs ┊ GC (median): 0.00%
Time (mean ± σ): 245.839 μs ± 8.833 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▃▄█▇▇▆▅▄▄▄▃▃▃▂▂▁▁▁ ▂
▄██████████████████████▇▇▇▇▆▅▆▆▆▅▅▄▆▆▄▄▄▆▅▄▅▄▅▅▄▃▅▅▄▅▂▅▅▄▅▄▅▄ █
239 μs Histogram: log(frequency) by time 292 μs <
Memory estimate: 0 bytes, allocs estimate: 0.
```

Note that the only difference between `f1(dx, ϵ)`

and `f2(dx, ϵ)`

is that there is an unneccesary `* 1.0`

in `f1`

. By the way, the results of `@code_llvm`

and `@code_native`

are exactly the same for `test5(f1, xs, ϵ)`

and `test5(f2, xs, ϵ)`

.

If there is anything unclear, please leave a comment to let me know