@abraemer, thank you for you kind suggestion. However, as long as the preparation step is required each time, my issue remains unresolved.

I’ll provide an example to illustrate. The first version below uses DifferentiationInterface.jl with StaticArrays.jl:

```
using StaticArrays
using LinearAlgebra
using DifferentiationInterface
using ForwardDiff # for AutoForwardDiff()
function main()
t = 0.0
T = 10.0
dt = 1.0
vals = map(i -> rand(SVector{3}), 1:100_000)
while t < T
calculate!(vals)
t += dt
end
end
function calculate!(vals)
for i in eachindex(vals)
vals[i] = func1(vals[i])
end
end
function func1(x::SVector)
f = x -> 2x ⋅ x
prep = prepare_gradient(f, AutoForwardDiff(), x)
y, dydx = value_and_gradient(f, prep, AutoForwardDiff(), x)
func2(y, dydx)
end
function func2(y::Real, dydx::SVector)
prep = prepare_hessian(norm, AutoForwardDiff(), dydx)
H = hessian(norm, prep, AutoForwardDiff(), dydx)
func3(2*y, H * dydx)
end
function func3(z::Real, v::SVector)
f = x -> z * x
prep = prepare_jacobian(f, AutoForwardDiff(), v)
J = jacobian(f, prep, AutoForwardDiff(), v)
z .+ J * v
end
@time main()
2.589184 seconds (20.00 M allocations: 3.370 GiB, 3.47% gc time)
```

This code is slow as expected because preparation is performed each time. To prepare only once, the code can be modified as follows:

```
using StaticArrays
using LinearAlgebra
using DifferentiationInterface
using ForwardDiff
function main()
t = 0.0
T = 10.0
dt = 1.0
vals = map(i -> rand(SVector{3}), 1:100_000)
prep1 = prepare_gradient(x -> 2x ⋅ x, AutoForwardDiff(), vals[1])
prep2 = prepare_hessian(norm, AutoForwardDiff(), vals[1])
prep3 = prepare_jacobian(x -> z * x, AutoForwardDiff(), vals[1])
while t < T
calculate!(vals, prep1, prep2, prep3)
t += dt
end
end
function calculate!(vals, prep1, prep2, prep3)
for i in eachindex(vals)
vals[i] = func1(vals[i], prep1, prep2, prep3)
end
end
function func1(x::SVector, prep1, prep2, prep3)
f = x -> 2x ⋅ x
y, dydx = value_and_gradient(f, prep1, AutoForwardDiff(), x)
func2(y, dydx, prep2, prep3)
end
function func2(y::Real, dydx::SVector, prep2, prep3)
H = hessian(norm, prep2, AutoForwardDiff(), dydx)
func3(2*y, H * dydx, prep3)
end
function func3(z::Real, v::SVector, prep3)
f = x -> z * x
J = jacobian(f, prep3, AutoForwardDiff(), v)
z .+ J * v
end
@time main()
0.086852 seconds (22 allocations: 2.292 MiB)
```

This approach is much faster but requires separate preparation steps for each function, which can be cumbersome. With Tensorial.jl, however, the code can be simplified as follows:

```
using Tensorial
function main()
t = 0.0
T = 10.0
dt = 1.0
vals = map(i -> rand(Vec{3}), 1:100_000)
while t < T
calculate!(vals)
t += dt
end
end
function calculate!(vals)
for i in eachindex(vals)
vals[i] = func1(vals[i])
end
end
function func1(x::Vec)
f = x -> 2x ⋅ x
dydx, y = gradient(f, x, :all)
func2(y, dydx)
end
function func2(y::Real, dydx::Vec)
H = hessian(norm, dydx)
func3(2*y, H * dydx)
end
function func3(z::Real, v::Vec)
f = x -> z * x
J = gradient(f, v) # this becomes Jacobin automatically in Tensorial.jl if the output of function is a vector
z .+ J * v
end
@time main()
0.079240 seconds (3 allocations: 2.289 MiB)
```

The execution time is comparable to the second example, but the code is significantly simpler. This is possible because when using ForwardDiff.jl the preparation object can be bitstype, so almost no cost for preparation actually.