Allocations in Zygote for a simple derivative

Hello everyone, somewhat new to Julia and very new to Zygote. I want to use Zygote’s AD functionality to compute derivatives of some simple functions, like in the following code

using Zygote
using Random
using BenchmarkTools

struct SomeStruct{T <: AbstractFloat}
    λ::T
    σ::T
    a::T
    b::T
    eps::T
end

function SomeStruct(λ::T, eps::T) where {T <: AbstractFloat}
    # Compute some internal parameters
    b = λ / (λ - 1.0)
    a = λ * b^(λ - 1.0)
    return SomeStruct{T}(λ,
        1.0,
        a,
        b,
        eps)
end

function ffunc(p::SomeStruct, x::AbstractFloat)
    total = (p.a / p.eps * (x^-p.λ - x^-(p.λ - 1.0))
    return total + (1.0 / p.eps)
end

function gfunc(p::SomeStruct, x::AbstractFloat)
    # Hand-written derivative of ffunc
    result = (p.λ * x^-(p.λ + 1.0)) - (p.λ - 1.0) * x^-p.λ
    return result * (p.a / p.eps)
end

function handtest(a::AbstractArray, f::Function, g::Function, c::AbstractArray, d::AbstractArray)
    for i in eachindex(a)
        c[i] = f(a[i])
        d[i] = g(a[i])
    end
end

function gradtest(a::AbstractArray, f::Function, c::AbstractArray, d::AbstractArray)
    for i in eachindex(a)
        c[i] = f(a[i])
        # Use Zygote's gradient
        d[i] = Zygote.gradient(f, a[i])[1]
    end
end  # function gradtest

function bzygote()
    pseudo = SomeStruct(50.0, 1.4737)
    # Test values
    a = rand(1000, 3) .+ 0.5
    # Pre-allocate result arrays
    res1 = zeros(1000, 3)
    res2 = zeros(1000, 3)
    # Function to be evaluated
    f(x) = ffunc(pseudo, x)
    # Hand-written derivative
    g(x) = gfunc(pseudo, x)
    @btime handtest($a, $f, $g, $res1, $res2)
    @btime gradtest($a, $f, $res1, $res2)
end  # function bzygote

bzygote()

When run, I get the following output

206.491 μs (0 allocations: 0 bytes)
3.591 ms (84000 allocations: 3.02 MiB)

while reading the Zygote paper, IIUC Zygote creates Julia code, so both should be comparable in memory allocation and speed. What can I do to improve performance?