Accelerating an N^2 nested loop

I generally don’t expect to be faster, but identical. Yet, in this particular case, the SVector version is turning out to be slower (~30%), and I don’t understand why. Do you see the difference? Here is a copy/paste runnable code of both versions. Just run main().

I get:

julia> main()
  66.126 μs (0 allocations: 0 bytes)
nothing
  101.776 μs (0 allocations: 0 bytes)
nothing
import Pkg
Pkg.activate(;temp=true)
Pkg.add("StaticArrays")
using Test
using StaticArrays

# With simple 1D arrays
function XYDOT2!(X,Y,X_DOT,Y_DOT,EPS)
    fill!(X_DOT,0.0); fill!(Y_DOT,0.0)
    @inbounds for i in eachindex(X,Y,X_DOT,Y_DOT)
        x_i, y_i = X[i], Y[i]
        acc_x, acc_y = 0.0, 0.0
        @fastmath for j=(i+1):lastindex(X)
            x_j, y_j = X[j], Y[j]
            diff_x = x_i - x_j; diff_y = y_i - y_j
            inv_d = inv(diff_x^2 + diff_y^2 + EPS)
            dxdt = diff_x * inv_d; dydt = diff_y * inv_d
            acc_x += dxdt; acc_y += dydt
            X_DOT[j] -= dxdt; Y_DOT[j] -= dydt
        end
        X_DOT[i] += acc_x; Y_DOT[i] += acc_y
    end
    return nothing
end

# with vectors of SVectors
function XYDOT_SV!(P::Vector{T}, XYDOT::Vector{T}, EPS) where T<:SVector
    fill!(XYDOT, zero(T))
    @inbounds for i in eachindex(P,XYDOT)
        p_i = P[i] 
        acc = zero(T)
        @fastmath for j=(i+1):lastindex(P)
            diff = p_i - P[j]
            invd = inv(diff[1]^2 + diff[2]^2 + EPS)
            dpdt = diff * invd
            acc += dpdt
            XYDOT[j] -= dpdt
        end
        XYDOT[i] += acc
    end
    return nothing
end

function main()
    N = 512
    EPS = 0.1
    P = rand(SVector{2,Float64},N)
    XYDOT = similar(P)
    X = [ x[1] for x in P ];
    Y = [ x[2] for x in P ];
    X_DOT = similar(X)
    Y_DOT = similar(Y)

    XYDOT2!(X,Y,X_DOT,Y_DOT,EPS)   
    XYDOT_SV!(P,XYDOT,EPS)
    @test all(isapprox.([SVector(x,y) for (x,y) in zip(X_DOT,Y_DOT)], XYDOT))
     
    display(@btime XYDOT2!($X,$Y,$X_DOT,$Y_DOT,$EPS))
    display(@btime XYDOT_SV!($P,$XYDOT,$EPS))

    return nothing
end