Hi, I am trying to convert some Python+numpy subroutines to Julia. These subroutines are most important part of my main progam and take much time. In the current version these codes are run faster in non-optimized Python than Julia.

I have tried to improve the Julia codes to the best of my capabilities. `@code_warntype`

does not show a type instability. For the sake of minimal working example, I have simplified the codes as much as possible and gave them in below. I will appreciate if someone can help on improving these codes.

```
using SparseArrays
using Printf
using BenchmarkTools
using LoopVectorization
using NearestNeighbors
using LinearAlgebra
function K(r::Matrix,od::Int64,γ::Float64,e::Float64)::Array{Float64,2}
phi = similar(r)
@avx for i in eachindex(r)
phi[i] = r[i]^od*γ + exp(-(r[i]*e)^2)
end
return phi
end
function D(r::Float64,rx::Float64,n::Int64,γ::Float64,e::Float64)
dp = zero(r);
if n == 1
dp = 7.0*γ*rx*r^5 - 2.0*e^2*rx*exp(-e^2*(r^2))#
elseif n == 2
dp = 4.0*(e^2)^2*rx^2 *exp(-e^2*(r^2)) - 2.0*e^2*exp(-e^2*(r^2)) + 7.0*γ*r^5 + (35.0*γ*rx^2)*r^3
else
println("enough")
end
return dp
end
function H(i::Int64,n::Int64,X::Array{Float64,1},Y::Array{Float64,1},p::Array{Int64,2},γ::Float64,sh::Float64)::NTuple{2,Array{Float64,2}}
h1,h2 = zeros(size(p)),zeros(size(p))
@inbounds for j in 1:n
a = (X[i]-X[p[j]]); b=(Y[i]-Y[p[j]]); c=sqrt(a^2 + b^2);
h1[j] = D(c, a, 2,γ,sh)
h2[j] = D(c, b, 2,γ,sh)
end
return h1,h2
end
function R(n::Int64,X::Array{Float64,1},Y::Array{Float64,1},p::Array{Int64,2})::Array{Float64,2}
r =zeros(n,n)
@inbounds for j in 1:n, k in 1:n
r[k,j] = sqrt((X[p[k]]-X[p[j]])^2 + (Y[p[k]]-Y[p[j]])^2)
end
return r
end
function main(st::Array{Int64,2},X::Array{Float64,1},Y::Array{Float64,1},ip::Array{Int64,1},od::Int64,γ::Float64,sh::Float64,n::Int64)
N = length(X)
D = spzeros(N,N)
p = zeros(Int,n,1);
for i in ip
@inbounds for j in 1:n
p[j] = st[i,j];
end
r = R(n,X,Y,p)
h1,h2 = H(i,n,X,Y,p,γ,sh)
B = K(r,od,γ,sh)
Dl = h1+h2;
D[i,p] = ldiv!(lu!(B), Dl);# ; #instead of (B\Dl)
end
return D
end
const x1, x2, y1, y2 ,sh, od, γ, N, n = -64, 64, -64, 64, 1.0, 7, 2.0, 200, 50
a,b = range(x1,stop=x2,length=N),range(y1,stop=y2,length=N);
x,y = repeat(a,1,length(a)),repeat(b',length(b),1);
PT = reduce(vcat, [x y] for y in b for x in a) #
ip = filter(i -> PT[i,1]>x1 && PT[i,1]<x2 && PT[i,2]>y1 && PT[i,2]<y2, axes(PT, 1));
X,Y = PT[:,1], PT[:,2]; data = [X';Y'];
kd = KDTree(data)
SS, _ = knn(kd, data, n, true);
SS1 = Int.(vcat(SS'...));
```

Here are timings:

```
@benchmark main(SS1,X,Y,ip,od,γ,sh,n)
BenchmarkTools.Trial:
memory estimate: 1.55 GiB
allocs estimate: 392087
--------------
minimum time: 16.661 s (0.18% GC)
median time: 16.661 s (0.18% GC)
mean time: 16.661 s (0.18% GC)
maximum time: 16.661 s (0.18% GC)
--------------
samples: 1
evals/sample: 1
```

As I said previously, the Python codes are faster and if I increase `N`

and `n`

than the speed of Python become more dominant than speed of Julia.