How to use parallel computing or GPU to make calculations fasterwhen the M=10000

using CuArrays,CUDAnative,CUDAdrv,BenchmarkTools
N = 3
k = ones(N,N)*5
M = 100

rho = zeros(N,N)
aa = range(-5,5,length=N)
K = zeros(N,N)
A = repeat(-aa’,N,1)
B = repeat(aa,1,N)
C=ones(N,N)-10
phi = repeat(range(0,2
pi,length=M),1,M)
theta = repeat(range(0,pi/2,length=M),1,M)’

Rho = zeros(M,M)
Ave_Energy = ones(N,N)*pi/N^2
junyundu = 0
f = zeros(N,N)
function getnum(rho,K,f,A,B,C,phi,theta)
while junyundu < 0.9
goal_E = ones(N,N)/N^2
number_E = zeros(N,N)
E = zeros(N,N)

    Threads.@threads for i = 1:M
        Threads.@threads for j = 1:M
                f .= @views (A[:, :].^2 .+ B[:,:].^2 .+ C[:, :] .^2).^0.5
                K .= @views 2 .* k[:, :] .+ f[:,:]
                rho .= @views (K[:, :] .^2 .- f[:, :].^2) ./ (2 .* (K[:, :] .- A[:, :] .* sin(theta[i, j]) .* cos(phi[i, j]) .-
                            B[:, :] .*sin(theta[i, j]) .* sin(phi[i, j]) .- C[:, :] .* cos(theta[i, j])))

        Rho[i, j] = maximum(maximum(rho[:]))
        FF = findall(x->x .== maximum(maximum(rho)),rho)
        number_E[FF] = number_E[FF] .+ 1/length(FF)
        end
    end
    
E = number_E/(M*M)*pi
Var = goal_E - E
da = findall(x->x > 0, Var)
xiao = findall(x->x < 0, Var)
Var = Ave_Energy - E
fangcha = sum(sum(Var.^2))

if Var == 0
    k .= @views  k[:, :] .+ 0
else
    k .= @views  k[:, :] .+ Var[:, :] .* 0.1
end

qa = maximum(maximum(E))
qb = minimum(minimum(E))
qc = sum(sum(E))/(N*N)
global junyundu = 1-(qa-qb)/qc
println(meannum)
end

end

Can you please place your code between triple backticks (``` my code goes here ```), so that it’s easier to read your posted code?

Also, what exactly is your question? Do you have benchmarks of your code that show that it isn’t performing as fast as you’d expect? Do you have a hot loop that you need help optimizing?