How to speed up calculating large array?

Hi, I’m wondering if I could decrease the computation time for calculating large size array.

Because I was new to Julia, I almost copied the format of ‘advection_shared!(q,u)’ in https://docs.julialang.org/en/latest/manual/parallel-computing like following

@everywhere const Hist_x = collect(5000:10:15000)
@everywhere const Hist_y = collect(5000:10:15000)
@everywhere function div(X::SharedArray,q::Array)
          idx = indexpids(X)
          nchunks = length(procs(X))
          splits = [round(Int, s) for s in linspace(0,length(q),nchunks+1)]
          splits[idx]+1:splits[idx+1]
end
@everywhere function Xs!(X_ray::SharedArray{Float64,2},ρ::Array{Float64,1},Pos::Array{Float64,2},T::Array{Float64,1},Width::Array{Float64,1},M::Array{Float64,1},X_lim::UnitRange{Int64},Y_lim::UnitRange{Int64})
  One = ones(length(ρ))
  L_X = X_ray_Luminosity(M,ρ,T) #Function that returns Float64 array
  for i in X_lim, j in Y_lim
    Inter_X = (min.(Pos[1,:]+Width./2.,(Hist_x[i]+Hist_bin/2.)*One)-max.(Pos[1,:]-Width./2.,(Hist_x[i]-Hist_bin/2.)*One))./Width
    Inter_Y = (min.(Pos[2,:]+Width./2.,(Hist_y[j]+Hist_bin/2.)*One)-max.(Pos[2,:]-Width./2.,(Hist_y[j]-Hist_bin/2.)*One))./Width
    Inter_X[Inter_X.<0] = 0
    Inter_Y[Inter_Y.<0] = 0
    X_ray[i,j] = sum(L_X .* (Inter_Y .* Inter_X))/(Hist_bin^2.)
  end
  X_ray
end
@everywhere Xk!(X_ray::SharedArray{Float64,2},Params::Array{Array{Float64,N} where N})=Xs!(X_ray,Params[1],Params[2],Params[3],Params[4],Params[5],div(X_ray,Hist_x),1:length(Hist_y))
function Xx(GPos::Array{Float64,2},M::Array{Float64,1},ρ::Array{Float64,1},T::Array{Float64,1},Width::Array{Float64,1}) 
  # Length for other array = 10^6
  X_ray = SharedArray{Float64}(length(Hist_x),length(Hist_y)) #1000x1000 array
  @sync begin
    for p in procs(X_ray)
        @async remotecall_wait(Xk!,p,X_ray,[ρ,GPos,T,Width,M])
      end
  end
  X_ray
end

and the computation time for this code was extremely large(>1 days with 7 processors)
Although the number of calculation was large(10^6 * 1000 * 1000), I think it’s … well too long.

Are there any ways to reduce the time for this?

Have you profiled this part of your code without running it distributed?
Once you have that running at top-speed on a single processor I would then worry about running it distributed.

3 Likes

Thanks for replying!

Yes, previous code of this was written for single processor.(because this code was originally written in Python 3)
I just tested the code by putting Hist_bin = 1000, 100 (which makes 1010 and 100100 array) and result for @time was
30.413630 seconds (677.29 k allocations: 30.436 GiB, 58.71% gc time)
(… Still waiting but because it is O(N^2), I guess it will take…40 min? )

To make sure I’ll attach code that I ran for single processor

const Hist_bin = 0.01
const Hist_x = collect(0:Hist_bin:1)
const Hist_y = collect(0:Hist_bin:1)
function X_ray_Dist(Pos::Array{Float64,2},M::Array{Float64,1},ρ::Array{Float64,1},T::Array{Float64,1}) ::Array{Float64,2}
  Num_Dens = ρ./1.5
  Width = ((M./ρ).^(1./3.))
  L_X = M.*ρ#Lets simply say, X_ray_Luminosity(M,ρ,T)
  X_ray = Array{Float64,2}(length(Hist_x),length(Hist_y))
  One = ones(length(ρ))
  for i = 1:length(Hist_x)
    for j = 1:length(Hist_y)
      Inter_X = (min.(Pos[1,:]+Width./2.,(Hist_x[i]+Hist_bin/2.)*One)-max.(Pos[1,:]-Width./2.,(Hist_x[i]-Hist_bin/2.)*One))./Width
      Inter_Y = (min.(Pos[2,:]+Width./2.,(Hist_y[j]+Hist_bin/2.)*One)-max.(Pos[2,:]-Width./2.,(Hist_y[j]-Hist_bin/2.)*One))./Width
      Inter_X[Inter_X.<0] = 0
      Inter_Y[Inter_Y.<0] = 0
      X_ray[i,j] = sum(L_X .* (Inter_Y .* Inter_X))/(Hist_bin^2.)
    end
  end
  return X_ray
end
Pos = rand(Float64,(3,10^6))
M  = rand(Float64,(10^6)).+1
ρ  = rand(Float64,(10^6)).+1
T =  rand(Float64,(10^6)).+1
X_dist= @time X_ray_Dist(Pos,M,ρ,T)

Please post the input arguments you use to the functions so that we can run it by just copy-pasting the code into the REPL.

2 Likes

You want to get that number of allocation down :wink:

Use the profiler and the memory allocation tracker to figure out where they are coming from

Sorry, I just edit the code so it can be tested right away

Your code is very matlaby, you’re likely to get a very large speedup by devectorizing the code (ie replace your array operations on Inter_X/Y by a loop over the size). I shave 20% just by removing “*One”, so there’s a lot of room there :slight_smile:

I’m not sure devectorizing is really needed too much here, but the vectorizations should be inplace. Make use of @. so that way the full statements are inplace, like @. x = y*z + a*b. If you restructure it like this it’ll be easy to read yet you’ll get the allocations down like @vchuravy suggested. You may need to pre-allocate a few caches.

OK, here’s what I have, with devectorization, a “continue”, @inbounds, and moving some floating point divisions around (each of those had a real impact on performance). On a size 1e4 this is about 4 times as fast as the original version, I’m actually surprised it isn’t more…

const Hist_bin = 0.01
const Hist_x = collect(0:Hist_bin:1)
const Hist_y = collect(0:Hist_bin:1)
function X_ray_Dist(Pos::Array{Float64,2},M::Array{Float64,1},ρ::Array{Float64,1},T::Array{Float64,1}) ::Array{Float64,2}
    Num_Dens = ρ./1.5
    Width = ((M./ρ).^(1./3.))
    L_X = M.*ρ#Lets simply say, X_ray_Luminosity(M,ρ,T)
    X_ray = zeros(length(Hist_x),length(Hist_y))
    One = ones(length(ρ))
    for i = 1:length(Hist_x)
        for j = 1:length(Hist_y)
            @inbounds for k = 1:size(Pos,2)
                Inter_X = (min(Pos[1,k]+Width[k]/2,(Hist_x[i]+Hist_bin/2))-max(Pos[1,k]-Width[k]/2,(Hist_x[i]-Hist_bin/2)))
                Inter_Y = (min(Pos[2,k]+Width[k]/2,(Hist_y[j]+Hist_bin/2))-max(Pos[2,k]-Width[k]/2,(Hist_y[j]-Hist_bin/2)))
                if Inter_X < 0 || Inter_Y < 0
                    continue
                end
                X_ray[i,j] += (L_X[k] * (Inter_Y * Inter_X))/(Hist_bin^2*Width[k]*Width[k])
            end
        end
    end
    return X_ray
end
n = 10^4
srand(1)
Pos = rand(Float64,(3,n))
M  = rand(Float64,(n)).+1
ρ  = rand(Float64,(n)).+1
T =  rand(Float64,(n)).+1
X_dist= @time X_ray_Dist(Pos,M,ρ,T)
1 Like

Here is a tweaked version. I intentionally didn’t make any dramatic changes (like devectorizing) to the code because I wanted to keep the changes to a minimum:

function X_ray_Dist(Pos, M, ρ, T)
    Width = ((M./ρ).^(1./3.))
    L_X = M .* ρ
    X_ray = zeros(Float64, length(Hist_x), length(Hist_y))
    Inter_X = zeros(Float64, size(Pos, 2))
    Inter_Y = zeros(Float64, size(Pos, 2))
    x_coords = Pos[1, :]
    y_coords = Pos[2, :]
    for i = 1:length(Hist_x)
        @. Inter_X = max(0.0, (min(x_coords + Width / 2., (Hist_x[i] + Hist_bin / 2.)) - max(x_coords - Width / 2., (Hist_x[i] - Hist_bin/2.))) / Width)
        for j = 1:length(Hist_y)
            @. Inter_Y = max(0.0, (min(y_coords + Width / 2., (Hist_y[j] + Hist_bin / 2.)) - max(y_coords - Width / 2., (Hist_y[j] - Hist_bin/2.))) / Width)
            X_ray[i, j] = mapreduce(i -> L_X[i] * Inter_Y[i] * Inter_X[i], +, 1:length(Inter_X)) / (Hist_bin^2.)            
        end
    end
    return X_ray
end

Comparison (with N = 10^4):

Main> @time X_dist = X_ray_Dist(Pos, M, ρ, T);
  1.179841 seconds (10.22 k allocations: 868.016 KiB)

Main> @time X_dist_old = X_ray_Dist_old(Pos, M, ρ, T);
  6.448615 seconds (693.68 k allocations: 19.130 GiB, 21.02% gc time)

Main> println(norm(X_dist - X_dist_old) / norm(X_dist_old))
1.3156213596542271e-16
1 Like

Awesome!! All of the solutions worked fine. And I just tested this change to the original code with parallel computing and it is much faster than before! (And I think I need to learn how to use profiler…I can’t understand the printed result right now)

Thanks for the help!

Check out ProfileView.jl for a visual (and, I think, easier to understand) profile output.

3 Likes