How to speed up calculating large array?

question

#1

Hi, I’m wondering if I could decrease the computation time for calculating large size array.

Because I was new to Julia, I almost copied the format of ‘advection_shared!(q,u)’ in https://docs.julialang.org/en/latest/manual/parallel-computing like following

@everywhere const Hist_x = collect(5000:10:15000)
@everywhere const Hist_y = collect(5000:10:15000)
@everywhere function div(X::SharedArray,q::Array)
          idx = indexpids(X)
          nchunks = length(procs(X))
          splits = [round(Int, s) for s in linspace(0,length(q),nchunks+1)]
          splits[idx]+1:splits[idx+1]
end
@everywhere function Xs!(X_ray::SharedArray{Float64,2},ρ::Array{Float64,1},Pos::Array{Float64,2},T::Array{Float64,1},Width::Array{Float64,1},M::Array{Float64,1},X_lim::UnitRange{Int64},Y_lim::UnitRange{Int64})
  One = ones(length(ρ))
  L_X = X_ray_Luminosity(M,ρ,T) #Function that returns Float64 array
  for i in X_lim, j in Y_lim
    Inter_X = (min.(Pos[1,:]+Width./2.,(Hist_x[i]+Hist_bin/2.)*One)-max.(Pos[1,:]-Width./2.,(Hist_x[i]-Hist_bin/2.)*One))./Width
    Inter_Y = (min.(Pos[2,:]+Width./2.,(Hist_y[j]+Hist_bin/2.)*One)-max.(Pos[2,:]-Width./2.,(Hist_y[j]-Hist_bin/2.)*One))./Width
    Inter_X[Inter_X.<0] = 0
    Inter_Y[Inter_Y.<0] = 0
    X_ray[i,j] = sum(L_X .* (Inter_Y .* Inter_X))/(Hist_bin^2.)
  end
  X_ray
end
@everywhere Xk!(X_ray::SharedArray{Float64,2},Params::Array{Array{Float64,N} where N})=Xs!(X_ray,Params[1],Params[2],Params[3],Params[4],Params[5],div(X_ray,Hist_x),1:length(Hist_y))
function Xx(GPos::Array{Float64,2},M::Array{Float64,1},ρ::Array{Float64,1},T::Array{Float64,1},Width::Array{Float64,1}) 
  # Length for other array = 10^6
  X_ray = SharedArray{Float64}(length(Hist_x),length(Hist_y)) #1000x1000 array
  @sync begin
    for p in procs(X_ray)
        @async remotecall_wait(Xk!,p,X_ray,[ρ,GPos,T,Width,M])
      end
  end
  X_ray
end

and the computation time for this code was extremely large(>1 days with 7 processors)
Although the number of calculation was large(10^6 * 1000 * 1000), I think it’s … well too long.

Are there any ways to reduce the time for this?


#2

Have you profiled this part of your code without running it distributed?
Once you have that running at top-speed on a single processor I would then worry about running it distributed.


#3

Thanks for replying!

Yes, previous code of this was written for single processor.(because this code was originally written in Python 3)
I just tested the code by putting Hist_bin = 1000, 100 (which makes 1010 and 100100 array) and result for @time was
30.413630 seconds (677.29 k allocations: 30.436 GiB, 58.71% gc time)
(… Still waiting but because it is O(N^2), I guess it will take…40 min? )

To make sure I’ll attach code that I ran for single processor

const Hist_bin = 0.01
const Hist_x = collect(0:Hist_bin:1)
const Hist_y = collect(0:Hist_bin:1)
function X_ray_Dist(Pos::Array{Float64,2},M::Array{Float64,1},ρ::Array{Float64,1},T::Array{Float64,1}) ::Array{Float64,2}
  Num_Dens = ρ./1.5
  Width = ((M./ρ).^(1./3.))
  L_X = M.*ρ#Lets simply say, X_ray_Luminosity(M,ρ,T)
  X_ray = Array{Float64,2}(length(Hist_x),length(Hist_y))
  One = ones(length(ρ))
  for i = 1:length(Hist_x)
    for j = 1:length(Hist_y)
      Inter_X = (min.(Pos[1,:]+Width./2.,(Hist_x[i]+Hist_bin/2.)*One)-max.(Pos[1,:]-Width./2.,(Hist_x[i]-Hist_bin/2.)*One))./Width
      Inter_Y = (min.(Pos[2,:]+Width./2.,(Hist_y[j]+Hist_bin/2.)*One)-max.(Pos[2,:]-Width./2.,(Hist_y[j]-Hist_bin/2.)*One))./Width
      Inter_X[Inter_X.<0] = 0
      Inter_Y[Inter_Y.<0] = 0
      X_ray[i,j] = sum(L_X .* (Inter_Y .* Inter_X))/(Hist_bin^2.)
    end
  end
  return X_ray
end
Pos = rand(Float64,(3,10^6))
M  = rand(Float64,(10^6)).+1
ρ  = rand(Float64,(10^6)).+1
T =  rand(Float64,(10^6)).+1
X_dist= @time X_ray_Dist(Pos,M,ρ,T)

#4

Please post the input arguments you use to the functions so that we can run it by just copy-pasting the code into the REPL.


#5

You want to get that number of allocation down :wink:

Use the profiler and the memory allocation tracker to figure out where they are coming from


#6

Sorry, I just edit the code so it can be tested right away


#7

Your code is very matlaby, you’re likely to get a very large speedup by devectorizing the code (ie replace your array operations on Inter_X/Y by a loop over the size). I shave 20% just by removing “*One”, so there’s a lot of room there :slight_smile:


#8

I’m not sure devectorizing is really needed too much here, but the vectorizations should be inplace. Make use of @. so that way the full statements are inplace, like @. x = y*z + a*b. If you restructure it like this it’ll be easy to read yet you’ll get the allocations down like @vchuravy suggested. You may need to pre-allocate a few caches.


#9

OK, here’s what I have, with devectorization, a “continue”, @inbounds, and moving some floating point divisions around (each of those had a real impact on performance). On a size 1e4 this is about 4 times as fast as the original version, I’m actually surprised it isn’t more…

const Hist_bin = 0.01
const Hist_x = collect(0:Hist_bin:1)
const Hist_y = collect(0:Hist_bin:1)
function X_ray_Dist(Pos::Array{Float64,2},M::Array{Float64,1},ρ::Array{Float64,1},T::Array{Float64,1}) ::Array{Float64,2}
    Num_Dens = ρ./1.5
    Width = ((M./ρ).^(1./3.))
    L_X = M.*ρ#Lets simply say, X_ray_Luminosity(M,ρ,T)
    X_ray = zeros(length(Hist_x),length(Hist_y))
    One = ones(length(ρ))
    for i = 1:length(Hist_x)
        for j = 1:length(Hist_y)
            @inbounds for k = 1:size(Pos,2)
                Inter_X = (min(Pos[1,k]+Width[k]/2,(Hist_x[i]+Hist_bin/2))-max(Pos[1,k]-Width[k]/2,(Hist_x[i]-Hist_bin/2)))
                Inter_Y = (min(Pos[2,k]+Width[k]/2,(Hist_y[j]+Hist_bin/2))-max(Pos[2,k]-Width[k]/2,(Hist_y[j]-Hist_bin/2)))
                if Inter_X < 0 || Inter_Y < 0
                    continue
                end
                X_ray[i,j] += (L_X[k] * (Inter_Y * Inter_X))/(Hist_bin^2*Width[k]*Width[k])
            end
        end
    end
    return X_ray
end
n = 10^4
srand(1)
Pos = rand(Float64,(3,n))
M  = rand(Float64,(n)).+1
ρ  = rand(Float64,(n)).+1
T =  rand(Float64,(n)).+1
X_dist= @time X_ray_Dist(Pos,M,ρ,T)

#10

Here is a tweaked version. I intentionally didn’t make any dramatic changes (like devectorizing) to the code because I wanted to keep the changes to a minimum:

function X_ray_Dist(Pos, M, ρ, T)
    Width = ((M./ρ).^(1./3.))
    L_X = M .* ρ
    X_ray = zeros(Float64, length(Hist_x), length(Hist_y))
    Inter_X = zeros(Float64, size(Pos, 2))
    Inter_Y = zeros(Float64, size(Pos, 2))
    x_coords = Pos[1, :]
    y_coords = Pos[2, :]
    for i = 1:length(Hist_x)
        @. Inter_X = max(0.0, (min(x_coords + Width / 2., (Hist_x[i] + Hist_bin / 2.)) - max(x_coords - Width / 2., (Hist_x[i] - Hist_bin/2.))) / Width)
        for j = 1:length(Hist_y)
            @. Inter_Y = max(0.0, (min(y_coords + Width / 2., (Hist_y[j] + Hist_bin / 2.)) - max(y_coords - Width / 2., (Hist_y[j] - Hist_bin/2.))) / Width)
            X_ray[i, j] = mapreduce(i -> L_X[i] * Inter_Y[i] * Inter_X[i], +, 1:length(Inter_X)) / (Hist_bin^2.)            
        end
    end
    return X_ray
end

Comparison (with N = 10^4):

Main> @time X_dist = X_ray_Dist(Pos, M, ρ, T);
  1.179841 seconds (10.22 k allocations: 868.016 KiB)

Main> @time X_dist_old = X_ray_Dist_old(Pos, M, ρ, T);
  6.448615 seconds (693.68 k allocations: 19.130 GiB, 21.02% gc time)

Main> println(norm(X_dist - X_dist_old) / norm(X_dist_old))
1.3156213596542271e-16

#11

Awesome!! All of the solutions worked fine. And I just tested this change to the original code with parallel computing and it is much faster than before! (And I think I need to learn how to use profiler…I can’t understand the printed result right now)

Thanks for the help!


#12

Check out ProfileView.jl for a visual (and, I think, easier to understand) profile output.