Improve performance of this index creation and accessing code

How to improve the performance of clp(j) function given below. Please suggest faster version of this code.
I am converting spherical coordinates to Cartesian and placing color values to exact location in data. Otherwise i get cuboid on plotting but i should get distorted shape.

nx=23; ny=5; nz=17;
X=rand(nx*ny*nz); Y=rand(nx*ny*nz); Z=rand(nx*ny*nz); C=rand(nx*ny*nz);
xmin = -0.00010490733f0
ymin = -176.07669f0
zmin = -1200.0f0
sx = 0.97817904f0
sy = 0.27828765f0
sz = 0.24079268f0
field = fill(NaN32, nx,ny,nz);

function clp(j)
	xi = Int32(round((X[j] - xmin) * sx)) + 1   #slow line
	yi = Int32(round((Y[j] - ymin) * sy)) + 1   #slow line
	zi = Int32(round((Z[j] - zmin) * sz)) + 1   #slow line
	ix = max(1, min(nx, xi))
	iy = max(1, min(ny, yi))
	iz = max(1, min(nz, zi))
	field[ix, iy, iz] = C[j]
end

for j in eachindex(X)
	clp(j)
end

another alternative form of clp(j) is :backhand_index_pointing_down:

for (xi, yi, zi, ci) in zip(X, Y, Z, C)
    ix = clamp(Int(round((xi-xmin)*sx)) + 1, 1, nx)
    iy = clamp(Int(round((yi-ymin)*sy)) + 1, 1, ny)
    iz = clamp(Int(round((zi-zmin)*sz)) + 1, 1, nz)
    field[ix, iy, iz] = ci
end

Accessing a lot of untyped globals would stop most compiler optimizations.

5 Likes

I don’t see any improvement even after assigning types and passing as arguments to functions clp(j,X,Y,Z,C,sx,sy,sz,nx,ny,nz,field). I get same Profile result in both cases with and without type annotation.

     8         8 …man/Videos/Modular.jl    ? clp(j::Int64)
     6         6 …man/Videos/Modular.jl   51 clp(j::Int64)
   670       648 …man/Videos/Modular.jl   52 clp(j::Int64)
   601       588 …man/Videos/Modular.jl   53 clp(j::Int64)
   566       550 …man/Videos/Modular.jl   54 clp(j::Int64)
    47        45 …man/Videos/Modular.jl   55 clp(j::Int64)
    63        59 …man/Videos/Modular.jl   56 clp(j::Int64)
    81        73 …man/Videos/Modular.jl   57 clp(j::Int64)
   151       122 …man/Videos/Modular.jl   58 clp(j::Int64)
nx, ny, nz = 23, 5, 17
N = nx * ny * nz
X = rand(Float32, N)
Y = rand(Float32, N)
Z = rand(Float32, N)
C = rand(Float32, N)

xmin = -0.00010490733f0
ymin = -176.07669f0
zmin = -1200.0f0
sx = 0.97817904f0
sy = 0.27828765f0
sz = 0.24079268f0

field = fill(NaN32, nx, ny, nz)

function clp!(field::Array{Float32,3}, X, Y, Z, C, j, xmin, ymin, zmin, sx, sy, sz, nx, ny, nz)
    xi = Int32(round((X[j] - xmin) * sx)) + 1
    yi = Int32(round((Y[j] - ymin) * sy)) + 1
    zi = Int32(round((Z[j] - zmin) * sz)) + 1
    
    ix = clamp(xi, 1, nx)
    iy = clamp(yi, 1, ny)
    iz = clamp(zi, 1, nz)
    
    field[ix, iy, iz] = C[j]
    return (ix, iy, iz)
end

results = Vector{NTuple{3, Int32}}(undef, N)
@time for j in 1:N
    results[j] = clp!(field, X, Y, Z, C, j, xmin, ymin, zmin, sx, sy, sz, nx, ny, nz)
end
julia> @time for j in 1:N
           results[j] = clp!(field, X, Y, Z, C, j, xmin, ymin, zmin, sx, sy, sz, nx, ny, nz)
       end
  0.006635 seconds (19.37 k allocations: 883.781 KiB, 96.97% compilation time)
julia> @time for j in 1:N
           results[j] = clp!(field, X, Y, Z, C, j, xmin, ymin, zmin, sx, sy, sz, nx, ny, nz)
       end
  0.000161 seconds (6.80 k allocations: 167.344 KiB)
1 Like

You forgot xmin, ymin, and zmin. I get an 80x speedup if I pass all the data as explicit arguments and interpolate globals:

julia> @btime clp(1);
  372.137 ns (13 allocations: 208 bytes)

julia> @btime clp(1,$X,$Y,$Z,$C,$xmin,$ymin,$zmin,$sx,$sy,$sz,$nx,$ny,$nz,$field);
  4.625 ns (0 allocations: 0 bytes)

(No explicit type declarations are required! Literally my only changes were to append X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field to the argument list, and to use BenchmarkTools.jl for benchmarking while interpolating globals with $.)

Eliminating the global-variable penalty is the first and most important Julia performance tip. You really have to understand and internalize this.

8 Likes

Thanks, Yes it improved performance. I actually want even better performance.

    77        55 …man/Videos/Modular.jl   19 (::var"#3#4")(::Int64)
    20        10 …man/Videos/Modular.jl   48 (::var"#5#6")(::Int64)
     6         6 …man/Videos/Modular.jl   51 clp(j::Int64, X::Vector{Float32},…
    18         0 …man/Videos/Modular.jl   52 clp(j::Int64, X::Vector{Float32},…
     6         0 …man/Videos/Modular.jl   53 clp(j::Int64, X::Vector{Float32},…
     5         0 …man/Videos/Modular.jl   54 clp(j::Int64, X::Vector{Float32},…
     1         0 …man/Videos/Modular.jl   55 clp(j::Int64, X::Vector{Float32},…
     1         0 …man/Videos/Modular.jl   56 clp(j::Int64, X::Vector{Float32},…
     1         0 …man/Videos/Modular.jl   57 clp(j::Int64, X::Vector{Float32},…
     9         0 …man/Videos/Modular.jl   58 clp(j::Int64, X::Vector{Float32},…

The Int32 conversion is wasted here, since Int32(1) + 1 is an Int64, due to type promotion. The min/max or clamp calls will also promote to Int64. This is because literal integers are 64-bit on 64-bit platforms.

Also, there isn’t any reason you need 32-bit integers for indexing anyway, just use Ints.

It’s preferable to just write

xi = round(Int, (X[j] - xmin) * sx) + 1

or, if you are ok with trunc instead of round, that is a bit faster.

4 Likes

When i use @threads to parallelize for loop then i face data race in field variable. To tackle it i used @lock but it gave not much improvement in performance. It takes ~5000 seconds on 14 threads while on single thread it takes ~6000 seconds. What else should i do?

1 Like

Please post the current code again such that we can run it.

1 Like
using Base.Threads
nx=23; ny=5; nz=17;
X=rand(nx*ny*nz); Y=rand(nx*ny*nz); Z=rand(nx*ny*nz); C=rand(nx*ny*nz);
xmin = -0.00010490733f0
ymin = -176.07669f0
zmin = -1200.0f0
sx = 0.97817904f0
sy = 0.27828765f0
sz = 0.24079268f0
field = fill(NaN32, nx,ny,nz);
my_lock = ReentrantLock();
function clp(j,X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
	xi = Int32(round((X[j] - xmin) * sx)) + 1   #slow line
	yi = Int32(round((Y[j] - ymin) * sy)) + 1   #slow line
	zi = Int32(round((Z[j] - zmin) * sz)) + 1   #slow line
	ix = max(1, min(nx, xi))
	iy = max(1, min(ny, yi))
	iz = max(1, min(nz, zi))
	@lock my_lock field[ix, iy, iz] = C[j]
end

@time @threads for j in eachindex(X)
	clp(j,X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
end
@time @threads for j in eachindex(X)
	clp(j,X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
end
@time for j in eachindex(X)
	clp(j,X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
end
@time for j in eachindex(X)
	clp(j,X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
end

My data is available at this link. The actual complete code is :backhand_index_pointing_down:

Real full code
using HDF5, GLMakie, Interpolations, Statistics, Base.Threads

function modl(h5file)
	h5 = h5open(h5file, "r")
	B = h5["B"]
	Br, Bθ, Bϕ = @views B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
	Bp = @. hypot(Br, Bθ) / Bϕ
	x_all, y_all, z_all = h5["x1v"], h5["x2v"], h5["x3v"]
	xe_all, ye_all, ze_all = h5["x1f"], h5["x2f"], h5["x3f"]
	MeshBlockSize = read((HDF5.attributes(h5))["MeshBlockSize"])
	blocks::Int32 = read(HDF5.attributes(h5),"NumMeshBlocks")
	fig = Figure()
	ax = Axis3(fig[1,1], title="Faster", aspect=:data)
	xmins, xmaxs, ymins, ymaxs, zmins, zmaxs = ntuple(_ -> zeros(Float32, blocks* prod(MeshBlockSize.+1)), 6)

	global icx=1
	for b in 1:blocks
		r, θ, ϕ = xe_all[:, b], ye_all[:, b], ze_all[:, b]
		for rr in r, th in θ, ph in ϕ,  
			XX = rr * sin(th) * cos(ph)
			YY = rr * sin(th) * sin(ph)
			ZZ = rr * cos(th)
			xmins[icx], xmaxs[icx] = extrema(XX)
			ymins[icx], ymaxs[icx] = extrema(YY)
			zmins[icx], zmaxs[icx] = extrema(ZZ)
			global icx += 1
		end
	end

	xmin=minimum(xmins); ymin=minimum(ymins); zmin=minimum(zmins)
	xmax=maximum(xmaxs); ymax=maximum(ymaxs); zmax=maximum(zmaxs)

	println("  x ∈ [$xmin, $xmax], y ∈ [$ymin, $ymax], z ∈ [$zmin, $zmax]")
	nx,ny,nz = 10(MeshBlockSize.+1)
	my_lock = ReentrantLock();
	field = fill(NaN32, nx,ny,nz)

	sx = (nx-1)/(xmax-xmin); sy = (ny-1)/(ymax-ymin); sz = (nz-1)/(zmax-zmin)
	X, Y, Z, C = ntuple(_ -> zeros(Float32, 1000 * prod(MeshBlockSize .+ 1)), 4)

	function clp(X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
		xi = trunc(Int, (X-xmin) * sx) + 1
		yi = trunc(Int, (Y-ymin) * sy) + 1
		zi = trunc(Int, (Z-zmin) * sz) + 1
		ix = clamp(xi, 1, nx)
		iy = clamp(yi, 1, ny)
		iz = clamp(zi, 1, nz)
		@lock my_lock field[ix, iy, iz] = C
	end
	
	function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,sinθ,cosθ,cosϕ,sinϕ)
		rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
		st = sinθ[jj]
		ct = cosθ[jj]
		cp = cosϕ[kk]
		sp = sinϕ[kk]

		X[iex] = rr * st * cp
		Y[iex] = rr * st * sp
		Z[iex] = rr * ct
		C[iex] = ext_itp(rr, th, ph)
	end

	@threads for b in 1:blocks
		println("Block: $b")
		r, θ, ϕ = @views x_all[:, b], y_all[:, b], z_all[:, b]
		re, θe, ϕe = @views xe_all[:, b], ye_all[:, b], ze_all[:, b]
		Bp_block = @views Bp[:, :, :, b]
		ext_itp = extrapolate(interpolate((r, θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())

		r_eval = LinRange(re[1], re[end], 10(MeshBlockSize[1]+1))
		θ_eval = LinRange(θe[1], θe[end], 10(MeshBlockSize[2]+1))
		ϕ_eval = LinRange(ϕe[1], ϕe[end], 10(MeshBlockSize[3]+1))
		nr, nθ, nϕ = length(r_eval), length(θ_eval), length(ϕ_eval)
		sinθ = @. sin(θ_eval)
		cosθ = @. cos(θ_eval)
		sinϕ = @. sin(ϕ_eval)
		cosϕ = @. cos(ϕ_eval)
		iex::Int32 = 1
		@inbounds for ii in 1:nr, jj in 1:nθ, kk in 1:nϕ
			cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,sinθ,cosθ,cosϕ,sinϕ)
			clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
			iex += 1
		end
	end
	finite_vals = filter(!isnan, vec(field))
    q_low, q_high = quantile(finite_vals, (0.05, 0.95))
	volume!(ax, extrema(X), extrema(Y), extrema(Z), field, transparency=true; colorrange=(q_low, q_high))
	display(fig)
end
@time modl("sane00.prim.01800.athdf")

Is this faster?

nx=Int32(23); ny=Int32(5); nz=Int32(17);
...
function clp(j)
	xi = round(Int32, (X[j] - xmin) * sx) + Int32(1) 
        ...
	ix = max(Int32(1), min(nx, xi))
        ...
	@lock my_lock field[ix, iy, iz] = C[j]
end

You didn’t incorporate any changes that were proposed earlier in this thread? I thought @stevengj established that simply passing everything as arguments into the function is 2 orders of magnitude faster.
Also the conversion too Int32 are still superfluous.

Edit: Ah your full code actually has everything in a function. If it always has been then this thread has been a waste of everyone’s time because the code you posted did not reflext your actual code and thus the suggestions were not effective.

I have incorporated them in my real code. sorry for pasting MWE without changes. I edited it now.

I think I recognize this code from another thread. There I told you that reading data from a file is inherently type unstable (because Julia cannot know at compile time what it will find in the file). Thus you should separate this large function modl into smaller functions because each of them will act as a function barrier, meaning a border where Julia can infere types anew.

So try this: split this single huge function modl into smaller pieces. One function that just loads the data and returns it. Another function that receives the loaded data as arguments (!), then does the computations and returns the results. These results you feed into a third function responsible for plotting.

Try this. If you are unhappy with the performance, then we can profile the function, that does the computations, on its own and try to speed it up.

6 Likes

This is code after splitting into functions. I don’t see much improvement in performance even after parallelizing it in comparison to single threaded code. I want to get best performance.

Summary
using HDF5, GLMakie, Interpolations, Statistics, Profile, Base.Threads, BenchmarkTools
@time begin
function readhdf(h5file)
	h5 = h5open(h5file, "r")
	B = h5["B"]
	Br, Bθ, Bϕ = @views B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
	Bp = @. hypot(Br, Bθ) / Bϕ
	x_all, y_all, z_all = h5["x1v"], h5["x2v"], h5["x3v"]
	xe_all, ye_all, ze_all = h5["x1f"], h5["x2f"], h5["x3f"]
	MeshBlockSize = read((HDF5.attributes(h5))["MeshBlockSize"])
	blocks::Int32 = read(HDF5.attributes(h5),"NumMeshBlocks") 
	return Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks
end

Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks = readhdf("sane00.prim.01800.athdf")

function loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks)
	xmins, xmaxs, ymins, ymaxs, zmins, zmaxs = ntuple(_ -> zeros(Float32, blocks* prod(MeshBlockSize.+1)), 6)
	global icx=1
	for b in 700:blocks
		r, θ, ϕ = xe_all[:, b], ye_all[:, b], ze_all[:, b]
		for rr in r, th in θ, ph in ϕ,  
			XX = rr * sin(th) * cos(ph)
			YY = rr * sin(th) * sin(ph)
			ZZ = rr * cos(th)
			xmins[icx], xmaxs[icx] = extrema(XX)
			ymins[icx], ymaxs[icx] = extrema(YY)
			zmins[icx], zmaxs[icx] = extrema(ZZ)
			global icx += 1
		end
	end

	xmin=minimum(xmins); ymin=minimum(ymins); zmin=minimum(zmins)
	xmax=maximum(xmaxs); ymax=maximum(ymaxs); zmax=maximum(zmaxs)

	# println(" Bounds:")
	println("  x ∈ [$xmin, $xmax], y ∈ [$ymin, $ymax], z ∈ [$zmin, $zmax]")
	nx,ny,nz = 10(MeshBlockSize.+1)
    my_lock = ReentrantLock();
	field = fill(NaN32, nx,ny,nz)
	println("Allocated $(nx)×$(ny)×$(nz) grid.")

	sx = (nx-1)/(xmax-xmin); sy = (ny-1)/(ymax-ymin); sz = (nz-1)/(zmax-zmin)
	X, Y, Z, C = ntuple(_ -> zeros(Float32, 1000 * prod(MeshBlockSize .+ 1)), 4)

	function clp(X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
		xi = trunc(Int, (X-xmin) * sx) + 1
		yi = trunc(Int, (Y-ymin) * sy) + 1
		zi = trunc(Int, (Z-zmin) * sz) + 1
		ix = clamp(xi, 1, nx)
		iy = clamp(yi, 1, ny)
		iz = clamp(zi, 1, nz)
		@lock my_lock field[ix, iy, iz] = C
	end
	
	function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,sinθ,cosθ,cosϕ,sinϕ)
		rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
		st = sinθ[jj]
		ct = cosθ[jj]
		cp = cosϕ[kk]
		sp = sinϕ[kk]

		X[iex] = rr * st * cp
		Y[iex] = rr * st * sp
		Z[iex] = rr * ct
		C[iex] = ext_itp(rr, th, ph)
	end

	@threads for b in 700:blocks
		println("Block: $b")
		r, θ, ϕ = @views x_all[:, b], y_all[:, b], z_all[:, b]
		re, θe, ϕe = @views xe_all[:, b], ye_all[:, b], ze_all[:, b]
		Bp_block = @views Bp[:, :, :, b]
		ext_itp = extrapolate(interpolate((r, θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())

		r_eval = LinRange(re[1], re[end], 10(MeshBlockSize[1]+1))
		θ_eval = LinRange(θe[1], θe[end], 10(MeshBlockSize[2]+1))
		ϕ_eval = LinRange(ϕe[1], ϕe[end], 10(MeshBlockSize[3]+1))
		nr, nθ, nϕ = length(r_eval), length(θ_eval), length(ϕ_eval)
		sinθ = @. sin(θ_eval)
		cosθ = @. cos(θ_eval)
		sinϕ = @. sin(ϕ_eval)
		cosϕ = @. cos(ϕ_eval)
		iex::Int32 = 1
		@inbounds for ii in 1:nr, jj in 1:nθ, kk in 1:nϕ
			cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,sinθ,cosθ,cosϕ,sinϕ)
			clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
			iex += 1
		end
	end
	return X,Y,Z,field
end

X,Y,Z,field = loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks)

function tdplot(X,Y,Z,field)
	finite_vals = filter(!isnan, vec(field))
    q_low, q_high = quantile(finite_vals, (0.05, 0.95))
    fig = Figure()
	ax = Axis3(fig[1,1], title="Faster", aspect=:data)
	volume!(ax, extrema(X), extrema(Y), extrema(Z), field, transparency=true; colorrange=(q_low, q_high))
	display(fig)
end

tdplot(X,Y,Z,field)
end

Data movement is often the cause of a lot of delay. It seems the X, Y, Z which are later quantized into the field indices can be better calculated on the fly (all the coss and sins), and they should never be assigned to any array (such as sinθ).
This should be a reasonably easy refactor, and I would expect a nice speedup.
Frankly, it might even be easier to read.

4 Likes

That is definitely a step in the right direction. But still we can’t really tell what we need to optimize because the computations are all bundled up into a large function (with subfunctions). So please make multiple toplevel function out of this and then measure them separately to get a feeling of what actually is slowest. It only makes sense to optimize the slowest part of the process because that’s were you can get the most speedup.

Also a few comments:

Why do have a global here? This might cause inference issues and I think you can just delete it. If I had to guess you put global there because you refer to that variable inside the loop that follows, but Julia’s scoping rules allow you to access it just fine without global.

Here you copy all of the arrays. Put an @views in front of the line.

This code confuses me. Aren’t the XX, YY, ZZ just plain numbers? Then calling extrema on them does not have much of a effect. I believe extrema(1.0) just gives you (1.0, 1.0) but I haven’t checked.

1 Like
julia> extrema(1)
(1, 1)
1 Like

Thanks. Yes, You are right my extrema code was incorrect although it was giving same output.

corrected code
using HDF5, GLMakie, Interpolations, Statistics, Profile, Base.Threads, BenchmarkTools
@time begin
function readhdf(h5file::String)
	h5 = h5open(h5file, "r")
	B = h5["B"]
	@views Br, Bθ, Bϕ = B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
	Bp = @. hypot(Br, Bθ) / Bϕ
	x_all, y_all, z_all = h5["x1v"], h5["x2v"], h5["x3v"]
	xe_all, ye_all, ze_all = h5["x1f"], h5["x2f"], h5["x3f"]
	MeshBlockSize = read((HDF5.attributes(h5))["MeshBlockSize"])
	blocks::Int32 = read(HDF5.attributes(h5),"NumMeshBlocks") 
	return Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks
end

Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks = readhdf("sane00.prim.01800.athdf")

function bound(xe_all, ye_all, ze_all, MeshBlockSize, blocks)
	XX,YY,ZZ = ntuple(_ -> zeros(Float32, blocks* prod(MeshBlockSize.+1)), 3)
	icx=1
	for b in 1:blocks
		@views r, θ, ϕ = xe_all[:, b], ye_all[:, b], ze_all[:, b]
		for rr in r, th in θ, ph in ϕ,  
			XX[icx] = rr * sin(th) * cos(ph)
			YY[icx] = rr * sin(th) * sin(ph)
			ZZ[icx] = rr * cos(th)
			icx += 1
		end
	end
	xmin,xmax = extrema(XX); ymin,ymax = extrema(YY); zmin,zmax = extrema(ZZ)
	println("  x ∈ [$xmin, $xmax], y ∈ [$ymin, $ymax], z ∈ [$zmin, $zmax]")
	return xmin, xmax, ymin, ymax, zmin, zmax
end
	
xmin, xmax, ymin, ymax, zmin, zmax = bound(xe_all, ye_all, ze_all, MeshBlockSize, blocks)

function clp(X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field,my_lock)
	
	xi = trunc(Int, (X-xmin) * sx) + 1
	yi = trunc(Int, (Y-ymin) * sy) + 1
	zi = trunc(Int, (Z-zmin) * sz) + 1
	ix = clamp(xi, 1, nx)
	iy = clamp(yi, 1, ny)
	iz = clamp(zi, 1, nz)
	@lock my_lock field[ix, iy, iz] = C
end
	
function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
	X[iex] = rr * sin(th) * cos(ph)
    Y[iex] = rr * sin(th) * sin(ph)
    Z[iex] = rr * cos(th)
	C[iex] = ext_itp(rr, th, ph)
end
	
function loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks)
	nx,ny,nz = 10(MeshBlockSize.+1)
	field = fill(NaN32, nx,ny,nz)
	println(" Allocated $(nx)×$(ny)×$(nz) grid.")
	
	sx = (nx-1)/(xmax-xmin); sy = (ny-1)/(ymax-ymin); sz = (nz-1)/(zmax-zmin)
	X, Y, Z, C = ntuple(_ -> zeros(Float32, 1000 * prod(MeshBlockSize.+1)), 4)


	@threads for b in 1:blocks
		println("Block: $b")
		@views r, θ, ϕ = x_all[:, b], y_all[:, b], z_all[:, b]
		@views re, θe, ϕe = xe_all[:, b], ye_all[:, b], ze_all[:, b]
		@views Bp_block = Bp[:, :, :, b]
		ext_itp = extrapolate(interpolate((r, θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())

		r_eval = LinRange(re[1], re[end], 10(MeshBlockSize[1]+1))
		θ_eval = LinRange(θe[1], θe[end], 10(MeshBlockSize[2]+1))
		ϕ_eval = LinRange(ϕe[1], ϕe[end], 10(MeshBlockSize[3]+1))
		nr, nθ, nϕ = length(r_eval), length(θ_eval), length(ϕ_eval)
		iex::Int32 = 1
		my_lock = ReentrantLock();
		@inbounds for ii in 1:nr, jj in 1:nθ, kk in 1:nϕ
			cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
			clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field,my_lock)
			iex += 1
		end
	end
	return X,Y,Z,field
end

X,Y,Z,field = loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks)

function tdplot(X,Y,Z,field)
	finite_vals = filter(!isnan, vec(field))
    q_low, q_high = quantile(finite_vals, (0.05, 0.95))
    fig = Figure()
	ax = Axis3(fig[1,1], title="Faster")#, aspect=:data)
	volume!(ax, extrema(X), extrema(Y), extrema(Z), field, transparency=true; colorrange=(q_low, q_high))
	display(fig)
end

tdplot(X,Y,Z,field)
end

Have i placed my_lock = ReentrantLock(); at correct position in this latest code? I am defining lock inside @threads for loop as this was faster.

Why do you need a lock? What is a race scenario which could be bad enough to require it?

1 Like