Improve performance of this index creation and accessing code

My earlier script was having data race problem. It used to get terminated. Now it is ok. Updated code without lock is :backhand_index_pointing_down:

updated code
using HDF5, GLMakie, Interpolations, Statistics, Profile, Base.Threads, BenchmarkTools
@profile begin
function readhdf(h5file::String)
	h5 = h5open(h5file, "r")
	B = h5["B"]
	@views Br, Bθ, Bϕ = B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
	Bp = @. hypot(Br, Bθ) / Bϕ
	x_all, y_all, z_all = h5["x1v"], h5["x2v"], h5["x3v"]
	xe_all, ye_all, ze_all = h5["x1f"], h5["x2f"], h5["x3f"]
	MeshBlockSize = read((HDF5.attributes(h5))["MeshBlockSize"])
	blocks::Int32 = read(HDF5.attributes(h5),"NumMeshBlocks") 
	return Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks
end

Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks = readhdf("sane00.prim.01800.athdf")

function bound(xe_all, ye_all, ze_all, MeshBlockSize, blocks)
	XX,YY,ZZ = ntuple(_ -> zeros(Float32, blocks* prod(MeshBlockSize.+1)), 3)
	icx=1
	for b in 1:blocks
		@views r, θ, ϕ = xe_all[:, b], ye_all[:, b], ze_all[:, b]
		for rr in r, th in θ, ph in ϕ,  
			XX[icx] = rr * sin(th) * cos(ph)
			YY[icx] = rr * sin(th) * sin(ph)
			ZZ[icx] = rr * cos(th)
			icx += 1
		end
	end
	xmin,xmax = extrema(XX); ymin,ymax = extrema(YY); zmin,zmax = extrema(ZZ)
	println("  x ∈ [$xmin, $xmax], y ∈ [$ymin, $ymax], z ∈ [$zmin, $zmax]")
	return xmin, xmax, ymin, ymax, zmin, zmax
end
	
xmin, xmax, ymin, ymax, zmin, zmax = bound(xe_all, ye_all, ze_all, MeshBlockSize, blocks)

function clp(X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)	
	xi = trunc(Int, (X-xmin) * sx) + 1
	yi = trunc(Int, (Y-ymin) * sy) + 1
	zi = trunc(Int, (Z-zmin) * sz) + 1
	ix = clamp(xi, 1, nx)
	iy = clamp(yi, 1, ny)
	iz = clamp(zi, 1, nz)
	field[ix, iy, iz] = C
end
	
function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
	X[iex] = rr * sin(th) * cos(ph)
    Y[iex] = rr * sin(th) * sin(ph)
    Z[iex] = rr * cos(th)
	C[iex] = ext_itp(rr, th, ph)
end
	
function loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks)
	nx,ny,nz = 10(MeshBlockSize.+1)
	field = fill(NaN32, nx,ny,nz)
	println("🧩 Allocated $(nx)×$(ny)×$(nz) grid.")
	
	sx = (nx-1)/(xmax-xmin); sy = (ny-1)/(ymax-ymin); sz = (nz-1)/(zmax-zmin)
	X, Y, Z, C = ntuple(_ -> zeros(Float32, 1000 * prod(MeshBlockSize.+1)), 4)


	@threads for b in 1:blocks
		println("Block: $b")
		@views r, θ, ϕ = x_all[:, b], y_all[:, b], z_all[:, b]
		@views re, θe, ϕe = xe_all[:, b], ye_all[:, b], ze_all[:, b]
		@views Bp_block = Bp[:, :, :, b]
		ext_itp = extrapolate(interpolate((r, θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())

		r_eval = LinRange(re[1], re[end], 10(MeshBlockSize[1]+1))
		θ_eval = LinRange(θe[1], θe[end], 10(MeshBlockSize[2]+1))
		ϕ_eval = LinRange(ϕe[1], ϕe[end], 10(MeshBlockSize[3]+1))
		nr, nθ, nϕ = length(r_eval), length(θ_eval), length(ϕ_eval)
		iex::Int32 = 1
		@inbounds for ii in 1:nr, jj in 1:nθ, kk in 1:nϕ
			cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
			clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
			iex += 1
		end
	end
	return X,Y,Z,field
end

X,Y,Z,field = loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks)

function tdplot(xmin, xmax, ymin, ymax, zmin, zmax, field)
	finite_vals = filter(!isnan, vec(field))
    q_low, q_high = quantile(finite_vals, (0.05, 0.95))
    fig = Figure()
    ax = LScene(fig[1,1], show_axis=false)
	#ax = Axis3(fig[1,1], title="Faster", aspect=:data)
	volume!(ax, xmin..xmax, ymin..ymax, zmin..xmax, field, transparency=true; colorrange=(q_low, q_high))
	display(fig)
end

tdplot(xmin, xmax, ymin, ymax, zmin, zmax, field)
end
Profile.print(format=:flat)

Perhaps, if you make a new MWE which runs when copy-pasted (without HDF5, but a small rand data maker), it will re-ignite enthusiasm to optimize. Leave all the code, data sizes as in real-data, but generate with rand and with a seed. Just make it run without any external data files. Drop the plotting bit as well.

4 Likes

This function is very inefficient: you allocate a huge array only to get the extrema. Also since you take all combinations anyhow, you can extremize each axis by itself. Here is my suggestion which should take a fraction of the original runtime:

function bound(xe_all, ye_all, ze_all)
    xmin = Inf; xmax = -Inf
    ymin = Inf; ymax = -Inf
    zmin = Inf; zmax = -Inf
    for block in axes(xe_all, 2)
        @views r, theta, phi = xe_all[:, b], ye_all[:, b], ze_all[:, b]
        rvals = extrema(r)
        theta_sin_vals = extrema(sin, theta)
        theta_cos_vals = extrema(cos, theta)
        phi_sin_vals = extrema(sin, phi)
        phi_cos_vals = extrema(cos, phi)
        
        new_zmin, new_zmax = extrema(r*cos_theta for r in rvals for cos_theta in theta_cos_vals)
        zmax = max(zmax, new_zmax)
        zmin = min(zmin, new_zmin)
        # do the others yourself
    end
    return xmin, xmax, ymin, ymax, zmin, zmax
end

Note that the xmin, ymin, zmin parameters are globals again. In this inner loop that will cost a lot of performance.

You never use C outside of this function and also do not return it. So you can change your code slightly to avoid allocating it at all.
Similar question for X,Y,Z you return these but don’t seem to use them. Do you really need these? If not then we can change your code around easily to avoid allocating them.

I would suggest that you put the code of cart and clp (but only those 2!) back where they are called and then look for simplifications. These functions appear to be poorly chosen abstractions and actually just obfuscate the real logic and make it harder to optimize. This is something you’ll learn with time: when is putting which code into a function a sensible abstraction that makes the code clearer and when not.

1 Like

Please ignore the performance of this round() function. I can also calculate these extreme points other ways as their extreme points only are provided separately in my data.

I need extreme points of grid i.e. xmin, xmax, ymin, ymax, zmin, zmax for volume plot. If it possible to get these variables from loaddata() variables X,Y,Z; then i can remove bound() function. If not then i don’t need X, Y and Z variables in loaddata() function.

Do you suggest this for code readability by user or for performance?

The main goal here is readability. But not in the sense that is necessarily easier to read. However, it will help when trying to optimize the code because it get easier to shuffle around pieces of code.

Ok then the use the xmin, etc. from the bound function I provided (which should be orders of magnitude faster than the original). And then get rid of the X,Y,Z arrays in loaddata.

Here I just did that and also made a few other minor improvements :

using HDF5, GLMakie, Interpolations, Statistics, Profile, Base.Threads, BenchmarkTools

function readhdf(h5file::String)
	h5 = h5open(h5file, "r")
	B = h5["B"]
	@views Br, Bθ, Bϕ = B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
	Bp = @. hypot(Br, Bθ) / Bϕ
	x_all, y_all, z_all = h5["x1v"], h5["x2v"], h5["x3v"]
	xe_all, ye_all, ze_all = h5["x1f"], h5["x2f"], h5["x3f"]
	MeshBlockSize = read((HDF5.attributes(h5))["MeshBlockSize"])
	blocks::Int32 = read(HDF5.attributes(h5),"NumMeshBlocks") 
	return Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks
end

Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks = readhdf("sane00.prim.01800.athdf")


"""
Find the bounding box in cartesian coordinates from sample points in polar coordinates.
"""
function compute_bounds(xe_all, ye_all, ze_all) 
	# renamed this from `bounds`
    xmin = Inf; xmax = -Inf
    ymin = Inf; ymax = -Inf
    zmin = Inf; zmax = -Inf
    for block in axes(xe_all, 2)
        @views r, theta, phi = xe_all[:, b], ye_all[:, b], ze_all[:, b]
        rvals = extrema(r)
        sinθs = extrema(sin, theta)
        cosθs = extrema(cos, theta)
        sinϕs = extrema(sin, phi)
        cosϕs = extrema(cos, phi)
        
		new_xmin, new_xmax = extrema(r*sinθ*cosϕ for r in rvals for sinθ in sinθs for cosϕ in cosϕs)
        xmax = max(xmax, new_xmax)
        xmin = min(xmin, new_xmin)
		
		new_ymin, new_ymax = extrema(r*sinθ*sinϕ for r in rvals for sinθ in sinθs for sinϕ in sinϕs)
        ymax = max(ymax, new_ymax)
        ymin = min(ymin, new_ymin)

        new_zmin, new_zmax = extrema(r*cosθ for r in rvals for cosθ in cosθs)
        zmax = max(zmax, new_zmax)
        zmin = min(zmin, new_zmin)
    end
    return (; xmin, xmax, ymin, ymax, zmin, zmax) # named tuple
end

	
bounds = compute_bounds(xe_all, ye_all, ze_all)

function loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, bounds)
	nx, ny, nz = 10 .* (MeshBlockSize .+ 1)
	field = fill(NaN32, nx, ny, nz)
	
	sx = (nx-1)/(bounds.xmax-bounds.xmin) 
	sy = (ny-1)/(bounds.ymax-bounds.ymin)
	sz = (nz-1)/(bounds.zmax-bounds.zmin)

	@threads for b in axes(x_all, 2)
		println("Block: $b")
		@views r, θ, ϕ = x_all[:, b], y_all[:, b], z_all[:, b]
		@views Bp_block = Bp[:, :, :, b]
		ext_itp = extrapolate(interpolate((r, θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())

		r_eval = LinRange(xe_all[1, b], xe_all[end, b], nx)
		θ_eval = LinRange(ye_all[1, b], ye_all[end, b], ny)
		ϕ_eval = LinRange(ze_all[1, b], ze_all[end, b], nz)
		@inbounds for rr in r_eval, th in θ_eval, ph in ϕ_eval
			X = rr * sin(th) * cos(ph)
			Y = rr * sin(th) * sin(ph)
			Z = rr * cos(th)

			xi = trunc(Int, (X-xmin) * sx) + 1
			yi = trunc(Int, (Y-ymin) * sy) + 1
			zi = trunc(Int, (Z-zmin) * sz) + 1
			ix = clamp(xi, 1, nx)
			iy = clamp(yi, 1, ny)
			iz = clamp(zi, 1, nz)

			field[ix, iy, iz] = ext_itp(rr, th, ph)
		end
	end
	return field
end

field = loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, bounds)

function tdplot(bounds, field)
	(; xmin, xmax, ymin, ymax, zmin, zmax) = bounds
	finite_vals = filter(!isnan, vec(field))
    q_low, q_high = quantile(finite_vals, (0.05, 0.95))
    fig = Figure()
    ax = LScene(fig[1,1], show_axis=false)
	#ax = Axis3(fig[1,1], title="Faster", aspect=:data)
	# there was typo here where there was a xmax instead of zmax
	volume!(ax, xmin..xmax, ymin..ymax, zmin..zmax, field, transparency=true; colorrange=(q_low, q_high))
	display(fig)
end

tdplot(bounds, field)

Now with this refactoring it became apparent that all you do is interpolating some data you have in polar coordinates to cartesian coordinates. Note: I did not run any of this, so perhaps I made mistakes or misunderstood your code.

A couple of additional comments:

  • There are still data races should the different blocks overlap in coordinate space. You might not care about these.
  • You interpolate linearly in polar coordinate space, which is probably conceptually wrong.
  • You use equally spaced sample points of polar coordinates when choosing points in cartesian space. This is probably quite suboptimal because density of sample points will vary widely in cartesian space e.g. for small radii all the points are very close together. The same goes for values along the z-axis, where the azimuthal direction is nearly degenerate.
  • I don’t see why you have the xe_all, ye_all and ze_all arrays. You only use them to get the sampling ranges, but you probably could use the values from x_all etc instead? What is the reason for the xe_all, etc. arrays?
4 Likes

x_all, y_all and z_all are cell-body center positions while xe_all, ye_all and ze_all are interface positions and they create full sphere. If i use x_all, y_all and z_all then i will get sphere with crack/gap.

They should not overlap but can be adjacent.

What should i do for it? Your code is slower than mine now.

Does it produce a similar result at least? Did you check which part is slower or what causes the slownes? That would seriously surprise me since I eliminated a bunch of very large, unnecessary allocations and the use of several untyped globals in your innermost loop. But maybe I missed something - as I said I did not run this.
The only thing I fundamentally changed was the bound function, so you could also try changing just that back. The rest of the rewriting should have been uncontroversial.

No, It gives different plot.
Actually variable r has geometrical spacing. Here 1.6 is minimum, 1200 is maximum and 1.0963 is geometric ratio read((HDF5.attributes(h5))["RootGridX1"]) = [1.6, 1200.0, 1.0963050292915717]. θ and ϕ have constant spacing.

@abraemer Please can you improve the performance of cart() function as it is bottleneck? I tried to make a MWE but it becomes as long as real code and also may be wrong so i ignore it.

function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
	X[iex] = rr * sin(th) * cos(ph)
    Y[iex] = rr * sin(th) * sin(ph)
    Z[iex] = rr * cos(th)
	C[iex] = ext_itp(log(rr), th, ph)
end

Full code and data to read are

Full code
using HDF5, GLMakie, Interpolations, Statistics, Profile, Base.Threads, BenchmarkTools
@profile begin
function readhdf(h5file::String)
	h5 = h5open(h5file, "r")
	B = h5["B"]
	@views Br, Bθ, Bϕ = B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
	Bp = @. hypot(Br, Bθ) / Bϕ
	x_all, y_all, z_all = h5["x1v"], h5["x2v"], h5["x3v"]
	xe_all, ye_all, ze_all = h5["x1f"], h5["x2f"], h5["x3f"]
	MeshBlockSize::Vector{Int32} = read((HDF5.attributes(h5))["MeshBlockSize"])
	blocks::Int32 = read(HDF5.attributes(h5),"NumMeshBlocks") 
	#@show read((HDF5.attributes(h5))["RootGridX1"])
	return Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks
end

Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks = readhdf("sane00.prim.01800.athdf")

function bound(xe_all, ye_all, ze_all, MeshBlockSize, blocks)
	XX,YY,ZZ = ntuple(_ -> zeros(Float32, blocks* prod(MeshBlockSize.+1)), 3)
	icx=1
	for b in 1:blocks
		@views r::Vector{Float32}, θ::Vector{Float32}, ϕ::Vector{Float32} = xe_all[:, b], ye_all[:, b], ze_all[:, b]
		for rr in r, th in θ, ph in ϕ,  
			XX[icx] = rr * sin(th) * cos(ph)
			YY[icx] = rr * sin(th) * sin(ph)
			ZZ[icx] = rr * cos(th)
			icx += 1
		end
	end
	xmin,xmax = extrema(XX); ymin,ymax = extrema(YY); zmin,zmax = extrema(ZZ)
	println("  x ∈ [$xmin, $xmax], y ∈ [$ymin, $ymax], z ∈ [$zmin, $zmax]")
	return xmin, xmax, ymin, ymax, zmin, zmax
end
	
xmin, xmax, ymin, ymax, zmin, zmax = bound(xe_all, ye_all, ze_all, MeshBlockSize, blocks)
	

function clp(X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)	
	xi = trunc(Int, (X-xmin) * sx) + 1
	yi = trunc(Int, (Y-ymin) * sy) + 1
	zi = trunc(Int, (Z-zmin) * sz) + 1
	ix = clamp(xi, 1, nx)
	iy = clamp(yi, 1, ny)
	iz = clamp(zi, 1, nz)
	field[ix, iy, iz] = C
end
	
function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
	X[iex] = rr * sin(th) * cos(ph)
    Y[iex] = rr * sin(th) * sin(ph)
    Z[iex] = rr * cos(th)
	C[iex] = ext_itp(log(rr), th, ph)
end
function loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks)
	nx,ny,nz = 10(MeshBlockSize.+1)
	field = fill(NaN32, nx,ny,nz)
	println("🧩 Allocated $(nx)×$(ny)×$(nz) grid.")
	
	sx::Float32 = (nx-1)/(xmax-xmin); sy::Float32 = (ny-1)/(ymax-ymin); sz::Float32 = (nz-1)/(zmax-zmin)
	#@show typeof(sx)
	X, Y, Z, C = ntuple(_ -> zeros(Float32, 1000 *prod(MeshBlockSize.+1)), 4)


	@threads for b in 1:blocks
		#println("Block: $b")
		@views r, θ, ϕ = x_all[:, b], y_all[:, b], z_all[:, b]
		@views re, θe, ϕe = xe_all[:, b], ye_all[:, b], ze_all[:, b]
		@views Bp_block = Bp[:, :, :, b]
		#@show size(Bp_block)
		ext_itp = extrapolate(interpolate((log.(r), θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())

		r_eval = exp.(LinRange(log(re[1]), log(re[end]), 10(MeshBlockSize[1]+1)))
		θ_eval = LinRange(θe[1], θe[end], 10(MeshBlockSize[2]+1))
		ϕ_eval = LinRange(ϕe[1], ϕe[end], 10(MeshBlockSize[3]+1))
		nr, nθ, nϕ = length(r_eval), length(θ_eval), length(ϕ_eval)
		iex::Int32 = 1
		@inbounds for ii in 1:nr, jj in 1:nθ, kk in 1:nϕ
			cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
			clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
			iex += 1
		end
	end
	return field
end

field = loaddata(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks)

function tdplot(xmin, xmax, ymin, ymax, zmin, zmax, field)
	finite_vals = filter(!isnan, vec(field))
    q_low, q_high = quantile(finite_vals, (0.05, 0.95))
    fig = Figure()
    ax::LScene = LScene(fig[1,1], show_axis=false)
	#ax = Axis3(fig[1,1], title="Faster", aspect=:data)
	volume!(ax, xmin..xmax, ymin..ymax, zmin..zmax, field, transparency=false; colorrange=(q_low, q_high))
	display(fig)
end

tdplot(xmin, xmax, ymin, ymax, zmin, zmax, field)
end
Profile.print(format=:flat)

No I can’t. This function by itself cannot be made magically faster.
You’ll have to find gains elsewhere or rethink your code structure to enable other optimizations. I tried the latter but apparently made some mistake somewhere so the results changed (you did check that this wasn’t causes just because I fixed typo in your plotting function leading to different axis limits, right?).

Also I don’t think you should use Profile.jl like this because IIRC it in cludes compiler operations. So the results will be very hard to understand if not meaningless.

2 Likes

Just reiterating - if the source you post can be copy pasted into the REPL and runs (with no HDF5 files necessary), then the chances of community speedup will go from 20% to 80%, which sounds like a big jump to me.
So:

  1. Get rid of tdplot
  2. Add a @benchmark to time what you wish accelerated.
  3. Remove HDF5, GLMakie dependencies.
  4. Add a random generator of data of similar size to that in h5.
  5. Post the code and hope for the best.

My 2¢ (not worth much more perhaps)

P.S. Maybe an LLM can take these instructions and magically do it.

5 Likes

Let me just briefly mention that you can avoid some duplicate calculations here with regards to the sin and the cos. I’m not too clear on the details of your problem, but for some random parameters, I get a roughly 2x speed-up in the part that calculates X, Y, Z. I’d expect the ext_itp(log(rr,th,ph)) part to be relatively costly, but it is a neat little improvement anyway.

using BenchmarkTools, Random

function cart!(iex, ii, jj, kk, r_eval, θ_eval, ϕ_eval, X, Y, Z)
    rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
    X[iex] = rr * sin(th) * cos(ph)
    Y[iex] = rr * sin(th) * sin(ph)
    Z[iex] = rr * cos(th)
end

function cart2!(iex, ii, jj, kk, r_eval, θ_eval, ϕ_eval, X, Y, Z)
    rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
    xy, z = rr .* sincos(th)
    y, x = xy .* sincos(ph)
    X[iex] = x
    Y[iex] = y
    Z[iex] = z
end

function run_loops(r_eval, θ_eval, ϕ_eval, X, Y, Z)
    iex = 1
    for ii in eachindex(r_eval), jj in eachindex(θ_eval), kk in eachindex(ϕ_eval)
        cart!(iex, ii, jj, kk, r_eval, θ_eval, ϕ_eval, X, Y, Z)
        iex += 1
    end
    return X, Y, Z
end

function run_loops2(r_eval, θ_eval, ϕ_eval, X, Y, Z)
    iex = 1
    for ii in eachindex(r_eval), jj in eachindex(θ_eval), kk in eachindex(ϕ_eval)
        cart2!(iex, ii, jj, kk, r_eval, θ_eval, ϕ_eval, X, Y, Z)
        iex += 1
    end
    return X, Y, Z
end

function test(NX, NY, NZ)
    r_eval = rand(NX)
    θ_eval = rand(NY)
    ϕ_eval = rand(NZ)
    X = Vector{Float64}(undef, NX * NY * NZ)
    Y = Vector{Float64}(undef, NX * NY * NZ)
    Z = Vector{Float64}(undef, NX * NY * NZ)

    run_loops(r_eval, θ_eval, ϕ_eval, X, Y, Z) # warmup
    display(@benchmark run_loops($r_eval, $θ_eval, $ϕ_eval, $X, $Y, $Z))
    run_loops2(r_eval, θ_eval, ϕ_eval, X, Y, Z) # warmup
    display(@benchmark run_loops2($r_eval, $θ_eval, $ϕ_eval, $X, $Y, $Z))
end

test(100, 100, 100)

gives

BenchmarkTools.Trial: 224 samples with 1 evaluation per sample.
 Range (min … max):  21.852 ms … 39.721 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     22.210 ms              ┊ GC (median):    0.00%
 Time  (mean ± σ):   22.352 ms ±  1.219 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%

                 ▂█▄▄  ▁                                       
  ▃▁▁▃▃▃▃▄▃▄▃▄▄▅█████▇██▆▅▄▄▄▅▅▁▅▄▄▃▃▃▃▁▃▁▃▃▁▃▃▃▁▃▁▃▃▁▁▁▁▁▁▃▃ ▃
  21.9 ms         Histogram: frequency by time          23 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.
BenchmarkTools.Trial: 408 samples with 1 evaluation per sample.
 Range (min … max):  12.019 ms …  29.857 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     12.203 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   12.282 ms ± 894.821 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

     ▃▃▁█▃▅▄▅▃▅▅▂▃ ▅▂▃ ▁ ▂                                      
  ▃▄▆█████████████████▇█▆█▇▆▆▄▄▆▃▄▃▃▄▅▃▃▄▃▄▃▃▃▃▁▁▃▁▃▃▁▁▁▁▁▁▁▁▃ ▄
  12 ms           Histogram: frequency by time         12.8 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.
2 Likes

One more thing: you use fill(NaN32, ....) and zeros(Float32, ...) at a few points. You may wish to replace these with undef values instead, because you never reference these values before writing to them: e.g. replace zeros(Float32, n) by Vector{Float32}(undef, n) etc.

1 Like

Thanks, Have you tried my full code on real data? See the actual plot full code gives on real data. I don’t understand why clp() produces weird samples at some particular angles?

I use NaN because i have to keep places associated to these values transparent.

No, I haven’t tried it on real data, I just noticed these possible optimizations and wanted to mention them to you. I see that I missed the part about not all entries of field being written to necessarily, so I agree that filling them with NaN32 is useful.

I’m happy to take a look if time permits, but then I would need you to provide some MWE that generates dummy data and doesn’t generate a plot (so code that doesn’t rely on HDF5 and GLMakie), as suggested by others in this thread.

1 Like

I hope this MWE can help you understand gaps that i gets in some parts.

using Interpolations
r = Float32[1.677044, 1.8385518, 2.0156136, 2.2097273, 2.4225352, 2.6558375, 2.911608, 3.1920104, 3.499417, 3.8364286, 4.205896, 4.6109447, 5.055002, 5.5418243, 6.0755296, 6.660634, 7.3020864, 8.005314];
θ = Float32[0.049087387, 0.14726216, 0.24543692, 0.3436117];
ϕ = Float32[0.19634955, 0.5890486, 0.9817477, 1.3744467, 1.7671459, 2.1598449, 2.552544, 2.9452431, 3.3379421, 3.7306414, 4.12334, 4.5160394, 4.9087386, 5.3014374, 5.6941366, 6.086836];
Bp_block = rand(18,4,16);

function clp(X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)	
	xi = trunc(Int, (X-xmin) * sx) + 1
	yi = trunc(Int, (Y-ymin) * sy) + 1
	zi = trunc(Int, (Z-zmin) * sz) + 1
	ix = clamp(xi, 1, nx)
	iy = clamp(yi, 1, ny)
	iz = clamp(zi, 1, nz)
	field[ix, iy, iz] = C
end
	
function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
	xy, z = rr .* sincos(th)
    y, x = xy .* sincos(ph)
    X[iex] = x
    Y[iex] = y
    Z[iex] = z
	C[iex] = ext_itp(log(rr), th, ph)
end
ext_itp = extrapolate(interpolate((log.(r), θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line());
MeshBlockSize = Int32[18, 4, 16];
nx,ny,nz = 10(MeshBlockSize.+1)
xmin, xmax, ymin, ymax, zmin, zmax = -3.2042396f0, 3.2042396f0, -3.2042396f0, 3.2042396f0, 1.4782072f0, 8.373081f0;
sx::Float32 = (nx-1)/(xmax-xmin); sy::Float32 = (ny-1)/(ymax-ymin); sz::Float32 = (nz-1)/(zmax-zmin)
field = fill(NaN32, nx,ny,nz);
r_eval = exp.(LinRange(log(1.6), log(8.373081), 10(MeshBlockSize[1]+1)))
θ_eval = LinRange(0.0, 0.3926991, 10(MeshBlockSize[2]+1))
ϕ_eval = LinRange(0.0, 6.2831855, 10(MeshBlockSize[3]+1))
nr, nθ, nϕ = length(r_eval), length(θ_eval), length(ϕ_eval)
X, Y, Z, C= ntuple(_ -> Vector{Float32}(undef, 1000 *prod(MeshBlockSize.+1)), 4);
iex::Int32 = 1
@inbounds for ii in 1:nr, jj in 1:nθ, kk in 1:nϕ
	cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
	iex += 1
end
count(isnan, field)
count(!isnan, field)

Thanks! I can’t really improve on the extrapolation part, but let me point out that you can 1) try to multi-thread the for-loop and 2) pre-compute the log(rr), which gives a modest speed-up on my device:

using Interpolations, Random, BenchmarkTools

function clp(X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)	
	xi = trunc(Int, (X-xmin) * sx) + 1
	yi = trunc(Int, (Y-ymin) * sy) + 1
	zi = trunc(Int, (Z-zmin) * sz) + 1
	ix = clamp(xi, 1, nx)
	iy = clamp(yi, 1, ny)
	iz = clamp(zi, 1, nz)
	field[ix, iy, iz] = C
end
	
function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
	xy, z = rr .* sincos(th)
    y, x = xy .* sincos(ph)
    X[iex] = x
    Y[iex] = y
    Z[iex] = z
	C[iex] = ext_itp(log(rr), th, ph)
end
function cart2(iex,ii,jj,kk,r_eval,log_r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
	xy, z = rr .* sincos(th)
    y, x = xy .* sincos(ph)
    X[iex] = x
    Y[iex] = y
    Z[iex] = z
	C[iex] = ext_itp(log_r_eval[ii], th, ph)
end

function run_stuff()
    Random.seed!(42)
    r = Float32[1.677044, 1.8385518, 2.0156136, 2.2097273, 2.4225352, 2.6558375, 2.911608, 3.1920104, 3.499417, 3.8364286, 4.205896, 4.6109447, 5.055002, 5.5418243, 6.0755296, 6.660634, 7.3020864, 8.005314];
    θ = Float32[0.049087387, 0.14726216, 0.24543692, 0.3436117];
    ϕ = Float32[0.19634955, 0.5890486, 0.9817477, 1.3744467, 1.7671459, 2.1598449, 2.552544, 2.9452431, 3.3379421, 3.7306414, 4.12334, 4.5160394, 4.9087386, 5.3014374, 5.6941366, 6.086836];
    Bp_block = rand(18,4,16);
    ext_itp = extrapolate(interpolate((log.(r), θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line());
    MeshBlockSize = Int32[18, 4, 16];
    nx,ny,nz = 10(MeshBlockSize.+1)
    xmin, xmax, ymin, ymax, zmin, zmax = -3.2042396f0, 3.2042396f0, -3.2042396f0, 3.2042396f0, 1.4782072f0, 8.373081f0;
    sx::Float32 = (nx-1)/(xmax-xmin); sy::Float32 = (ny-1)/(ymax-ymin); sz::Float32 = (nz-1)/(zmax-zmin)
    field = fill(NaN32, nx,ny,nz);
    r_eval = exp.(LinRange(log(1.6), log(8.373081), 10(MeshBlockSize[1]+1)))
    θ_eval = LinRange(0.0, 0.3926991, 10(MeshBlockSize[2]+1))
    ϕ_eval = LinRange(0.0, 6.2831855, 10(MeshBlockSize[3]+1))
    nr, nθ, nϕ = length(r_eval), length(θ_eval), length(ϕ_eval)
    X, Y, Z, C = ntuple(_ -> Vector{Float32}(undef, 1000 *prod(MeshBlockSize.+1)), 4);
    iex::Int32 = 1
    @inbounds for ii in 1:nr, jj in 1:nθ, kk in 1:nϕ
        cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
        clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
        iex += 1
    end
end

function run_stuff2()
    Random.seed!(42)
    r = Float32[1.677044, 1.8385518, 2.0156136, 2.2097273, 2.4225352, 2.6558375, 2.911608, 3.1920104, 3.499417, 3.8364286, 4.205896, 4.6109447, 5.055002, 5.5418243, 6.0755296, 6.660634, 7.3020864, 8.005314];
    θ = Float32[0.049087387, 0.14726216, 0.24543692, 0.3436117];
    ϕ = Float32[0.19634955, 0.5890486, 0.9817477, 1.3744467, 1.7671459, 2.1598449, 2.552544, 2.9452431, 3.3379421, 3.7306414, 4.12334, 4.5160394, 4.9087386, 5.3014374, 5.6941366, 6.086836];
    Bp_block = rand(18,4,16);
    ext_itp = extrapolate(interpolate((log.(r), θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line());
    MeshBlockSize = Int32[18, 4, 16];
    nx,ny,nz = 10(MeshBlockSize.+1)
    xmin, xmax, ymin, ymax, zmin, zmax = -3.2042396f0, 3.2042396f0, -3.2042396f0, 3.2042396f0, 1.4782072f0, 8.373081f0;
    sx::Float32 = (nx-1)/(xmax-xmin); sy::Float32 = (ny-1)/(ymax-ymin); sz::Float32 = (nz-1)/(zmax-zmin)
    field = fill(NaN32, nx,ny,nz);
    r_eval = exp.(LinRange(log(1.6), log(8.373081), 10(MeshBlockSize[1]+1)))
    log_r_eval = log.(r_eval)
    θ_eval = LinRange(0.0, 0.3926991, 10(MeshBlockSize[2]+1))
    ϕ_eval = LinRange(0.0, 6.2831855, 10(MeshBlockSize[3]+1))
    nr, nθ, nϕ = length(r_eval), length(θ_eval), length(ϕ_eval)
    X, Y, Z, C = ntuple(_ -> Vector{Float32}(undef, 1000 *prod(MeshBlockSize.+1)), 4);
    iex::Int32 = 1
    @inbounds for ii in 1:nr, jj in 1:nθ, kk in 1:nϕ
        cart2(iex,ii,jj,kk,r_eval,log_r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
        clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
        iex += 1
    end
end

function run_benchmarks()
    @time run_stuff()
    display(@benchmark run_stuff())
    @time run_stuff2() 
    display(@benchmark run_stuff2())
end
run_benchmarks()

gives

  0.142551 seconds (30 allocations: 30.825 MiB)
BenchmarkTools.Trial: 35 samples with 1 evaluation per sample.
 Range (min … max):  141.376 ms … 168.789 ms  ┊ GC (min … max): 0.00% … 15.00%
 Time  (median):     142.942 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   144.379 ms ±   5.362 ms  ┊ GC (mean ± σ):  1.00% ±  2.53%

  ▁ █▄                                                           
  █▇██▇▇▄▇▄▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
  141 ms           Histogram: frequency by time          169 ms <

 Memory estimate: 30.83 MiB, allocs estimate: 30.
  0.127622 seconds (31 allocations: 30.827 MiB, 3.05% gc time)
BenchmarkTools.Trial: 40 samples with 1 evaluation per sample.
 Range (min … max):  123.669 ms … 153.238 ms  ┊ GC (min … max): 0.00% … 18.74%
 Time  (median):     125.134 ms               ┊ GC (median):    0.20%
 Time  (mean ± σ):   126.532 ms ±   5.086 ms  ┊ GC (mean ± σ):  1.28% ±  2.97%

  ▆█▃ ▆                                                          
  ███▆█▆▆▆▄▁▁▁▁▄▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
  124 ms           Histogram: frequency by time          153 ms <

 Memory estimate: 30.83 MiB, allocs estimate: 31.

No, I am asking why i got weird gaps in plot of field? See post 35.

I’m not sure unfortunately.

1 Like