Improve performance of this index creation and accessing code

Dan · December 17, 2025, 7:15pm

Now that there is some easier code to play with, I’ve tried running run_benchmark. Once with the given method, and once with:

        # C[iex] = ext_itp(log(rr), th, ph)
        C[iex] = log(rr)+th+ph

replacing the interpolation. The conclusion: > 60% of the time is spent in the interpolation. So, no dramatic efficiency gain from rearranging stuff around that (though I did manage to get a 10% gain by fiddling a bit).

raman_kumar · December 17, 2025, 7:25pm

What is use of it? It is fictitious. I will get realistic color on Interpolation.

Dan · December 17, 2025, 8:27pm

Useful just for profiling / benchmarking. Comparing with vs. without interpolation.

eldee · December 19, 2025, 1:35pm

You’re still using the global variables xmin etc. in loaddata. After that’s corrected, there still seems to be type instability in indexing in e.g. x_all:

julia> @btime $x_all[1, 1]
  7.475 μs (20 allocations: 624 bytes)
1.677044f0

So it’s better to just collect it once in advance, and then work with an Array:

julia> cx_all = @btime reshape(collect($x_all), size($x_all));
  24.100 μs (21 allocations: 61.93 KiB)

As a proof of concept, when you replace the @threads for b in 1:blocks by for b in 1, the difference in allocations is already massive:

julia> @time loaddata_original(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks); # no xmin, xmax, etc.
🧩 Allocated 190×50×170 grid.
  0.599230 seconds (14.53 M allocations: 252.602 MiB, 19.25% gc time)

julia> @time loaddata_minmax(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax);  # typeof(x_all) == HDF5.Dataset
🧩 Allocated 190×50×170 grid.
  0.364104 seconds (3.23 M allocations: 80.101 MiB, 24.37% gc time)

julia> @time loaddata_minmax(Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax);  # typeof(cx_all) == Matrix{Float32}
🧩 Allocated 190×50×170 grid.
  0.184638 seconds (49 allocations: 30.811 MiB, 2.06% gc time)

For the full loaddata (threaded (4C 8T), all blocks) I then get

julia> @time loaddata_original(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks);
🧩 Allocated 190×50×170 grid.
122.128465 seconds (12.67 G allocations: 188.904 GiB, 10.82% gc time, 770 lock conflicts)

julia> @time loaddata_minmax(Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax);
🧩 Allocated 190×50×170 grid.
 45.027201 seconds (6.17 k allocations: 35.547 MiB)

You could still reduce allocations by getting rid of the log. and exp. in

ext_itp = extrapolate(interpolate((log.(r), θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())
r_eval = exp.(LinRange(log(re[1]), log(re[end]), 10(MeshBlockSize[1]+1)))

by precomputing / lazily computing / using Base.LogRange / … But I imagine that fundamentally at this point really only the for ii ... loop matters (the body of which will in total be executed 1.4 billion times). @JADekker has already given some ideas for that above. If you split the for ..., ..., ... into three nested loops, you might also be able to reuse some calculations. Likely the only way to get a major performance boost (if at all possible) is to try to make the interpolation more efficient somehow.

raman_kumar · December 19, 2025, 8:23pm

I also notice speedup but there is some instability in code performance. Notice too much variation in allocation.

julia> include("Modular2.jl")
 64.548171 seconds (7.06 G allocations: 274.009 GiB, 20.45% gc time, 18.34% compilation time: 3% of which was recompilation)
GLMakie.Screen(...)

julia> include("Modular2.jl")
 38.827293 seconds (1.66 M allocations: 163.253 MiB, 0.10% gc time, 4.24% compilation time)
GLMakie.Screen(...)

julia> include("Modular2.jl")
 59.855314 seconds (7.04 G allocations: 272.950 GiB, 23.98% gc time, 5.20% compilation time)
GLMakie.Screen(...)

See the code

full code

using HDF5, GLMakie, Interpolations, Statistics, Profile, Base.Threads, BenchmarkTools
@time begin
function readhdf(h5file::String)
	h5 = h5open(h5file, "r")
	B = h5["B"]
	@views Br::Array{Float32, 4}, Bθ::Array{Float32, 4}, Bϕ::Array{Float32, 4} = B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
	Bp = @. hypot(Br, Bθ) / Bϕ
	x_all, y_all, z_all = h5["x1v"], h5["x2v"], h5["x3v"]
	xe_all, ye_all, ze_all = h5["x1f"], h5["x2f"], h5["x3f"]
	MeshBlockSize::Vector{Int32} = read((HDF5.attributes(h5))["MeshBlockSize"])
	blocks::Int32 = read(HDF5.attributes(h5),"NumMeshBlocks") 
	cx_all::Matrix{Float32} = reshape(collect(x_all), size(x_all))
	cy_all::Matrix{Float32} = reshape(collect(y_all), size(y_all))
	cz_all::Matrix{Float32} = reshape(collect(z_all), size(z_all))
	cxe_all::Matrix{Float32} = reshape(collect(xe_all), size(xe_all))
	cye_all::Matrix{Float32} = reshape(collect(ye_all), size(ye_all))
	cze_all::Matrix{Float32} = reshape(collect(ze_all), size(ze_all))
	return Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks
end

Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks = readhdf("sane00.prim.01800.athdf")

function bound(cxe_all, cye_all, cze_all, MeshBlockSize, blocks)
	XX,YY,ZZ = ntuple(_ -> zeros(Float32, blocks* prod(MeshBlockSize.+1)), 3)
	icx=1
	for b in 1:blocks
		@views r::Vector{Float32}, θ::Vector{Float32}, ϕ::Vector{Float32} = cxe_all[:, b], cye_all[:, b], cze_all[:, b]
		for rr in r, th in θ, ph in ϕ,  
			XX[icx] = rr * sin(th) * cos(ph)
			YY[icx] = rr * sin(th) * sin(ph)
			ZZ[icx] = rr * cos(th)
			icx += 1
		end
	end
	xmin,xmax = extrema(XX); ymin,ymax = extrema(YY); zmin,zmax = extrema(ZZ)
	println("  x ∈ [$xmin, $xmax], y ∈ [$ymin, $ymax], z ∈ [$zmin, $zmax]")
	return xmin, xmax, ymin, ymax, zmin, zmax
end
	
xmin, xmax, ymin, ymax, zmin, zmax = bound(cxe_all, cye_all, cze_all, MeshBlockSize, blocks)
	

function clp(X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)	
	xi = trunc(Int, (X-xmin) * sx) + 1
	yi = trunc(Int, (Y-ymin) * sy) + 1
	zi = trunc(Int, (Z-zmin) * sz) + 1
	ix = clamp(xi, 1, nx)
	iy = clamp(yi, 1, ny)
	iz = clamp(zi, 1, nz)
	field[ix, iy, iz] = C
end
	
function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
	xy, z = rr .* sincos(th)
    y, x = xy .* sincos(ph)
    X[iex] = x
    Y[iex] = y
    Z[iex] = z
	C[iex] = ext_itp(log(rr), th, ph)
end

function loaddata(Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax)
	nx,ny,nz = 10(MeshBlockSize.+1)
	field = fill(NaN32, nx,ny,nz)

	sx::Float32 = (nx-1)/(xmax-xmin); sy::Float32 = (ny-1)/(ymax-ymin); sz::Float32 = (nz-1)/(zmax-zmin)
	X, Y, Z, C = ntuple(_ -> zeros(Float32, nx*ny*nz), 4)

	@threads for b in 1:blocks
		@views r, θ, ϕ = cx_all[:, b], cy_all[:, b], cz_all[:, b]
		@views re, θe, ϕe = cxe_all[:, b], cye_all[:, b], cze_all[:, b]
		@views Bp_block = Bp[:, :, :, b]
		@views size(Bp_block)
		ext_itp = extrapolate(interpolate((log.(r), θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())
		r_eval = exp.(LinRange(log(re[1]), log(re[end]), nx))
		θ_eval = LinRange(θe[1], θe[end], ny)
		ϕ_eval = LinRange(ϕe[1], ϕe[end], nz)
		iex::Int32 = 1
		@inbounds for ii in 1:nx, jj in 1:ny, kk in 1:nz
			cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
			clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
			iex += 1
		end
	end
	return field
end

field = loaddata(Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax)

function tdplot(xmin, xmax, ymin, ymax, zmin, zmax, field)
	finite_vals = filter(!isnan, vec(field))
    q_low, q_high = quantile(finite_vals, (0.05, 0.95))
    fig = Figure()
    ax::LScene = LScene(fig[1,1], show_axis=false)
	volume!(ax, xmin..xmax, ymin..ymax, zmin..zmax, field, transparency=true; colorrange=(q_low, q_high))
	display(fig)
end

tdplot(xmin, xmax, ymin, ymax, zmin, zmax, field)
end

Topic		Replies	Views
Memory kills this code Performance performance , memory-allocation , parallel-computing	5	449	November 3, 2025
Possible to speed up this function for calculating cartesian indices? Performance	19	304	March 31, 2024
Understanding benchmark results General Usage	12	2059	January 12, 2017
Parallelizing for loop in the computation of a gradient Performance question	19	2646	February 26, 2018
Its possible to use all cores or parallel cores to do some project/calculations? Performance	41	1954	September 19, 2020

Improve performance of this index creation and accessing code

Related topics