Improve performance of this index creation and accessing code

Now that there is some easier code to play with, I’ve tried running run_benchmark. Once with the given method, and once with:

        # C[iex] = ext_itp(log(rr), th, ph)
        C[iex] = log(rr)+th+ph

replacing the interpolation. The conclusion: > 60% of the time is spent in the interpolation. So, no dramatic efficiency gain from rearranging stuff around that (though I did manage to get a 10% gain by fiddling a bit).

1 Like

What is use of it? It is fictitious. :joy: I will get realistic color on Interpolation.

Useful just for profiling / benchmarking. Comparing with vs. without interpolation.

1 Like

You’re still using the global variables xmin etc. in loaddata. After that’s corrected, there still seems to be type instability in indexing in e.g. x_all:

julia> @btime $x_all[1, 1]
  7.475 μs (20 allocations: 624 bytes)
1.677044f0

So it’s better to just collect it once in advance, and then work with an Array:

julia> cx_all = @btime reshape(collect($x_all), size($x_all));
  24.100 μs (21 allocations: 61.93 KiB)

As a proof of concept, when you replace the @threads for b in 1:blocks by for b in 1, the difference in allocations is already massive:

julia> @time loaddata_original(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks); # no xmin, xmax, etc.
🧩 Allocated 190×50×170 grid.
  0.599230 seconds (14.53 M allocations: 252.602 MiB, 19.25% gc time)

julia> @time loaddata_minmax(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax);  # typeof(x_all) == HDF5.Dataset
🧩 Allocated 190×50×170 grid.
  0.364104 seconds (3.23 M allocations: 80.101 MiB, 24.37% gc time)

julia> @time loaddata_minmax(Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax);  # typeof(cx_all) == Matrix{Float32}
🧩 Allocated 190×50×170 grid.
  0.184638 seconds (49 allocations: 30.811 MiB, 2.06% gc time)

For the full loaddata (threaded (4C 8T), all blocks) I then get

julia> @time loaddata_original(Bp, x_all, y_all, z_all, xe_all, ye_all, ze_all, MeshBlockSize, blocks);
🧩 Allocated 190×50×170 grid.
122.128465 seconds (12.67 G allocations: 188.904 GiB, 10.82% gc time, 770 lock conflicts)

julia> @time loaddata_minmax(Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax);
🧩 Allocated 190×50×170 grid.
 45.027201 seconds (6.17 k allocations: 35.547 MiB)

You could still reduce allocations by getting rid of the log. and exp. in

ext_itp = extrapolate(interpolate((log.(r), θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())
r_eval = exp.(LinRange(log(re[1]), log(re[end]), 10(MeshBlockSize[1]+1)))

by precomputing / lazily computing / using Base.LogRange / … But I imagine that fundamentally at this point really only the for ii ... loop matters (the body of which will in total be executed 1.4 billion times). @JADekker has already given some ideas for that above. If you split the for ..., ..., ... into three nested loops, you might also be able to reuse some calculations. Likely the only way to get a major performance boost (if at all possible) is to try to make the interpolation more efficient somehow.

2 Likes

I also notice speedup but there is some instability in code performance. Notice too much variation in allocation.

julia> include("Modular2.jl")
 64.548171 seconds (7.06 G allocations: 274.009 GiB, 20.45% gc time, 18.34% compilation time: 3% of which was recompilation)
GLMakie.Screen(...)

julia> include("Modular2.jl")
 38.827293 seconds (1.66 M allocations: 163.253 MiB, 0.10% gc time, 4.24% compilation time)
GLMakie.Screen(...)

julia> include("Modular2.jl")
 59.855314 seconds (7.04 G allocations: 272.950 GiB, 23.98% gc time, 5.20% compilation time)
GLMakie.Screen(...)

See the code

full code
using HDF5, GLMakie, Interpolations, Statistics, Profile, Base.Threads, BenchmarkTools
@time begin
function readhdf(h5file::String)
	h5 = h5open(h5file, "r")
	B = h5["B"]
	@views Br::Array{Float32, 4}, Bθ::Array{Float32, 4}, Bϕ::Array{Float32, 4} = B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
	Bp = @. hypot(Br, Bθ) / Bϕ
	x_all, y_all, z_all = h5["x1v"], h5["x2v"], h5["x3v"]
	xe_all, ye_all, ze_all = h5["x1f"], h5["x2f"], h5["x3f"]
	MeshBlockSize::Vector{Int32} = read((HDF5.attributes(h5))["MeshBlockSize"])
	blocks::Int32 = read(HDF5.attributes(h5),"NumMeshBlocks") 
	cx_all::Matrix{Float32} = reshape(collect(x_all), size(x_all))
	cy_all::Matrix{Float32} = reshape(collect(y_all), size(y_all))
	cz_all::Matrix{Float32} = reshape(collect(z_all), size(z_all))
	cxe_all::Matrix{Float32} = reshape(collect(xe_all), size(xe_all))
	cye_all::Matrix{Float32} = reshape(collect(ye_all), size(ye_all))
	cze_all::Matrix{Float32} = reshape(collect(ze_all), size(ze_all))
	return Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks
end

Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks = readhdf("sane00.prim.01800.athdf")

function bound(cxe_all, cye_all, cze_all, MeshBlockSize, blocks)
	XX,YY,ZZ = ntuple(_ -> zeros(Float32, blocks* prod(MeshBlockSize.+1)), 3)
	icx=1
	for b in 1:blocks
		@views r::Vector{Float32}, θ::Vector{Float32}, ϕ::Vector{Float32} = cxe_all[:, b], cye_all[:, b], cze_all[:, b]
		for rr in r, th in θ, ph in ϕ,  
			XX[icx] = rr * sin(th) * cos(ph)
			YY[icx] = rr * sin(th) * sin(ph)
			ZZ[icx] = rr * cos(th)
			icx += 1
		end
	end
	xmin,xmax = extrema(XX); ymin,ymax = extrema(YY); zmin,zmax = extrema(ZZ)
	println("  x ∈ [$xmin, $xmax], y ∈ [$ymin, $ymax], z ∈ [$zmin, $zmax]")
	return xmin, xmax, ymin, ymax, zmin, zmax
end
	
xmin, xmax, ymin, ymax, zmin, zmax = bound(cxe_all, cye_all, cze_all, MeshBlockSize, blocks)
	

function clp(X,Y,Z,C,xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)	
	xi = trunc(Int, (X-xmin) * sx) + 1
	yi = trunc(Int, (Y-ymin) * sy) + 1
	zi = trunc(Int, (Z-zmin) * sz) + 1
	ix = clamp(xi, 1, nx)
	iy = clamp(yi, 1, ny)
	iz = clamp(zi, 1, nz)
	field[ix, iy, iz] = C
end
	
function cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
	rr, th, ph = r_eval[ii], θ_eval[jj], ϕ_eval[kk]
	xy, z = rr .* sincos(th)
    y, x = xy .* sincos(ph)
    X[iex] = x
    Y[iex] = y
    Z[iex] = z
	C[iex] = ext_itp(log(rr), th, ph)
end

function loaddata(Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax)
	nx,ny,nz = 10(MeshBlockSize.+1)
	field = fill(NaN32, nx,ny,nz)

	sx::Float32 = (nx-1)/(xmax-xmin); sy::Float32 = (ny-1)/(ymax-ymin); sz::Float32 = (nz-1)/(zmax-zmin)
	X, Y, Z, C = ntuple(_ -> zeros(Float32, nx*ny*nz), 4)

	@threads for b in 1:blocks
		@views r, θ, ϕ = cx_all[:, b], cy_all[:, b], cz_all[:, b]
		@views re, θe, ϕe = cxe_all[:, b], cye_all[:, b], cze_all[:, b]
		@views Bp_block = Bp[:, :, :, b]
		@views size(Bp_block)
		ext_itp = extrapolate(interpolate((log.(r), θ, ϕ), Bp_block, Gridded(Linear())), Interpolations.Line())
		r_eval = exp.(LinRange(log(re[1]), log(re[end]), nx))
		θ_eval = LinRange(θe[1], θe[end], ny)
		ϕ_eval = LinRange(ϕe[1], ϕe[end], nz)
		iex::Int32 = 1
		@inbounds for ii in 1:nx, jj in 1:ny, kk in 1:nz
			cart(iex,ii,jj,kk,r_eval,θ_eval,ϕ_eval,ext_itp,X,Y,Z,C)
			clp(X[iex],Y[iex],Z[iex],C[iex],xmin,ymin,zmin,sx,sy,sz,nx,ny,nz,field)
			iex += 1
		end
	end
	return field
end

field = loaddata(Bp, cx_all, cy_all, cz_all, cxe_all, cye_all, cze_all, MeshBlockSize, blocks, xmin, xmax, ymin, ymax, zmin, zmax)

function tdplot(xmin, xmax, ymin, ymax, zmin, zmax, field)
	finite_vals = filter(!isnan, vec(field))
    q_low, q_high = quantile(finite_vals, (0.05, 0.95))
    fig = Figure()
    ax::LScene = LScene(fig[1,1], show_axis=false)
	volume!(ax, xmin..xmax, ymin..ymax, zmin..zmax, field, transparency=true; colorrange=(q_low, q_high))
	display(fig)
end

tdplot(xmin, xmax, ymin, ymax, zmin, zmax, field)
end