Memory kills this code

raman_kumar · October 31, 2025, 9:46pm

This code is giving accurate plot but needs too much memory and is slow. I have been working since few months to get 3D volume plot of this Astrophysical data linked here. You can download this data by clicking on Download icon and needs no sign in. This parallel code is working fine but it consumes too much RAM memory and process gets killed.

julia> @time fig = plothdf_volume_parallel(“sane00.prim.01800.athdf”; samples_per_axis = 20, grid_res = 100)
Processing 872 blocks with 14 threads…
Killed julia -t auto

This data contains many mesh blocks( 872). My 32GB ram gets filled before processing about 60 mesh blocks in this @threads for b in 1:num_blocks iteration. You can replace this with @threads for b in 5 on 28th code line to get working plot. # place number of any mesh block you like to plot.

What should i do to reduce memory usage and also improve performance? I want to make it run fast as much possible. I have to get many such plots.

using GLMakie, Random, Interpolations
using HDF5, Meshes, CoordRefSystems, Unitful, Statistics
using Base.Threads

function plothdf_volume_parallel(h5file::String; samples_per_axis::Int = 6, grid_res::Int = 100)
    # --- Load HDF5 data ---
    h5 = h5open(h5file, "r")
    num_blocks::Int32 = read(HDF5.attributes(h5), "NumMeshBlocks")
    MeshBlockSize::Vector{Int32} = read(HDF5.attributes(h5), "MeshBlockSize")
    B = read(h5["B"])
    Br, Bθ, Bϕ = @views B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
    Bp = @. hypot(Br, Bθ) / Bϕ
    x_all::Array{Float32, 2} = read(h5["x1f"])
    y_all::Array{Float32, 2} = read(h5["x2f"])
    z_all::Array{Float32, 2} = read(h5["x3f"])
    close(h5)

    # --- Visualization setup ---
    fig = Figure(size = (1000, 800))
    ax = Axis3(fig[1, 1], title = "Interpolated Volume", aspect = :data, viewmode = :free)

    println("Processing $num_blocks blocks with $(nthreads()) threads...")

    # --- Thread-local accumulators (stored in a Dict) ---
    accumulators = Dict{Int, NamedTuple{(:X, :Y, :Z, :C), NTuple{4, Vector{Float32}}}}()

    # --- Parallel over mesh blocks ---
    @threads for b in 1:num_blocks
        tid = threadid()
        acc = get!(accumulators, tid) do
            (X = Float32[], Y = Float32[], Z = Float32[], C = Float32[])
        end

        @views Bp_block = Bp[:, :, :, b]
        r = x_all[:, b]
        θ = y_all[:, b]
        ϕ = z_all[:, b]
        g = RectilinearGrid{𝔼, typeof(Spherical(0,0,0))}((r, θ, ϕ))

        for i in 1:nelements(g)
            el = element(g, i)

            # ---- Corner values ----
            clr = Vector{Float32}(undef, 8)
            bd = Boundary{3,0}(topology(g))(i)
            for (ii, id) in enumerate(bd)
                adels = Coboundary{0,3}(topology(g))(id)
                s = 0.0f0
                for idx in adels
                    inds = elem2cart(topology(g), idx)
                    s += Bp_block[inds...]
                end
                clr[ii] = s / length(adels)
            end

            # ---- Vertex coordinates ----
            verts = [Float64.(ustrip.((c.r, c.θ, c.ϕ))) for c in coords.(vertices(el))]
            rgrid = LinRange(extrema(getindex.(verts, 1))..., 2)
            θgrid = LinRange(extrema(getindex.(verts, 2))..., 2)
            ϕgrid = LinRange(extrema(getindex.(verts, 3))..., 2)

            # ---- Build 2×2×2 data cube ----
            data = Array{Float32}(undef, 2, 2, 2)
            data[1,1,1] = clr[1]; data[2,1,1] = clr[2]
            data[2,2,1] = clr[3]; data[1,2,1] = clr[4]
            data[1,1,2] = clr[5]; data[2,1,2] = clr[6]
            data[2,2,2] = clr[7]; data[1,2,2] = clr[8]

            itp = interpolate((rgrid, θgrid, ϕgrid), data, Gridded(Linear()))

            r_eval = LinRange(rgrid[1], rgrid[end], samples_per_axis)
            θ_eval = LinRange(θgrid[1], θgrid[end], samples_per_axis)
            ϕ_eval = LinRange(ϕgrid[1], ϕgrid[end], samples_per_axis)

            for rr in r_eval, th in θ_eval, ph in ϕ_eval
                val = itp(rr, th, ph)
                x = rr * sin(th) * cos(ph)
                y = rr * sin(th) * sin(ph)
                z = rr * cos(th)
                push!(acc.X, x)
                push!(acc.Y, y)
                push!(acc.Z, z)
                push!(acc.C, val)
            end
        end
    end

    println("\nInterpolation complete. Merging results...")

    # --- Merge all thread accumulators ---
    Xacc_all = reduce(vcat, [v.X for v in values(accumulators)])
    Yacc_all = reduce(vcat, [v.Y for v in values(accumulators)])
    Zacc_all = reduce(vcat, [v.Z for v in values(accumulators)])
    Cacc_all = reduce(vcat, [v.C for v in values(accumulators)])

    # --- Compute bounding box ---
    x_min, x_max = extrema(Xacc_all)
    y_min, y_max = extrema(Yacc_all)
    z_min, z_max = extrema(Zacc_all)

    nx = ny = nz = grid_res
    x_range = LinRange(x_min, x_max, nx)
    y_range = LinRange(y_min, y_max, ny)
    z_range = LinRange(z_min, z_max, nz)

    # --- Fill 3D grid with nearest-point assignment ---
    field = fill(NaN32, nx, ny, nz)
    for (xi, yi, zi, ci) in zip(Xacc_all, Yacc_all, Zacc_all, Cacc_all)
        ix = clamp(Int(round((xi - x_min)/(x_max - x_min) * (nx - 1))) + 1, 1, nx)
        iy = clamp(Int(round((yi - y_min)/(y_max - y_min) * (ny - 1))) + 1, 1, ny)
        iz = clamp(Int(round((zi - z_min)/(z_max - z_min) * (nz - 1))) + 1, 1, nz)
        field[ix, iy, iz] = ci
    end

    # --- Compute color normalization ---
    finite_vals = filter(!isnan, vec(field))
    q_low  = quantile(finite_vals, 0.05)
    q_high = quantile(finite_vals, 0.95)
    cr = (q_low, q_high)


    println("Rendering volume...")
    volume!(ax, x_min..x_max, y_min..y_max, z_min..z_max, field; colorrange=cr)
    fig
end

@time fig = plothdf_volume_parallel("sane00.prim.01800.athdf"; samples_per_axis = 20, grid_res = 100)
display(fig)

gdalle · November 1, 2025, 7:13am

This looks like a rather complex code with many dependencies. If you want people to help, can you try profiling first (with @profview and @profview_allocs) to reduce the MWE and focus it on the parts that truly impact performance?

abraemer · November 1, 2025, 9:13am

Just a few general comments that make it easier to work with such a complex code:

Right now you have tangled up: loading the data, processing the data, plotting. Especially loading is inherently type unstable (although you mitigate to large extend by type annotations - however B is lacking a type annotation). So to improve modularity and avoid type inference issues I propose splitting this up into at least 3 functions for loading, processing and plotting. Note you can still have a wrapper that combines all 3 so you can have the interface that you have now. After the split we can focus on making the processing function as fast as possible without any other part interfering.
Your pattern is not thread-safe

For an explanation please refer to

I wonder whether you could instead preallocate all the workspace you need. That would also bring some performance improvements. It seems that you know how many elements each block adds to an accumulator and you know how many blocks there are. So instead of using this attempted thread-local state, push!ing elements and then using vcat, you could just preallocate some matrices/vectors and write into them. Then you avoid allocations and don’t need the merge step.

Small observation:

raman_kumar:

            data[1,1,1] = clr[1]; data[2,1,1] = clr[2]
            data[2,2,1] = clr[3]; data[1,2,1] = clr[4]
            data[1,1,2] = clr[5]; data[2,1,2] = clr[6]
            data[2,2,2] = clr[7]; data[1,2,2] = clr[8]

Here you almost assign the values in order, i.e. it almost a reshape. Just the data[1,2,:] and data[2,2,:] are swapped. Is that intended? I have no idea what you are doing, so I can’t judge whether that is correct - may as well be correct . It just stood out to me while reading through the code.

raman_kumar · November 2, 2025, 7:06pm

Yes, My HDF5 data is predefined and it contains information like:Number of mesh-blocks in data. Every mesh-block have same fixed size. push!() decreases performance of code. It would be better to preallocate them.

@profile

     1         0 @StaticArraysCore/…c/StaticArraysCore.jl   83 #s4#1
     7         7 …man/Videos/modular.jl    ? macro expansion
     2         0 …man/Videos/modular.jl   38 macro expansion
     4         0 …man/Videos/modular.jl   43 macro expansion
     6         0 …man/Videos/modular.jl   50 macro expansion
    12         0 …man/Videos/modular.jl   54 macro expansion
     1         0 …man/Videos/modular.jl   56 macro expansion
    12        12 …man/Videos/modular.jl   67 macro expansion
     2         2 …man/Videos/modular.jl   68 macro expansion
     1         1 …man/Videos/modular.jl   69 macro expansion
   772       687 …man/Videos/modular.jl   71 macro expansion
  2961      1287 …man/Videos/modular.jl   72 macro expansion
  1214       952 …man/Videos/modular.jl   73 macro expansion
  1434      1045 …man/Videos/modular.jl   74 macro expansion
  1039       843 …man/Videos/modular.jl   75 macro expansion
  2473         0 …man/Videos/modular.jl   76 macro expansion
  2320         0 …man/Videos/modular.jl   77 macro expansion
  2403         0 …man/Videos/modular.jl   78 macro expansion
  2450         0 …man/Videos/modular.jl   79 macro expansion
   505       442 …man/Videos/modular.jl   80 macro expansion
   447         0 …man/Videos/modular.jl   25 processdata(Bp::Array{Float32, 4}…

Yes, it is intended. I have to supply color values to corner in that order. This helps me get plot matching its scatter plot so think it is correct.Updated code is given below with quick solution to thread safety.

new code

using GLMakie, Interpolations, HDF5, Meshes, CoordRefSystems, Unitful, Statistics, Profile
using Base.Threads
h5file = "sane00.prim.01800.athdf"

#### --- Load HDF5 data ---
function load(h5file)
    h5 = h5open(h5file, "r")
    num_blocks::Int32 = read(HDF5.attributes(h5), "NumMeshBlocks")
    MeshBlockSize::Vector{Int32} = read(HDF5.attributes(h5), "MeshBlockSize")
    B::Array{Float32, 5} = read(h5["B"])
    Br, Bθ, Bϕ = @views B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
    Bp = @. hypot(Br, Bθ) / Bϕ
    x_all::Array{Float32, 2} = read(h5["x1f"])
    y_all::Array{Float32, 2} = read(h5["x2f"])
    z_all::Array{Float32, 2} = read(h5["x3f"])
    close(h5)
    return Bp, num_blocks, x_all, y_all, z_all
end

Bp, num_blocks, x_all, y_all, z_all = load(h5file)

#### --- Thread-local accumulators (stored in a Dict) ---
accumulators = Dict{Int, NamedTuple{(:X, :Y, :Z, :C), NTuple{4, Vector{Float32}}}}()
samples_per_axis = 20

#### --- Parallel over mesh blocks ---
function processdata(Bp, num_blocks, x_all, y_all, z_all, accumulators)
    @threads :static for b in 870:num_blocks   # using 870 to have minimum load
        tid = threadid()
        acc = get!(accumulators, tid) do
            (X = Float32[], Y = Float32[], Z = Float32[], C = Float32[])
        end

        @views Bp_block = Bp[:, :, :, b]
        r = x_all[:, b]
        θ = y_all[:, b]
        ϕ = z_all[:, b]
        g = RectilinearGrid{𝔼, typeof(Spherical(0,0,0))}((r, θ, ϕ))

        for i in 1:nelements(g)
            el = element(g, i)
            # ---- Corner color values ----
            clr = Vector{Float32}(undef, 8)
            bd = Boundary{3,0}(topology(g))(i)
            for (ii, id) in enumerate(bd)
                adels = Coboundary{0,3}(topology(g))(id)
                s = 0.0f0
                for idx in adels
                    inds = elem2cart(topology(g), idx)
                    s += Bp_block[inds...]
                end
                clr[ii] = s / length(adels)
            end

            # ---- Vertex coordinates ----
            verts = [Float64.(ustrip.((c.r, c.θ, c.ϕ))) for c in coords.(vertices(el))]
            rgrid = LinRange(extrema(getindex.(verts, 1))..., 2)
            θgrid = LinRange(extrema(getindex.(verts, 2))..., 2)
            ϕgrid = LinRange(extrema(getindex.(verts, 3))..., 2)

            # ---- Build 2×2×2 data cube ----
            data = Array{Float32}(undef, 2, 2, 2)
            data[1,1,1] = clr[1]; data[2,1,1] = clr[2]
            data[2,2,1] = clr[3]; data[1,2,1] = clr[4]
            data[1,1,2] = clr[5]; data[2,1,2] = clr[6]
            data[2,2,2] = clr[7]; data[1,2,2] = clr[8]

            itp = interpolate((rgrid, θgrid, ϕgrid), data, Gridded(Linear()))

            r_eval = LinRange(rgrid[1], rgrid[end], samples_per_axis)
            θ_eval = LinRange(θgrid[1], θgrid[end], samples_per_axis)
            ϕ_eval = LinRange(ϕgrid[1], ϕgrid[end], samples_per_axis)

            for rr in r_eval, th in θ_eval, ph in ϕ_eval
                val = itp(rr, th, ph)
                x = rr * sin(th) * cos(ph)
                y = rr * sin(th) * sin(ph)
                z = rr * cos(th)
                push!(acc.X, x)
                push!(acc.Y, y)
                push!(acc.Z, z)
                push!(acc.C, val)
            end
        end
    end
    println("\nInterpolation complete. Merging results...")
    return accumulators
 end
 @profile accumulators = processdata(Bp, num_blocks, x_all, y_all, z_all, accumulators)
 
# --- Visualization setup ---
function plothdf_volume_parallel(h5file::String; grid_res::Int = 100)
    fig = Figure(size = (1000, 800))
    ax = Axis3(fig[1, 1], title = "Interpolated Volume", aspect = :data, viewmode = :free)

    # --- Merge all thread accumulators ---
    Xacc_all = reduce(vcat, [v.X for v in values(accumulators)])
    Yacc_all = reduce(vcat, [v.Y for v in values(accumulators)])
    Zacc_all = reduce(vcat, [v.Z for v in values(accumulators)])
    Cacc_all = reduce(vcat, [v.C for v in values(accumulators)])

    # --- Compute bounding box ---
    x_min, x_max = extrema(Xacc_all)
    y_min, y_max = extrema(Yacc_all)
    z_min, z_max = extrema(Zacc_all)

    nx = ny = nz = grid_res
    x_range = LinRange(x_min, x_max, nx)
    y_range = LinRange(y_min, y_max, ny)
    z_range = LinRange(z_min, z_max, nz)

    # --- Fill 3D grid with nearest-point assignment ---
    field = fill(NaN32, nx, ny, nz)
    for (xi, yi, zi, ci) in zip(Xacc_all, Yacc_all, Zacc_all, Cacc_all)
        ix = clamp(Int(round((xi - x_min)/(x_max - x_min) * (nx - 1))) + 1, 1, nx)
        iy = clamp(Int(round((yi - y_min)/(y_max - y_min) * (ny - 1))) + 1, 1, ny)
        iz = clamp(Int(round((zi - z_min)/(z_max - z_min) * (nz - 1))) + 1, 1, nz)
        field[ix, iy, iz] = ci
    end

    # --- Compute color normalization ---
    finite_vals = filter(!isnan, vec(field))
    q_low  = quantile(finite_vals, 0.05)
    q_high = quantile(finite_vals, 0.95)
    cr = (q_low, q_high)

    println("Rendering volume...")
    volume!(ax, x_min..x_max, y_min..y_max, z_min..z_max, field; colorrange=cr)
    fig
end

# --- usage ---

fig = plothdf_volume_parallel("sane00.prim.01800.athdf";  grid_res = 100)
display(fig)

Profile.print(format=:flat)

abraemer · November 2, 2025, 9:05pm

The code you posted has severe problems:

Do not use global variables. For one if you don’t make sure to type all then your performance will be abysmal (here you forgot to add a type annotation for samples_per_axis). Additionally, global mutable state can lead to mistakes quickly and it almost bites you here already: every time you run accumulators = processdata(Bp, num_blocks, x_all, y_all, z_all, accumulators) it adds the data onto the existing data leading to growing memory consumption. I think @profile does not run the code multiple times but if you tried benchmarking it with @btime that would cause issues.
When fixing the threading you or your code assistent accidently changed the iteration bounds to start at 870: @threads :static for b in 870:num_blocks

Please clean up the code some more by passing everything via function arguments. I think it would be more natural if you did the merging in processdata instead of plothdf_volume_parallel as well.

Then try to produce some useful profiles. Maybe you need to read about Profile.jl to tweak the output such that it shows the lines the take long/allocate a lot properly: Or even better: Try to use @profview from ProfileView.jl instead of @profile and identify the parts that take most time or allocate most memory. And then come back with this detailed information and get help for optimizing these specific parts. We can’t run your code, so there is no way for us to get this information.

raman_kumar · November 3, 2025, 9:33pm

Will it be better to use GPU as my code creates a lot of data? See this serial or single threaded code below for the same above data.

using GLMakie, Random, Interpolations, Colors
using HDF5, Meshes, CoordRefSystems, Unitful, Statistics

function plothdf_volume(h5file::String; samples_per_axis::Int = 6, grid_res::Int = 100)
    h5 = h5open(h5file, "r")
    num_blocks::Int32 = read(HDF5.attributes(h5), "NumMeshBlocks")
    MeshBlockSize::Vector{Int32} = read(HDF5.attributes(h5), "MeshBlockSize")
    B = read(h5["B"])
    Br, Bθ, Bϕ = @views B[:,:,:,:,1], B[:,:,:,:,2], B[:,:,:,:,3]
    Bp = @. hypot(Br, Bθ) / Bϕ
    x_all::Array{Float32, 2} = read(h5["x1f"])
    y_all::Array{Float32, 2} = read(h5["x2f"])
    z_all::Array{Float32, 2} = read(h5["x3f"])
    close(h5)

    # Create figure
    fig = Figure(size = (800, 800))
    ax = Axis3(fig[1, 1], title = "Interpolated Volume", aspect = :data, viewmode = :free)
	#ax = LScene(fig[1,1], show_axis=false)
    # Global accumulators for all blocks
    Xacc = Float32[]
    Yacc = Float32[]
    Zacc = Float32[]
    Cacc = Float32[]

    # ---- Loop through selected blocks ----
    for b in 1:num_blocks
        print("\rProcessing block $b ...")
        @views Bp_block = Bp[:, :, :, b]
        r = x_all[:, b]
        θ = y_all[:, b]
        ϕ = z_all[:, b]
        g = RectilinearGrid{𝔼, typeof(Spherical(0,0,0))}((r, θ, ϕ))

        for i in 1:nelements(g)
            el = element(g, i)

            # ---- Corner values ----
            clr = Vector{Float32}(undef, 8)
            bd = Boundary{3,0}(topology(g))(i)
            for (ii, id) in enumerate(bd)
                adels = Coboundary{0,3}(topology(g))(id)
                s = 0.0f0
                for idx in adels
                    inds = elem2cart(topology(g), idx)
                    s += Bp_block[inds...]
                end
                clr[ii] = s / length(adels)
            end

            # ---- Vertex coordinates ----
            verts = [Float64.(ustrip.((c.r, c.θ, c.ϕ))) for c in coords.(vertices(el))]
            rgrid = LinRange(extrema(getindex.(verts, 1))..., 2)
            θgrid = LinRange(extrema(getindex.(verts, 2))..., 2)
            ϕgrid = LinRange(extrema(getindex.(verts, 3))..., 2)

            # ---- Build 2×2×2 data cube ----
            data = Array{Float32}(undef, 2, 2, 2)
            data[1,1,1] = clr[1]; data[2,1,1] = clr[2]
            data[2,2,1] = clr[3]; data[1,2,1] = clr[4]
            data[1,1,2] = clr[5]; data[2,1,2] = clr[6]
            data[2,2,2] = clr[7]; data[1,2,2] = clr[8]

            itp = interpolate((rgrid, θgrid, ϕgrid), data, Gridded(Linear()))

            r_eval = LinRange(rgrid[1], rgrid[end], samples_per_axis)
            θ_eval = LinRange(θgrid[1], θgrid[end], samples_per_axis)
            ϕ_eval = LinRange(ϕgrid[1], ϕgrid[end], samples_per_axis)

            for rr in r_eval, th in θ_eval, ph in ϕ_eval
                val = itp(rr, th, ph)
                x = rr * sin(th) * cos(ph)
                y = rr * sin(th) * sin(ph)
                z = rr * cos(th)
                push!(Xacc, x); push!(Yacc, y); push!(Zacc, z)
                push!(Cacc, val)
            end
        end
    end

    println("\nInterpolation complete.")
    println("Creating volume grid...")

    # --- Compute bounding box for all points ---
    x_min, x_max = extrema(Xacc)
    y_min, y_max = extrema(Yacc)
    z_min, z_max = extrema(Zacc)

    nx = ny = nz = grid_res
    x_range = LinRange(x_min, x_max, nx)
    y_range = LinRange(y_min, y_max, ny)
    z_range = LinRange(z_min, z_max, nz)

    # --- Create regular 3D grid and fill with nearest points ---
    field = fill(NaN32, nx, ny, nz)
    for (xi, yi, zi, ci) in zip(Xacc, Yacc, Zacc, Cacc)
        ix = clamp(Int(round((xi - x_min)/(x_max - x_min) * (nx - 1))) + 1, 1, nx)
        iy = clamp(Int(round((yi - y_min)/(y_max - y_min) * (ny - 1))) + 1, 1, ny)
        iz = clamp(Int(round((zi - z_min)/(z_max - z_min) * (nz - 1))) + 1, 1, nz)
        field[ix, iy, iz] = ci
    end

    # --- Compute dynamic color normalization ---
    finite_vals = filter(!isnan, vec(field))
    q_low  = quantile(finite_vals, 0.05)
    q_high = quantile(finite_vals, 0.95)
    cr = (q_low, q_high)

    # --- Optional transparency masking ---
    #field[field .< q_low] .= NaN

    println("Rendering volume...")
    volume!(ax, x_min..x_max, y_min..y_max, z_min..z_max, field; colorrange=cr)

    fig
end

# --- Run example ---
@time fig = plothdf_volume("sane00.prim.01800.athdf"; samples_per_axis = 20, grid_res = 150)
display(fig)

Topic		Replies	Views
Appropiate use of Threads.@threads? New to Julia question , multithreading	6	1877	June 27, 2017
Poor performance while multithreading (Julia 1.0) Performance multithreading	28	4039	February 11, 2019
Parallelizing for loop in the computation of a gradient Performance question	19	2638	February 26, 2018
Parallelizing a force assembly function Performance	11	891	October 17, 2019
Reading and processing Data files concurrently Data parallel	18	3867	September 20, 2017

Memory kills this code

Related topics