@sgaure got it. Thanks for the direction! I did find something since our HPC moved to Rocky Linux that there was some issues with OpenMPI and the IB.
Here’s the website: Job example scripts — RCIC 1.0.0 documentation
Which shows in the example that;
#SBATCH --constraint="mlx5_ib" ## run only on nodes with updated IB firmware
# set these UCX parameters for openmpi
export OMP_NUM_THREADS=1
export UCX_TLS=rc,mm
export UCX_NET_DEVICES=mlx5_0:1
should be set and it looks like they suggest using mpirun
with:
# original command is updated with: -mca pml ucx
mpirun -np $SLURM_NTASKS -mca pml ucx vasp_std
Although I notice that their might be a missing -
in front of what should be --mca
?
@mkitti Thanks. With all of the above I’ve reformulated my slurm script. I’ve constraint to just intel processors to better trouble shoot:
#!/bin/bash
#SBATCH -p free
#SBATCH --job-name=SRRS
#SBATCH --constraint="mlx5_ib&intel"
#SBATCH --nodes=1-4
#SBATCH --ntasks=20
#SBATCH --error=err-%j.log
#SBATCH --output=output-%j.log
#SBATCH --mem-per-cpu=8G
#SBATCH --mail-type=begin
#SBATCH --mail-type=end # send email if job fails
#SBATCH --mail-type=fail # send email if job fails
#SBATCH --mail-user=ernestob@uci.edu
echo $SLURM_JOB_NUM_NODES >> "julia-$SLURM_JOB_ID.out"
echo $SLURM_NTASKS >> "julia-$SLURM_JOB_ID.out"
echo $SLURM_CPUS_PER_TASK >> "julia-$SLURM_JOB_ID.out"
module load openmpi/4.1.2/gcc.11.2.0
module load hdf5/1.14.1/gcc.11.2.0-openmpi.4.1.2
# set these UCX parameters for openmpi
export OMP_NUM_THREADS=1
export UCX_TLS=rc,mm
export UCX_NET_DEVICES=mlx5_0:1
export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
export JULIA_CPU_TARGET="generic;skylake-avx512,clone_all; skylake,clone_all;icelake-server,clone_all;"
export JULIA_MPI_LIBRARY="/opt/apps/openmpi/4.1.2/gcc/11.2.0/lib/libmpi"
echo "Precompiling Master" >> "julia-$SLURM_JOB_ID.out"
julia --project=. -e 'using Pkg; Pkg.instantiate(); Pkg.precompile()' >> "julia-$SLURM_JOB_ID.out"
mpirun --mca pml ucx -n $SLURM_NTASKS /data/homezvol0/ernestob/.juliaup/bin/julia --project=. /dfs6/pub/ernestob/Julia/NRL/DistributedSRRSMASTER/PencilTest.jl >> "julia-$SLURM_JOB_ID.out"
Below is my Project.toml
:
[deps]
AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
BPMDistributed = "634e16ad-23c4-4c3d-8de6-2ae1fdbe117a"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a"
Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LinearOperators = "5c8ed15e-5a4c-59e4-a42b-c7e8811fb125"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
PencilArrays = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"
PencilFFTs = "4a48f351-57a6-4416-9ec4-c37015456aae"
PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
SRRSCalc = "c215410e-53fa-4750-bffd-73de35b87ca4"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
[HDF5]
libhdf5 = "/path/to/your/libhdf5.so"
libhdf5_hl = "/path/to/your/libhdf5_hl.so"
[MPIPreferences]
__clear__ = ["preloads_env_switch"]
_format = "1.0"
abi = "OpenMPI"
binary = "system"
cclibs = []
libmpi = "/opt/apps/openmpi/4.1.2/gcc/11.2.0/lib/libmpi"
mpiexec = "/opt/apps/openmpi/4.1.2/gcc/11.2.0/bin/mpirun"
preloads = []
[extras]
HDF5_jll = "0234f1f7-429e-5d53-9886-15a909be8d59"
With my PencilTest.jl
below. As an FYI, I am trying to test out part of this example: Gradient of a scalar field · PencilFFTs.jl
But with in place transforms.
using Pkg
using MPI
#using MPIPreferences
# ENV["JULIA_MPI_BINARY"]="system"
using BenchmarkTools
using Test
using LinearAlgebra
using FFTW
using Krylov
using LinearOperators
using HDF5
using PencilArrays
using PencilFFTs
using TimerOutputs
using PencilArrays.PencilIO
using Random
using AbstractFFTs: fftfreq, rfftfreq
#loading personal modules
# println("Loading SRRSCalc.jl and BPMDistributed.jl Functions");
# # # include("SRRSCalc.jl")
# # # include("BPMFFT.jl")
# # #include("SRRSCalcMod.jl")
# # #include("BPMFFTMod.jl")
# # # Pkg.develop(PackageSpec(path="./BPMFFT")) # dev ./BPMFFT
# # # Pkg.develop(PackageSpec(path="./SRRSCalc"))
# # # using .SRRSCalcMod
# # # using .BPMFFTMod
# push!(LOAD_PATH, "/dfs6/pub/ernestob/Julia/NRL/DistributedSRRSMASTER/SRRSCalc");
# push!(LOAD_PATH, "/dfs6/pub/ernestob/Julia/NRL/DistributedSRRSMASTER/BPMDistributed");
# using SRRSCalc
# using BPMDistributed
MPI.Init()
comm = MPI.COMM_WORLD # MPI communicator
rank = MPI.Comm_rank(comm) # rank of local process
root = 0
MPI.Barrier(comm)
if rank == root
println("Finished Activating Project and Loading Packages")
flush(stdout)
println("-------------------")
flush(stdout)
println("Is ther HDF5 Parallel?")
println("-------------------")
flush(stdout)
println(HDF5.has_parallel())
flush(stdout)
println("-------------------")
flush(stdout)
println("Beginning PencilFFT Tutorial")
println("-------------------")
flush(stdout)
end
#wait for all MPI processes to catch up
MPI.Barrier(comm)
# Input data dimensions (Nx × Ny × Nz)
Nx = 64
Ny = 32
dims = (64, 32)#(64, 32, 12)
# Apply a 2D real-to-complex (r2c) FFT.
transform = (Transforms.FFT(), Transforms.NoTransform())#, Transforms.NoTransform())
inplacefft = (Transforms.FFT!(), Transforms.NoTransform!())
pen = Pencil(dims, comm)
#wait for all MPI processes to catch up
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Pencils")
println("-------------------")
flush(stdout)
println(summary(pen))
flush(stdout)
end
#wait for all MPI processes to catch up
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Creating Pencil Array")
flush(stdout)
println("-------------------")
flush(stdout)
end
# Create plan
#plan = PencilFFTPlan(pen, transform)#, permute_dims = Val(false))
planinplace = PencilFFTPlan(pen, inplacefft )
MPI.Barrier(comm)
# Allocate data and initialise field
# theta = allocate_input(plan)
# randn!(theta)
Theta1 = allocate_input(planinplace)
Theta2 = allocate_input(planinplace)
theta1 = first(Theta1)
theta2 = first(Theta2)
randn!(theta1)
#@. theta2 = theta1
randn!(theta2)
if rank==root
println("-------------------")
flush(stdout)
println("Gathering Pencil Array theta To Root")
println("-------------------")
flush(stdout)
end
theta0 = gather(theta1)
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Checking Gathering To Root")
println("-------------------")
flush(stdout)
println("Size of gather theta0: ",size(theta0))
flush(stdout)
end
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Creating 1D Array theta1D in Root")
flush(stdout)
println("-------------------")
flush(stdout)
theta1D = zeros(ComplexF64, dims[1])#allocate_input(plan1D)
println("Size of this array: ", size(theta1D))
flush(stdout)
println("-------------------")
flush(stdout)
println("Setting 1D Array theta1D to gather theta0[:,1,1]" )
flush(stdout)
println("-------------------")
flush(stdout)
@. theta1D = theta0[:,1]#theta0[:,1,1]
end
MPI.Barrier(comm)
# theta_glob = global_view(theta)
# @. theta1D = theta_glob[:,1,1]
if rank==root
println("-------------------")
flush(stdout)
println("Checking Sizes of Pencil Array")
println("-------------------")
flush(stdout)
end
MPI.Barrier(comm)
flush(stdout)
println(size(theta1))
flush(stdout)
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Performing Pencil FFT")
println("-------------------")
flush(stdout)
end
# theta_hat = plan * theta
planinplace * Theta1;
theta_hat = last(Theta1)
thetaf = gather(theta_hat)
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Checking size of Pencil FFT Output")
println("-------------------")
flush(stdout)
end
println(size(theta_hat))
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Performing 1D FFTW")
println("-------------------")
flush(stdout)
thetaft1D = FFTW.fft(theta1D) #plan1D * theta1D
end
#wait for all MPI processes to catch up
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Check to see if 1D transform worked")
println("-------------------")
flush(stdout)
errors = zeros(Float64, length(thetaft1D ))
# @. errors = abs2(thetaf_glob1D - thetaf_glob[:,1,1])
@. errors = abs2(thetaft1D - thetaf[:,1,1])
NotZero = any(x->x<(1.0e-5), errors)
if NotZero
println(rank, ": The Error is less than 1.0e-5")
flush(stdout)
elseif !NotZero
println(rank, ": The Error is greater than 1.0e-5")
flush(stdout)
end
end
#wait for all MPI processes to catch up
MPI.Barrier(comm)
# Finally, we initialise the output that will hold ∇θ in Fourier space. Noting that ∇θ is a vector field, we choose to store it as a tuple of 3 PencilArrays.
gradTheta_hat = last(Theta1)#allocate_output(plan)
MPI.Barrier(comm)
# # This is equivalent:
# # ∇θ_hat = ntuple(d -> similar(θ_hat), Val(3))
if rank==root
println("-------------------")
flush(stdout)
println("FFT Output Plan of Grad Theta")
println("-------------------")
flush(stdout)
# println(summary(gradTheta_hat))
# flush(stdout)
end
println(size(gradTheta_hat))
#wait for all MPI processes to catch up
MPI.Barrier(comm)
# #fourier wave number vectors
if rank==root
println("-------------------")
flush(stdout)
println("Creating FFT Wave Numbers")
println("-------------------")
flush(stdout)
# println(summary(gradTheta_hat))
# flush(stdout)
end
box_size = (2*pi, 2*pi) # Lx, Ly, Lz
sample_rate = 2*pi .* dims ./ box_size
# In our case (Lx = 2π and Nx even), this gives kx = [0, 1, 2, ..., Nx/2].
kx = fftfreq(dims[1], sample_rate[1])
ky = ones(ComplexF64, dims[2])
#wait for all MPI processes to catch up
MPI.Barrier(comm)
MPI.Barrier(comm)
# #fourier wave number vectors
if rank==root
println("-------------------")
flush(stdout)
println("Creating FFT Wave Numbers Local Grid")
println("-------------------")
flush(stdout)
println("Need to create a 2D local grid and Kvec grid")
# println(summary(gradTheta_hat))
# flush(stdout)
end
# #Local Indexing
# PencilFFTs.localgrid()
xs = range(1, Nx; length = Nx)
ys = range(1, Ny; length = Ny)
gridx = localgrid(theta_hat, (xs,ys))
yones = ones(Float64, Ny)
grid_fourier = localgrid(theta_hat, (kx,yones))
MPI.Barrier(comm)
println(grid_fourier)
MPI.Barrier(comm)
# #fourier wave number vectors
if rank==root
println("-------------------")
flush(stdout)
println("Local Grid Indexing")
println("-------------------")
end
flush(stdout)
println(summary(grid_fourier))
flush(stdout)
MPI.Barrier(comm)
# #computing gradient
if rank==root
println("-------------------")
flush(stdout)
println("Computing FFT Gradient")
println("-------------------")
flush(stdout)
# println(summary(gradTheta_hat))
# flush(stdout)
end
@inbounds for I in eachindex(grid_fourier)
# Wave number vector associated to current Cartesian index.
#i, j = Tuple(I)
kkx, yy= grid_fourier[I]
# u = im * θ_hat[I]
gradTheta_hat[I] = 1.0im * kkx * theta_hat[I]
end
if rank==root
println("-------------------")
flush(stdout)
println("Gathering PencilFFT grad To Root")
println("-------------------")
flush(stdout)
end
gradtheta0 = gather(gradTheta_hat)
MPI.Barrier(comm)
# check gradient
if rank==root
println("-------------------")
flush(stdout)
println("Check PencilFFT is the Same as FFTW gradient")
println("-------------------")
flush(stdout)
println("Doing Root FFTW gradient calculation")
# @. gradTheta_hat = 1.0im * grid_fourier * theta_hat
gradthetaft1D = zeros(ComplexF64, size(thetaft1D))
@. gradthetaft1D = 1.0im * kx * thetaft1D
end
#wait for all MPI processes to catch up
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Check to see if 1D FFT gradient worked")
println("-------------------")
flush(stdout)
errors = zeros(Float64, length(gradthetaft1D))
# @. errors = abs2(thetaf_glob1D - thetaf_glob[:,1,1])
@. errors = abs2(gradthetaft1D - gradtheta0[:,1,1])
NotZero = any(x->x<(1.0e-5), errors)
if NotZero
println(rank, ": The Error is less than 1.0e-5")
flush(stdout)
elseif !NotZero
println(rank, ": The Error is greater than 1.0e-5")
flush(stdout)
end
end
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Saving Data to HDF5")
println("-------------------")
flush(stdout)
end
# comm = get_comm(gradTheta_hat)
ff = open(PHDF5Driver(), "/dfs6/pub/ernestob/Julia/NRL/DistributedSRRSMASTER/data/hdf5test.hdf", comm; write=true)
gradtheta1 = last(Theta1)
ff["gradTheta_hat1"] = gradtheta1
gradtheta2 = first(Theta2)
ff["gradTheta_hat2"] = gradtheta2
# ff["gradTheta_hat2"] = gradTheta_hat[2]
# ff["gradTheta_hat3"] = gradTheta_hat[3]
close(ff)
MPI.Barrier(comm)
if rank==root
println("-------------------")
flush(stdout)
println("Finished Saving to HDF5")
println("-------------------")
flush(stdout)
end