Struggling to Run Distributed I/O Operations on SLURM Cluster

I prefer splitting the distributed setup from the actual code. @everywhere is considered evil xD.

So as an example I often have a file called setup.jl that can be variously complex:

using Distributed
using ClusterManagers

# Usage:
# - Set `export JULIA_PROJECT=`pwd``

if haskey(ENV, "SLURM_JOB_ID")
  jobid = ENV["SLURM_JOB_ID"]  
  ntasks = parse(Int, ENV["SLURM_NTASKS"])
  cpus_per_task = parse(Int, ENV["SLURM_CPUS_PER_TASK"])
  @info "Running on Slurm cluster" jobid ntasks cpus_per_task
  manager = SlurmManager(ntasks)
else
  ntasks = 2
  cpus_per_task = div(Sys.CPU_THREADS, ntasks)
  @info "Running locally" ntasks
  manager = Distributed.LocalManager(ntasks, false)
end
flush(stderr)

# Launch workers
addprocs(manager; exeflags = ["-t $cpus_per_task"])

@everywhere begin
  import Dates
  using Logging, LoggingExtras
  const date_format = "HH:MM:SS"

  function dagger_logger(logger)
    logger = MinLevelLogger(logger, Logging.Info)
    logger = TransformerLogger(logger) do log
      merge(log, (; message = "$(Dates.format(Dates.now(), date_format)) ($(myid())) $(log.message)"))
    end
    return logger
  end

  # set the global logger
  if !(stderr isa IOStream)
    ConsoleLogger(stderr)
  else
    FileLogger(stderr, always_flush=true)
  end |> global_logger
end

@everywhere begin
  if myid() != 1
    @info "Worker started" Base.Threads.nthreads()
  end
  sysimg = unsafe_string((Base.JLOptions()).image_file)
  project = Base.active_project()
  @info "Environment" sysimg project
end

# Load code to execute on all processes
@everywhere begin
     include("code.jl")
end

code.jl then contains the actual code definitions.

And then I have a driver.jl which is the code to execute on the primary to manage the computation.

I then have a slurm script:

#!/bin/bash
# Begin SLURM Directives
#SBATCH --job-name=Example
#SBATCH --time=1:00:00
#SBATCH --mem=0
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=16

# Clear the environment from any previously loaded modules
module purge > /dev/null 2>&1

module load julia

export JULIA_PROJECT=`pwd`

HOSTNAME=$(hostname)
echo "Primary runs on ${HOSTNAME}"

julia -L setup.jl driver.jl 
1 Like