I have problems getting CUDA-aware MPI running in Julia. I have a C++ example that works perfectly if I run it with mpiexec -n 2 ./alltoall_test
. The equivalent Julia code run with mpiexec -n 2 julia alltoall_test.jl
fails on a segmentation fault with [1642930332.032032] [gcn19:4087661:0] gdr_copy_md.c:122 UCX ERROR gdr_pin_buffer failed. length :65536 ret:22
. I have set up the MPI using the system MPI and it reports the correct version, so my impression is that the problem lies on the CUDA side which wants to download artefacts as soon as I use CuArray
, despite CUDA being installed. My impression is that am using an CUDA version that is incompatible with the compilation of MPI, but I do not know how to verify this. I used the JULIA_CUDA_USE_BINARY_BUILDER=false
setting. The two test codes are:
using MPI
using CUDA
np = 2
MPI.Init()
comm = MPI.COMM_WORLD
mpiid = MPI.Comm_rank(comm)
print("The MPI rank is: $mpiid\n")
device!(mpiid)
print("The CUDA device is: $(device())\n")
n = 1024
data_cpu = rand(n)
data_out_cpu = similar(data_cpu)
data = CuArray(data_cpu)
data_out = similar(data)
# Test the alltoall on the CPU
mpi_data_cpu = MPI.UBuffer(data_cpu, 512)
mpi_data_out_cpu = MPI.UBuffer(data_out_cpu, 512)
@time MPI.Alltoall!(mpi_data_cpu, mpi_data_out_cpu, comm)
@time MPI.Alltoall!(mpi_data_cpu, mpi_data_out_cpu, comm)
# Test the alltoall on the GPU
print("$mpiid has CUDA: $(MPI.has_cuda())\n")
mpi_data = MPI.UBuffer(data, 512)
mpi_data_out = MPI.UBuffer(data_out, 512)
@time MPI.Alltoall!(mpi_data, mpi_data_out, comm)
@time MPI.Alltoall!(mpi_data, mpi_data_out, comm)
# Close the MPI.
MPI.Finalize()
and
#include <iostream>
#include <vector>
#include <mpi.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <chrono>
int main()
{
MPI_Init(NULL, NULL);
int n, id;
MPI_Comm_size(MPI_COMM_WORLD, &n);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
const size_t size_tot = 1024*1024*1024;
const size_t size_max = size_tot / n;
// CPU TEST
std::vector<double> a_cpu_in (size_tot);
std::vector<double> a_cpu_out(size_tot);
std::fill(a_cpu_in.begin(), a_cpu_in.end(), id);
std::cout << id << ": Starting CPU all-to-all\n";
auto time_start = std::chrono::high_resolution_clock::now();
MPI_Alltoall(
a_cpu_in .data(), size_max, MPI_DOUBLE,
a_cpu_out.data(), size_max, MPI_DOUBLE,
MPI_COMM_WORLD);
auto time_end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration<double, std::milli>(time_end-time_start).count();
std::cout << id << ": Finished CPU all-to-all in " << std::to_string(duration) << " (ms)\n";
// GPU TEST
int id_local = id % 4;
cudaSetDevice(id_local);
double* a_gpu_in;
double* a_gpu_out;
cudaMalloc((void **)&a_gpu_in , size_tot * sizeof(double));
cudaMalloc((void **)&a_gpu_out, size_tot * sizeof(double));
cudaMemcpy(a_gpu_in, a_cpu_in.data(), size_tot*sizeof(double), cudaMemcpyHostToDevice);
int id_gpu;
cudaGetDevice(&id_gpu);
std::cout << id << ", " << id_local << ", " << id_gpu << ": Starting GPU all-to-all\n";
time_start = std::chrono::high_resolution_clock::now();
MPI_Alltoall(
a_gpu_in , size_max, MPI_DOUBLE,
a_gpu_out, size_max, MPI_DOUBLE,
MPI_COMM_WORLD);
time_end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration<double, std::milli>(time_end-time_start).count();
std::cout << id << ", " << id_local << ", " << id_gpu << ": Finished GPU all-to-all in " << std::to_string(duration) << " (ms)\n";
MPI_Finalize();
return 0;
}