Julia -> C function (Create thead) -> Julia CUDA kernel issue

I am facing a weird problem in our application. We have a julia function calling a C function, which is creating a pthread and calling Julia CUDA kernel. I have created a small example to illustrate and reproduce the problem. Unfortunately, we are not able to make it even more simpler. This is the simplest example to reproduce the issue.

The "call_c_function_direct” Julia function calls the C function “c_function_direct”, which calls Julia CUDA kernel “ccall_saxpy() -> saxpy_kernel()”. It works without any issue.

However, when I create a pthread inside C function and call Julia CUDA kernel, it hangs the execution and no useful stack trace is available.

The "call_c_function_pthread” Julia function calls the C function “c_function_pthread”, which creates a pthread and calls Julia CUDA kernel “ccall_saxpy() -> saxpy_kernel()”. It hangs the execution when it calls @cuda saxpy_kernel”.

To control the execution of Julia CUDA kernel either through direct or pthread based, a Julia variable is added in the file “julia_cuda.jl” with “direct = true”. You can set it to false to run using pthread.

C code: c_cuda.c

#include <julia.h>
#include <pthread.h>
#include <cuda_runtime.h>
#include <cuda.h>

typedef struct {
    float *A;
    float *B;
    float *C;
    float alpha;
    int n;
} thread_data_t;

// Declare the Julia function
typedef void (*julia_saxpy_t)(void *ctx, int device, float *A, float *B, float *C, float alpha, int n);

julia_saxpy_t ccall_saxpy = NULL;
CUcontext cuContext;
CUdevice cuDevice;

void initialize_c(void *func) {
    jl_init();
    ccall_saxpy = (julia_saxpy_t) func;
    cuInit(0);
    cuDeviceGet(&cuDevice, 0);
    cuCtxCreate(&cuContext, 0, cuDevice);
    printf("C CUDA ctx: %p\n", cuContext);
}

void c_println(char *data) {
    printf("%s\n", data);
}
void call_cuda_saxpy(float *A, float *B, float *C, float alpha, int n) {
    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cuCtxSetCurrent(cuContext);
    cudaMalloc((void**)&d_A, n * sizeof(float));
    cudaMalloc((void**)&d_B, n * sizeof(float));
    cudaMalloc((void**)&d_C, n * sizeof(float));

    // Copy data from host to device
    cudaMemcpy(d_A, A, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, n * sizeof(float), cudaMemcpyHostToDevice);

    // Call the Julia function
    ccall_saxpy((void *)&cuContext, (int)cuDevice, d_A, d_B, d_C, alpha, n);

    // Copy result from device to host
    cudaMemcpy(C, d_C, n * sizeof(float), cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

void *thread_function(void *arg) {
    thread_data_t *data = (thread_data_t *)arg;
    call_cuda_saxpy(data->A, data->B, data->C, data->alpha, data->n);
    return NULL;
}
void c_function_pthread(float *A, float *B, float *C, float alpha, int n) {
    pthread_t thread;
    thread_data_t data = {A, B, C, alpha, n};
    pthread_create(&thread, NULL, thread_function, &data);
    pthread_join(thread, NULL);
}
void c_function_direct(float *A, float *B, float *C, float alpha, int n) {
    call_cuda_saxpy(A, B, C, alpha, n);
}

Julia code: julia_code.jl

using CUDA

# Define the CUDA kernel for saxpy
function saxpy_kernel(A, B, C, alpha)
    i = threadIdx().x
    if i <= length(A)
        C[i] = alpha * A[i] + B[i]
    end
    return
end

# Make the Julia function callable from C
export ccall_saxpy
function ccall_saxpy(ctx::Ptr{Cvoid}, device::Cint, A::Ptr{Float32}, B::Ptr{Float32}, C::Ptr{Float32}, alpha::Cfloat, n::Cint)::Cvoid
    cu_ctx = unsafe_load(reinterpret(Ptr{CuContext}, ctx))
    c_println("CUDA ctx: $cu_ctx Device:$device")
    CUDA.context!(cu_ctx)
    CUDA.device!(device)
    size_dims=Tuple(Int64[n])
    A_array = unsafe_wrap(CuArray, reinterpret(CuPtr{Float32}, A), size_dims, own=false)
    B_array = unsafe_wrap(CuArray, reinterpret(CuPtr{Float32}, B), size_dims, own=false)
    C_array = unsafe_wrap(CuArray, reinterpret(CuPtr{Float32}, C), size_dims, own=false)
    c_println("A: $A_array, B:$B_array, C:$C_array Alpha:$alpha")
    c_println("Calling CUDA function")
    CUDA.@sync @cuda threads=size_dims saxpy_kernel(A_array, B_array, C_array, alpha)
    c_println("CUDA call completed")
end

# Initialize the C function
function initialize_c_function()
    func_ptr = @cfunction(ccall_saxpy, Cvoid, (Ptr{Cvoid}, Cint, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cfloat, Cint))
    global stored_func_ptr = func_ptr
    ccall((:initialize_c, "libcfunction"), Cvoid, (Ptr{Cvoid},), func_ptr)
end

function c_println(data::String)::Cvoid
    ccall((:c_println, "libcfunction"), Cvoid, (Ptr{Cchar},), pointer(data))
end

# Define a function to call the C function
function call_c_function_pthread(A::Vector{Float32}, B::Vector{Float32}, C::Vector{Float32}, alpha::Float32, n::Cint)
    ccall((:c_function_pthread, "./libcfunction.so"), Cvoid,
          (Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Float32, Cint),
          A, B, C, alpha, n)
end
# Define a function to call the C function
function call_c_function_direct(A::Vector{Float32}, B::Vector{Float32}, C::Vector{Float32}, alpha::Float32, n::Cint)
    ccall((:c_function_direct, "./libcfunction.so"), Cvoid,
          (Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Float32, Cint),
          A, B, C, alpha, n)
end

# Example usage
A = Float32[1.0, 2.0, 3.0]
B = Float32[4.0, 5.0, 6.0]
C = Float32[0.0, 0.0, 0.0]
alpha = 2.0f0
n = 3
#direct = true
direct = false
initialize_c_function()
# Call the C function
if direct
    call_c_function_direct(A, B, C, alpha, Int32(n))
else
    call_c_function_pthread(A, B, C, alpha, Int32(n))
end
# Print the result
println("Result: $C")

Build commands:

$ gcc -g -O0 -fPIC -shared  -o libcfunction.so c_cuda.c -I$(JULIA)/include/julia -L$(JULIA)/lib -ljulia -lpthread -I$(NVHPC_ROOT)/cuda/include -L$(NVHPC_ROOT)/cuda/lib64  -lcuda -lcudart
$ julia julia_cuda.jl

I currently don’t have time to look at this, but I would double check that Julia and C are linking against the same CUDA.

You can tell CUDA.jl to set the version it ought to use.

Secondly, the context management might be an issue. Julia handles CUDA contexts internally and I have forgotten what the requirements for memory is…

In any case if the program hangs you should be able to attach GDB and print a backtrace to see where.