Hello
I have written a GPU kernel in Julia that I use to accelerate a custom layer that I have written to be used with PyTorch. Everything works great, except when I try to use DDP from PyTorch. Here is a simple MWE that reproduces the issue.
main.py
from julia import Julia
jpath = "~path to julia~"
jl = Julia(runtime=jpath, compiled_modules=False)
from julia import Pkg
Pkg.activate("./matmul")
from julia import matmul
import torch
dev = '0'
x = torch.randn(1024, 1024).to(f"cuda:{dev}")
w = torch.randn(1024, 1024).to(f"cuda:{dev}")
y = torch.zeros(1024, 1024).to(f"cuda:{dev}")
matmul.MatMul(y, x, w, int(dev))
yGolden = torch.matmul(w, x)
print(torch.allclose(y, yGolden, atol=1e-2))
matmul/src/matmul.jl
__precompile__()
module matmul
using DLPack
using PyCall
using CUDA
const torch = PyNULL()
const dl = PyNULL()
function __init__()
copy!(torch, pyimport("torch"))
copy!(dl, pyimport("torch.utils.dlpack"))
end
function matrixMul(y, x, w)
tx = threadIdx().x
ty = threadIdx().y
bx = blockIdx().x
by = blockIdx().y
row = tx + (bx - 1) * blockDim().x
col = ty + (by - 1) * blockDim().y
for i in 1:1024
y[row, col] += x[row, i] * w[i, col]
end
return
end
function MatMul(yTensor, xInTensor, wTensor, dev)
device!(dev)
y = DLPack.wrap(yTensor, o -> @pycall dl.to_dlpack(o)::PyObject)::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
xIN = DLPack.wrap(xInTensor, o -> @pycall dl.to_dlpack(o)::PyObject)::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
w = DLPack.wrap(wTensor, o -> @pycall dl.to_dlpack(o)::PyObject)::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
ySize = size(y)
tthreads = (16, 16)
bblocks = (ceil(Int, ySize[1] / tthreads[1]), ceil(Int, ySize[2] / tthreads[2]))
@cuda threads=tthreads blocks=bblocks matrixMul(y, xIN, w)
end
export MatMul
end # module matmul
When I run this as python-jl main.py
, it works. But, when I try to use torchrun as python-jl -m torch.distributed.launch --nproc_per_node=1 main.py
, I get the following error:
/opt/miniconda3/envs/torch/lib/python3.10/site-packages/torch/distributed/launch.py:180: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects `--local_rank` argument to be set, please
change it to read from `os.environ['LOCAL_RANK']` instead. See
https://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
warnings.warn(
Activating project at `~/ddpJulia/matmul`
Traceback (most recent call last):
File "/ddpJulia/main.py", line 13, in <module>
matmul.MatMul(y, x, w, int(dev))
RuntimeError: <PyCall.jlwrap (in a Julia function called from Python)
JULIA: InvalidIRError: compiling MethodInstance for matmul.matrixMul(::CUDA.CuDeviceMatrix{Float32, 1}, ::CUDA.CuDeviceMatrix{Float32, 1}, ::CUDA.CuDeviceMatrix{Float32, 1}) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to pointerref(ptr::Core.LLVMPtr{T, A}, i::I, ::Val{align}) where {T, A, I, align} in LLVM.Interop at /.julia/packages/LLVM/Od0DH/src/interop/pointer.jl:9)
Stacktrace:
[1] unsafe_load
@ ~/.julia/packages/LLVM/Od0DH/src/interop/pointer.jl:85
[2] arrayref_bits
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:91
[3] #arrayref
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:85
[4] getindex
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:164
[5] getindex
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:176
[6] matrixMul
@ ~/ddpJulia/matmul/src/matmul.jl:23
Reason: unsupported dynamic function invocation (call to *)
Stacktrace:
[1] matrixMul
@ ~/ddpJulia/matmul/src/matmul.jl:23
Reason: unsupported dynamic function invocation (call to +)
Stacktrace:
[1] matrixMul
@ ~/ddpJulia/matmul/src/matmul.jl:23
Reason: unsupported dynamic function invocation (call to convert)
Stacktrace:
[1] setindex!
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:166
[2] setindex!
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:179
[3] matrixMul
@ ~/ddpJulia/matmul/src/matmul.jl:23
Reason: unsupported dynamic function invocation (call to pointerset(ptr::Core.LLVMPtr{T, A}, x::T, i::I, ::Val{align}) where {T, A, I, align} in LLVM.Interop at /.julia/packages/LLVM/Od0DH/src/interop/pointer.jl:46)
Stacktrace:
[1] unsafe_store!
@ ~/.julia/packages/LLVM/Od0DH/src/interop/pointer.jl:88
[2] arrayset_bits
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:134
[3] #arrayset
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:127
[4] setindex!
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:166
[5] setindex!
@ ~/.julia/packages/CUDA/tVtYo/src/device/array.jl:179
[6] matrixMul
@ ~/ddpJulia/matmul/src/matmul.jl:23
Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl
Stacktrace:
[1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, args::LLVM.Module)
@ GPUCompiler ~/.julia/packages/GPUCompiler/l8TxP/src/validation.jl:149
[2] macro expansion
@ ~/.julia/packages/GPUCompiler/l8TxP/src/driver.jl:415 [inlined]
[3] macro expansion
@ ~/.julia/packages/TimerOutputs/RsWnF/src/TimerOutput.jl:253 [inlined]
[4] macro expansion
@ ~/.julia/packages/GPUCompiler/l8TxP/src/driver.jl:414 [inlined]
[5] emit_llvm(job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, only_entry::Bool, validate::Bool)
@ GPUCompiler ~/.julia/packages/GPUCompiler/l8TxP/src/utils.jl:89
[6] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing)
@ GPUCompiler ~/.julia/packages/GPUCompiler/l8TxP/src/driver.jl:129
[7] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, toplevel::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool)
@ GPUCompiler ~/.julia/packages/GPUCompiler/l8TxP/src/driver.jl:106
[8] compile
@ ~/.julia/packages/GPUCompiler/l8TxP/src/driver.jl:98 [inlined]
[9] #1037
@ ~/.julia/packages/CUDA/tVtYo/src/compiler/compilation.jl:104 [inlined]
[10] JuliaContext(f::CUDA.var"#1037#1040"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
@ GPUCompiler ~/.julia/packages/GPUCompiler/l8TxP/src/driver.jl:58
[11] compile(job::GPUCompiler.CompilerJob)
@ CUDA ~/.julia/packages/CUDA/tVtYo/src/compiler/compilation.jl:103
[12] actual_compilation(cache::Dict{Any, CUDA.CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/l8TxP/src/execution.jl:125
[13] cached_compilation(cache::Dict{Any, CUDA.CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/l8TxP/src/execution.jl:103
[14] macro expansion
@ ~/.julia/packages/CUDA/tVtYo/src/compiler/execution.jl:318 [inlined]
[15] macro expansion
@ ./lock.jl:223 [inlined]
[16] cufunction(f::typeof(matmul.matrixMul), tt::Type{Tuple{CUDA.CuDeviceMatrix{Float32, 1}, CUDA.CuDeviceMatrix{Float32, 1}, CUDA.CuDeviceMatrix{Float32, 1}}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA ~/.julia/packages/CUDA/tVtYo/src/compiler/execution.jl:313
[17] cufunction
@ ~/.julia/packages/CUDA/tVtYo/src/compiler/execution.jl:310 [inlined]
[18] macro expansion
@ ~/.julia/packages/CUDA/tVtYo/src/compiler/execution.jl:104 [inlined]
[19] MatMul(yTensor::PyObject, xInTensor::PyObject, wTensor::PyObject, dev::Int64)
@ matmul ~/ddpJulia/matmul/src/matmul.jl:38
[20] invokelatest(::Any, ::Any, ::Vararg{Any}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Base ./essentials.jl:729
[21] invokelatest(::Any, ::Any, ::Vararg{Any})
@ Base ./essentials.jl:726
[22] _pyjlwrap_call(f::Function, args_::Ptr{PyCall.PyObject_struct}, kw_::Ptr{PyCall.PyObject_struct})
@ PyCall ~/.julia/packages/PyCall/SBNSg/src/callback.jl:28
[23] pyjlwrap_call(self_::Ptr{PyCall.PyObject_struct}, args_::Ptr{PyCall.PyObject_struct}, kw_::Ptr{PyCall.PyObject_struct})
@ PyCall ~/.julia/packages/PyCall/SBNSg/src/callback.jl:44>
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 49569) of binary: /opt/miniconda3/envs/torch/bin/python
Traceback (most recent call last):
File "/opt/miniconda3/envs/torch/lib/python3.10/site-packages/julia/pseudo_python_cli.py", line 308, in main
python(**vars(ns))
File "/opt/miniconda3/envs/torch/lib/python3.10/site-packages/julia/pseudo_python_cli.py", line 54, in python
scope = runpy.run_module(module, run_name="__main__", alter_sys=True)
File "/opt/miniconda3/envs/torch/lib/python3.10/runpy.py", line 224, in run_module
return _run_module_code(code, init_globals, run_name, mod_spec)
File "/opt/miniconda3/envs/torch/lib/python3.10/runpy.py", line 96, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "/opt/miniconda3/envs/torch/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/opt/miniconda3/envs/torch/lib/python3.10/site-packages/torch/distributed/launch.py", line 195, in <module>
main()
File "/opt/miniconda3/envs/torch/lib/python3.10/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/opt/miniconda3/envs/torch/lib/python3.10/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/opt/miniconda3/envs/torch/lib/python3.10/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/miniconda3/envs/torch/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/miniconda3/envs/torch/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
main.py FAILED
This MWE might be too simplistic, but in my larger project this setup makes more sense.
As I intend to train and perform inference with large neural networks, I would need DDP to perform multi-GPU and multi-node runs. Any help in understanding what is causing this InvalidIRError would be greatly appreciated.
Thanks!