Hi,
I am learning to use GPU in Julia. From the limited tutorials, I notice that it is possible to write cuda kernels in Julia directly. Then I tried the simple naïve matrix multiplication in CUDA and converted it into Julia:
# Matrix multiplication in GPU Julia
using CUDAnative
""" Compute C = A * B """
function kernel_matmul(C, A, B, m, n, p)
tx = (blockIdx().x-1) * blockDim().x + threadIdx().x
ty = (blockIdx().y-1) * blockDim().y + threadIdx().y
Cvalue = 0.0
if (tx < n) && (ty < m)
for k = 1:p
Cvalue += A[ty*p + k]*B[k*n + tx]
end
# Write the matrix to device memory; each thread writes one element
C[ty*n + tx] = Cvalue
end
return nothing
end
using CUDAdrv, CuArrays
using Test
# Turn this on for speed
#CuArrays.allowscalar(false)
precision = Float32
m, n, p = 8, 8, 8 # matrix sizes: C[m,n] = A[m,p] * B[p,n]
A, B, C = rand(precision,m,p), rand(precision,p,n), rand(precision,m,n)
Ad, Bd, Cd = CuArray(A), CuArray(B), CuArray(C)
# CPU
C = A*B
# CUBLAS
Cd = Ad * Bd
TILE_WIDTH = 4
blocks = (TILE_WIDTH,TILE_WIDTH,1)
threads = (floor(Int,(n+blocks[1]-1)/blocks[1]),
floor(Int,(m+blocks[2]-1)/blocks[2]),
1)
@cuda blocks=blocks threads=threads kernel_matmul(Cd, Ad, Bd, m, n, p)
@assert C == Cd
However, it showed me the error:
ERROR: a exception was thrown during kernel execution.
Run Julia on debug level 2 for device stack traces.
ERROR: LoadError:
Stacktrace:
[1] cuMemcpyDtoH_v2(::Ptr{Float32}, ::CuPtr{Float32}, ::Int64) at C:\Users\henry\.julia\packages\CUDAdrv\3EzC1\src\error.jl:123
[2] #unsafe_copyto!#5 at C:\Users\henry\.julia\packages\CUDAdrv\3EzC1\src\memory.jl:285 [inlined]
[3] unsafe_copyto! at C:\Users\henry\.julia\packages\CUDAdrv\3EzC1\src\memory.jl:278 [inlined]
[4] copyto!(::Array{Float32,1}, ::Int64, ::CuArray{Float32,2,Nothing}, ::Int64, ::Int64) at C:\Users\henry\.julia\packages\CuArrays\7z7MV\src\array.jl:265
[5] _getindex at C:\Users\henry\.julia\packages\GPUArrays\0lvhc\src\indexing.jl:49 [inlined]
[6] getindex at C:\Users\henry\.julia\packages\GPUArrays\0lvhc\src\indexing.jl:55 [inlined]
[7] iterate at .\abstractarray.jl:914 [inlined]
[8] iterate at .\abstractarray.jl:912 [inlined]
[9] _zip_iterate_some at .\iterators.jl:350 [inlined]
[10] _zip_iterate_some at .\iterators.jl:352 [inlined]
[11] _zip_iterate_all at .\iterators.jl:342 [inlined]
[12] iterate at .\iterators.jl:332 [inlined]
[13] ==(::Array{Float32,2}, ::CuArray{Float32,2,Nothing}) at .\abstractarray.jl:1775
[14] top-level scope at D:\scripts\GPU_collection\julia\GPU_MatMuliply.jl:53
[15] include at .\boot.jl:328 [inlined]
[16] include_relative(::Module, ::String) at .\loading.jl:1105
[17] include(::Module, ::String) at .\Base.jl:31
[18] include(::String) at .\client.jl:424
[19] top-level scope at none:0ERROR: KernelException: exception thrown during kernel execution on device GeForce GTX 1080
Stacktrace:
[1] check_exceptions() at C:\Users\henry\.julia\packages\CUDAnative\2WQzk\src\exceptions.jl:55
[2] check_exception_hook(::Symbol) at C:\Users\henry\.julia\packages\CUDAnative\2WQzk\src\exceptions.jl:66
[3] macro expansion at C:\Users\henry\.julia\packages\CUDAdrv\3EzC1\src\error.jl:119 [inlined]
[4] cuGetErrorString(::CuError, ::Base.RefValue{Cstring}) at C:\Users\henry\.julia\packages\CUDAdrv\3EzC1\src\libcuda.jl:6
[5] description at C:\Users\henry\.julia\packages\CUDAdrv\3EzC1\src\error.jl:65 [inlined]
[6] showerror(::IOContext{Base.GenericIOBuffer{Array{UInt8,1}}}, ::CuError) at C:\Users\henry\.julia\packages\CUDAdrv\3EzC1\src\error.jl:71
[7] (::Base.var"#657#658"{CuError})(::IOContext{Base.GenericIOBuffer{Array{UInt8,1}}}) at .\errorshow.jl:76
[8] #with_output_color#707(::Bool, ::typeof(Base.with_output_color), ::Function, ::Symbol, ::IOContext{Base.TTY}) at .\util.jl:365
[9] with_output_color(::Function, ::Symbol, ::IOContext{Base.TTY}) at .\util.jl:363
[10] #showerror#656(::Bool, ::typeof(showerror), ::IOContext{Base.TTY}, ::CuError, ::Array{Base.StackTraces.StackFrame,1}) at .\errorshow.jl:75
[11] (::Base.var"#kw##showerror")(::NamedTuple{(:backtrace,),Tuple{Bool}}, ::typeof(showerror), ::IOContext{Base.TTY}, ::CuError, ::Array{Base.StackTraces.StackFrame,1}) at .\none:0
[12] #showerror#659(::Bool, ::typeof(showerror), ::IOContext{Base.TTY}, ::LoadError, ::Array{Base.StackTraces.StackFrame,1}) at .\errorshow.jl:85
[13] showerror at .\errorshow.jl:84 [inlined]
[14] display_error(::Base.TTY, ::LoadError, ::Array{Base.StackTraces.StackFrame,1}) at C:\Users\henry\.julia\packages\Atom\lBERI\src\repl.jl:138
[15] (::Atom.var"#172#174"{Module})() at C:\Users\henry\.julia\packages\Atom\lBERI\src\repl.jl:177
[16] with_logstate(::Atom.var"#172#174"{Module}, ::Base.CoreLogging.LogState) at .\logging.jl:395
[17] with_logger at .\logging.jl:491 [inlined]
[18] evalrepl(::Module, ::String) at C:\Users\henry\.julia\packages\Atom\lBERI\src\repl.jl:162
[19] top-level scope at C:\Users\henry\.julia\packages\Atom\lBERI\src\repl.jl:207
[20] eval(::Module, ::Any) at .\boot.jl:330
[21] eval_user_input(::Any, ::REPL.REPLBackend) at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.3\REPL\src\REPL.jl:86
[22] macro expansion at D:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.3\REPL\src\REPL.jl:118 [inlined]
[23] (::REPL.var"#26#27"{REPL.REPLBackend})() at .\task.jl:333
What is the error here?