https://cuda.juliagpu.org/stable/api/kernel/#Element-access-and-broadcasting
using Test
using CUDA
a = rand(Float16, (16, 16))
b = rand(Float16, (16, 16))
c = rand(Float32, (16, 16))
a_dev = CuArray(a)
b_dev = CuArray(b)
c_dev = CuArray(c)
d_dev = similar(c_dev)
function kernel(a_dev, b_dev, c_dev, d_dev)
conf = WMMA.Config{16, 16, 16, Float32}
a_frag = WMMA.load_a(pointer(a_dev), 16, WMMA.ColMajor, conf)
b_frag = WMMA.load_b(pointer(b_dev), 16, WMMA.ColMajor, conf)
c_frag = WMMA.load_c(pointer(c_dev), 16, WMMA.ColMajor, conf)
c_frag = 0.5f0 .* c_frag
d_frag = WMMA.mma(a_frag, b_frag, c_frag, conf)
WMMA.store_d(pointer(d_dev), d_frag, 16, WMMA.ColMajor, conf)
return
end
@cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev)
d = Array(d_dev)
@test all(isapprox.(a * b + 0.5 * c, d; rtol=0.01))
Produces the error:
ERROR: LLVM error: Cannot select: intrinsic %llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32
Any one knows the solution
Kind regards