I wonder why CUDA mul! performance isn’t that good on Int32
using CUDA
N = 2^10
A = CUDA.ones(Int32,N, N)
B = CUDA.ones(Int32,N, N)
CUDA.@time A*B
A = CUDA.ones(Float32,N, N)
B = CUDA.ones(Float32,N, N)
CUDA.@time A*B
results in
0.007594 seconds (103 CPU allocations: 3.453 KiB) (1 GPU allocation: 4.000 MiB, 0.23% memmgmt time)
0.001069 seconds (74 CPU allocations: 2.000 KiB) (3 GPU allocations: 4.000 MiB, 2.31% memmgmt time)
This leads to a simple kernel being faster
using CUDA,KernelAbstractions,BenchmarkTools,LinearAlgebra
@kernel inbounds=true unsafe_indices=true cpu=false function matrixMul!(C, @Const(A), @Const(B))
gs,_ = @groupsize()
N,_ = @ndrange()
a = KernelAbstractions.@localmem eltype(C) (gs,gs)
b = KernelAbstractions.@localmem eltype(C) (gs,gs)
row, col = @index(Global, NTuple) # global index
ig, jg = @index(Local, NTuple) # index within the group
tmp = zero(eltype(C))
for i in Base.OneTo((N+gs-1)÷gs)
id = (i-1)*gs
a[ig,jg] = A[row,id+jg]
b[ig,jg] = B[id+ig,col]
KernelAbstractions.@synchronize()
for j in Base.OneTo(gs)
tmp += a[ig,j] * b[j,jg]
end
end
KernelAbstractions.@synchronize()
C[row,col] = tmp
end
function run_kernel(kernel, C, A, B, N)
kernel(C, A, B; ndrange = (N,N))
CUDA.synchronize()
end
function main()
N = 2^10
A = CUDA.ones(Int32,N, N)
B = CUDA.ones(Int32,N, N)
C1 = CUDA.zeros(Int32,N, N)
C2 = CUDA.zeros(Int32,N, N)
bck = get_backend(C1)
thread = 2^4
kernel = matrixMul!(bck,(thread,thread))
println("kernel")
@btime run_kernel($kernel, $C1, $A, $B, $N)
println("mul!")
@btime begin
mul!($C2, $A, $B)
CUDA.synchronize()
end
println( C1 ≈ C2)
return C1, C2
end
main();
kernel
2.158 ms (58 allocations: 1.75 KiB)
mul!
3.138 ms (98 allocations: 3.25 KiB)
true