I am learning GPU programming via CUDA.jl. I wish to implement an efficient LU factorization for matrices over a finite field (I implemented it as UInt8/UInt16/... with operations + mod and * mod).
For the sake of simplicity, however, in this post I will work over floats and focus solely on the rank-1 update (the real bottleneck), so no pivoting etc. The simplified CPU code is:
function _rank1_update!(X::Matrix{tw}, k, m, n) where {tw}
@inbounds begin
for j = k+1 : n
@simd for i = k+1 : m
X[i,j] -= X[i,k] * X[k,j] end end end end;
function main_fn(X)
m,n = size(X)
for k = 1 : min(m,n)
_rank1_update!(X, k, m, n) end end;
With ChatGPTās help, I transform this into CUDA code in 3 different ways (parallelizing on all entries / on columns / on rows), and these kernels give the correct result:
using LinearAlgebra, CUDA
@inline _idx1D() = (blockIdx().x-Int32(1)) * blockDim().x + threadIdx().x;
@inline _idx2D() = (blockIdx().y-Int32(1)) * blockDim().y + threadIdx().y, (blockIdx().x-Int32(1)) * blockDim().x + threadIdx().x;
function _kernel1!(X::CuDeviceMatrix{tw}, k, m, n) where {tw}
@inbounds begin i,j = _idx2D(); i += k; j += k;
!(k < i ⤠m && k < j ⤠n) && return;
X[i,j] -= X[i,k] * X[k,j];
return end end;
function _kernel2!(X::CuDeviceMatrix{tw}, k, m, n) where {tw}
@inbounds begin j = k + _idx1D(); !(k < j ⤠n) && return;
xkj = X[k,j]; i = k+1;
while i ⤠m X[i,j] -= X[i,k] * xkj; i += 1; end;
return end end;
function _kernel3!(X::CuDeviceMatrix{tw}, k, m, n) where {tw}
@inbounds begin i = k + _idx1D(); !(k < i ⤠m) && return;
xik = X[i,k]; j = k+1;
while j ⤠n X[i,j] -= xik * X[k,j]; j += 1; end;
return end end;
function _rank1_update1!(X::CuMatrix{tw}, k, m, n) where {tw}
@inbounds begin k==min(m,n) && return nothing;
ker = @cuda launch=false always_inline=true _kernel1!(X,k,m,n);
Φ = CUDA.launch_configuration(ker.fun);
t = Φ.threads; h=m-k; w=n-k; r=w/h;
tx = clamp(round(Int,sqrt(t*r)), 1, w); bx = cld(w,tx);
ty = clamp(round(Int,sqrt(t/r)), 1, h); by = cld(h,ty);
CUDA.@sync ker(X,k,m,n; threads=(tx,ty), blocks=(bx,by));
end end; # use the compiler/Φ to determine the optimal number of threads for this input
function _rank1_update2!(X::CuMatrix{tw}, k, m, n) where {tw}
@inbounds begin k==min(m,n) && return nothing;
ker = @cuda launch=false always_inline=true _kernel2!(X,k,m,n);
Φ = CUDA.launch_configuration(ker.fun);
t = clamp(Φ.threads, Int32(1), n-k); b = cld(n-k,t);
CUDA.@sync ker(X,k,m,n; threads=t, blocks=b) end end;
function _rank1_update3!(X::CuMatrix{tw}, k, m, n) where {tw}
@inbounds begin k==min(m,n) && return;
ker = @cuda launch=false always_inline=true _kernel3!(X,k,m,n);
Φ = CUDA.launch_configuration(ker.fun);
t = clamp(Φ.threads, Int32(1), m-k); b = cld(m-k,t);
CUDA.@sync ker(X,k,m,n; threads=t, blocks=b) end end;
function main_fn!(X::CuMatrix{tw}, mtd::Int=3) where {tw}
@inbounds begin m,n = size(X);
for k = 1 : min(m,n)
if mtd==1 _rank1_update1!(X, Int32(k), Int32(m), Int32(n))
elseif mtd==2 _rank1_update2!(X, Int32(k), Int32(m), Int32(n))
elseif mtd==3 _rank1_update3!(X, Int32(k), Int32(m), Int32(n)) end
end end end;
When benchmarking (on Nvidia Geforce RTX 2080 Ti), we see that mtd=3 is the most performant of the 3, but still pales in comparison with the built-in lu!:
julia> m=n=30_000; X = cu(rand(-1:Float32(1e-6):1,m,n)); CUDA.@time lu!(X);
2.179116 seconds (73 CPU allocations: 3.000 KiB, 7.00% gc time) (1 GPU allocation: 117.188 KiB, 0.00% memmgmt time)
julia> X = cu(rand(-1:Float32(1e-6):1,m,n)); CUDA.@time main_fn!(X,1);
527.408564 seconds (1.47 M CPU allocations: 60.880 MiB)
julia> X = cu(rand(-1:Float32(1e-6):1,m,n)); CUDA.@time main_fn!(X,2);
1386.822122 seconds (389.96 k CPU allocations: 7.783 MiB, 1.05% gc time)
julia> X = cu(rand(-1:Float32(1e-6):1,m,n)); CUDA.@time main_fn!(X,3);
294.605763 seconds (389.13 k CPU allocations: 7.769 MiB, 0.05% gc time), 0.00% GPU memmgmt time
I used some of the official performance tips: passing Int32 instead of Int64 indices, maxregs=32, always_inline=true, replaced for j=k+1:n with a while loop, but none of it helped.
- Most of all, I would appreciate any suggestions on how to improve mtd=3 performance.
- Why are there so many CPU allocations? How can I cut down on them?
- How come mtd=3 is more efficient than mtd=2? I thought CuMatrixās are column-major.
Also, note that if in main_fn! I replace _rank1_update3! with just functions for finding the pivot, swapping the column and row, and multiplying the column of the pivot (which are all GPU functions), the execution is very fast (4sec on 100K x 100K matrix), so the problem should be in how _rank1_update3! is implemented, not in calling the kernel many times.