Hello everyone!
I have worked quite a bit on setting up a robust foundation for structuring my code and computation loop, with quite a few good pointers from you guys on the forum, so I hope to be able to learn more from you
I have a self-contained MWE with automatic benchmarking produced here:
Code Block (MWE)
# Packages
using CUDA
using Parameters
using StaticArrays
using Adapt
using LinearAlgebra
using BenchmarkTools
# Constants
const T = Float32
const ST = 3
const H = T(0.5)
const αD = T((7/(4*π*H^2)))
const ϵ = T(1e-6)
const NP = 10^4
const NL = 10^7
# Main Struct for Gradient Values
@with_kw struct ∇ᵢWᵢⱼStruct{T,ST}
NL::Base.RefValue{Int}
MaxValidIndex::Base.RefValue{Int} = deepcopy(NL)
# Input for calculation
xᵢⱼ ::AbstractVector{SVector{ST,T}} = zeros(SVector{ST,T},NL[])
xᵢⱼ² ::AbstractVector{T} = similar(xᵢⱼ,T,NL[])
dᵢⱼ ::AbstractVector{T} = similar(xᵢⱼ,T,NL[])
qᵢⱼ ::AbstractVector{T} = similar(xᵢⱼ,T,NL[])
∇ᵢWᵢⱼ ::AbstractVector{SVector{ST,T}} = similar(xᵢⱼ,NL[])
end
# Helper struct to convert onto GPU
struct ∇ᵢWᵢⱼDeviceStruct{ST,T}
xᵢⱼ ::CuDeviceVector{SVector{ST,T}, 1}
xᵢⱼ² ::CuDeviceVector{T, 1}
dᵢⱼ ::CuDeviceVector{T, 1}
qᵢⱼ ::CuDeviceVector{T, 1}
∇ᵢWᵢⱼ ::CuDeviceVector{SVector{ST,T}, 1}
end
# Helper function to resize the specific structs and
# its arrays as how I want it to happen
function Base.resize!(object::∇ᵢWᵢⱼStruct, N::Integer)
# If the current N is larger than the allocated size
# then reallocate a new array with size of N
if N > object.NL[]
object.NL[] = N
for P in propertynames(object)
arr = getfield(object, P)
if isa(arr,AbstractVector)
resize!(arr, N)
fill!(arr,zero(eltype(arr)))
end
end
else
# If the current N is <= to the allocated size
# keep the current array, update MaxValidIndex
# and reset values up to that index instead
object.MaxValidIndex[] = N
for P in propertynames(object)
arr = getfield(object, P)
if P !== :xᵢⱼ
if isa(arr,AbstractVector)
fill!(@view(arr[1:N]),zero(eltype(arr)))
end
end
end
end
return nothing
end
# Adapt Structure for Conversion to GPU
Adapt.adapt_structure(to, s::∇ᵢWᵢⱼStruct) = ∇ᵢWᵢⱼDeviceStruct(
adapt(to, s.xᵢⱼ ),
adapt(to, s.xᵢⱼ² ),
adapt(to, s.dᵢⱼ ),
adapt(to, s.qᵢⱼ ),
adapt(to, s.∇ᵢWᵢⱼ),
)
# Simple Grad Version
function GradVersion0(V::∇ᵢWᵢⱼStruct)
NV = V.MaxValidIndex[]
V.xᵢⱼ²[1:NV] .= dot.(@view(V.xᵢⱼ[1:NV]),@view(V.xᵢⱼ[1:NV]))
V.dᵢⱼ[1:NV] .= sqrt.(@view(V.xᵢⱼ²[1:NV]))
V.qᵢⱼ[1:NV] .= @view(V.dᵢⱼ[1:NV]) ./ H
@. V.∇ᵢWᵢⱼ[1:NV] = αD *5 * (@view(V.qᵢⱼ[1:NV])- 2)^3 * @view(V.qᵢⱼ[1:NV]) / (8H * (@view(V.qᵢⱼ[1:NV]) * H + ϵ)) * @view(V.xᵢⱼ[1:NV])
return nothing
end
### --------------------------- Writing Our Own Custom Kernel ---------------------------
# Actual Calculation of Gradient in Kernel
# Per default
# CUDA.registers(kernel)
# 9
# We wish to lower as specified by https://github.com/JuliaComputing/Training/blob/master/AdvancedGPU/2-2-kernel_analysis_optimization.ipynb
function GPU_GRAD(V,Nmax)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
for i = index:stride:Nmax
xdot = dot(V.xᵢⱼ[i],V.xᵢⱼ[i])
@inbounds V.xᵢⱼ²[i] += xdot
dist = sqrt(xdot)
@inbounds V.dᵢⱼ[i] += dist
q = dist/H
@inbounds V.qᵢⱼ[i] += q
wg = αD * 5 * (q - 2)^3 * q / (8H * (q * H + ϵ)) * V.xᵢⱼ[i]
@inbounds V.∇ᵢWᵢⱼ[i] += wg
end
return nothing
end
# Actual Function to Call for Kernel Gradient Calc
function GradVersion0_Kernel(V)
Nmax = V.MaxValidIndex[]
kernel = @cuda launch=false GPU_GRAD(V,Nmax)
config = launch_configuration(kernel.fun)
threads = min(Nmax, config.threads)
blocks = cld(Nmax, threads)
CUDA.@sync begin
kernel(V, Nmax; threads, blocks)
end
end
### --------------------------- Benchmarking ---------------------------
IniX = rand(SVector{ST,T},NL)
V_CPU = ∇ᵢWᵢⱼStruct{T,ST}(NL=Ref(NL),xᵢⱼ=IniX)
V_GPU_Simple = ∇ᵢWᵢⱼStruct{T,ST}(NL=Ref(NL),xᵢⱼ=CuArray(IniX))
V_GPU_Kernel = ∇ᵢWᵢⱼStruct{T,ST}(NL=Ref(NL),xᵢⱼ=CuArray(IniX))
@CUDA.time GradVersion0(V_CPU)
@CUDA.time GradVersion0(V_GPU_Simple)
@CUDA.time GradVersion0_Kernel(V_GPU_Kernel)
if isapprox(V_CPU.∇ᵢWᵢⱼ,Array(V_GPU_Simple.∇ᵢWᵢⱼ), rtol=1e-6) & isapprox(V_CPU.∇ᵢWᵢⱼ,Array(V_GPU_Kernel.∇ᵢWᵢⱼ), rtol=1e-6)
println("All three approaches produce the same result with rtol=1e-6.")
end
V_CPU_Bench = @benchmark GradVersion0($V_CPU)
V_GPU_Simple_Bench = @benchmark @CUDA.sync GradVersion0($V_GPU_Simple)
V_GPU_Kernel_Bench = @benchmark @CUDA.sync GradVersion0_Kernel($V_GPU_Kernel)
println("V_CPU_Bench")
display(V_CPU_Bench)
println("V_GPU_Simple_Bench")
display(V_GPU_Simple_Bench)
println("V_GPU_Kernel_Bench")
display(V_GPU_Kernel_Bench)
It runs through the same calculation on; CPU, GPU using high-level Julia abstractions and a third version using a GPU kernel. The timings of each are shown below:
All three approaches produce the same result with rtol=1e-6.
V_CPU_Bench
BenchmarkTools.Trial: 60 samples with 1 evaluation.
Range (min … max): 68.682 ms … 122.078 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 78.529 ms ┊ GC (median): 0.00%
Time (mean ± σ): 83.543 ms ± 13.425 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
▂ ▂▅▂▅█ ▂ ▅
█▅▅███████▅██▁▅▅███▅▅█▁▁▅▁█▁▁▁▁▅▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▅▁▅▅▁▁▁▁▁▁▅▁▅ ▁
68.7 ms Histogram: frequency by time 122 ms <
Memory estimate: 3.30 KiB, allocs estimate: 50.
V_GPU_Simple_Bench
BenchmarkTools.Trial: 2082 samples with 1 evaluation.
Range (min … max): 1.931 ms … 32.706 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 2.143 ms ┊ GC (median): 0.00%
Time (mean ± σ): 2.393 ms ± 1.542 ms ┊ GC (mean ± σ): 0.00% ± 0.00%
▆█▆▄▄▂▁
████████▇▇▅▆▄▆▄▄▁▆▄▄▄▁▄▃▄▅▃▃▅▃▁▃▃▃▁▁▅▁▁▃▁▁▁▁▃▁▃▃▁▅▁▁▁▁▁▁▁▃ █
1.93 ms Histogram: log(frequency) by time 8.67 ms <
Memory estimate: 18.20 KiB, allocs estimate: 246.
V_GPU_Kernel_Bench
BenchmarkTools.Trial: 2461 samples with 1 evaluation.
Range (min … max): 1.757 ms … 13.909 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 1.950 ms ┊ GC (median): 0.00%
Time (mean ± σ): 2.022 ms ± 483.841 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁▄▇▆█▂
▃▃▄▆▇██████▆▄▃▃▃▃▃▄▄▃▄▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▂▂▂▂▁▁▁▁▂▁▂ ▃
1.76 ms Histogram: frequency by time 3.06 ms <
Memory estimate: 5.98 KiB, allocs estimate: 93.
It is clear to me that the custom kernel function;
function GPU_GRAD(V,Nmax)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
for i = index:stride:Nmax
xdot = dot(V.xᵢⱼ[i],V.xᵢⱼ[i])
@inbounds V.xᵢⱼ²[i] += xdot
dist = sqrt(xdot)
@inbounds V.dᵢⱼ[i] += dist
q = dist/H
@inbounds V.qᵢⱼ[i] += q
wg = αD * 5 * (q - 2)^3 * q / (8H * (q * H + ϵ)) * V.xᵢⱼ[i]
@inbounds V.∇ᵢWᵢⱼ[i] += wg
end
return nothing
end
My question now is how, if any way, can I speed up this calculation?
To be a bit frank, I think the code “looks simple” and I’ve seen a lot more complex codes with something about shared memory and other tricks etc. If I wish to save the intermediate results as I do in the code currently, would there be any way to speed this up?
Kind regards