Hi All,
I’m relatively new to Julia and I’ve been trying to benchmark a nested-loop function over 2 arrays, on both the CPU and GPU, however, the CPU version is significantly quicker.
Now, I’m sure that this is due to my inexperience with Julia and writing GPU kernel directly. So, I’d like to ask the Julia community what I’m doing wrong?
I’ve attached a minimal reproducible example script at the bottom of this topic.
I’ve ran a quick benchmark as a function of the size of the arrays, from 10 through 1000, and the walltime of the CPU function is significantly faster than the GPU (near an order of magnitude in some cases).
| Size | CPU walltime | CUDA walltime | CPU Allocations | CUDA Allocations |
| 10 | 10.197 μs | 306.622 μs | 64 (6.88 KiB) | 199 (4.97KiB) |
| 100 | 5.191 ms | 155.692 ms | 64 (6.88 KiB) | 200 (4.98KiB) |
| 1000 | | 118.089 s | 872.761 s | 64 (6.88KiB) | 307 (9.58 KiB) |
Any help is appreciated! Thanks
Here’s the minimal reproducible example,
using Random
using LinearAlgebra
using ProgressMeter
using BenchmarkTools
using CUDA
# numerical constants for our benchmark here
const MAX_INT = 2^12 # max integer in our array
@inline function lookup_cpu(bit::Int64, bitword::Vector{Int64}, coeffs::Vector{Float64})
insertion_index = searchsortedfirst(bitword, bit)
# Return 0.0 if index is out of bounds or if the state doesn't match
if insertion_index > size(bitword,1) || bitword[insertion_index] != bit
return 0.0
end
return coeffs[insertion_index]
end
@inline function calc_multi_factor(i::Int64, j::Int64, k::Int64, l::Int64)::Float64
""" Function to compute occurance of each 4-index tuple,
i.e. compute once times by factor """
if i == j && j == k && k == l
return 1.0
elseif (i == j && j == k) || (i == j && j == l) || (i == k && k == l) || (j == k && k == l)
return 4.0
elseif (i == j && k == l) || (i == k && j == l) || (i == l && j == k)
return 6.0
elseif (i == j) || (i == k) || (i == l) || (j == k) || (j == l) || (k == l)
return 12.0
else
return 24.0
end
end
@inline function cpu_function(bitword::Vector{Int64}, coeffs::Vector{Float64})
N_states = size(bitword,1)
n_threads = Threads.nthreads()
chunk_size = ceil(Int, N_states/n_threads)
thread_totals = zeros(Float64, n_threads)
Threads.@threads for thread_id in 1:n_threads # separate the outer loop across all threads
# set a subset of indices in `i` loop to each available thread
start_i = (thread_id-1)*chunk_size + 1
end_i = min(thread_id*chunk_size, N_states)
thread_total = 0.0 # local thread accumulator
for i in start_i:end_i
for j in i:N_states
for k in j:N_states
b123 = bitword[i] ⊻ bitword[j] ⊻ bitword[k]
coeff1 = lookup_cpu(b123, bitword, coeffs)
if coeff1 == 0.0
continue # if coeff is 0. skip rest of calc
end
base_coeff = coeffs[i] * coeffs[j] * coeffs[k]
for l in k:N_states
mult_factor = calc_multi_factor(i,j,k,l)
state2 = bitword[i] ⊻ bitword[k] ⊻ bitword[l]
coeff2 = lookup_cpu(state2, bitword, coeffs)
if coeff2 == 0.0
continue
end
state3 = bitword[j] ⊻ bitword[k] ⊻ bitword[l]
coeff3 = lookup_cpu(state3, bitword, coeffs)
if coeff3 == 0.0
continue
end
state4 = bitword[i] ⊻ bitword[j] ⊻ bitword[l]
coeff4 = lookup_cpu(state4, bitword, coeffs)
if coeff4 == 0.0
continue
end
thread_total += mult_factor * base_coeff * coeffs[l] *
coeff1 * coeff2 * coeff3 * coeff4
end
end
end
end
thread_totals[thread_id] = thread_total # Store result for this thread
end
return -log2(sum(thread_totals)) # Combine results from all threads
end
@inline function lookup_gpu(basis::Int64, bitword::CuDeviceVector{Int64}, coeffs::CuDeviceVector{Float64}, n_configs::Int64)::Float64
# binary tree
@inbounds begin
low = 1
high = n_configs
while low <= high
mid = (low + high) >>> 1
mid_val = bitword[mid]
if mid_val < basis
low = mid + 1
elseif mid_val > basis
high = mid - 1
else
return coeffs[mid]
end
end
end
return 0.0
end
function gpu_kernel(
bitword::CuDeviceVector{Int64},
coeffs::CuDeviceVector{Float64},
N_states::Int,
result::CuDeviceVector{Float64}
)
tid = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
total_threads = blockDim().x * gridDim().x
acc = 0.0f0
@inbounds begin
for i in tid+1:total_threads:N_states
bi = bitword[i]
ci = coeffs[i]
for j in i:N_states
bj = bitword[j]
cj = coeffs[j]
for k in j:N_states
bk = bitword[k]
ck = coeffs[k]
b123 = bi ⊻ bj ⊻ bk
coeff1 = lookup_gpu(b123, bitword, coeffs, N_states)
if coeff1 == 0.0f0
continue
end
base = ci * cj * ck
ik = bi ⊻ bk
jk = bj ⊻ bk
ij = bi ⊻ bj
for l in k:N_states
bl = bitword[l]
cl = coeffs[l]
coeff2 = lookup_gpu(ik ⊻ bl, bitword, coeffs, N_states)
if coeff2 == 0.0f0; continue; end
coeff3 = lookup_gpu(jk ⊻ bl, bitword, coeffs, N_states)
if coeff3 == 0.0f0; continue; end
coeff4 = lookup_gpu(ij ⊻ bl, bitword, coeffs, N_states)
if coeff4 == 0.0f0; continue; end
acc += calc_multi_factor(i,j,k,l) * base * cl * coeff1 * coeff2 * coeff3 * coeff4
end
end
end
end
end
result[tid+1] = acc # 1-based in Julia
return
end
function gpu_function(bitword::Vector{Int64}, coeffs::Vector{Float64})
N::Int64 = size(bitword,1)
d_bitword = CuArray(bitword)
d_coeffs = CuArray(coeffs)
# Create dummy kernel to get launch configurations
dummy_kernel = @cuda launch=false gpu_kernel(
d_bitword,
d_coeffs,
N,
CUDA.zeros(Float64, 1)
)
# get number of threads/blocks from dummy kernel
config = launch_configuration(dummy_kernel.fun)
threads = min(N, config.threads)
blocks = cld(N, threads)
d_result = CUDA.zeros(Float64, threads * blocks)
@cuda threads=threads blocks=blocks gpu_kernel(
d_bitword, d_coeffs, N, d_result
)
total = sum(d_result)
value = -log2(total)
return value
end
# Our script begins here
NITEMS::Int64 = 100
bitword::Vector{Int64} = rand(1:MAX_INT,NITEMS) # array of integers representing binary strings
coeffs::Vector{Float64} = randn(NITEMS) # coeffs for each item in `bitword`
coeffs ./= norm(coeffs) # normalize to 1
sorted_indices = sortperm(bitword) # they need to be sorted for searchsortedfirst to work
bitword = bitword[sorted_indices]
coeffs = coeffs[sorted_indices]
# CPU code
results = @btime cpu_function($bitword, $coeffs)
println("[CPU] Size: ",size(bitword,1)," | value: ", results)
# GPU code
results_gpu = @btime gpu_function(bitword, coeffs)
println("[CUDA] Size: ",size(bitword,1)," | value: ", results_gpu)