Hi all,
I have a GPU kernel where the main bottleneck is finding the index of an unsigned integer in a large constant array (lll_basis in the code below).
Typical array sizes are around \sim 10^7, but I am hoping to work with sizes of 10^9.
Here’s the kernel (for reference):
function operator_action_gpu!(
y, x, creation_ops, creation_op_phase_helpers,
annhilation_ops, annhilation_phase_helpers,
op_matrix_elements, lll_basis
)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
i = index
while i <= length(y)
op_sum = zero(eltype(y))
@inbounds fock_state = lll_basis[i]
op_iter = 1
while op_iter <= length(annhilation_ops)
@inbounds annhilated_state = ~annhilation_ops[op_iter] & fock_state
@inbounds if isequal(annhilation_ops[op_iter], annhilation_ops[op_iter] & fock_state) &&
isequal(creation_ops[op_iter], creation_ops[op_iter] & (~annhilated_state))
@inbounds op_sum += x[searchsortedfirst(lll_basis, annhilated_state | creation_ops[op_iter])] *
op_matrix_elements[op_iter] *
(-1)^(count_ones(annhilation_phase_helpers[op_iter] & fock_state) +
count_ones(creation_op_phase_helpers[op_iter] & annhilated_state))
end
op_iter += 1
end
@inbounds y[i] = op_sum
i += stride
end
return
end
Currently, I use searchsortedfirst for the lookup.
I also tried an alternative where I compute the index directly:
function state_to_index(indexer::ConstrainedIndexer, state::T)::UInt64 where {T<:Unsigned}
_validate_state(indexer, state)
counts = indexer.counts
remaining_particles = indexer.num_particles
remaining_sum = indexer.total_sum
rank = UInt64(0)
@inbounds for pos in (indexer.num_orbitals - 1):-1:0
remaining_particles == 0 && break
without = counts[pos + 1, remaining_particles + 1, remaining_sum + 1]
if ((state >>> pos) & one(T)) != zero(T)
rank += without
remaining_particles -= 1
remaining_sum -= pos
end
end
return rank + UInt64(1)
end
However, for \sim 10^6 entries, this direct indexing approach is about 2\times slower than the binary search version (just comparing the search portions) (with the factor growing with array length).
My question is:
- Is there a faster alternative for this kind of lookup on the GPU?
- In particular, is there a dictionary-like or hash-table structure available in
CUDA.jl?
I know NVIDIA’scuCollectionshas hash tables; should I try to hook into that?
Any suggestions / help would be greatly appreciated!