I am developing kernel that is comparing two boolean 1 dimensional arrays looking for true positives, false positives and false negatives (details in function incr_shmem below).
And I have one bizarre error when I include CUDA.@cuprint statement code start to give correct results , and without it it do not. My primary hipothesis was that print gives time for threads to synchronize - but in this case syncwarp() / syncthreads would make it work, but it is not the case.
Expected result - tp = 18; fp = 9; fn = 27
What goes wrong - unless I do not include print statement and array is not too big in the position mentioned below result is incorect - ussually to big
I tested multiple hypothesis and all were rejected
- incr_shmem function is invoked in incorrect setting - Iincluded print statement inside function - and in all cases it is invoked correct nukmber of times
- indexes are calculated incorrectly and they are overlapping - I printed the indexes, and I did not observed any overlapping
- incr_shmem is somehow not synchronized insie the unrolled loop - basically this loop is intended to affect only given single lane as it is referencing thread id so there should be no datarace - yet I chacked hypothesis by including syncwarp() / sync threads inside a loop - and it did not helped
- reduce warp function are interacting between each other - I tested it by putting sync thread between calls and after each and it did not helped, besides shuffle down sync If I understand it well should be synchronizing on it’s own
- there is error in test data - I checked that sum of all elements in test data is 45 and code gives back numbers like 46000 so far from it
- In line reduce_warp(shmem[threadIdx().x,1],32) shmem[threadIdx().x,1] should be in separate value and only this included in reduce statement - I tried it and did not succeeded
Additional observations:
as mentioned I get correct results If print statment is included and array is small enough. When array gets big adding print statement do not guarantee correct results.
I am quite sure that problem is somewhat related to reduce_warp function, but implementation is taken from CUDA.jl/mapreduce.jl at afe81794038dddbda49639c8c26469496543d831 · JuliaGPU/CUDA.jl (github.com)
so I think it needs to be correct.
Thank You for any help, If You will be ever in Poland I will have pile of chockolates for You :D!
# Reduce a value across a warp
@inline function reduce_warp( vall, lanesNumb)
offset = UInt32(1)
while(offset <lanesNumb)
vall+=shfl_down_sync(FULL_MASK, vall, offset)
offset<<= 1
end
return vall
end
"""
add value to the shared memory in the position i, x where x is 1 ,2 or 3 and is calculated as described below
boolGold & boolSegm + boolGold +1 will evaluate to
3 in case of true positive
2 in case of false positive
1 in case of false negative
"""
@inline function incr_shmem( primi::Int64,boolGold::Bool,boolSegm::Bool,shmem )
@inbounds shmem[ primi, (boolGold & boolSegm + boolSegm +1) ]+=(boolGold | boolSegm)
return true
end
function getBlockTpFpFn(goldBoolGPU::CuDeviceArray{Bool,1, 1}
, segmBoolGPU::CuDeviceArray{Bool, 1, 1}
,tp,tn,fp,fn)
i = (threadIdx().x* 16) + ((blockIdx().x - 1) *16) * (blockDim().x)# used as a basis to get data we want from global memory
blockId = blockIdx().x
wid, lane = fldmod1(threadIdx().x,32)
shmem = @cuStaticSharedMem(UInt16, (513,3))
#for storing results from warp reductions
shmemSum = @cuStaticSharedMem(UInt16, (33,3))
#incrementing - taking multiple datapoints per lane
@unroll for k in 0:15
incr_shmem(threadIdx().x,goldBoolGPU[i+k],segmBoolGPU[i+k],shmem)
end#for
sync_warp()
#reducing across the warp
@inbounds sumFn = reduce_warp(shmem[threadIdx().x,1],32)
@inbounds sumFp = reduce_warp(shmem[threadIdx().x,2],32)
@inbounds sumTp = reduce_warp(shmem[threadIdx().x,3],32)
if(lane==1)
if(sumFp>0)
# !!!!!!!!!! HERE is this magical print statement!!!!!!!!!!!!
CUDA.@cuprint "sumFp $(sumFp) wid $(wid) "
end
@inbounds shmemSum[wid,1]= sumFn
@inbounds shmemSum[wid,2]= sumFp
@inbounds shmemSum[wid,3]= sumTp
end#if
sync_threads()
#now all data about of intrest should be in shared memory so we will get all results from warp reduction in the shared memory
if(wid==1 )
vallTp = reduce_warp(shmemSum[lane,3],32 )
vallFp = reduce_warp(shmemSum[lane,2],32 )
vallFn = reduce_warp(shmemSum[lane,1],32 )
if(lane==1)
@inbounds @atomic tp[]+=vallTp
@inbounds @atomic fp[]+=vallFp
@inbounds @atomic fn[]+=vallFn
end
end
return
end
Below function returning test data
nx=512
ny=512
nz=500
#first we initialize the metrics on CPU so we will modify them easier
goldBool= falses(nx,ny,nz); #mimicks gold standard mask
segmBool= falses(nx,ny,nz); #mimicks mask
# so we have 2 cubes that are overlapped in their two thirds
cartTrueGold = CartesianIndices(zeros(3,3,5) ).+CartesianIndex(5,5,5);
cartTrueSegm = CartesianIndices(zeros(3,3,3) ).+CartesianIndex(4,5,5);
goldBool[cartTrueGold].=true
segmBool[cartTrueSegm].=true
#for storing output total first tp than TN than Fp and Fn
tp= CuArray([0]);
tn= CuArray([0]);
fp= CuArray([0]);
fn = CuArray([0]);
FlattG = push!(vec(goldBool),false,false,false,false,false,false,false);
FlattSeg = push!(vec(segmBool),false,false,false,false,false,false,false);
FlattGoldGPU= CuArray( FlattG)
FlattSegGPU= CuArray( FlattSeg )
goldBoolGPU= CuArray( goldBool)
segmBoolGPU= CuArray( segmBool )
And Finally invoking like this
blockss = Int64(round((length(FlattGoldGPU)/512)/16))-1
#@benchmark CUDA.@sync
@cuda threads=512 blocks=blockss getBlockTpFpFn(FlattGoldGPU,FlattSegGPU,tp,tn,fp,fn)