Kernel for building histogram on GPU

Is it possible that there would be an issue remaining with atomic operations in shared memory following the fix in #642 in CUDAnative? Using the variation on the test that was introduced with the fix the following works:

using CUDA
function kernel3(x)
    tid = threadIdx().x
    shared = @cuStaticSharedMem(Float32, 4)
    fill!(shared, 1f0)
    sync_threads()
    CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
    sync_threads()
    CUDA.atomic_add!(pointer(x, 1), shared[1])
    return
end

x = CUDA.zeros(4)
@cuda threads = 2 kernel3(x)
x

However, it throws an error if the atomic add within the shared memory is repeated a second time (simplification of the iterations that would happens within a loop). The operations seems quite legit to me given the sync_threads() that occurs between those atomic adds. Am I missing something?

function kernel4(x)
    tid = threadIdx().x
    shared = @cuStaticSharedMem(Float32, 4)
    fill!(shared, 1f0)
    sync_threads()
    CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
    sync_threads()
    CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
    sync_threads()
    CUDA.atomic_add!(pointer(x, 1), shared[1])
    return
end

x = CUDA.zeros(4)
@cuda threads = 2 kernel4(x)
x

4-element CuArray{Float32,1}:
ERROR: CUDA error: an illegal memory access was encountered (code 700, ERROR_ILLEGAL_ADDRESS)
Stacktrace:
 [1] throw_api_error(::CUDA.cudaError_enum) at C:\Users\jerem\.julia\packages\CUDA\YeS8q\lib\cudadrv\error.jl:97
 [2] macro expansion at C:\Users\jerem\.julia\packages\CUDA\YeS8q\lib\cudadrv\error.jl:104 [inlined]
 [3] cuMemcpyDtoH_v2(::Ptr{Float32}, ::CuPtr{Float32}, ::Int64) at C:\Users\jerem\.julia\packages\CUDA\YeS8q\lib\utils\call.jl:93
 [4] #unsafe_copyto!#6 at C:\Users\jerem\.julia\packages\CUDA\YeS8q\lib\cudadrv\memory.jl:395 [inlined]
 [5] unsafe_copyto! at C:\Users\jerem\.julia\packages\CUDA\YeS8q\lib\cudadrv\memory.jl:388 [inlined]
 [6] unsafe_copyto! at C:\Users\jerem\.julia\packages\CUDA\YeS8q\src\array.jl:299 [inlined]
 [7] copyto!(::Array{Float32,1}, ::Int64, ::CuArray{Float32,1}, ::Int64, ::Int64) at C:\Users\jerem\.julia\packages\CUDA\YeS8q\src\array.jl:268
 [8] copyto! at C:\Users\jerem\.julia\packages\CUDA\YeS8q\src\array.jl:272 [inlined]
 [9] copyto_axcheck! at .\abstractarray.jl:946 [inlined]
 [10] Array at .\array.jl:562 [inlined]
 [11] Array at .\boot.jl:430 [inlined]
 [12] convert at .\array.jl:554 [inlined]
 [13] adapt_storage at C:\Users\jerem\.julia\packages\CUDA\YeS8q\src\array.jl:243 [inlined]
 [14] adapt_structure at C:\Users\jerem\.julia\packages\Adapt\8kQMV\src\Adapt.jl:42 [inlined]
 [15] adapt at C:\Users\jerem\.julia\packages\Adapt\8kQMV\src\Adapt.jl:40 [inlined]
 [16] convert_to_cpu at C:\Users\jerem\.julia\packages\GPUArrays\jhRU7\src\host\abstractarray.jl:45 [inlined]
 [17] print_array at C:\Users\jerem\.julia\packages\GPUArrays\jhRU7\src\host\abstractarray.jl:50 [inlined]
 [18] show(::IOContext{REPL.Terminals.TTYTerminal}, ::MIME{Symbol("text/plain")}, ::CuArray{Float32,1}) at .\arrayshow.jl:358
 [19] display(::REPL.REPLDisplay, ::MIME{Symbol("text/plain")}, ::Any) at C:\Users\jerem\AppData\Local\Programs\Julia-1.5.2\share\julia\stdlib\v1.5\REPL\src\REPL.jl:214
 [20] display(::REPL.REPLDisplay, ::Any) at C:\Users\jerem\AppData\Local\Programs\Julia-1.5.2\share\julia\stdlib\v1.5\REPL\src\REPL.jl:218
 [21] display(::Any) at .\multimedia.jl:328
 [22] #invokelatest#1 at .\essentials.jl:710 [inlined]
 [23] invokelatest at .\essentials.jl:709 [inlined]
 [24] (::VSCodeServer.var"#61#65"{String,Int64,Int64,String,Module,Bool,VSCodeServer.ReplRunCodeRequestParams})() at c:\Users\jerem\.vscode\extensions\julialang.language-julia-1.0.10\scripts\packages\VSCodeServer\src\eval.jl:157
 [25] withpath(::VSCodeServer.var"#61#65"{String,Int64,Int64,String,Module,Bool,VSCodeServer.ReplRunCodeRequestParams}, ::String) at c:\Users\jerem\.vscode\extensions\julialang.language-julia-1.0.10\scripts\packages\VSCodeServer\src\repl.jl:124
 [26] (::VSCodeServer.var"#60#64"{String,Int64,Int64,String,Module,Bool,Bool,VSCodeServer.ReplRunCodeRequestParams})() at c:\Users\jerem\.vscode\extensions\julialang.language-julia-1.0.10\scripts\packages\VSCodeServer\src\eval.jl:142
 [27] hideprompt(::VSCodeServer.var"#60#64"{String,Int64,Int64,String,Module,Bool,Bool,VSCodeServer.ReplRunCodeRequestParams}) at c:\Users\jerem\.vscode\extensions\julialang.language-julia-1.0.10\scripts\packages\VSCodeServer\src\repl.jl:36
 [28] (::VSCodeServer.var"#59#63"{String,Int64,Int64,String,Module,Bool,Bool,VSCodeServer.ReplRunCodeRequestParams})() at c:\Users\jerem\.vscode\extensions\julialang.language-julia-1.0.10\scripts\packages\VSCodeServer\src\eval.jl:110
 [29] with_logstate(::Function, ::Any) at .\logging.jl:408
 [30] with_logger at .\logging.jl:514 [inlined]
 [31] (::VSCodeServer.var"#58#62"{VSCodeServer.ReplRunCodeRequestParams})() at c:\Users\jerem\.vscode\extensions\julialang.language-julia-1.0.10\scripts\packages\VSCodeServer\src\eval.jl:109
 [32] #invokelatest#1 at .\essentials.jl:710 [inlined]
 [33] invokelatest(::Any) at .\essentials.jl:709
 [34] macro expansion at c:\Users\jerem\.vscode\extensions\julialang.language-julia-1.0.10\scripts\packages\VSCodeServer\src\eval.jl:27 [inlined]
 [35] (::VSCodeServer.var"#56#57")() at .\task.jl:356