using CUDAnative, CuArrays, DoubleFloats
T = Double64
# T = Float64
function sum_plus_mul!(c, a, b)
@cuda blocks = 1 threads = 32 kernel(c, a, b)
CUDAnative.synchronize()
end
function kernel(c, a, b)
i = threadIdx().x
while i <= length(c)
c[i] = a[i] + b[i] + a[i] * b[i]
i += 32
end
return
end
a = rand(T, 10) |> CuArrays.CuArray;
b = rand(T, 10) |> CuArrays.CuArray;
c = similar(a);
sum_plus_mul!(c, a, b)