CUDA Float64 Float32 difference

Hello. I use RTX 3070. Why is there no difference?

using CUDA, BenchmarkTools
function add!(X::AbstractVector{T}) where T
    return X .+= T(1)
end
A = CUDA.zeros(Float32, 10_000_000);
B = CUDA.zeros(Float64, 10_000_000);
@btime add!(A);  # -> 5.250 μs (37 allocations: 1.09 KiB)
@btime add!(B);  # -> 5.267 μs (37 allocations: 1.09 KiB)
using CUDA, BenchmarkTools
function add!(X::AbstractVector{T}) where T
    return X .+= T(1)
end
A = CUDA.zeros(Float32, 10_000_000);
B = CUDA.zeros(Float64, 10_000_000);
@btime CUDA.@sync add!(A);  # -> 225.800 μs (91 allocations: 1.94 KiB)
@btime CUDA.@sync add!(B);  # -> 437.800 μs (77 allocations: 1.72 KiB)
1 Like