CUDA perf on Int types

I wonder why CUDA mul! performance isn’t that good on Int32

using CUDA
N  = 2^10
A = CUDA.ones(Int32,N, N)
B = CUDA.ones(Int32,N, N)
CUDA.@time A*B
A = CUDA.ones(Float32,N, N)
B = CUDA.ones(Float32,N, N)
CUDA.@time A*B

results in

 0.007594 seconds (103 CPU allocations: 3.453 KiB) (1 GPU allocation: 4.000 MiB, 0.23% memmgmt time)
  0.001069 seconds (74 CPU allocations: 2.000 KiB) (3 GPU allocations: 4.000 MiB, 2.31% memmgmt time)

This leads to a simple kernel being faster

using CUDA,KernelAbstractions,BenchmarkTools,LinearAlgebra
@kernel inbounds=true unsafe_indices=true cpu=false function matrixMul!(C, @Const(A), @Const(B))
    gs,_ = @groupsize()
    N,_ = @ndrange()
    a = KernelAbstractions.@localmem eltype(C) (gs,gs)
    b = KernelAbstractions.@localmem eltype(C) (gs,gs)
    row, col = @index(Global, NTuple) # global index
    ig, jg = @index(Local, NTuple) # index within the group
    tmp = zero(eltype(C))
    for i in Base.OneTo((N+gs-1)÷gs)    
        id = (i-1)*gs
        a[ig,jg] = A[row,id+jg]
        b[ig,jg] = B[id+ig,col]
        KernelAbstractions.@synchronize()
        for j in Base.OneTo(gs)
            tmp += a[ig,j] * b[j,jg]
        end
    end
    KernelAbstractions.@synchronize()
    C[row,col] = tmp
end
function run_kernel(kernel, C, A, B, N)
    kernel(C, A, B; ndrange = (N,N))
    CUDA.synchronize() 
end
function main()
    N  = 2^10
    A = CUDA.ones(Int32,N, N)
    B = CUDA.ones(Int32,N, N)
    C1 = CUDA.zeros(Int32,N, N)
    C2 = CUDA.zeros(Int32,N, N)
    bck = get_backend(C1)
    thread = 2^4
    kernel = matrixMul!(bck,(thread,thread))
    println("kernel")
    @btime run_kernel($kernel, $C1, $A, $B, $N)
    println("mul!")
    @btime begin
        mul!($C2, $A, $B)
        CUDA.synchronize()
    end
    println( C1 ≈ C2)
    return C1, C2
end
main();
kernel
  2.158 ms (58 allocations: 1.75 KiB)
mul!
  3.138 ms (98 allocations: 3.25 KiB)
true