Broadcasting in CUDA kernels

In my actual code I have a two-dimensional array and I want to perform a set of operations along one dimension while parallelize along the other. Something like this:

using CUDA

function kernel1(a, b)
    id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    N1, N2 = size(a)
    for i=id:stride:N1
        @. a[i, :] = b
    end
    return nothing
end


function kernel2(a, b)
    id = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    N1, N2 = size(a)
    for i=id:stride:N1
        for j=1:N2
            a[i, j] = b[j]
        end
    end
    return nothing
end

N1 = 100
N2 = 10

a = CUDA.zeros((N1, N2))
b = CUDA.ones(N2)

# @cuda threads=N1 kernel1(a, b)

@cuda threads=N1 kernel2(a, b)