Flux: GPU not working as expected

Sure, essentially the same as the Model Zoo example (swapped 28,28,N input pictures with 5,10,N also Float32)
Only utilising 7% of GPU [Nvidia 1060] when I increase N above a small number (i.e. CPU is faster

if has_cuda()		# Check if CUDA is available
    @info "CUDA is on"
    import CuArrays		# If CUDA is available, import CuArrays
    CuArrays.allowscalar(false)
end

@with_kw mutable struct Args
    η::Float64 = 3e-4       # learning rate
    batchsize::Int = 32   # batch size
    epochs::Int = 10        # number of epochs
    device::Function = cpu  # set as gpu, if gpu available
end
function getdata(args)
    # Loading Dataset
    endOfTrain=Int64(trunc(length(Output)*0.7))
    xtrain=Imatrix[:,:,1:endOfTrain]
    xtest=Imatrix[:,:,endOfTrain+1:length(Output)]

    ytrain=Output[1:endOfTrain]
    ytest=Output[endOfTrain+1:length(Output)]
    # Reshape Data for flatten the each image into linear array
    xtrain = Flux.flatten(xtrain)
    xtest = Flux.flatten(xtest)

    # One-hot-encode the labels
    ytrain, ytest = onehotbatch(ytrain, 0:1), onehotbatch(ytest, 0:1)

    # Batching
    train_data = DataLoader(xtrain, ytrain, batchsize=args.batchsize, shuffle=true)
    test_data = DataLoader(xtest, ytest, batchsize=args.batchsize)

    return train_data, test_data
end

function build_model(; imgsize=(5,10,1), nclasses=2)
    return Chain(
 	    Dense(prod(imgsize), 32, relu),Dense(32,32,relu),
            Dense(32, nclasses))
end

function loss_all(dataloader, model)
    l = 0f0
    for (x,y) in dataloader
        l += logitcrossentropy(model(x), y)
    end
    l/length(dataloader)
end

function train(; kws...)
    # Initializing Model parameters
    args = Args(; kws...)

    # Load Data
    train_data,test_data = getdata(args)

    # Construct model
    m = build_model()
    train_data = args.device.(train_data)
    test_data = args.device.(test_data)
    #m = args.device(m)
    m = args.device(m)
    loss(x,y) = logitcrossentropy(m(x), y)

    ## Training
    evalcb = Flux.throttle( () -> @show(loss_all(train_data, m),accuracy(test_data, m)),5)
    
    opt = ADAM(args.η)

    @epochs args.epochs Flux.train!(loss, Flux.params(m), train_data, opt)

    @show accuracy(train_data, m)

    @show accuracy(test_data, m)
    return m
end