I want to write the following neural networks machine learning code on GPU but I need to write complicated and dirty something within dataloader
’s for loop to pass data
to train!
:
# This GPU based program works but very slow than CPU
using Flux
using Flux.Data: DataLoader
using Printf
xpu = gpu
# xpu = cpu
n = 100
dataset = (
input = (
data1 = rand(1, n),
data2 = rand(1, n),
data3 = rand(1, n),
data4 = rand(5, 10, n)
),
output = rand(1, n)
) |> xpu
dataloader = DataLoader(dataset.input, dataset.output, batchsize = 4)
model = Chain(
(input) -> cat(dims=1, input.data1, input.data2), # Here I ignore data3 and data4 for simplicity. In real model, I'll use these data.
Dense(2, 10, relu),
Dense(10, 1)
) |> xpu
loss(input, output) = Flux.mse(model(input), output)
optimizer = ADAM()
epoch_length = 100
for epoch in 1:epoch_length
for (input, output) in dataloader
# complicated and maybe slow, I need to reconstruct minibatched dataset
data = [((data1 = input.data1[:, i], data2 = input.data2[:, i], data3 = input.data3[:, i], data4 = input.data4[:, :, i]), output[:, i]) for i in size(output)[2]]
Flux.train!(loss, params(model), data, optimizer)
end
loss(dataset.input, dataset.output) |> println
end
This code is very slower than CPU. How should I do to run it fast? What is the good practice?