Hi all, I have a problem with optimization of convolutional neural networks on GPU. I would like to run a couple of experiments from the same initial values of network parameters, but for some reason, when it is run on GPU, it gives different results. Here is a MWE
using Flux
using Statistics
using LinearAlgebra
using CUDA
#initial network whose parameters I load into the others
bn_ch = 2 #channels for BN (probably has to be > 1)
net = Chain(Conv((3,3), 3 => bn_ch, pad=1), BatchNorm(bn_ch), Conv((3,3), bn_ch => bn_ch, pad=1)) #probably has to contain BN and the last convolution
#input array
sz = 10 #size of input and output array (too small input seems to work ok)
x_init = ones(Float32, sz,sz,3,1) |> gpu
#data
d = ones(Float32, sz,sz,1,1)
#loss function
loss(n, x, y) = mean((n(x).-y).^2)
#nuber of steps of optimisation
steps = 10 #can fail even with one optimisation step but it does not happen that often
#nuber of runs (does not fail every time)
nruns = 10
#optimiser for network optimisation: "adam" or "descent"
optimiser = "adam" #descent seems to work ok
#----------------- GPU runs -----------------
#first nruns networks
pars_norm2 = 0
for j = 1 : nruns
net2 = Chain(Conv((3,3), 3 => bn_ch, pad=1), BatchNorm(bn_ch), Conv((3,3), bn_ch => bn_ch, pad=1))
Flux.loadmodel!(net2, net) #load initial parameters of net
net2 = net2 |> gpu
optim!(net2, steps, (n)->loss(n, x_init, gpu(d)), optimiser)
pars_norm2 += get_norm(net2)
end
#second nruns networks
pars_norm3 = 0
for j = 1 : nruns
net3 = Chain(Conv((3,3), 3 => bn_ch, pad=1), BatchNorm(bn_ch), Conv((3,3), bn_ch => bn_ch, pad=1))
Flux.loadmodel!(net3, net) #load initial parameters of net
net3 = net3 |> gpu
optim!(net3, steps, (n)->loss(n, x_init, gpu(d)), optimiser)
pars_norm3 += get_norm(net3)
end
#----------- CPU runs ---------------------------------------
#first nruns networks
pars_normc2 = 0
for j = 1 : nruns
netc2 = Chain(Conv((3,3), 3 => bn_ch, pad=1), BatchNorm(bn_ch), Conv((3,3), bn_ch => bn_ch, pad=1))
Flux.loadmodel!(netc2, net) #load initial parameters of net
optim!(netc2, steps, (n)->loss(n, cpu(x_init), d), optimiser) #run optimisation
pars_normc2 += get_norm(netc2) #add norm of parameters after optimisation
end
#second nruns networks
pars_normc3 = 0
for j = 1 : nruns
netc3 = Chain(Conv((3,3), 3 => bn_ch, pad=1), BatchNorm(bn_ch), Conv((3,3), bn_ch => bn_ch, pad=1))
Flux.loadmodel!(netc3, net) #load initial parameters of net
optim!(netc3, steps, (n)->loss(n, cpu(x_init), d), optimiser)
pars_normc3 += get_norm(netc3)
end
#-------------------------
println("Difference between parameter norms of first ", nruns, " and second ", nruns, " optimisations on GPU: ", pars_norm2 - pars_norm3)
println("Difference between parameter norms of first ", nruns, " and second ", nruns, " optimisations on CPU: ", pars_normc2 - pars_normc3)
function optim!(network, nsteps, lossf, opt)
#function for network optimisation
#network..... network to be optimised
#nsteps...... number of optimisation steps
#lossf....... loss function
#opt......... optimiser, either "adam" or "descent"
if opt == "adam"
opts = Adam(0.01)
elseif opt == "descent"
opts = Descent(0.01)
end
pars = Flux.params(network)
for j = 1 : nsteps
gs = Flux.gradient(()->lossf(network), pars)
Flux.Optimise.update!(opts, pars, gs)
end
end
function get_norm(network)
#function returning sum of norms of network parameters
pars = Flux.params(network)
ns = 0
for x in pars
ns += norm(x)
end
return ns
end
Running this script returns
Difference between parameter norms of first 10 and second 10 optimisations on GPU: -0.004699707
Difference between parameter norms of first 10 and second 10 optimisations on CPU: 0.0
which shows that the results of an optimisation of the same network with no stochastic influences are not the same on GPU. On CPU, however, it works alright. Any idea what is going on?
Edit: It seems like it happens only on NVIDIA TITAN V.