I have been trying to create ResNet-18 on Julia. I have managed to create a functional one but it is slow and uses lot of GPU memory. This is puzzling since similar architecture on Tensorflow runs 5-10 times faster and uses much less memory. Also, there is a implementation of VGG-16, which I found that also uses much less memory than mine implementation of ResNet-18. What I am doing wrong?

The implementation of VGG-16 that doesn’t use much memory: [Flux] cifar10 upgrade – Julia ?

My ResNet-18:

```
using Statistics
using CuArrays
using Zygote
using Flux, Flux.Optimise
using Metalhead, Images
using Metalhead: trainimgs
using Images.ImageCore
using Flux: onehotbatch, onecold
using Base.Iterators: partition
Metalhead.download(Metalhead.CIFAR10)
X = trainimgs(Metalhead.CIFAR10)
labels = onehotbatch([X[i].ground_truth.class for i in 1:50000],1:10)
image(x) = x.img
ground_truth(x) = x.ground_truth
image.(X[rand(1:end, 10)])
getarray(X) = float.(permutedims(channelview(X), (2, 3, 1)))
imgs = [getarray(X[i].img) for i in 1:50000]
batch_size = 1
train = ([(cat(imgs[i]..., dims = 4), labels[:,i]) for i in partition(1:49000, batch_size)])
train_data = train |>
x -> map(y->gpu.(y),x)
valset = 49001:50000
valX = cat(imgs[valset]..., dims = 4)
valY = labels[:, valset]
identity_layer(n) = Chain(
Conv((3,3), n=>n, pad = (1,1), stride = (1,1)),
BatchNorm(n,relu),
Conv((3,3), n=>n, pad = (1,1), stride = (1,1)),
BatchNorm(n,relu)
)
convolution_layer(n) = Chain(
Conv((3,3), n=> 2*n, pad = (1,1), stride = (2,2)),
BatchNorm(2*n,relu),
Conv((3,3), 2*n=>2*n, pad = (1,1), stride = (1,1)),
BatchNorm(2*n,relu)
)
simple_convolution(n) = Chain(
Conv((1,1), n=>n, pad = (1,1), stride = (2,2)),
BatchNorm(n,relu)
)
m_filter(n) = Chain(
Conv((3,3), n=>2*n, pad = (1,1), stride = (2,2)),
BatchNorm(2*n,relu),
) |> gpu
struct Combinator
conv::Chain
end |> gpu
Combinator(n) = Combinator(m_filter(n))
function (op::Combinator)(x, y)
z = op.conv(y)
return x + z
end
n = 7
m = Chain(
ConvTranspose((n, n), 3 => 3, stride = n),
Conv((7,7), 3=>64, pad = (3,3), stride = (2,2)),
BatchNorm(64,relu),
MaxPool((3,3), pad = (1,1), stride = (2,2)),
SkipConnection(identity_layer(64), (variable_1, variable_2) -> variable_1 + variable_2),
SkipConnection(identity_layer(64), (variable_1, variable_2) -> variable_1 + variable_2),
SkipConnection(convolution_layer(64), Combinator(64)),
SkipConnection(identity_layer(128), (variable_1, variable_2) -> variable_1 + variable_2),
SkipConnection(convolution_layer(128), Combinator(128)),
SkipConnection(identity_layer(256), (variable_1, variable_2) -> variable_1 + variable_2),
SkipConnection(convolution_layer(256), Combinator(256)),
SkipConnection(identity_layer(512), (variable_1, variable_2) -> variable_1 + variable_2),
MeanPool((7,7)),
x -> reshape(x, :, size(x,4)),
Dense(512, 10),
softmax,
) |> gpu
using Flux: crossentropy, Momentum, @epochs
loss(x, y) = sum(crossentropy(m(x), y))
opt = Momentum(0.01)
@epochs 5 train!(loss, params(m), train_data, opt)
```
```