I have tried to implement a 2D locally connected layer (convolution with non-shared weights) in Flux restricted to filter size (1,1) only. That is, the linear combinations/convolutions are only over the (3rd) channel/variable dimension. Since this type of layer involves many weight parameters, an efficient implementation with respect to speed and memory management is essential. Is there any scope for improvement of the following code, in particular for GPU?

```
function (a::Local2D)(x::AbstractArray) # size(x) = (w,h,cin,n), size(W) = (w,h,cin,cout)
W, b, σ = a.W, a.b, a.σ
out = cat([sum(W[:,:,:,i] .* x, dims=3) .+ b[:,:,:,i] for i in 1:size(W,4)]..., dims=3)
return σ.(out)
end
```

The complete code including a test example:

```
struct Local2D{S<:AbstractArray, T<:AbstractArray, F}
W::S
b::T
σ::F
end
Local2D(W, b) = Local2D(W, b, identity)
function Local2D(w::Integer, h::Integer, cin::Integer, cout::Integer, σ = identity;
initW = Flux.glorot_uniform, initb = zeros)
return Local2D(initW(w,h,cin,cout), initb(Float32, w,h,1,cout), σ)
end
Flux.@functor Local2D
function (a::Local2D)(x::AbstractArray) # size(x) = (w,h,cin,n), size(W) = (w,h,cin,cout)
W, b, σ = a.W, a.b, a.σ
out = cat([sum(W[:,:,:,i] .* x, dims=3) .+ b[:,:,:,i] for i in 1:size(W,4)]..., dims=3)
return σ.(out)
end
# example
device = gpu
n = 1024; d = 256; cin = 4; cout = 2
x = rand(Float32, d, d, cin, n) |> device;
y = rand(Float32, d, d, cout, n) |> device;
trdata = Flux.Data.DataLoader((x, y), batchsize=32) |> device;
model = Local2D(d, d, cin, cout, identity) |> device;
loss(x, y) = Flux.mse(model(x), y)
@time Flux.train!(loss, Flux.params(model), trdata, ADAM())
```