Can't train simple net what am I doing wrong?

Here is a minimal working example:

function ResNetBlock(n::Int)
    return Chain(
        Conv((3, 3), n => n, relu; pad=1, stride=1),
        BatchNorm(n, relu),
        Conv((3, 3), n => n; pad=1, stride=1),
        BatchNorm(n),
    )
end

function simplenet(n_filter)
    return Chain(
        Conv((3, 3), 4 => n_filter, relu; stride=1, pad=1),
        Conv((3, 3), n_filter => n_filter, relu; stride=1, pad=1),
        Conv((3, 3), n_filter => n_filter, relu; stride=1, pad=1),
        Conv((3, 3), n_filter => n_filter, relu; stride=1, pad=1),
        Conv((3, 3), n_filter => n_filter, relu; stride=1, pad=1),
        Conv((3, 3), n_filter => n_filter, relu; stride=1, pad=1),
        Conv((1, 1), n_filter => 32, relu; stride=1, pad=0),
        Flux.flatten,
        Dense(32 * 49, 128, relu),
        Dense(128, 1, tanh),
    )
end

function resnet(n_filter)
    return Chain(
        Conv((3, 3), 4 => n_filter, relu; stride=1, pad=1),
        ResNetBlock(n_filter),
        ResNetBlock(n_filter),
        ResNetBlock(n_filter),
        ResNetBlock(n_filter),
        Conv((1, 1), n_filter => 32, relu; stride=1, pad=0),
        Flux.flatten,
        Dense(32 * 49, 128, relu),
        Dense(128, 1, tanh),
    )
end

function loss(model, x, y)
    return Flux.mse(model(x), y)
end

 data=[(rand(Float32,7,7,4,1),0.7f0) for k in 1:10]

model1=simplenet(4)
opt1=Flux.setup(Adam(),model1)
model2= resnet(4)
opt2=flux.setup(Adam(),model2)

for k in 1:10;Flux.train!(loss,model1,data,opt1);
for k in 1:10;Flux.train!(loss,model2,data,opt2);

First I know this is not trully a resnet, came to this investigating my problem. If you try model1(data[4][1]) will be close to 0.7 as expected, but same with model 2 is always 0.999 and get to 1 with more epochs. Same on gpu.

I use latest Flux.

I recall batch norm has some degenerate behaviour if you train it with only one sample. Maybe try replacing it with layernorm or removing it altogether for your tests and see if that makes a difference.