Doubled memory usage and computation time after update to 1.6.1 from 1.5.4

Hi, today I updated a quite large code base to julia 1.6. I noticed that both computation time and memory usage (measured with TimerOutputs.jl) doubled.
I wasn’t able to produce a MWE yet, however, I found that accessing a certain Vector causes 0 allocations for julia 1.5 above 0 for 1.6:

Julia 1.5.4

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     10.490 ns (0.00% GC)
  median time:      10.591 ns (0.00% GC)
  mean time:        10.628 ns (0.00% GC)
  maximum time:     21.241 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

Julia 1.6.1

BenchmarkTools.Trial: 
  memory estimate:  192 bytes
  allocs estimate:  1
  --------------
  minimum time:     34.521 ns (0.00% GC)
  median time:      36.576 ns (0.00% GC)
  mean time:        40.875 ns (6.89% GC)
  maximum time:     779.796 ns (94.76% GC)
  --------------
  samples:          10000
  evals/sample:     993

Typed and lowered code are equal for accessing this vector, however, code llvm and native look different:

LLVM Julia 1.5.4

;  @ REPL[15]:1 within `f'
define nonnull %jl_value_t* @julia_f_1415([1 x %jl_value_t*]* nocapture nonnull readonly dereferenceable(8)) {
top:
; ┌ @ tuple.jl:24 within `getindex'
   %1 = getelementptr inbounds [1 x %jl_value_t*], [1 x %jl_value_t*]* %0, i64 0, i64 0
   %2 = load %jl_value_t*, %jl_value_t** %1, align 8
; └
  ret %jl_value_t* %2
}

LLVM Julia 1.6.1

;  @ REPL[23]:1 within `f'
define void @julia_f_975({ [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }* noalias nocapture sret %0, [3 x {}*]* noalias nocapture %1, [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* nocapture nonnull readonly align 8 dereferenceable(184) %2) {
top:
  %3 = bitcast [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* %2 to i64*
  %4 = load i64, i64* %3, align 8
  %5 = getelementptr inbounds [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }], [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* %2, i64 0, i64 0, i32 1, i64 0, i32 8
  %6 = bitcast {}** %5 to i64*
  %7 = load i64, i64* %6, align 8
  %8 = getelementptr inbounds [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }], [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* %2, i64 0, i64 0, i32 1, i64 1, i32 8
  %9 = bitcast {}** %8 to i64*
  %10 = load i64, i64* %9, align 8
  %11 = bitcast [3 x {}*]* %1 to i64*
  store i64 %4, i64* %11, align 8
  %12 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 1
  %13 = bitcast {}** %12 to i64*
  store i64 %7, i64* %13, align 8
  %14 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 2
  %15 = bitcast {}** %14 to i64*
  store i64 %10, i64* %15, align 8
  %16 = bitcast { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }* %0 to i8*
  %17 = bitcast [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* %2 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(184) %16, i8* nonnull align 8 dereferenceable(184) %17, i64 184, i1 false)
  ret void
}

Native Julia 1.5.4

        .text
; ┌ @ REPL[15]:1 within `f'
; │┌ @ tuple.jl:24 within `getindex'
        movq    (%rdi), %rax
; │└
        retq
        nopw    %cs:(%rax,%rax)
; └

Native Julia 1.6.1

        .text
; ┌ @ REPL[23]:1 within `f'
        movq    %rdi, %rax
        movq    (%rdx), %rcx
        movq    72(%rdx), %rdi
        movq    160(%rdx), %r8
        movq    %rcx, (%rsi)
        movq    %rdi, 8(%rsi)
        movq    %r8, 16(%rsi)
        vmovups 152(%rdx), %ymm0
        vmovups %ymm0, 152(%rax)
        vmovups 128(%rdx), %ymm0
        vmovups %ymm0, 128(%rax)
        vmovups (%rdx), %ymm0
        vmovups 32(%rdx), %ymm1
        vmovups 64(%rdx), %ymm2
        vmovups 96(%rdx), %ymm3
        vmovups %ymm3, 96(%rax)
        vmovups %ymm2, 64(%rax)
        vmovups %ymm1, 32(%rax)
        vmovups %ymm0, (%rax)
        vzeroupper
        retq
        nopw    %cs:(%rax,%rax)
; └

Any ideas why and how they are different? I’ll keep trying to produce a MWE.