Hi, today I updated a quite large code base to julia 1.6. I noticed that both computation time and memory usage (measured with TimerOutputs.jl) doubled.
I wasn’t able to produce a MWE yet, however, I found that accessing a certain Vector causes 0 allocations for julia 1.5 above 0 for 1.6:
Julia 1.5.4
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 10.490 ns (0.00% GC)
median time: 10.591 ns (0.00% GC)
mean time: 10.628 ns (0.00% GC)
maximum time: 21.241 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 999
Julia 1.6.1
BenchmarkTools.Trial:
memory estimate: 192 bytes
allocs estimate: 1
--------------
minimum time: 34.521 ns (0.00% GC)
median time: 36.576 ns (0.00% GC)
mean time: 40.875 ns (6.89% GC)
maximum time: 779.796 ns (94.76% GC)
--------------
samples: 10000
evals/sample: 993
Typed and lowered code are equal for accessing this vector, however, code llvm and native look different:
LLVM Julia 1.5.4
; @ REPL[15]:1 within `f'
define nonnull %jl_value_t* @julia_f_1415([1 x %jl_value_t*]* nocapture nonnull readonly dereferenceable(8)) {
top:
; ┌ @ tuple.jl:24 within `getindex'
%1 = getelementptr inbounds [1 x %jl_value_t*], [1 x %jl_value_t*]* %0, i64 0, i64 0
%2 = load %jl_value_t*, %jl_value_t** %1, align 8
; └
ret %jl_value_t* %2
}
LLVM Julia 1.6.1
; @ REPL[23]:1 within `f'
define void @julia_f_975({ [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }* noalias nocapture sret %0, [3 x {}*]* noalias nocapture %1, [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* nocapture nonnull readonly align 8 dereferenceable(184) %2) {
top:
%3 = bitcast [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* %2 to i64*
%4 = load i64, i64* %3, align 8
%5 = getelementptr inbounds [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }], [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* %2, i64 0, i64 0, i32 1, i64 0, i32 8
%6 = bitcast {}** %5 to i64*
%7 = load i64, i64* %6, align 8
%8 = getelementptr inbounds [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }], [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* %2, i64 0, i64 0, i32 1, i64 1, i32 8
%9 = bitcast {}** %8 to i64*
%10 = load i64, i64* %9, align 8
%11 = bitcast [3 x {}*]* %1 to i64*
store i64 %4, i64* %11, align 8
%12 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 1
%13 = bitcast {}** %12 to i64*
store i64 %7, i64* %13, align 8
%14 = getelementptr inbounds [3 x {}*], [3 x {}*]* %1, i64 0, i64 2
%15 = bitcast {}** %14 to i64*
store i64 %10, i64* %15, align 8
%16 = bitcast { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }* %0 to i8*
%17 = bitcast [1 x { [1 x {}*], [2 x { double, double, double, double, double, double, i64, double, {}*, double, double }] }]* %2 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 dereferenceable(184) %16, i8* nonnull align 8 dereferenceable(184) %17, i64 184, i1 false)
ret void
}
Native Julia 1.5.4
.text
; ┌ @ REPL[15]:1 within `f'
; │┌ @ tuple.jl:24 within `getindex'
movq (%rdi), %rax
; │└
retq
nopw %cs:(%rax,%rax)
; └
Native Julia 1.6.1
.text
; ┌ @ REPL[23]:1 within `f'
movq %rdi, %rax
movq (%rdx), %rcx
movq 72(%rdx), %rdi
movq 160(%rdx), %r8
movq %rcx, (%rsi)
movq %rdi, 8(%rsi)
movq %r8, 16(%rsi)
vmovups 152(%rdx), %ymm0
vmovups %ymm0, 152(%rax)
vmovups 128(%rdx), %ymm0
vmovups %ymm0, 128(%rax)
vmovups (%rdx), %ymm0
vmovups 32(%rdx), %ymm1
vmovups 64(%rdx), %ymm2
vmovups 96(%rdx), %ymm3
vmovups %ymm3, 96(%rax)
vmovups %ymm2, 64(%rax)
vmovups %ymm1, 32(%rax)
vmovups %ymm0, (%rax)
vzeroupper
retq
nopw %cs:(%rax,%rax)
; └
Any ideas why and how they are different? I’ll keep trying to produce a MWE.