This
```julia
import BenchmarkTools: @btime
import InteractiveUtils: @code_β¦llvm
function countnothing1(xs)
i = 0
for x in xs
i += x === nothing
end
i
end
function countnothing2(xs)
i = 0
for x in xs
i += isnothing(x)
end
i
end
@inline myisnothing(x) = x === nothing
function countnothing3(xs)
i = 0
for x in xs
i += myisnothing(x)
end
i
end
function bench()
xs = Union{Nothing, Int64}[i % 2 == 0 ? nothing : 1 for i in 1:1_000_000]
@btime countnothing1($xs)
@btime countnothing2($xs)
@btime countnothing3($xs)
@code_llvm countnothing1(xs)
@code_llvm countnothing3(xs)
end
bench()
```
produces the timings
```
214.857 ΞΌs (0 allocations: 0 bytes)
448.005 ΞΌs (0 allocations: 0 bytes)
447.848 ΞΌs (0 allocations: 0 bytes)
```
on Julia 1.4.0 and
```
214.870 ΞΌs (0 allocations: 0 bytes)
572.832 ΞΌs (0 allocations: 0 bytes)
572.833 ΞΌs (0 allocations: 0 bytes)
```
on current master. The `countnothing3()` version is there just to make sure nothing funny occurs with the definition of `isnothing()`, but nope, it's not that. The `@code_llvm` output when using `x === nothing` is
```
; @ /Users/rauli/bench.jl:5 within `countnothing1'
define i64 @julia_countnothing1_17323(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40)) {
top:
; @ /Users/rauli/bench.jl:6 within `countnothing1'
; β @ array.jl:763 within `iterate' @ array.jl:763
; ββ @ array.jl:221 within `length'
%1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_array_t addrspace(11)*
%3 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %2, i64 0, i32 1
%4 = load i64, i64 addrspace(11)* %3, align 8
; ββ
%5 = icmp slt i64 %4, 1
br i1 %5, label %L46, label %L14
L14: ; preds = %top
; ββ @ array.jl:787 within `getindex'
%6 = bitcast %jl_value_t addrspace(11)* %1 to [1 x i64] addrspace(13)* addrspace(11)*
%7 = load [1 x i64] addrspace(13)*, [1 x i64] addrspace(13)* addrspace(11)* %6, align 8
%8 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %2, i64 0, i32 4
%9 = load i32, i32 addrspace(11)* %8, align 4
%10 = bitcast %jl_value_t addrspace(11)* %1 to %jl_value_t addrspace(10)* addrspace(11)*
%11 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %10, i64 4
%12 = bitcast %jl_value_t addrspace(10)* addrspace(11)* %11 to i64 addrspace(11)*
%13 = load i64, i64 addrspace(11)* %12, align 8
%14 = zext i32 %9 to i64
%15 = sub i64 %13, %14
%16 = getelementptr inbounds [1 x i64], [1 x i64] addrspace(13)* %7, i64 %15
%17 = bitcast [1 x i64] addrspace(13)* %16 to i8 addrspace(13)*
%18 = sext i32 %9 to i64
%19 = getelementptr inbounds i8, i8 addrspace(13)* %17, i64 %18
; ββ
%tindex_phi3.in52 = load i8, i8 addrspace(13)* %19, align 1
; @ /Users/rauli/bench.jl:7 within `countnothing1'
; β @ int.jl:858 within `+'
; ββ @ int.jl:442 within `rem'
; βββ @ number.jl:7 within `convert'
; ββββ @ boot.jl:707 within `Int64'
; βββββ @ boot.jl:634 within `toInt64'
%20 = xor i8 %tindex_phi3.in52, 1
%21 = zext i8 %20 to i64
; βββββ
; β @ array.jl:763 within `iterate'
; ββ @ int.jl:416 within `<' @ int.jl:409
%22 = icmp eq i64 %4, 1
; ββ
br i1 %22, label %L46, label %L40.preheader
L40.preheader: ; preds = %L14
; ββ @ array.jl:787 within `getindex'
%23 = add i64 %4, -1
%min.iters.check = icmp ult i64 %23, 16
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %L40.preheader
%n.vec = and i64 %23, -16
%ind.end = or i64 %n.vec, 1
%ind.end57 = or i64 %n.vec, 2
%24 = insertelement <4 x i64> <i64 undef, i64 0, i64 0, i64 0>, i64 %21, i32 0
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x i64> [ %24, %vector.ph ], [ %41, %vector.body ]
%vec.phi61 = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %42, %vector.body ]
%vec.phi62 = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %43, %vector.body ]
%vec.phi63 = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ %44, %vector.body ]
%offset.idx = or i64 %index, 1
%25 = getelementptr inbounds i8, i8 addrspace(13)* %19, i64 %offset.idx
; ββ
%26 = bitcast i8 addrspace(13)* %25 to <4 x i8> addrspace(13)*
%wide.load = load <4 x i8>, <4 x i8> addrspace(13)* %26, align 1
%27 = getelementptr inbounds i8, i8 addrspace(13)* %25, i64 4
%28 = bitcast i8 addrspace(13)* %27 to <4 x i8> addrspace(13)*
%wide.load71 = load <4 x i8>, <4 x i8> addrspace(13)* %28, align 1
%29 = getelementptr inbounds i8, i8 addrspace(13)* %25, i64 8
%30 = bitcast i8 addrspace(13)* %29 to <4 x i8> addrspace(13)*
%wide.load72 = load <4 x i8>, <4 x i8> addrspace(13)* %30, align 1
%31 = getelementptr inbounds i8, i8 addrspace(13)* %25, i64 12
%32 = bitcast i8 addrspace(13)* %31 to <4 x i8> addrspace(13)*
%wide.load73 = load <4 x i8>, <4 x i8> addrspace(13)* %32, align 1
; @ /Users/rauli/bench.jl:7 within `countnothing1'
; β @ int.jl:858 within `+'
; ββ @ int.jl:442 within `rem'
; βββ @ number.jl:7 within `convert'
; ββββ @ boot.jl:707 within `Int64'
; βββββ @ boot.jl:634 within `toInt64'
%33 = xor <4 x i8> %wide.load, <i8 1, i8 1, i8 1, i8 1>
%34 = xor <4 x i8> %wide.load71, <i8 1, i8 1, i8 1, i8 1>
%35 = xor <4 x i8> %wide.load72, <i8 1, i8 1, i8 1, i8 1>
%36 = xor <4 x i8> %wide.load73, <i8 1, i8 1, i8 1, i8 1>
%37 = zext <4 x i8> %33 to <4 x i64>
%38 = zext <4 x i8> %34 to <4 x i64>
%39 = zext <4 x i8> %35 to <4 x i64>
%40 = zext <4 x i8> %36 to <4 x i64>
; βββββ
; β @ int.jl:860 within `+' @ int.jl:53
%41 = add <4 x i64> %vec.phi, %37
%42 = add <4 x i64> %vec.phi61, %38
%43 = add <4 x i64> %vec.phi62, %39
%44 = add <4 x i64> %vec.phi63, %40
%index.next = add i64 %index, 16
%45 = icmp eq i64 %index.next, %n.vec
br i1 %45, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%bin.rdx = add <4 x i64> %42, %41
%bin.rdx74 = add <4 x i64> %43, %bin.rdx
%bin.rdx75 = add <4 x i64> %44, %bin.rdx74
%rdx.shuf = shufflevector <4 x i64> %bin.rdx75, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx76 = add <4 x i64> %bin.rdx75, %rdx.shuf
%rdx.shuf77 = shufflevector <4 x i64> %bin.rdx76, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx78 = add <4 x i64> %bin.rdx76, %rdx.shuf77
%46 = extractelement <4 x i64> %bin.rdx78, i32 0
%cmp.n = icmp eq i64 %23, %n.vec
; β
; β @ array.jl:763 within `iterate'
; ββ @ array.jl:787 within `getindex'
br i1 %cmp.n, label %L46, label %scalar.ph
scalar.ph: ; preds = %middle.block, %L40.preheader
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L40.preheader ]
%bc.resume.val56 = phi i64 [ %ind.end57, %middle.block ], [ 2, %L40.preheader ]
%bc.merge.rdx = phi i64 [ %46, %middle.block ], [ %21, %L40.preheader ]
br label %L40
L40: ; preds = %scalar.ph, %L40
%47 = phi i64 [ %value_phi553, %L40 ], [ %bc.resume.val, %scalar.ph ]
%48 = phi i64 [ %53, %L40 ], [ %bc.merge.rdx, %scalar.ph ]
%value_phi553 = phi i64 [ %50, %L40 ], [ %bc.resume.val56, %scalar.ph ]
%49 = getelementptr inbounds i8, i8 addrspace(13)* %19, i64 %47
; ββ
; ββ @ int.jl:53 within `+'
%50 = add i64 %value_phi553, 1
; ββ
%tindex_phi3.in = load i8, i8 addrspace(13)* %49, align 1
; @ /Users/rauli/bench.jl:7 within `countnothing1'
; β @ int.jl:858 within `+'
; ββ @ int.jl:442 within `rem'
; βββ @ number.jl:7 within `convert'
; ββββ @ boot.jl:707 within `Int64'
; βββββ @ boot.jl:634 within `toInt64'
%51 = xor i8 %tindex_phi3.in, 1
%52 = zext i8 %51 to i64
; βββββ
; β @ int.jl:860 within `+' @ int.jl:53
%53 = add i64 %48, %52
; β
; β @ array.jl:763 within `iterate'
; ββ @ int.jl:416 within `<' @ int.jl:409
%54 = icmp ult i64 %value_phi553, %4
; ββ
br i1 %54, label %L40, label %L46
L46: ; preds = %L40, %middle.block, %L14, %top
%value_phi10 = phi i64 [ 0, %top ], [ %21, %L14 ], [ %53, %L40 ], [ %46, %middle.block ]
; β
; @ /Users/rauli/bench.jl:9 within `countnothing1'
ret i64 %value_phi10
}
```
and when using `myisnothing(x)` it is
```
; @ /Users/rauli/bench.jl:23 within `countnothing3'
define i64 @julia_countnothing3_17327(%jl_value_t addrspace(10)* nonnull align 16 dereferenceable(40)) {
top:
; @ /Users/rauli/bench.jl:24 within `countnothing3'
; β @ array.jl:763 within `iterate' @ array.jl:763
; ββ @ array.jl:221 within `length'
%1 = addrspacecast %jl_value_t addrspace(10)* %0 to %jl_value_t addrspace(11)*
%2 = bitcast %jl_value_t addrspace(11)* %1 to %jl_array_t addrspace(11)*
%3 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %2, i64 0, i32 1
%4 = load i64, i64 addrspace(11)* %3, align 8
; ββ
%5 = icmp slt i64 %4, 1
br i1 %5, label %L54, label %L14.L14.split_crit_edge
L14.L14.split_crit_edge: ; preds = %top
; ββ @ array.jl:787 within `getindex'
%6 = bitcast %jl_value_t addrspace(11)* %1 to [1 x i64] addrspace(13)* addrspace(11)*
%7 = load [1 x i64] addrspace(13)*, [1 x i64] addrspace(13)* addrspace(11)* %6, align 8
%8 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %2, i64 0, i32 4
%9 = load i32, i32 addrspace(11)* %8, align 4
%10 = bitcast %jl_value_t addrspace(11)* %1 to %jl_value_t addrspace(10)* addrspace(11)*
%11 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %10, i64 4
%12 = bitcast %jl_value_t addrspace(10)* addrspace(11)* %11 to i64 addrspace(11)*
%13 = load i64, i64 addrspace(11)* %12, align 8
%14 = zext i32 %9 to i64
%15 = sub i64 %13, %14
%16 = getelementptr inbounds [1 x i64], [1 x i64] addrspace(13)* %7, i64 %15
%17 = bitcast [1 x i64] addrspace(13)* %16 to i8 addrspace(13)*
%18 = sext i32 %9 to i64
%19 = getelementptr inbounds i8, i8 addrspace(13)* %17, i64 %18
; ββ
br label %L20
L20: ; preds = %L48, %L14.L14.split_crit_edge
%value_phi2 = phi i64 [ 0, %L14.L14.split_crit_edge ], [ %22, %L48 ]
%tindex_phi3.in.in = phi i8 addrspace(13)* [ %19, %L14.L14.split_crit_edge ], [ %25, %L48 ]
%value_phi5 = phi i64 [ 2, %L14.L14.split_crit_edge ], [ %26, %L48 ]
%tindex_phi3.in = load i8, i8 addrspace(13)* %tindex_phi3.in.in, align 1
; @ /Users/rauli/bench.jl:25 within `countnothing3'
%20 = xor i8 %tindex_phi3.in, 1
%21 = zext i8 %20 to i64
; β @ int.jl:860 within `+' @ int.jl:53
%22 = add i64 %value_phi2, %21
; β
; β @ array.jl:763 within `iterate'
; ββ @ int.jl:860 within `-' @ int.jl:52
%23 = add i64 %value_phi5, -1
; ββ
; ββ @ int.jl:416 within `<' @ int.jl:409
%24 = icmp ult i64 %23, %4
; ββ
br i1 %24, label %L48, label %L54
L48: ; preds = %L20
; ββ @ array.jl:787 within `getindex'
%25 = getelementptr inbounds i8, i8 addrspace(13)* %19, i64 %23
; ββ
; ββ @ int.jl:53 within `+'
%26 = add i64 %value_phi5, 1
; ββ
br label %L20
L54: ; preds = %L20, %top
%value_phi11 = phi i64 [ 0, %top ], [ %22, %L20 ]
; @ /Users/rauli/bench.jl:27 within `countnothing3'
ret i64 %value_phi11
}
```
The former uses vectorization (`<4 x i64>`, `<4 x i8>`, `shufflevector`, etc.), whereas the latter does not. The only difference in the code is that the former is manually inlined (literally by copy-pasting the function body), whereas the latter is automatically inlined.
Julia 1.4.0 versioninfo:
```
Julia Version 1.4.0
Commit b8e9a9ecc6 (2020-03-21 16:36 UTC)
Platform Info:
OS: macOS (x86_64-apple-darwin18.7.0)
CPU: Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-8.0.1 (ORCJIT, skylake)
```
Julia master versioninfo:
```
Julia Version 1.5.0-DEV.676
Commit b49d5ba395 (2020-04-24 06:56 UTC)
Platform Info:
OS: macOS (x86_64-apple-darwin18.7.0)
CPU: Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.1 (ORCJIT, skylake)
```