LLVM code changes if code is wrapped in function

Hi!

I am looking to use SIMD to speed up some operations on strings. I noticed that when I wrap my code in a function it seems that julia doesn’t use the vector instructions. I am using SIMD.jl for my simd functions. Here is an example:

function reduceString(str)
	charArray::Vector{UInt8} = codeunits(str)
	simdCharArray = @inbounds charArray[VecRange{sizeof(str)}(0)+1]
    return reduce(|, simdCharArray)
end

If we use the code_llvm maco to generate the llvm we get the following:

julia> @code_llvm debuginfo = :none reduceString("hi")
define nonnull {}* @julia_reduceString_4875({}* nonnull %0) #0 {
top:
  %1 = alloca [2 x {}*], align 8
  %gcframe7 = alloca [6 x {}*], align 16
  %gcframe7.sub = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 0
  %.sub = getelementptr inbounds [2 x {}*], [2 x {}*]* %1, i64 0, i64 0
  %2 = bitcast [6 x {}*]* %gcframe7 to i8*
  call void @llvm.memset.p0i8.i32(i8* noundef nonnull align 16 dereferenceable(48) %2, i8 0, i32 48, i1 false)
  %3 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 3
  %4 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 2
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #3
  %ppgcstack_i8 = getelementptr i8, i8* %thread_ptr, i64 -8
  %ppgcstack = bitcast i8* %ppgcstack_i8 to {}****
  %pgcstack = load {}***, {}**** %ppgcstack, align 8
  %5 = bitcast [6 x {}*]* %gcframe7 to i64*
  store i64 16, i64* %5, align 16
  %6 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 1
  %7 = bitcast {}** %6 to {}***
  %8 = load {}**, {}*** %pgcstack, align 8
  store {}** %8, {}*** %7, align 8
  %9 = bitcast {}*** %pgcstack to {}***
  store {}** %gcframe7.sub, {}*** %9, align 8
  %10 = bitcast {}* %0 to i64*
  %11 = load i64, i64* %10, align 8
  %12 = call nonnull {}* inttoptr (i64 140555877049360 to {}* ({}*, i64)*)({}* inttoptr (i64 140555346681792 to {}*), i64 %11)
  %.not = icmp eq i64 %11, 0
  br i1 %.not, label %L16, label %L10

L10:                                              ; preds = %top
  %13 = bitcast {}** %3 to [1 x {}*]*
  %14 = bitcast {}** %4 to [1 x {}*]*
  store {}* %0, {}** %4, align 16
  %15 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 4
  store {}* %12, {}** %15, align 16
  %16 = call [1 x {}*] @j_unalias_4877({}* nonnull %12, [1 x {}*]* nocapture readonly %14) #0
  %.fca.0.extract = extractvalue [1 x {}*] %16, 0
  store {}* %.fca.0.extract, {}** %3, align 8
  %17 = call nonnull {}* @"j_copyto_unaliased!_4878"({}* nonnull %12, [1 x {}*]* nocapture readonly %13) #0
  br label %L16

L16:                                              ; preds = %L10, %top
  %value_phi = phi {}* [ %17, %L10 ], [ %12, %top ]
  %18 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 5
  store {}* %value_phi, {}** %18, align 8
  %19 = call nonnull {}* @ijl_box_int64(i64 signext %11)
  %20 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 4
  store {}* %19, {}** %20, align 16
  store {}* inttoptr (i64 140555548663792 to {}*), {}** %.sub, align 8
  %21 = getelementptr inbounds [2 x {}*], [2 x {}*]* %1, i64 0, i64 1
  store {}* %19, {}** %21, align 8
  %22 = call nonnull {}* @jl_f_apply_type({}* null, {}** nonnull %.sub, i32 2)
  store {}* %22, {}** %20, align 16
  store {}* inttoptr (i64 140555566903328 to {}*), {}** %.sub, align 8
  %23 = call nonnull {}* @ijl_apply_generic({}* nonnull %22, {}** nonnull %.sub, i32 1)
  store {}* %23, {}** %20, align 16
  store {}* %23, {}** %.sub, align 8
  store {}* inttoptr (i64 140555566903392 to {}*), {}** %21, align 8
  %24 = call nonnull {}* @ijl_apply_generic({}* inttoptr (i64 140555349625248 to {}*), {}** nonnull %.sub, i32 2)
  store {}* %24, {}** %20, align 16
  store {}* %value_phi, {}** %.sub, align 8
  store {}* %24, {}** %21, align 8
  %25 = call nonnull {}* @ijl_apply_generic({}* inttoptr (i64 140555347988128 to {}*), {}** nonnull %.sub, i32 2)
  store {}* %25, {}** %20, align 16
  store {}* inttoptr (i64 140555354080288 to {}*), {}** %.sub, align 8
  store {}* %25, {}** %21, align 8
  %26 = call nonnull {}* @ijl_apply_generic({}* inttoptr (i64 140555361198352 to {}*), {}** nonnull %.sub, i32 2)
  %27 = load {}*, {}** %6, align 8
  %28 = bitcast {}*** %pgcstack to {}**
  store {}* %27, {}** %28, align 8
  ret {}* %26
}

Now this is a bit strange to me since it seems that none of the vector instructions are being used. One that should be being used is the instruction for reduce or. Here is the output when I use the @code_llvm macro on just the reduce line outside of the function:

julia> str = "hi"
"hi"

julia> simdCharArray = @inbounds charArray[VecRange{sizeof(str)}(0)+1]
<2 x UInt8>[0x68, 0x69]

julia> @code_llvm debuginfo = :none reduce(|, simdCharArray)
define i8 @julia_reduce_4879([1 x <2 x i8>]* nocapture nonnull readonly align 2 dereferenceable(2) %0) #0 {
top:
  %1 = getelementptr inbounds [1 x <2 x i8>], [1 x <2 x i8>]* %0, i64 0, i64 0
  %2 = load <2 x i8>, <2 x i8>* %1, align 2
  %res.i = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %2)
  ret i8 %res.i
}

We can see that LLVM in this case is using the vector instructions. Notice the use of “lvm.vector.reduce.or.v2i8”

So my question is why is the LLVM code changing? Also how can I enforce that the functions I call that use explict SIMD instructions actually use them?

1 Like

Primarily because you’re asking it to provide the code for two completely different functions. Presumably you could see the reduce code inside reduceString if it managed to inline it but this code is not inferrable as you can see with

julia> @code_warntype reduceString("hi")
MethodInstance for reduceString(::String)
  from reduceString(str) in Main at REPL[11]:1
Arguments
  #self#::Core.Const(reduceString)
  str::String
Locals
  val::Any
  simdCharArray::Any
  charArray::Vector{UInt8}
Body::Any
1 ─ %1  = Main.codeunits(str)::Base.CodeUnits{UInt8, String}
│   %2  = Core.apply_type(Main.Vector, Main.UInt8)::Core.Const(Vector{UInt8})
│   %3  = Base.convert(%2, %1)::Vector{UInt8}
│         (charArray = Core.typeassert(%3, %2))
│         $(Expr(:inbounds, true))
│   %6  = charArray::Vector{UInt8}
│   %7  = Main.sizeof(str)::Int64
│   %8  = Core.apply_type(Main.VecRange, %7)::Type{VecRange{_A}} where _A
│   %9  = (%8)(0)::VecRange
│   %10 = (%9 + 1)::VecRange
│         (val = Base.getindex(%6, %10))
│         $(Expr(:inbounds, :pop))
│         (simdCharArray = val)
│   %14 = Main.reduce(Main.:|, simdCharArray)::Any
└──       return %14

Thus what you get to see in the @code_llvm output is the code required to set up a dynamic dispatch to whatever runtime type reduce happens to be called with.

Why it can’t infer? The VecRange{N} type depends on the runtime length of str.

2 Likes

Thank you so much for you response. This is really helpful. I was banging my head last night trying to figure out what was going on.