Hi!
I am looking to use SIMD to speed up some operations on strings. I noticed that when I wrap my code in a function it seems that julia doesn’t use the vector instructions. I am using SIMD.jl for my simd functions. Here is an example:
function reduceString(str)
charArray::Vector{UInt8} = codeunits(str)
simdCharArray = @inbounds charArray[VecRange{sizeof(str)}(0)+1]
return reduce(|, simdCharArray)
end
If we use the code_llvm maco to generate the llvm we get the following:
julia> @code_llvm debuginfo = :none reduceString("hi")
define nonnull {}* @julia_reduceString_4875({}* nonnull %0) #0 {
top:
%1 = alloca [2 x {}*], align 8
%gcframe7 = alloca [6 x {}*], align 16
%gcframe7.sub = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 0
%.sub = getelementptr inbounds [2 x {}*], [2 x {}*]* %1, i64 0, i64 0
%2 = bitcast [6 x {}*]* %gcframe7 to i8*
call void @llvm.memset.p0i8.i32(i8* noundef nonnull align 16 dereferenceable(48) %2, i8 0, i32 48, i1 false)
%3 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 3
%4 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 2
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #3
%ppgcstack_i8 = getelementptr i8, i8* %thread_ptr, i64 -8
%ppgcstack = bitcast i8* %ppgcstack_i8 to {}****
%pgcstack = load {}***, {}**** %ppgcstack, align 8
%5 = bitcast [6 x {}*]* %gcframe7 to i64*
store i64 16, i64* %5, align 16
%6 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 1
%7 = bitcast {}** %6 to {}***
%8 = load {}**, {}*** %pgcstack, align 8
store {}** %8, {}*** %7, align 8
%9 = bitcast {}*** %pgcstack to {}***
store {}** %gcframe7.sub, {}*** %9, align 8
%10 = bitcast {}* %0 to i64*
%11 = load i64, i64* %10, align 8
%12 = call nonnull {}* inttoptr (i64 140555877049360 to {}* ({}*, i64)*)({}* inttoptr (i64 140555346681792 to {}*), i64 %11)
%.not = icmp eq i64 %11, 0
br i1 %.not, label %L16, label %L10
L10: ; preds = %top
%13 = bitcast {}** %3 to [1 x {}*]*
%14 = bitcast {}** %4 to [1 x {}*]*
store {}* %0, {}** %4, align 16
%15 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 4
store {}* %12, {}** %15, align 16
%16 = call [1 x {}*] @j_unalias_4877({}* nonnull %12, [1 x {}*]* nocapture readonly %14) #0
%.fca.0.extract = extractvalue [1 x {}*] %16, 0
store {}* %.fca.0.extract, {}** %3, align 8
%17 = call nonnull {}* @"j_copyto_unaliased!_4878"({}* nonnull %12, [1 x {}*]* nocapture readonly %13) #0
br label %L16
L16: ; preds = %L10, %top
%value_phi = phi {}* [ %17, %L10 ], [ %12, %top ]
%18 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 5
store {}* %value_phi, {}** %18, align 8
%19 = call nonnull {}* @ijl_box_int64(i64 signext %11)
%20 = getelementptr inbounds [6 x {}*], [6 x {}*]* %gcframe7, i64 0, i64 4
store {}* %19, {}** %20, align 16
store {}* inttoptr (i64 140555548663792 to {}*), {}** %.sub, align 8
%21 = getelementptr inbounds [2 x {}*], [2 x {}*]* %1, i64 0, i64 1
store {}* %19, {}** %21, align 8
%22 = call nonnull {}* @jl_f_apply_type({}* null, {}** nonnull %.sub, i32 2)
store {}* %22, {}** %20, align 16
store {}* inttoptr (i64 140555566903328 to {}*), {}** %.sub, align 8
%23 = call nonnull {}* @ijl_apply_generic({}* nonnull %22, {}** nonnull %.sub, i32 1)
store {}* %23, {}** %20, align 16
store {}* %23, {}** %.sub, align 8
store {}* inttoptr (i64 140555566903392 to {}*), {}** %21, align 8
%24 = call nonnull {}* @ijl_apply_generic({}* inttoptr (i64 140555349625248 to {}*), {}** nonnull %.sub, i32 2)
store {}* %24, {}** %20, align 16
store {}* %value_phi, {}** %.sub, align 8
store {}* %24, {}** %21, align 8
%25 = call nonnull {}* @ijl_apply_generic({}* inttoptr (i64 140555347988128 to {}*), {}** nonnull %.sub, i32 2)
store {}* %25, {}** %20, align 16
store {}* inttoptr (i64 140555354080288 to {}*), {}** %.sub, align 8
store {}* %25, {}** %21, align 8
%26 = call nonnull {}* @ijl_apply_generic({}* inttoptr (i64 140555361198352 to {}*), {}** nonnull %.sub, i32 2)
%27 = load {}*, {}** %6, align 8
%28 = bitcast {}*** %pgcstack to {}**
store {}* %27, {}** %28, align 8
ret {}* %26
}
Now this is a bit strange to me since it seems that none of the vector instructions are being used. One that should be being used is the instruction for reduce or. Here is the output when I use the @code_llvm macro on just the reduce line outside of the function:
julia> str = "hi"
"hi"
julia> simdCharArray = @inbounds charArray[VecRange{sizeof(str)}(0)+1]
<2 x UInt8>[0x68, 0x69]
julia> @code_llvm debuginfo = :none reduce(|, simdCharArray)
define i8 @julia_reduce_4879([1 x <2 x i8>]* nocapture nonnull readonly align 2 dereferenceable(2) %0) #0 {
top:
%1 = getelementptr inbounds [1 x <2 x i8>], [1 x <2 x i8>]* %0, i64 0, i64 0
%2 = load <2 x i8>, <2 x i8>* %1, align 2
%res.i = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %2)
ret i8 %res.i
}
We can see that LLVM in this case is using the vector instructions. Notice the use of “lvm.vector.reduce.or.v2i8”
So my question is why is the LLVM code changing? Also how can I enforce that the functions I call that use explict SIMD instructions actually use them?