I was wondering how I could combine multithreading with SIMD operations and I’m not sure if it is even possible. I’m writing a few toy codes just to check that the code is actually being vectorized but as I don’t have much knowledge on llvm I can only spot it when I see types like <4 x double> when calling with @code_llvm.
using Base.Threads
using SIMD
function parallel_simd_loop(x,y,z,chunk,type)
@threads for i in 1:chunk:length(x)
sx = vload(Vec{chunk,type}, x, i)
sy = vload(Vec{chunk,type}, y, i)
sz = sx + sy
vstore(sz, z, i)
end
return z
end
and this is what @code_llvm returns
; @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:64 within `parallel_simd_loop'
; Function Attrs: uwtable
define nonnull %jl_value_t* @julia_parallel_simd_loop_930(%jl_value_t* nonnull align 16 dereferenceable(40), %jl_value_t* nonnull align 16 dereferenceable(40), %jl_value_t* nonnull align 16 dereferenceable(40), i64) #0 {
top:
%4 = alloca %jl_value_t*, i32 2
%gcframe = alloca %jl_value_t*, i32 4, align 16
%5 = bitcast %jl_value_t** %gcframe to i8*
call void @llvm.memset.p0i8.i32(i8* align 16 %5, i8 0, i32 32, i1 false)
%6 = call %jl_value_t*** inttoptr (i64 1720679280 to %jl_value_t*** ()*)() #7
; @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:65 within `parallel_simd_loop'
; ┌ @ threadingconstructs.jl:46 within `macro expansion'
; │┌ @ array.jl:219 within `length'
%7 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 0
%8 = bitcast %jl_value_t** %7 to i64*
store i64 8, i64* %8
%9 = getelementptr %jl_value_t**, %jl_value_t*** %6, i32 0
%10 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
%11 = bitcast %jl_value_t** %10 to %jl_value_t***
%12 = load %jl_value_t**, %jl_value_t*** %9
store %jl_value_t** %12, %jl_value_t*** %11
%13 = bitcast %jl_value_t*** %9 to %jl_value_t***
store %jl_value_t** %gcframe, %jl_value_t*** %13
%14 = bitcast %jl_value_t* %0 to %jl_array_t*
%15 = getelementptr inbounds %jl_array_t, %jl_array_t* %14, i64 0, i32 1
%16 = load i64, i64* %15, align 8
; │└
; │┌ @ range.jl:22 within `Colon'
; ││┌ @ range.jl:24 within `_colon'
; │││┌ @ range.jl:256 within `StepRange' @ range.jl:205
%17 = call i64 @j_steprange_last_931(i64 1, i64 %3, i64 %16) #0
; │└└└
; │ @ threadingconstructs.jl:85 within `macro expansion'
; │┌ @ threadingconstructs.jl:10 within `threadid'
%18 = getelementptr %jl_value_t**, %jl_value_t*** %6, i64 2
%19 = bitcast %jl_value_t*** %18 to i16*
%20 = load i16, i16* %19, align 2
; │└
; │┌ @ operators.jl:202 within `!='
; ││┌ @ promotion.jl:398 within `=='
%21 = icmp eq i16 %20, 0
; │└└
br i1 %21, label %L12, label %L16
L12: ; preds = %top
%22 = call i32 inttoptr (i64 1720680768 to i32 ()*)()
; │┌ @ operators.jl:202 within `!='
; ││┌ @ promotion.jl:348 within `==' @ promotion.jl:398
%23 = icmp ne i32 %22, 0
; ││└
; ││┌ @ bool.jl:36 within `!'
br label %L16
L16: ; preds = %L12, %top
%value_phi = phi i1 [ %23, %L12 ], [ true, %top ]
; │└└
%24 = bitcast %jl_value_t*** %6 to i8*
%25 = call noalias nonnull %jl_value_t* @jl_gc_pool_alloc(i8* %24, i32 1496, i32 80) #3
%26 = bitcast %jl_value_t* %25 to %jl_value_t**
%27 = getelementptr %jl_value_t*, %jl_value_t** %26, i64 -1
store %jl_value_t* inttoptr (i64 344113456 to %jl_value_t*), %jl_value_t** %27
%28 = bitcast %jl_value_t* %25 to { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }*
%.repack = bitcast %jl_value_t* %25 to %jl_value_t**
store %jl_value_t* %0, %jl_value_t** %.repack, align 8
%.repack5 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*,
%jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 1
store %jl_value_t* %1, %jl_value_t** %.repack5, align 8
%.repack7 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*,
%jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 2
store %jl_value_t* %2, %jl_value_t** %.repack7, align 8
%.repack9 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*,
%jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 3
store i64 %3, i64* %.repack9, align 8
%.repack11 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 4
store %jl_value_t* inttoptr (i64 178181168 to %jl_value_t*), %jl_value_t** %.repack11, align 8
%.repack13.repack = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 5, i64 0
store i64 1, i64* %.repack13.repack, align 8
%.repack13.repack15 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 5, i64 1
store i64 %3, i64* %.repack13.repack15, align 8
%.repack13.repack17 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 5, i64 2
store i64 %17, i64* %.repack13.repack17, align 8
%29 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 3
store %jl_value_t* %25, %jl_value_t** %29
; │ @ threadingconstructs.jl:85 within `macro expansion'
br i1 %value_phi, label %L32, label %L37
L32: ; preds = %L16
; │ @ threadingconstructs.jl:86 within `macro expansion'
; │┌ @ essentials.jl:709 within `invokelatest'
; ││┌ @ essentials.jl:710 within `#invokelatest#1'
%30 = call noalias nonnull %jl_value_t* @jl_gc_pool_alloc(i8* %24, i32 1400, i32 16) #3
%31 = bitcast %jl_value_t* %30 to %jl_value_t**
%32 = getelementptr %jl_value_t*, %jl_value_t** %31, i64 -1
store %jl_value_t* inttoptr (i64 179201696 to %jl_value_t*), %jl_value_t** %32
%33 = bitcast %jl_value_t* %30 to i8*
store i8 1, i8* %33, align 8
%34 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
store %jl_value_t* %30, %jl_value_t** %34
%35 = getelementptr %jl_value_t*, %jl_value_t** %4, i32 0
store %jl_value_t* %25, %jl_value_t** %35
%36 = getelementptr %jl_value_t*, %jl_value_t** %4, i32 1
store %jl_value_t* %30, %jl_value_t** %36
%37 = call nonnull %jl_value_t* @jl_f__apply_latest(%jl_value_t* null, %jl_value_t** %4, i32 2)
; │└└
br label %L38
L37: ; preds = %L16
; │ @ threadingconstructs.jl:93 within `macro expansion'
%38 = getelementptr %jl_value_t*, %jl_value_t** %4, i32 0
store %jl_value_t* %25, %jl_value_t** %38
%39 = call nonnull %jl_value_t* @j1_threading_run_933(%jl_value_t* inttoptr (i64 237299920 to %jl_value_t*), %jl_value_t** %4, i32 1)
br label %L38
L38: ; preds = %L37, %L32
%40 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
%41 = load %jl_value_t*, %jl_value_t** %40
%42 = getelementptr %jl_value_t**, %jl_value_t*** %6, i32 0
%43 = bitcast %jl_value_t*** %42 to %jl_value_t**
store %jl_value_t* %41, %jl_value_t** %43
; └
; @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:71 within `parallel_simd_loop'
ret %jl_value_t* %2
}