Is it possible to use SIMD in multithreading?

I was wondering how I could combine multithreading with SIMD operations and I’m not sure if it is even possible. I’m writing a few toy codes just to check that the code is actually being vectorized but as I don’t have much knowledge on llvm I can only spot it when I see types like <4 x double> when calling with @code_llvm.

using Base.Threads
using SIMD

function parallel_simd_loop(x,y,z,chunk,type)
    @threads for i in 1:chunk:length(x)
        sx = vload(Vec{chunk,type}, x, i)
        sy = vload(Vec{chunk,type}, y, i)
        sz = sx + sy
        vstore(sz, z, i)
    end
    return z
end

and this is what @code_llvm returns

;  @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:64 within `parallel_simd_loop'
; Function Attrs: uwtable
define nonnull %jl_value_t* @julia_parallel_simd_loop_930(%jl_value_t* nonnull align 16 dereferenceable(40), %jl_value_t* nonnull align 16 dereferenceable(40), %jl_value_t* nonnull align 16 dereferenceable(40), i64) #0 {
top:
  %4 = alloca %jl_value_t*, i32 2
  %gcframe = alloca %jl_value_t*, i32 4, align 16
  %5 = bitcast %jl_value_t** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* align 16 %5, i8 0, i32 32, i1 false)
  %6 = call %jl_value_t*** inttoptr (i64 1720679280 to %jl_value_t*** ()*)() #7
;  @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:65 within `parallel_simd_loop'
; ┌ @ threadingconstructs.jl:46 within `macro expansion'
; │┌ @ array.jl:219 within `length'
    %7 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 0
    %8 = bitcast %jl_value_t** %7 to i64*
    store i64 8, i64* %8
    %9 = getelementptr %jl_value_t**, %jl_value_t*** %6, i32 0
    %10 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
    %11 = bitcast %jl_value_t** %10 to %jl_value_t***
    %12 = load %jl_value_t**, %jl_value_t*** %9
    store %jl_value_t** %12, %jl_value_t*** %11
    %13 = bitcast %jl_value_t*** %9 to %jl_value_t***
    store %jl_value_t** %gcframe, %jl_value_t*** %13
    %14 = bitcast %jl_value_t* %0 to %jl_array_t*
    %15 = getelementptr inbounds %jl_array_t, %jl_array_t* %14, i64 0, i32 1
    %16 = load i64, i64* %15, align 8
; │└
; │┌ @ range.jl:22 within `Colon'
; ││┌ @ range.jl:24 within `_colon'
; │││┌ @ range.jl:256 within `StepRange' @ range.jl:205
      %17 = call i64 @j_steprange_last_931(i64 1, i64 %3, i64 %16) #0
; │└└└
; │ @ threadingconstructs.jl:85 within `macro expansion'
; │┌ @ threadingconstructs.jl:10 within `threadid'
    %18 = getelementptr %jl_value_t**, %jl_value_t*** %6, i64 2
    %19 = bitcast %jl_value_t*** %18 to i16*
    %20 = load i16, i16* %19, align 2
; │└
; │┌ @ operators.jl:202 within `!='
; ││┌ @ promotion.jl:398 within `=='
     %21 = icmp eq i16 %20, 0
; │└└
   br i1 %21, label %L12, label %L16

L12:                                              ; preds = %top
   %22 = call i32 inttoptr (i64 1720680768 to i32 ()*)()
; │┌ @ operators.jl:202 within `!='
; ││┌ @ promotion.jl:348 within `==' @ promotion.jl:398
     %23 = icmp ne i32 %22, 0
; ││└
; ││┌ @ bool.jl:36 within `!'
     br label %L16

L16:                                              ; preds = %L12, %top
     %value_phi = phi i1 [ %23, %L12 ], [ true, %top ]
; │└└
   %24 = bitcast %jl_value_t*** %6 to i8*
   %25 = call noalias nonnull %jl_value_t* @jl_gc_pool_alloc(i8* %24, i32 1496, i32 80) #3
   %26 = bitcast %jl_value_t* %25 to %jl_value_t**
   %27 = getelementptr %jl_value_t*, %jl_value_t** %26, i64 -1
   store %jl_value_t* inttoptr (i64 344113456 to %jl_value_t*), %jl_value_t** %27
   %28 = bitcast %jl_value_t* %25 to { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }*
   %.repack = bitcast %jl_value_t* %25 to %jl_value_t**
   store %jl_value_t* %0, %jl_value_t** %.repack, align 8
   %.repack5 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, 
%jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 1
   store %jl_value_t* %1, %jl_value_t** %.repack5, align 8
   %.repack7 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, 
%jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 2
   store %jl_value_t* %2, %jl_value_t** %.repack7, align 8
   %.repack9 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, 
%jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 3
   store i64 %3, i64* %.repack9, align 8
   %.repack11 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 4
   store %jl_value_t* inttoptr (i64 178181168 to %jl_value_t*), %jl_value_t** %.repack11, align 8
   %.repack13.repack = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 5, i64 0
   store i64 1, i64* %.repack13.repack, align 8
   %.repack13.repack15 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 5, i64 1
   store i64 %3, i64* %.repack13.repack15, align 8
   %.repack13.repack17 = getelementptr inbounds { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }, { %jl_value_t*, %jl_value_t*, %jl_value_t*, i64, %jl_value_t*, [3 x i64] }* %28, i64 0, i32 5, i64 2
   store i64 %17, i64* %.repack13.repack17, align 8
   %29 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 3
   store %jl_value_t* %25, %jl_value_t** %29
; │ @ threadingconstructs.jl:85 within `macro expansion'
   br i1 %value_phi, label %L32, label %L37

L32:                                              ; preds = %L16
; │ @ threadingconstructs.jl:86 within `macro expansion'
; │┌ @ essentials.jl:709 within `invokelatest'
; ││┌ @ essentials.jl:710 within `#invokelatest#1'
     %30 = call noalias nonnull %jl_value_t* @jl_gc_pool_alloc(i8* %24, i32 1400, i32 16) #3
     %31 = bitcast %jl_value_t* %30 to %jl_value_t**
     %32 = getelementptr %jl_value_t*, %jl_value_t** %31, i64 -1
     store %jl_value_t* inttoptr (i64 179201696 to %jl_value_t*), %jl_value_t** %32
     %33 = bitcast %jl_value_t* %30 to i8*
     store i8 1, i8* %33, align 8
     %34 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
     store %jl_value_t* %30, %jl_value_t** %34
     %35 = getelementptr %jl_value_t*, %jl_value_t** %4, i32 0
     store %jl_value_t* %25, %jl_value_t** %35
     %36 = getelementptr %jl_value_t*, %jl_value_t** %4, i32 1
     store %jl_value_t* %30, %jl_value_t** %36
     %37 = call nonnull %jl_value_t* @jl_f__apply_latest(%jl_value_t* null, %jl_value_t** %4, i32 2)
; │└└
   br label %L38

L37:                                              ; preds = %L16
; │ @ threadingconstructs.jl:93 within `macro expansion'
   %38 = getelementptr %jl_value_t*, %jl_value_t** %4, i32 0
   store %jl_value_t* %25, %jl_value_t** %38
   %39 = call nonnull %jl_value_t* @j1_threading_run_933(%jl_value_t* inttoptr (i64 237299920 to %jl_value_t*), %jl_value_t** %4, i32 1)        
   br label %L38

L38:                                              ; preds = %L37, %L32
   %40 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
   %41 = load %jl_value_t*, %jl_value_t** %40
   %42 = getelementptr %jl_value_t**, %jl_value_t*** %6, i32 0
   %43 = bitcast %jl_value_t*** %42 to %jl_value_t**
   store %jl_value_t* %41, %jl_value_t** %43
; └
;  @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:71 within `parallel_simd_loop'
  ret %jl_value_t* %2
}

Yes. SIMD and multithreading are basically unrelated so it is no problem to do both.

3 Likes

You can structure your code as:

using Base.Threads
using SIMD

function do_store(x, y, z, i)
    @inbounds begin
        sx = vload(Vec{chunk,type}, x, i)
        sy = vload(Vec{chunk,type}, y, i)
        sz = sx + sy
        vstore(sz, z, i)
    end
end
function parallel_simd_loop(x,y,z,chunk,type)
    @threads for i in 1:chunk:length(x)
        do_store(x, y, z, i)
    end
    return z
end

and then look at the generated code for do_store instead.

2 Likes

I tried this before:

function test_v(x,y,z)
       sx = vload(Vec{5, Float64}, x, 1)
       sy = vload(Vec{5, Float64}, y, 1)
       sz = sx + sy
       vstore(sz, z, 1)
end

@code_llvm test_v(x,y,zeros(Float64,n))

and it gave me things I could spot as SIMD like

...
%38 = bitcast %jl_value_t* %11 to <5 x double>**
%39 = load <5 x double>*, <5 x double>** %38, align 8
...
store <5 x double> %37, <5 x double>* %39, align 8

But when I did

function do_store(x, y, z, i,type,chunk)
    @inbounds begin
        sx = vload(Vec{chunk,type}, x, i)
        sy = vload(Vec{chunk,type}, y, i)
        sz = sx + sy
        vstore(sz, z, i)
    end
end

@code_llvm do_store(x,y,zeros(Float64,n),1,Float64,5)

I got nothing like it. Here is the output if anyone wants to check it:

;  @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:64 within `do_store'
; Function Attrs: uwtable
define nonnull %jl_value_t* @julia_do_store_1013(%jl_value_t* nonnull align 16 dereferenceable(40), %jl_value_t* nonnull align 16 dereferenceable(40), %jl_value_t* nonnull align 16 dereferenceable(40), i64, i64) #0 {
top:
  %5 = alloca %jl_value_t*, i32 3
  %gcframe = alloca %jl_value_t*, i32 5, align 16
  %6 = bitcast %jl_value_t** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* align 16 %6, i8 0, i32 40, i1 false)
  %7 = call %jl_value_t*** inttoptr (i64 1720679280 to %jl_value_t*** ()*)() #6
;  @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:66 within `do_store'
  %8 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 0
  %9 = bitcast %jl_value_t** %8 to i64*
  store i64 12, i64* %9
  %10 = getelementptr %jl_value_t**, %jl_value_t*** %7, i32 0
  %11 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
  %12 = bitcast %jl_value_t** %11 to %jl_value_t***
  %13 = load %jl_value_t**, %jl_value_t*** %10
  store %jl_value_t** %13, %jl_value_t*** %12
  %14 = bitcast %jl_value_t*** %10 to %jl_value_t***
  store %jl_value_t** %gcframe, %jl_value_t*** %14
  %15 = call %jl_value_t* @jl_box_int64(i64 signext %4)
  %16 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
  store %jl_value_t* %15, %jl_value_t** %16
  %17 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 0
  store %jl_value_t* inttoptr (i64 364802608 to %jl_value_t*), %jl_value_t** %17
  %18 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 1
  store %jl_value_t* %15, %jl_value_t** %18
  %19 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 2
  store %jl_value_t* inttoptr (i64 178181168 to %jl_value_t*), %jl_value_t** %19
  %20 = call nonnull %jl_value_t* @jl_f_apply_type(%jl_value_t* null, %jl_value_t** %5, i32 3)
  %21 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 3
  store %jl_value_t* %20, %jl_value_t** %21
  %22 = call %jl_value_t* @jl_box_int64(i64 signext %3)
  %23 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
  store %jl_value_t* %22, %jl_value_t** %23
  %24 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 0
  store %jl_value_t* %20, %jl_value_t** %24
  %25 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 1
  store %jl_value_t* %0, %jl_value_t** %25
  %26 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 2
  store %jl_value_t* %22, %jl_value_t** %26
  %27 = call nonnull %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 345243808 to %jl_value_t*), %jl_value_t** %5, i32 3)
  %28 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 3
  store %jl_value_t* %27, %jl_value_t** %28
;  @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:67 within `do_store'
  %29 = call %jl_value_t* @jl_box_int64(i64 signext %4)
  %30 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
  store %jl_value_t* %29, %jl_value_t** %30
  %31 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 0
  store %jl_value_t* inttoptr (i64 364802608 to %jl_value_t*), %jl_value_t** %31
  %32 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 1
  store %jl_value_t* %29, %jl_value_t** %32
  %33 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 2
  store %jl_value_t* inttoptr (i64 178181168 to %jl_value_t*), %jl_value_t** %33
  %34 = call nonnull %jl_value_t* @jl_f_apply_type(%jl_value_t* null, %jl_value_t** %5, i32 3)
  %35 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 4
  store %jl_value_t* %34, %jl_value_t** %35
  %36 = call %jl_value_t* @jl_box_int64(i64 signext %3)
  %37 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
  store %jl_value_t* %36, %jl_value_t** %37
  %38 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 0
  store %jl_value_t* %34, %jl_value_t** %38
  %39 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 1
  store %jl_value_t* %1, %jl_value_t** %39
  %40 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 2
  store %jl_value_t* %36, %jl_value_t** %40
  %41 = call nonnull %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 345243808 to %jl_value_t*), %jl_value_t** %5, i32 3)
  %42 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
  store %jl_value_t* %41, %jl_value_t** %42
;  @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:68 within `do_store'
  %43 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 0
  store %jl_value_t* %27, %jl_value_t** %43
  %44 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 1
  store %jl_value_t* %41, %jl_value_t** %44
  %45 = call nonnull %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 192970688 to %jl_value_t*), %jl_value_t** %5, i32 2)
  %46 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 3
  store %jl_value_t* %45, %jl_value_t** %46
;  @ E:\Users\diana\Documents\UERJ\mestrado\simd-julia\simd_threads.jl:69 within `do_store'
  %47 = call %jl_value_t* @jl_box_int64(i64 signext %3)
  %48 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 2
  store %jl_value_t* %47, %jl_value_t** %48
  %49 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 0
  store %jl_value_t* %45, %jl_value_t** %49
  %50 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 1
  store %jl_value_t* %2, %jl_value_t** %50
  %51 = getelementptr %jl_value_t*, %jl_value_t** %5, i32 2
  store %jl_value_t* %47, %jl_value_t** %51
  %52 = call nonnull %jl_value_t* @jl_apply_generic(%jl_value_t* inttoptr (i64 345244712 to %jl_value_t*), %jl_value_t** %5, i32 3)
  %53 = getelementptr %jl_value_t*, %jl_value_t** %gcframe, i32 1
  %54 = load %jl_value_t*, %jl_value_t** %53
  %55 = getelementptr %jl_value_t**, %jl_value_t*** %7, i32 0
  %56 = bitcast %jl_value_t*** %55 to %jl_value_t**
  store %jl_value_t* %54, %jl_value_t** %56
  ret %jl_value_t* %52
}

Because you have dynamic dispatches. Try ::Type{T},::Val{chunk}) where {T,chunk} and then Vec{chunk,T}.

1 Like

Thanks Elrod, I managed to get a llvm code where I can see the usage of SIMD operations. Does this mean that if I want to vectorize it I have to make sure I’m not using dynamic dispatch?