@simd
isn’t needed for this loop, because it is already simd:
; julia> @code_llvm debuginfo=:none sequential_t(y, x)
define nonnull %jl_value_t addrspace(10)* @japi1_sequential_t_17699(%jl_value_t addrspace(10)*, %jl_value_t addrspace(10)**, i32) #0 {
top:
%3 = alloca %jl_value_t addrspace(10)**, align 8
store volatile %jl_value_t addrspace(10)** %1, %jl_value_t addrspace(10)*** %3, align 8
%4 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, i64 1
%5 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %4, align 8
%6 = addrspacecast %jl_value_t addrspace(10)* %5 to %jl_value_t addrspace(11)*
%7 = bitcast %jl_value_t addrspace(11)* %6 to %jl_array_t addrspace(11)*
%8 = getelementptr inbounds %jl_array_t, %jl_array_t addrspace(11)* %7, i64 0, i32 1
%9 = load i64, i64 addrspace(11)* %8, align 8
%10 = add i64 %9, -1
%11 = icmp sgt i64 %10, 0
%12 = select i1 %11, i64 %10, i64 0
br i1 %11, label %L14.preheader, label %L37
L14.preheader: ; preds = %top
%13 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, align 8
%14 = bitcast %jl_value_t addrspace(11)* %6 to double addrspace(13)* addrspace(11)*
%15 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %14, align 8
%16 = addrspacecast %jl_value_t addrspace(10)* %13 to %jl_value_t addrspace(11)*
%17 = bitcast %jl_value_t addrspace(11)* %16 to double addrspace(13)* addrspace(11)*
%18 = load double addrspace(13)*, double addrspace(13)* addrspace(11)* %17, align 8
%min.iters.check = icmp ult i64 %12, 16
br i1 %min.iters.check, label %scalar.ph, label %vector.memcheck
vector.memcheck: ; preds = %L14.preheader
%scevgep = getelementptr double, double addrspace(13)* %18, i64 %12
%19 = add nuw i64 %12, 1
%scevgep10 = getelementptr double, double addrspace(13)* %15, i64 %19
%bound0 = icmp ult double addrspace(13)* %18, %scevgep10
%bound1 = icmp ult double addrspace(13)* %15, %scevgep
%found.conflict = and i1 %bound0, %bound1
br i1 %found.conflict, label %scalar.ph, label %vector.ph
vector.ph: ; preds = %vector.memcheck
%n.vec = and i64 %12, 9223372036854775792
%ind.end = or i64 %n.vec, 1
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%offset.idx = or i64 %index, 1
%20 = getelementptr inbounds double, double addrspace(13)* %15, i64 %index
%21 = bitcast double addrspace(13)* %20 to <4 x double> addrspace(13)*
%wide.load = load <4 x double>, <4 x double> addrspace(13)* %21, align 8
%22 = getelementptr inbounds double, double addrspace(13)* %20, i64 4
%23 = bitcast double addrspace(13)* %22 to <4 x double> addrspace(13)*
%wide.load15 = load <4 x double>, <4 x double> addrspace(13)* %23, align 8
%24 = getelementptr inbounds double, double addrspace(13)* %20, i64 8
%25 = bitcast double addrspace(13)* %24 to <4 x double> addrspace(13)*
%wide.load16 = load <4 x double>, <4 x double> addrspace(13)* %25, align 8
%26 = getelementptr inbounds double, double addrspace(13)* %20, i64 12
%27 = bitcast double addrspace(13)* %26 to <4 x double> addrspace(13)*
%wide.load17 = load <4 x double>, <4 x double> addrspace(13)* %27, align 8
%28 = fmul <4 x double> %wide.load, <double 0x3FF3333340000000, double 0x3FF3333340000000, double 0x3FF3333340000000, double 0x3FF3333340000000>
%29 = fmul <4 x double> %wide.load15, <double 0x3FF3333340000000, double 0x3FF3333340000000, double 0x3FF3333340000000, double 0x3FF3333340000000>
%30 = fmul <4 x double> %wide.load16, <double 0x3FF3333340000000, double 0x3FF3333340000000, double 0x3FF3333340000000, double 0x3FF3333340000000>
%31 = fmul <4 x double> %wide.load17, <double 0x3FF3333340000000, double 0x3FF3333340000000, double 0x3FF3333340000000, double 0x3FF3333340000000>
%32 = getelementptr inbounds double, double addrspace(13)* %15, i64 %offset.idx
%33 = bitcast double addrspace(13)* %32 to <4 x double> addrspace(13)*
%wide.load18 = load <4 x double>, <4 x double> addrspace(13)* %33, align 8
%34 = getelementptr inbounds double, double addrspace(13)* %32, i64 4
%35 = bitcast double addrspace(13)* %34 to <4 x double> addrspace(13)*
%wide.load19 = load <4 x double>, <4 x double> addrspace(13)* %35, align 8
%36 = getelementptr inbounds double, double addrspace(13)* %32, i64 8
%37 = bitcast double addrspace(13)* %36 to <4 x double> addrspace(13)*
%wide.load20 = load <4 x double>, <4 x double> addrspace(13)* %37, align 8
%38 = getelementptr inbounds double, double addrspace(13)* %32, i64 12
%39 = bitcast double addrspace(13)* %38 to <4 x double> addrspace(13)*
%wide.load21 = load <4 x double>, <4 x double> addrspace(13)* %39, align 8
%40 = fmul <4 x double> %wide.load18, <double 0x3FC9999A00000000, double 0x3FC9999A00000000, double 0x3FC9999A00000000, double 0x3FC9999A00000000>
%41 = fmul <4 x double> %wide.load19, <double 0x3FC9999A00000000, double 0x3FC9999A00000000, double 0x3FC9999A00000000, double 0x3FC9999A00000000>
%42 = fmul <4 x double> %wide.load20, <double 0x3FC9999A00000000, double 0x3FC9999A00000000, double 0x3FC9999A00000000, double 0x3FC9999A00000000>
%43 = fmul <4 x double> %wide.load21, <double 0x3FC9999A00000000, double 0x3FC9999A00000000, double 0x3FC9999A00000000, double 0x3FC9999A00000000>
%44 = fsub <4 x double> %28, %40
%45 = fsub <4 x double> %29, %41
%46 = fsub <4 x double> %30, %42
%47 = fsub <4 x double> %31, %43
%48 = getelementptr inbounds double, double addrspace(13)* %18, i64 %index
%49 = bitcast double addrspace(13)* %48 to <4 x double> addrspace(13)*
store <4 x double> %44, <4 x double> addrspace(13)* %49, align 8
%50 = getelementptr inbounds double, double addrspace(13)* %48, i64 4
%51 = bitcast double addrspace(13)* %50 to <4 x double> addrspace(13)*
store <4 x double> %45, <4 x double> addrspace(13)* %51, align 8
%52 = getelementptr inbounds double, double addrspace(13)* %48, i64 8
%53 = bitcast double addrspace(13)* %52 to <4 x double> addrspace(13)*
store <4 x double> %46, <4 x double> addrspace(13)* %53, align 8
%54 = getelementptr inbounds double, double addrspace(13)* %48, i64 12
%55 = bitcast double addrspace(13)* %54 to <4 x double> addrspace(13)*
store <4 x double> %47, <4 x double> addrspace(13)* %55, align 8
%index.next = add i64 %index, 16
%56 = icmp eq i64 %index.next, %n.vec
br i1 %56, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%cmp.n = icmp eq i64 %12, %n.vec
br i1 %cmp.n, label %L37, label %scalar.ph
scalar.ph: ; preds = %middle.block, %vector.memcheck, %L14.preheader
%bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L14.preheader ], [ 1, %vector.memcheck ]
br label %L14
L14: ; preds = %scalar.ph, %L14
%value_phi3 = phi i64 [ %67, %L14 ], [ %bc.resume.val, %scalar.ph ]
%57 = add nsw i64 %value_phi3, -1
%58 = getelementptr inbounds double, double addrspace(13)* %15, i64 %57
%59 = load double, double addrspace(13)* %58, align 8
%60 = fmul double %59, 0x3FF3333340000000
%61 = getelementptr inbounds double, double addrspace(13)* %15, i64 %value_phi3
%62 = load double, double addrspace(13)* %61, align 8
%63 = fmul double %62, 0x3FC9999A00000000
%64 = fsub double %60, %63
%65 = getelementptr inbounds double, double addrspace(13)* %18, i64 %57
store double %64, double addrspace(13)* %65, align 8
%66 = icmp eq i64 %value_phi3, %12
%67 = add nuw i64 %value_phi3, 1
br i1 %66, label %L37, label %L14
L37: ; preds = %L14, %middle.block, %top
ret %jl_value_t addrspace(10)* addrspacecast (%jl_value_t* inttoptr (i64 140616413300176 to %jl_value_t*) to %jl_value_t addrspace(10)*)
}
Because rearranging the order doesn’t change rounding, LLVM doesn’t need @simd
or @fastmath
to vectorize the loop.
I do see a performance improvement from threading:
julia> @btime parallel_t($y, $x)
842.107 μs (124 allocations: 17.16 KiB)
julia> @btime sequential_t($y, $x)
7.035 ms (0 allocations: 0 bytes)
julia> versioninfo()
Julia Version 1.5.0-DEV.380
Commit a523fcf (2020-03-01 22:55 UTC)
Platform Info:
OS: Linux (x86_64-redhat-linux)
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-9.0.1 (ORCJIT, haswell)
Environment:
JULIA_NUM_THREADS = 24
Increasing the number of cores should generally help memory-bandwidth constrained problems.
In terms of memory requirements
julia> (sizeof(y) + sizeof(x)) / 2^20
76.2939453125
It requires about 76 MiB. This CPU has about 2 MiB L2 cache per core, and 12 cores, and 20 MiB L3 cache shared between cores.
Is there any good resource to learn more about how all of this works? I would guess that the different L2 caches can stream from the L3 in parallel, but I’m not really sure.