Here is an example of code that would routinely fail with Float32
but not Float64
. The input is the matrix BPP
. Each row is a separate data point, while columns 1-3 correspend to a vector, and columns 5-10 the upper triangle fo a symmetric matrix S
. The function solves for x
in Lx = y
, where L * L' = S
:
function process_big_prop_points!(X::AbstractMatrix{T}, BPP::AbstractMatrix{T}) where {T}
@fastmath @inbounds for i â 1:size(BPP,1)
y1 = BPP[i,1]
y2 = BPP[i,2]
y3 = BPP[i,3]
R11 = one(T) / sqrt(BPP[i,5])
x1 = R11 * y1
L21 = R11 * BPP[i,6]
L31 = R11 * BPP[i,8]
R22 = one(T) / sqrt(BPP[i,7] - L21*L21)
x2 = R22 * (y2 - L21*x1)
L32 = R22 * (BPP[i,9] - L21 * L31)
R33 = one(T) / sqrt(BPP[i,10] - L31*L31 - L32*L32)
X[i,1] = x1
X[i,2] = x2
X[i,3] = R33 * (y3 - L31*x1 - L32*x2)
end
end
You can see the vectorization easily by looking at the LLVM:
code_llvm
Float64
:
; julia> code_llvm(process_big_prop_points!, Tuple{Matrix{Float64},Matrix{Float64}}, debuginfo=:none)
define nonnull %jl_value_t* @"japi1_process_big_prop_points!_5934"(%jl_value_t* %0, %jl_value_t** %1, i32 %2) #0 {
top:
%3 = alloca %jl_value_t**, align 8
store volatile %jl_value_t** %1, %jl_value_t*** %3, align 8
%4 = getelementptr inbounds %jl_value_t*, %jl_value_t** %1, i64 1
%5 = load %jl_value_t*, %jl_value_t** %4, align 8
%6 = bitcast %jl_value_t* %5 to %jl_value_t**
%7 = getelementptr inbounds %jl_value_t*, %jl_value_t** %6, i64 3
%8 = bitcast %jl_value_t** %7 to i64*
%9 = load i64, i64* %8, align 8
%.inv = icmp sgt i64 %9, 0
%10 = select i1 %.inv, i64 %9, i64 0
br i1 %.inv, label %L13.preheader, label %L64
L13.preheader: ; preds = %top
%11 = load %jl_value_t*, %jl_value_t** %1, align 8
%12 = bitcast %jl_value_t* %5 to double**
%13 = load double*, double** %12, align 8
%14 = shl i64 %9, 1
%15 = shl i64 %9, 2
%16 = mul i64 %9, 5
%17 = mul i64 %9, 7
%18 = mul i64 %9, 6
%19 = shl i64 %9, 3
%20 = mul i64 %9, 9
%21 = bitcast %jl_value_t* %11 to double**
%22 = load double*, double** %21, align 8
%23 = bitcast %jl_value_t* %11 to %jl_value_t**
%24 = getelementptr inbounds %jl_value_t*, %jl_value_t** %23, i64 3
%25 = bitcast %jl_value_t** %24 to i64*
%26 = load i64, i64* %25, align 8
%27 = shl i64 %26, 1
br label %L13
L13: ; preds = %L13, %L13.preheader
%value_phi3 = phi i64 [ %80, %L13 ], [ 1, %L13.preheader ]
%28 = add nsw i64 %value_phi3, -1
%29 = getelementptr inbounds double, double* %13, i64 %28
%30 = load double, double* %29, align 8
%31 = add i64 %28, %9
%32 = getelementptr inbounds double, double* %13, i64 %31
%33 = load double, double* %32, align 8
%34 = add i64 %28, %14
%35 = getelementptr inbounds double, double* %13, i64 %34
%36 = load double, double* %35, align 8
%37 = add i64 %28, %15
%38 = getelementptr inbounds double, double* %13, i64 %37
%39 = load double, double* %38, align 8
%40 = call fast double @llvm.sqrt.f64(double %39)
%41 = fdiv fast double 1.000000e+00, %40
%42 = fmul fast double %41, %30
%43 = add i64 %28, %16
%44 = getelementptr inbounds double, double* %13, i64 %43
%45 = load double, double* %44, align 8
%46 = fmul fast double %45, %41
%47 = add i64 %28, %17
%48 = getelementptr inbounds double, double* %13, i64 %47
%49 = load double, double* %48, align 8
%50 = fmul fast double %49, %41
%51 = add i64 %28, %18
%52 = getelementptr inbounds double, double* %13, i64 %51
%53 = load double, double* %52, align 8
%54 = fmul fast double %46, %46
%55 = fsub fast double %53, %54
%56 = call fast double @llvm.sqrt.f64(double %55)
%57 = fdiv fast double 1.000000e+00, %56
%58 = fmul fast double %46, %42
%59 = fsub fast double %33, %58
%60 = fmul fast double %57, %59
%61 = add i64 %28, %19
%62 = getelementptr inbounds double, double* %13, i64 %61
%63 = load double, double* %62, align 8
%64 = fmul fast double %50, %46
%65 = fsub fast double %63, %64
%66 = fmul fast double %65, %57
%67 = add i64 %28, %20
%68 = getelementptr inbounds double, double* %13, i64 %67
%69 = load double, double* %68, align 8
%.neg = fmul fast double %50, %50
%.neg9 = fmul fast double %66, %66
%reass.add = fadd fast double %.neg9, %.neg
%70 = fsub fast double %69, %reass.add
%71 = call fast double @llvm.sqrt.f64(double %70)
%72 = getelementptr inbounds double, double* %22, i64 %28
store double %42, double* %72, align 8
%73 = add i64 %28, %26
%74 = getelementptr inbounds double, double* %22, i64 %73
store double %60, double* %74, align 8
%.neg10 = fmul fast double %50, %42
%.neg11 = fmul fast double %66, %60
%reass.add12 = fadd fast double %.neg11, %.neg10
%75 = fsub fast double %36, %reass.add12
%76 = fdiv fast double %75, %71
%77 = add i64 %28, %27
%78 = getelementptr inbounds double, double* %22, i64 %77
store double %76, double* %78, align 8
%79 = icmp eq i64 %value_phi3, %10
%80 = add nuw i64 %value_phi3, 1
br i1 %79, label %L64, label %L13
L64: ; preds = %L13, %top
ret %jl_value_t* inttoptr (i64 139708209337840 to %jl_value_t*)
}
Float32
:
; define nonnull %jl_value_t* @"japi1_process_big_prop_points!_5935"(%jl_value_t* %0, %jl_value_t** %1, i32 %2) #0 {
top:
%3 = alloca %jl_value_t**, align 8
store volatile %jl_value_t** %1, %jl_value_t*** %3, align 8
%4 = getelementptr inbounds %jl_value_t*, %jl_value_t** %1, i64 1
%5 = load %jl_value_t*, %jl_value_t** %4, align 8
%6 = bitcast %jl_value_t* %5 to %jl_value_t**
%7 = getelementptr inbounds %jl_value_t*, %jl_value_t** %6, i64 3
%8 = bitcast %jl_value_t** %7 to i64*
%9 = load i64, i64* %8, align 8
%.inv = icmp sgt i64 %9, 0
%10 = select i1 %.inv, i64 %9, i64 0
br i1 %.inv, label %L13.preheader, label %L64
L13.preheader: ; preds = %top
%11 = load %jl_value_t*, %jl_value_t** %1, align 8
%12 = bitcast %jl_value_t* %5 to float**
%13 = load float*, float** %12, align 8
%14 = shl i64 %9, 1
%15 = shl i64 %9, 2
%16 = mul i64 %9, 5
%17 = mul i64 %9, 7
%18 = mul i64 %9, 6
%19 = shl i64 %9, 3
%20 = mul i64 %9, 9
%21 = bitcast %jl_value_t* %11 to float**
%22 = load float*, float** %21, align 8
%23 = bitcast %jl_value_t* %11 to %jl_value_t**
%24 = getelementptr inbounds %jl_value_t*, %jl_value_t** %23, i64 3
%25 = bitcast %jl_value_t** %24 to i64*
%26 = load i64, i64* %25, align 8
%27 = shl i64 %26, 1
br label %L13
L13: ; preds = %L13, %L13.preheader
%value_phi3 = phi i64 [ %80, %L13 ], [ 1, %L13.preheader ]
%28 = add nsw i64 %value_phi3, -1
%29 = getelementptr inbounds float, float* %13, i64 %28
%30 = load float, float* %29, align 4
%31 = add i64 %28, %9
%32 = getelementptr inbounds float, float* %13, i64 %31
%33 = load float, float* %32, align 4
%34 = add i64 %28, %14
%35 = getelementptr inbounds float, float* %13, i64 %34
%36 = load float, float* %35, align 4
%37 = add i64 %28, %15
%38 = getelementptr inbounds float, float* %13, i64 %37
%39 = load float, float* %38, align 4
%40 = call fast float @llvm.sqrt.f32(float %39)
%41 = fdiv fast float 1.000000e+00, %40
%42 = fmul fast float %41, %30
%43 = add i64 %28, %16
%44 = getelementptr inbounds float, float* %13, i64 %43
%45 = load float, float* %44, align 4
%46 = fmul fast float %45, %41
%47 = add i64 %28, %17
%48 = getelementptr inbounds float, float* %13, i64 %47
%49 = load float, float* %48, align 4
%50 = fmul fast float %49, %41
%51 = add i64 %28, %18
%52 = getelementptr inbounds float, float* %13, i64 %51
%53 = load float, float* %52, align 4
%54 = fmul fast float %46, %46
%55 = fsub fast float %53, %54
%56 = call fast float @llvm.sqrt.f32(float %55)
%57 = fdiv fast float 1.000000e+00, %56
%58 = fmul fast float %46, %42
%59 = fsub fast float %33, %58
%60 = fmul fast float %57, %59
%61 = add i64 %28, %19
%62 = getelementptr inbounds float, float* %13, i64 %61
%63 = load float, float* %62, align 4
%64 = fmul fast float %50, %46
%65 = fsub fast float %63, %64
%66 = fmul fast float %65, %57
%67 = add i64 %28, %20
%68 = getelementptr inbounds float, float* %13, i64 %67
%69 = load float, float* %68, align 4
%.neg = fmul fast float %50, %50
%.neg9 = fmul fast float %66, %66
%reass.add = fadd fast float %.neg9, %.neg
%70 = fsub fast float %69, %reass.add
%71 = call fast float @llvm.sqrt.f32(float %70)
%72 = getelementptr inbounds float, float* %22, i64 %28
store float %42, float* %72, align 4
%73 = add i64 %28, %26
%74 = getelementptr inbounds float, float* %22, i64 %73
store float %60, float* %74, align 4
%.neg10 = fmul fast float %50, %42
%.neg11 = fmul fast float %66, %60
%reass.add12 = fadd fast float %.neg11, %.neg10
%75 = fsub fast float %36, %reass.add12
%76 = fdiv fast float %75, %71
%77 = add i64 %28, %27
%78 = getelementptr inbounds float, float* %22, i64 %77
store float %76, float* %78, align 4
%79 = icmp eq i64 %value_phi3, %10
%80 = add nuw i64 %value_phi3, 1
br i1 %79, label %L64, label %L13
L64: ; preds = %L13, %top
ret %jl_value_t* inttoptr (i64 139708209337840 to %jl_value_t*)
}
Notice all the <8 x double>
with Float64
and <16 x float>
with Float32
.
The code was written to be SIMD across iterations of the loop. This means that while what I wrote says for i â 1:size(BPP,1)
, what actually happens with Float32
is for _i â 1:15:size(BPP,1); i = _i:_i+15
, and for Float64
is for _i â 1:7:size(BPP,1); i = _i:_i+7
. That is, instead of calculating 1 iteration of a loop at a time, it calculates 16 or 8 of the iterations of the loop in parallel using SIMD instructions.
That is the major advantage of Float32
: twice as many of them fit in a register, so IF it is using SIMD instructions, itâll be able to calculate twice as much at a time.
This, and less cache pressure/memory bandwidth needed (for many problems, thatâs a much bigger concern). They are the two main reasons why Float32
is faster for the above example.
A second reason for greater performance in that particular example isnât generally applicable, but discussed below for the sake of thoroughness.
code_native
Sometimes, specialized CPU instructions exist for certain data types but not others. This is a less extreme example than gcalderoneâs.
For example, my CPU has a âapproximate inverse square rootâ instruction for Float32
, but not for Float64
. So for the Float32
case, the compiler can use this approximate instruction, and then add a Newton-Raphson iteration to make it more accurate. However, for Float64
, the compiler is forced to use the square root instruction and then divide.
I did try square root and division instructions with Float32
as well. This took twice as long IIRC, and wasnt really more accurate.
Float64
:
# julia> code_native(process_big_prop_points!, Tuple{Matrix{Float64},Matrix{Float64}}, debuginfo=:none)
.text
pushq %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
movq %rsi, -8(%rsp)
movq 8(%rsi), %rax
movq 24(%rax), %rdi
testq %rdi, %rdi
jle L300
movq %rdi, %rcx
sarq $63, %rcx
andnq %rdi, %rcx, %r8
movq (%rsi), %rcx
movq (%rax), %r15
movq (%rcx), %r9
movq 24(%rcx), %r14
leaq (%r9,%r14,8), %r10
shlq $4, %r14
addq %r9, %r14
leaq (%rdi,%rdi,8), %rax
leaq (%r15,%rax,8), %r11
movq %rdi, %r12
shlq $6, %r12
addq %r15, %r12
leaq (%rdi,%rdi,2), %rsi
shlq $4, %rsi
addq %r15, %rsi
imulq $56, %rdi, %rbx
addq %r15, %rbx
leaq (%rdi,%rdi,4), %rax
leaq (%r15,%rax,8), %r13
movq %rdi, %rax
shlq $5, %rax
addq %r15, %rax
leaq (%r15,%rdi,8), %rdx
shlq $4, %rdi
addq %r15, %rdi
xorl %ecx, %ecx
movabsq $.rodata.cst8, %rbp
vmovsd (%rbp), %xmm0 # xmm0 = mem[0],zero
nopw %cs:(%rax,%rax)
L160:
vmovsd (%rax,%rcx,8), %xmm1 # xmm1 = mem[0],zero
vsqrtsd %xmm1, %xmm1, %xmm1
vdivsd %xmm1, %xmm0, %xmm1
vmulsd (%r15,%rcx,8), %xmm1, %xmm2
vmulsd (%r13,%rcx,8), %xmm1, %xmm3
vmulsd (%rbx,%rcx,8), %xmm1, %xmm1
vmovsd (%rsi,%rcx,8), %xmm4 # xmm4 = mem[0],zero
vfnmadd231sd %xmm3, %xmm3, %xmm4 # xmm4 = -(xmm3 * xmm3) + xmm4
vsqrtsd %xmm4, %xmm4, %xmm4
vdivsd %xmm4, %xmm0, %xmm4
vmovsd (%rdx,%rcx,8), %xmm5 # xmm5 = mem[0],zero
vfnmadd231sd %xmm2, %xmm3, %xmm5 # xmm5 = -(xmm3 * xmm2) + xmm5
vmulsd %xmm5, %xmm4, %xmm5
vfnmadd213sd (%r12,%rcx,8), %xmm1, %xmm3 # xmm3 = -(xmm1 * xmm3) + mem
vmulsd %xmm4, %xmm3, %xmm3
vmulsd %xmm1, %xmm1, %xmm4
vfnmsub231sd %xmm3, %xmm3, %xmm4 # xmm4 = -(xmm3 * xmm3) - xmm4
vaddsd (%r11,%rcx,8), %xmm4, %xmm4
vsqrtsd %xmm4, %xmm4, %xmm4
vmulsd %xmm2, %xmm1, %xmm1
vfnmsub213sd %xmm1, %xmm5, %xmm3 # xmm3 = -(xmm5 * xmm3) - xmm1
vaddsd (%rdi,%rcx,8), %xmm3, %xmm1
vdivsd %xmm4, %xmm1, %xmm1
vmovsd %xmm2, (%r9,%rcx,8)
vmovsd %xmm5, (%r10,%rcx,8)
vmovsd %xmm1, (%r14,%rcx,8)
incq %rcx
cmpq %rcx, %r8
jne L160
L300:
movabsq $jl_system_image_data, %rax
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
Float32
.text
pushq %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
movq %rsi, -8(%rsp)
movq 8(%rsi), %rax
movq 24(%rax), %rbx
testq %rbx, %rbx
jle L343
movq %rbx, %rcx
sarq $63, %rcx
andnq %rbx, %rcx, %r8
movq (%rsi), %rdx
movq (%rax), %rcx
movq (%rdx), %r14
movq 24(%rdx), %rax
leaq (%r14,%rax,8), %r9
leaq (%r14,%rax,4), %r10
leaq (%rbx,%rbx,8), %rax
leaq (%rcx,%rax,4), %r11
movq %rbx, %r13
shlq $5, %r13
addq %rcx, %r13
leaq (%rbx,%rbx,2), %rdx
leaq (%rcx,%rdx,8), %r15
leaq (%rax,%rax,2), %rax
addq %rbx, %rax
addq %rcx, %rax
leaq (%rbx,%rbx,4), %rdx
leaq (%rcx,%rdx,4), %r12
movq %rbx, %rdx
shlq $4, %rdx
addq %rcx, %rdx
leaq (%rcx,%rbx,8), %rdi
leaq (%rcx,%rbx,4), %rbx
xorl %esi, %esi
movabsq $.rodata.cst4, %rbp
vmovss (%rbp), %xmm0 # xmm0 = mem[0],zero,zero,zero
movabsq $139707740336728, %rbp # imm = 0x7F103E3C3258
vmovss (%rbp), %xmm1 # xmm1 = mem[0],zero,zero,zero
nop
L160:
vmovss (%rdx,%rsi,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
vrsqrtss %xmm2, %xmm2, %xmm3
vmulss %xmm3, %xmm2, %xmm2
vfmadd213ss %xmm0, %xmm3, %xmm2 # xmm2 = (xmm3 * xmm2) + xmm0
vmulss %xmm1, %xmm3, %xmm3
vmulss %xmm2, %xmm3, %xmm2
vmulss (%rcx,%rsi,4), %xmm2, %xmm3
vmulss (%r12,%rsi,4), %xmm2, %xmm4
vmulss (%rax,%rsi,4), %xmm2, %xmm2
vmovss (%r15,%rsi,4), %xmm5 # xmm5 = mem[0],zero,zero,zero
vfnmadd231ss %xmm4, %xmm4, %xmm5 # xmm5 = -(xmm4 * xmm4) + xmm5
vrsqrtss %xmm5, %xmm5, %xmm6
vmulss %xmm6, %xmm5, %xmm5
vfmadd213ss %xmm0, %xmm6, %xmm5 # xmm5 = (xmm6 * xmm5) + xmm0
vmulss %xmm1, %xmm6, %xmm6
vmulss %xmm5, %xmm6, %xmm5
vmovss (%rbx,%rsi,4), %xmm6 # xmm6 = mem[0],zero,zero,zero
vfnmadd231ss %xmm3, %xmm4, %xmm6 # xmm6 = -(xmm4 * xmm3) + xmm6
vfnmadd213ss (%r13,%rsi,4), %xmm2, %xmm4 # xmm4 = -(xmm2 * xmm4) + mem
vmulss %xmm6, %xmm5, %xmm6
vmulss %xmm5, %xmm4, %xmm4
vmulss %xmm2, %xmm2, %xmm5
vfnmsub231ss %xmm4, %xmm4, %xmm5 # xmm5 = -(xmm4 * xmm4) - xmm5
vaddss (%r11,%rsi,4), %xmm5, %xmm5
vrsqrtss %xmm5, %xmm5, %xmm7
vmulss %xmm7, %xmm5, %xmm5
vfmadd213ss %xmm0, %xmm7, %xmm5 # xmm5 = (xmm7 * xmm5) + xmm0
vmulss %xmm1, %xmm7, %xmm7
vmulss %xmm5, %xmm7, %xmm5
vmulss %xmm3, %xmm2, %xmm2
vfnmsub213ss %xmm2, %xmm6, %xmm4 # xmm4 = -(xmm6 * xmm4) - xmm2
vaddss (%rdi,%rsi,4), %xmm4, %xmm2
vmulss %xmm5, %xmm2, %xmm2
vmovss %xmm3, (%r14,%rsi,4)
vmovss %xmm6, (%r10,%rsi,4)
vmovss %xmm2, (%r9,%rsi,4)
incq %rsi
cmpq %rsi, %r8
jne L160
L343:
movabsq $jl_system_image_data, %rax
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
I have two points with that example:
- To actually benefit from SIMD, you must actually make sure that the code is getting vectorized. That will require you to choose data layouts accordingly, and make sure the autovectorizer is actually triggering â itâs easy to make changes to the above code and no longer get vectorization (or, at least, it was at the time â Juliaâs gotten a better since then). And it isnât just Julia â gfortran, for example, had two associated performance bugs, one of which resulted in, starting with the newly released gcc10, gfortran now inlining packing by default, and providing a flag
-fno-inline-arg-packing
to control that behavior. If you donât pay careful attention to this, it is easy to lose any potential performance benefit. Serial Float64
instructions are just as fast as serial Float32
instructions. Same story if youâre calling library routines that arenât paying attention either.
- That there were no tolerance parameters, it was just arithmetic. The problem is that
BPP[i,10] - L31*L31 - L32*L32
often goes from being greater than zero with Float64
to less than zero with Float32
. Many of the matrices werenât especially well conditioned. Plus, there was a Gibbs Sampler involved.