Julia seems to be running multithreaded when I don't want it to

You mean in the Julia script? If so, yes I did.

I took the liberty of optimizing your Julia implementation a bit. I removed all of the slicing allocations by using @view and used mul! for in-place multiplication. Neither of these changed the runtime significantly at these large problem sizes, but they’ll reduce the memory churn which is never a bad thing. More significantly, I also used @view to trim computations on parts of your arrays that were still all-zero for a 2x speedup.

I checked that this still gave virtually the same results, but you should check it too.

using LinearAlgebra: norm, mul!

function CGS2_alt(X::AbstractMatrix)
  Base.require_one_based_indexing(X) # ensure X has 1-based indices
  T = float(eltype(X))
  n, m = size(X)
  Q = zeros(T, n, m)
  R = zeros(T, m, m)
  r = zeros(T, m)
  w = zeros(T, n)
  R[1, 1] = norm(@view X[:, 1])
  Q[:, 1] .= @view(X[:, 1]) ./ R[1, 1]
  for i in 2:m
	iprev = 1:i-1
    w .= @view X[:, i]
	Qi = @view Q[:,iprev]
	ri = @view r[iprev]
    mul!(ri, Qi', w) # ri .= (w'Qi)'
    R[iprev, i] = ri
    mul!(w, Qi, ri, -1, 1) # w .-= Qi * ri
    mul!(ri, Qi', w) # ri .= (w'Qi)'
    mul!(w, Qi, ri, -1, 1) # w .-= Qi * ri
    R[i, i] = norm(w)
    Q[:, i] .= w ./ R[i, i]
  end
  return Q, R
end
1 Like

By the way, I think the MGS is better than CGS2. Check out this code:

using LinearAlgebra

function coldot(A, j, i)
    m = size(A, 1)
    r = zero(eltype(A))
    @simd for k in 1:m
        r += A[k, i] * A[k, j]
    end
    return r; 
end

function colnorm(A, j)
    return sqrt(coldot(A, j, j)); 
end

function colsubt!(A, i, j, r)
    m = size(A, 1)
    @simd for k in 1:m
        A[k, i] -= r * A[k, j]
    end
end

function normalizecol!(A, j)
    m = size(A, 1)
    r = 1.0 / colnorm(A, j)
    @simd for k in 1:m
        A[k, j] *= r
    end
end

function mgsortho!(A)
    @show m, n = size(A)
    normalizecol!(A, 1)
    for j in 2:n 
        Base.Threads.@threads for i in j:n
            colsubt!(A, i, j-1, coldot(A, j-1, i))
        end
        normalizecol!(A, j)
    end
    return A
end

"""
Orthogonalize!(V, hv, vv, orth; verbose=false)

C. T. Kelley, 2022

Orthogonalize the Krylov vectors using your (my) choice of
methods. Anything other than classical Gram-Schmidt twice (cgs2) is
likely to become an undocumented and UNSUPPORTED option. Methods other 
than cgs2 are mostly for CI for the linear solver.

DO NOT use anything other than "cgs2" with Anderson acceleration.
"""
function Orthogonalize!(V, hv, vv, orth = "cgs2"; verbose = false)
    orthopts = ["mgs1", "mgs2", "cgs1", "cgs2"]
    orth in orthopts || error("Impossible orth spec in Orthogonalize!")
    if orth == "mgs1"
        mgs!(V, hv, vv; verbose = verbose)
    elseif orth == "mgs2"
        mgs!(V, hv, vv, "twice"; verbose = verbose)
    elseif orth == "cgs1"
        cgs!(V, hv, vv, "once"; verbose = verbose)
    else
        cgs!(V, hv, vv, "twice"; verbose = verbose)
    end
end

"""
mgs!(V, hv, vv, orth; verbose=false)
"""
function mgs!(V, hv, vv, orth = "once"; verbose = false)
    k = length(hv) - 1
    normin = norm(vv)
    #p=copy(vv)
    @views for j = 1:k
        p = vec(V[:, j])
        hv[j] = p' * vv
        vv .-= hv[j] * p
    end
    hv[k+1] = norm(vv)
    if (normin + 0.001 * hv[k+1] == normin) && (orth == "twice")
        @views for j = 1:k
            p = vec(V[:, j])
            hr = p' * vv
            hv[j] += hr
            vv .-= hr * p
        end
        hv[k+1] = norm(vv)
    end
    nv = hv[k+1]
    #
    # Watch out for happy breakdown
    #
    #if hv[k+1] != 0
    #@views vv .= vv/hv[k+1]
    (nv != 0) || (verbose && (println("breakdown in mgs1")))
    if nv != 0
        vv ./= nv
    end
end

"""
cgs!(V, hv, vv, orth="twice"; verbose=false)

Classical Gram-Schmidt.
"""
function cgs!(V, hv, vv, orth = "twice"; verbose = false)
    #
    #   no BLAS. Seems faster than BLAS since 1.6 and allocates
    #   far less memory.
    #
    k = length(hv)
    T = eltype(V)
    onep = T(1.0)
    zerop = T(0.0)
    @views rk = hv[1:k-1]
    pk = zeros(T, size(rk))
    qk = vv
    Qkm = V
    # Orthogonalize
    # New low allocation stuff
    mul!(rk, Qkm', qk, 1.0, 1.0)
    ###    mul!(pk, Qkm', qk)
    ###    rk .+= pk
    ##    rk .+= Qkm' * qk
    #    qk .-= Qkm * rk
    mul!(qk, Qkm, rk, -1.0, 1.0)
    if orth == "twice"
        # Orthogonalize again
        # New low allocation stuff
        mul!(pk, Qkm', qk)
        ##        pk .= Qkm' * qk
        #        qk .-= Qkm * pk
        mul!(qk, Qkm, pk, -1.0, 1.0)
        rk .+= pk
    end
    # Keep track of what you did.
    nqk = norm(qk)
    (nqk != 0) || (verbose && (println("breakdown in cgs")))
    (nqk > 0.0) && (qk ./= nqk)
    hv[k] = nqk
end

function qrctk!(A; orth = "cgs2")
    T = typeof(A[1, 1])
    (m, n) = size(A)
    R = zeros(T, n, n)
    @views R[1, 1] = norm(A[:, 1])
    @views A[:, 1] /= R[1, 1]
    @views for k = 2:n
        hv = vec(R[1:k, k])
        Qkm = view(A, :, 1:k-1)
        vv = vec(A[:, k])
        Orthogonalize!(Qkm, hv, vv, orth)
    end
    return (Q = A, R = R)
end


@show Threads.nthreads()

m = 200000
n = 200
A = rand(m, n)
B = deepcopy(A)

A .= B
@info "mgsortho!(A)"
@time mgsortho!(A)
@show norm(A'*A - LinearAlgebra.I, Inf)

A .= B
@info "qr!(A)"
@time begin
    q = qr!(A)
    A .= Matrix(q.Q)
end
@show norm(A'*A - LinearAlgebra.I, Inf)

A .= B
@info "qr!(A): broken up into two steps"
@time q = qr!(A)
@time     A .= Matrix(q.Q)
@show norm(A'*A - LinearAlgebra.I, Inf)

A .= B
@info "qrctk!(A; orth = \"cgs1\")"
@time qrctk!(A; orth = "cgs1")
@show norm(A'*A - LinearAlgebra.I, Inf)

A .= B
@info "qrctk!(A; orth = \"cgs2\")"
@time qrctk!(A; orth = "cgs2")
@show norm(A'*A - LinearAlgebra.I, Inf)

A .= B
@info "qrctk!(A; orth = \"mgs1\")"
@time qrctk!(A; orth = "mgs1")
@show norm(A'*A - LinearAlgebra.I, Inf)

A .= B
@info "qrctk!(A; orth = \"mgs2\")"
@time qrctk!(A; orth = "mgs2")
@show norm(A'*A - LinearAlgebra.I, Inf)


nothing

The good MGS is mgsortho!. mgs! and cgs! are due to C. T. Kelley GitHub - ctkelley/SIAMFANLEquations.jl: This is a Julia package for a book project.

Well, yeah, MGS is twice less the FLOP count than CGS2. I implemented MGS too, I didn’t attach it here, but if I run it on the 72-cores machines, CGS2 is actually about 4X faster! This is symptomatic of a parallel computation since the number of synchronizations is less in CGS2 than in MGS.

Try the codes above.
This is what I get on 63 threads:

julia> include("play.jl")
Threads.nthreads() = 63
[ Info: mgsortho!(A)
(m, n) = size(A) = (200000, 200)
  8.519595 seconds (218.53 k allocations: 13.934 MiB, 4.90% compilation time)
norm(A' * A - LinearAlgebra.I, Inf) = 3.1086244689504383e-15
[ Info: qr!(A)
  5.834709 seconds (262.59 k allocations: 624.114 MiB, 1.37% gc time, 6.05% compilation time)
norm(A' * A - LinearAlgebra.I, Inf) = 2.4424906541753444e-15
[ Info: qr!(A): broken up into two steps
  2.931786 seconds (5 allocations: 112.625 KiB)
  2.324473 seconds (9 allocations: 610.657 MiB, 6.39% gc time)
norm(A' * A - LinearAlgebra.I, Inf) = 2.4424906541753444e-15
[ Info: qrctk!(A; orth = "cgs1")
 12.793860 seconds (3.44 M allocations: 170.772 MiB, 0.08% gc time, 19.62% compilation time)
norm(A' * A - LinearAlgebra.I, Inf) = 3.720802284985433e-14
[ Info: qrctk!(A; orth = "cgs2")
 20.320473 seconds (404 allocations: 2.015 MiB)
norm(A' * A - LinearAlgebra.I, Inf) = 2.4424906541753444e-15
[ Info: qrctk!(A; orth = "mgs1")
 46.986516 seconds (40.20 k allocations: 29.656 GiB, 0.80% gc time)
norm(A' * A - LinearAlgebra.I, Inf) = 2.220446049250313e-15
[ Info: qrctk!(A; orth = "mgs2")
 46.504030 seconds (40.20 k allocations: 29.656 GiB, 0.49% gc time)
norm(A' * A - LinearAlgebra.I, Inf) = 2.220446049250313e-15
[ Info: CGS2_alt(A)
 22.561234 seconds (1.86 M allocations: 399.342 MiB, 0.17% gc time, 6.44% compilation time)
norm(A' * A - LinearAlgebra.I, Inf) = 2.4424906541753444e-15

You could try:
@code_llvm CGS2(X)

and in the output look for simd instructions. On my machine I see:

vector.body:                                      ; preds = %vector.body, %vector.ph
; β”‚β”‚β”‚β”‚ @ simdloop.jl:78 within `macro expansion`
; β”‚β”‚β”‚β”‚β”Œ @ int.jl:87 within `+`
       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
       %126 = getelementptr inbounds double, double* %123, i64 %index
; β”‚β”‚β”‚β”‚β””
; β”‚β”‚β”‚β”‚ @ simdloop.jl:77 within `macro expansion` @ broadcast.jl:961
; β”‚β”‚β”‚β”‚β”Œ @ broadcast.jl:597 within `getindex`
; β”‚β”‚β”‚β”‚β”‚β”Œ @ broadcast.jl:642 within `_broadcast_getindex`
; β”‚β”‚β”‚β”‚β”‚β”‚β”Œ @ broadcast.jl:666 within `_getindex`
; β”‚β”‚β”‚β”‚β”‚β”‚β”‚β”Œ @ broadcast.jl:636 within `_broadcast_getindex`
; β”‚β”‚β”‚β”‚β”‚β”‚β”‚β”‚β”Œ @ array.jl:924 within `getindex`
           %127 = bitcast double* %126 to <4 x double>*
           %wide.load = load <4 x double>, <4 x double>* %127, align 8
; β”‚β”‚β”‚β”‚β”‚β”‚β””β””β””
; β”‚β”‚β”‚β”‚β”‚β”‚ @ broadcast.jl:643 within `_broadcast_getindex`
; β”‚β”‚β”‚β”‚β”‚β”‚β”Œ @ broadcast.jl:670 within `_broadcast_getindex_evalf`
; β”‚β”‚β”‚β”‚β”‚β”‚β”‚β”Œ @ float.jl:386 within `/`
          %128 = fdiv <4 x double> %wide.load, %broadcast.splat
; β”‚β”‚β”‚β”‚β””β””β””β””
; β”‚β”‚β”‚β”‚ @ simdloop.jl:78 within `macro expansion`
; β”‚β”‚β”‚β”‚β”Œ @ int.jl:87 within `+`
       %129 = getelementptr inbounds double, double* %125, i64 %index

If you see simdloops only on the new CPU and not on the old CPU this might be an explanation for the performance difference. (SIMD: single instruction multiple data)
It might also be that the newer CPU supports AVX512 and the older only AVX256.

SIMD commands use only one CPU core, but can still process 4 or 8 words of data in parallel.

Other possible reasons for the speed difference:

  • different amount of RAM
  • different cache size
2 Likes

Following this thread a bit, and it doesn’t seem worth testing a bunch of things until @giordano’s suggestion of using top, htop, or other resource monitor is followed. I personally use conky on my desktop for exactly this reason. I just start a process, look at conky on my desktop and immediately can tell if something is multi-threading or multi-processing. It’s a super simple test.

2 Likes

Also, on Win 10/11 try ProcessExplorer.

1 Like

I didn’t know about ProcessExplorer. On Windows, usually I will open Task Manager, click on the β€œPerformance” tab, right click on the CPU graph, then β€œChange Graph To” β†’ β€œLogical Processors”.

1 Like

I cannot use conky on my machine. As I said htop is filled by processor gauges and does not display the ongoing processes. However, top does work. I ran it while running the code on the 72-cores machine. It seems as if there’s only one thread running:

1 Like

I’m on Linux.

If you mean was my timing data for Linux, the answer is yes.

Thanks! I’m not expert, but if its only on one thread then maybe @ufechner7 is correct that there is some SIMD magic going on instead of multi-threading?

This says the usage is at 3600% which is like using 36 cores at 100% util, so this is definitely running with multiple threads, probably on all 36 cores of your machine (I assume you have 36 actual cores which are just hyperthreaded).

3 Likes

Here is what I get with @code_llvm CGS2(X):

julia> @code_llvm CGS2(X)
;  @ /home/mica_20/users/nxf93431/test/julia-gram-schmidt/GramSchmidt.jl:93 within `CGS2`
define void @julia_CGS2_662([2 x {}*]* noalias nocapture sret([2 x {}*]) %0, {}* nonnull align 16 dereferenceable(40) %1) #0 {
top:
  %gcframe2097 = alloca [8 x {}*], align 16
  %gcframe2097.sub = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2097, i64 0, i64 0
  %2 = bitcast [8 x {}*]* %gcframe2097 to i8*
  call void @llvm.memset.p0i8.i32(i8* noundef nonnull align 16 dereferenceable(64) %2, i8 0, i32 64, i1 false)
  %3 = alloca { [1 x [1 x i64]], i64 }, align 8
  %4 = alloca [1 x [1 x i64]], align 8
  %5 = alloca [1 x [1 x i64]], align 8
  %6 = alloca [1 x [1 x i64]], align 8
  %7 = alloca [1 x [1 x i64]], align 8
  %8 = alloca { [1 x [1 x i64]], i64 }, align 8
  %9 = alloca { [1 x [1 x i64]], i64 }, align 8
  %10 = alloca [1 x [1 x i64]], align 8
  %11 = alloca [1 x [2 x i64]], align 8
  %12 = alloca [1 x [1 x i64]], align 8
  %13 = alloca { [1 x [1 x i64]], i64 }, align 8
  %14 = alloca [1 x [2 x i64]], align 8
  %15 = alloca [1 x [1 x i64]], align 8
  %16 = alloca [1 x [1 x i64]], align 8
  %17 = alloca { [1 x [1 x i64]], i64 }, align 8
  %18 = alloca [1 x [1 x i64]], align 8
  %19 = alloca [1 x [1 x i64]], align 8
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #7
  %ppgcstack_i8 = getelementptr i8, i8* %thread_ptr, i64 -8
  %ppgcstack = bitcast i8* %ppgcstack_i8 to {}****
  %pgcstack = load {}***, {}**** %ppgcstack, align 8
;  @ /home/mica_20/users/nxf93431/test/julia-gram-schmidt/GramSchmidt.jl:94 within `CGS2`
; Γ’ @ array.jl:152 within `size`
  %20 = bitcast [8 x {}*]* %gcframe2097 to i64*
   store i64 24, i64* %20, align 16
   %21 = getelementptr inbounds [8 x {}*], [8 x {}*]* %gcframe2097, i64 0, i64 1
   %22 = bitcast {}** %21 to {}***
   %23 = load {}**, {}*** %pgcstack, align 8
   store {}** %23, {}*** %22, align 8
   %24 = bitcast {}*** %pgcstack to {}***
   store {}** %gcframe2097.sub, {}*** %24, align 8
   %25 = bitcast {}* %1 to {}**
   %26 = getelementptr inbounds {}*, {}** %25, i64 3
   %27 = bitcast {}** %26 to i64*
   %28 = load i64, i64* %27, align 8
   %29 = getelementptr inbounds {}*, {}** %25, i64 4
   %30 = bitcast {}** %29 to i64*
   %31 = load i64, i64* %30, align 8
; Γ’
;  @ /home/mica_20/users/nxf93431/test/julia-gram-schmidt/GramSchmidt.jl:95 within `CGS2`
; Γ’ @ array.jl:584 within `zeros` @ array.jl:588
; Γ’Γ’ @ boot.jl:469 within `Array` @ boot.jl:461
    %32 = call nonnull {}* inttoptr (i64 47976924541792 to {}* ({}*, i64, i64)*)({}* inttoptr (i64 47977128374960 to {}*), i64 %28, i64 %31)
; Γ’Γ’
; Γ’ @ array.jl:584 within `zeros` @ array.jl:589

and it keeps going, I could not copy everything… There does not seem to be a sign of simdloops.

The amounts of RAM are as follows:

On the 2-cores machine:
15G

On the 72-cores machine:
755G

The cache sizes are as follows:

On the 2-cores machine:
L1: 32KB
L2: 256KB
L3: 35MB

On the 72-cores machine:
L1: 32KB
L2: 1MB
L3: 72MB

So the RAM and cache are clearly different. If this alone impacts the Julia code so much, why doesn’t it impact the C code at all? For reminder, here is the matrix of running times:

2-cores machine:
Julia code: 166.49 s
C code: 89.68 s

72-cores machine
Julia code: 35.96 s
C code: 84.54 s

You said you have:

at the beginning of your script.

But are you sure you are using BLAS and not MKL?

To Check Installation:

julia> using LinearAlgebra

julia> BLAS.get_config()
LinearAlgebra.BLAS.LBTConfig
Libraries: 
β”” [ILP64] libopenblas64_.0.3.13.dylib
julia> using MKL

julia> BLAS.get_config()
LinearAlgebra.BLAS.LBTConfig
Libraries: 
β”” [ILP64] libmkl_rt.1.dylib

Great! Thanks for noticing. This is the only way these results make sense. Do you know how I can prevent hyperthreading?

1 Like

You should launch Julia with a single thread, which can be done by setting the JULIA_NUM_THREADS env variable before the launch (you can’t do this after opening), or just set it as an option when you open the REPL:

julia -t 1

or

julia --threads=1

You should also set the BLAS threads as you are already doing.
Try the test again, and use top to make sure that CPU usage doesn’t go over 100%. If it does, you are probably calling something external which is using more threads.

I did add

using LinearAlgebra: BLAS
BLAS.set_num_threads(1)

to my script.

I do not know whether I use OpenBLAS or MKL. I thought all Julia installations used OpenBLAS by default. How can I check this? And if I use MKL, how can I limit the number of MKL threads from Julia?

This was done already. Prior to running the script, I exported JULIA_NUM_THREADS=1.