Multithreading and generated code

gideonsimpson · May 9, 2021, 4:04pm

A while back, I solved this problem, Performance and Allocation with Arrays of Functions. More recently, I wanted a multithreaded loop inside of this, so that the code now reads:

function update(x, Δt)
    return x + 0.5 * Δt * x
end

@generated function compute_threaded(x0_vals, Δt, nΔt, f::Tuple{Vararg{<:Any,K}}) where {K}
    quote
        x_vals = copy(x0_vals);
        
        nx = length(x0_vals);
        f_vals = zeros($K, nx);
        f_avg_vals = zeros($K, nΔt)
        
        for n in 1:nΔt
            # @. x_vals = update(x_vals, Δt); switch to this, and it works fine
            Threads.@threads for j in 1:nx
                x_vals[j] = update(x_vals[j], Δt)
            end
            Base.Cartesian.@nexprs $K k -> f_vals[k,:] .= (f[k]).(x_vals);
            for j in 1:nx
                Base.Cartesian.@nexprs $K k -> f_avg_vals[k,n] += f_vals[k,j]/nx
            end
        end
        return f_avg_vals
    end
end

Random.seed!(100);
x0 = randn(10);
x₀ = [1.0];
Δt = 0.5;
nΔt = 10^2;

f1(x) = sin(x)
f2(x) = x^2

compute_threaded(x0, Δt, nΔt, (f1,))

I get the error:

The function body AST defined by this @generated function is not pure. This likely means it contains a closure or comprehension.

The motivation here is that the real update function will be very expensive and will benefit from multithreading. I am open to moving away from @generated code, but the motivation from the earlier post still remains: I want to evaluate some collection of observable functions, on the fly, on a time series, as it is generated, and I may have a variable number of such observables for different problems.

simeonschaub · May 9, 2021, 4:28pm

There is really no reason to use @nexprs together with a generated function here. You can just use ntuple, e.g.

ntuple(k -> f_vals[k,:] .= (f[k]).(x_vals), K)

instead of

Base.Cartesian.@nexprs $K k -> f_vals[k,:] .= (f[k]).(x_vals)

so you don’t need a generated function.

gideonsimpson · May 9, 2021, 4:54pm

There’s a performance hit with that. Consider the following simplified example:

function update(x, Δt)
    return x + 0.5 * Δt * x
end


@generated function compute_values(x₀, Δt, nΔt, f::Tuple{Vararg{<:Any,K}}) where {K}
    quote
        x = x₀;
        f_vals = zeros($K, nΔt)
        for n in 1:nΔt
            x =update(x,Δt)
            Base.Cartesian.@nexprs $K k -> f_vals[k,n] = (f[k])(x);
        end
        return f_vals
    end
end

function compute_values_ntuple(x₀, Δt, nΔt, f::Tuple{Vararg{<:Any,K}}) where {K}
    x = x₀;
    f_vals = zeros(K, nΔt)
    for n in 1:nΔt
        x =update(x,Δt)
        ntuple(k -> f_vals[k,n] = (f[k])(x), K)
    end
    return f_vals
end

x₀ = 1.0;
Δt = 0.5;
nΔt = 10^3;

f1(x) = sin(x)
f2(x) = x^2
@btime compute_values($x₀, $Δt, $nΔt, $(f1,))
@btime compute_values($x₀, $Δt, $nΔt, $(f2,))
@btime compute_values($x₀, $Δt, $nΔt, $(f1,f2))

  36.936 μs (1 allocation: 7.94 KiB)
  3.044 μs (1 allocation: 7.94 KiB)
  37.929 μs (1 allocation: 15.75 KiB)

@btime compute_values_ntuple($x₀, $Δt, $nΔt, $(f1,))
@btime compute_values_ntuple($x₀, $Δt, $nΔt, $(f2,))
@btime compute_values_ntuple($x₀, $Δt, $nΔt, $(f1,f2))

 230.513 μs (3003 allocations: 54.84 KiB)
  189.271 μs (3003 allocations: 54.84 KiB)
  458.585 μs (4003 allocations: 78.28 KiB)

It’s about an order of magnitude slower, and the allocations now scale linearly with nΔt

Elrod · May 9, 2021, 5:08pm

Try recursion.

@inline recursive_assign!(f::Tuple{}, f_vals, k, n, x) = nothing
@inline function recursive_assign!(f::Tuple{F,Vararg}, f_vals, x, k, n) where {F}
    f_vals[k,n] = first(f)(x)
    recursive_assign!(Base.tail(f), f_vals, x, k+1, n)
end

function compute_values_recurse(x₀, Δt, nΔt, f::Tuple{Vararg{<:Any,K}}) where {K}
    x = x₀;
    f_vals = zeros(K, nΔt)
    for n in 1:nΔt
        x =update(x,Δt)
        recursive_assign!(f, f_vals, x, 1, n) 
    end
    return f_vals
end

I get:

julia> @btime compute_values_recurse($x₀, $Δt, $nΔt, $(f1,))
  24.771 μs (1 allocation: 7.94 KiB)
1×1000 Matrix{Float64}:
 0.948985  0.999966  0.927798  0.64436  0.0897141  -0.623416  -0.998433  …  -0.395455  -0.376427  -0.298363  -0.929138  0.982109  -0.998104  0.891708

julia> @btime compute_values_recurse($x₀, $Δt, $nΔt, $(f2,))
  2.277 μs (1 allocation: 7.94 KiB)
1×1000 Matrix{Float64}:
 1.5625  2.44141  3.8147  5.96046  9.31323  14.5519  22.7374  35.5271  55.5112  …  1.10853e193  1.73207e193  2.70636e193  4.22869e193  6.60733e193

julia> @btime compute_values_recurse($x₀, $Δt, $nΔt, $(f1, f2,))
  25.134 μs (1 allocation: 15.75 KiB)
2×1000 Matrix{Float64}:
 0.948985  0.999966  0.927798  0.64436  0.0897141  -0.623416  -0.998433  -0.317148  …  -0.298363     -0.929138     0.982109     -0.998104     0.891708
 1.5625    2.44141   3.8147    5.96046  9.31323    14.5519    22.7374    35.5271        1.10853e193   1.73207e193  2.70636e193   4.22869e193  6.60733e193

julia> @btime compute_values($x₀, $Δt, $nΔt, $(f1,))
  24.643 μs (1 allocation: 7.94 KiB)
1×1000 Matrix{Float64}:
 0.948985  0.999966  0.927798  0.64436  0.0897141  -0.623416  -0.998433  …  -0.395455  -0.376427  -0.298363  -0.929138  0.982109  -0.998104  0.891708

julia> @btime compute_values($x₀, $Δt, $nΔt, $(f2,))
  2.276 μs (1 allocation: 7.94 KiB)
1×1000 Matrix{Float64}:
 1.5625  2.44141  3.8147  5.96046  9.31323  14.5519  22.7374  35.5271  55.5112  …  1.10853e193  1.73207e193  2.70636e193  4.22869e193  6.60733e193

julia> @btime compute_values($x₀, $Δt, $nΔt, $(f1,f2))
  25.263 μs (1 allocation: 15.75 KiB)
2×1000 Matrix{Float64}:
 0.948985  0.999966  0.927798  0.64436  0.0897141  -0.623416  -0.998433  -0.317148  …  -0.298363     -0.929138     0.982109     -0.998104     0.891708
 1.5625    2.44141   3.8147    5.96046  9.31323    14.5519    22.7374    35.5271        1.10853e193   1.73207e193  2.70636e193   4.22869e193  6.60733e193

julia> @btime compute_values_ntuple($x₀, $Δt, $nΔt, $(f1,))
  156.152 μs (3003 allocations: 54.84 KiB)
1×1000 Matrix{Float64}:
 0.948985  0.999966  0.927798  0.64436  0.0897141  -0.623416  -0.998433  …  -0.395455  -0.376427  -0.298363  -0.929138  0.982109  -0.998104  0.891708

julia> @btime compute_values_ntuple($x₀, $Δt, $nΔt, $(f2,))
  131.799 μs (3003 allocations: 54.84 KiB)
1×1000 Matrix{Float64}:
 1.5625  2.44141  3.8147  5.96046  9.31323  14.5519  22.7374  35.5271  55.5112  …  1.10853e193  1.73207e193  2.70636e193  4.22869e193  6.60733e193

julia> @btime compute_values_ntuple($x₀, $Δt, $nΔt, $(f1,f2))
  298.434 μs (4003 allocations: 78.28 KiB)
2×1000 Matrix{Float64}:
 0.948985  0.999966  0.927798  0.64436  0.0897141  -0.623416  -0.998433  -0.317148  …  -0.298363     -0.929138     0.982109     -0.998104     0.891708
 1.5625    2.44141   3.8147    5.96046  9.31323    14.5519    22.7374    35.5271        1.10853e193   1.73207e193  2.70636e193   4.22869e193  6.60733e193

simeonschaub · May 9, 2021, 5:24pm

Ah, that’s because x is assigned to multiple times so it gets boxed in the closure. This can be circumvented by using a let block:

function compute_values_ntuple(x₀, Δt, nΔt, f::NTuple{K}) where {K}
    x = x₀;
    f_vals = zeros(K, nΔt)
    for n in 1:nΔt
        x =update(x,Δt)
        let x=x
            ntuple(k -> f_vals[k,n] = (f[k])(x), K)
        end
    end
    return f_vals
end

With this change, both perform identically for me.

gideonsimpson · May 9, 2021, 5:42pm

Yea, that works well. And for my original problem (without the threading) this runs pretty well:

function update(x, Δt)
    return x + 0.5 * Δt * x
end


function compute_avg_values_mean_ntuple(x0_vals, Δt, nΔt, f::Tuple{Vararg{<:Any,K}}) where {K}
    x_vals = copy(x0_vals);

    nx = length(x0_vals);
    f_avg_vals = zeros(K, nΔt)

    for n in 1:nΔt
        @. x_vals = update(x_vals, Δt);
        for j in nx
            ntuple(k->f_avg_vals[k,n] += (f[k])(x_vals[j])/nx, K)
        end
    end
    return f_avg_vals
end

Random.seed!(100);
x0 = randn(100);
x₀ = [1.0];
Δt = 0.5;
nΔt = 10^3;

f1(x) = sin(x)
f2(x) = x^2
@btime compute_avg_values_mean_ntuple($x0, $Δt, $nΔt, $(f1,))
@btime compute_avg_values_mean_ntuple($x0, $Δt, $nΔt, $(f2,))
@btime compute_avg_values_mean_ntuple($x0, $Δt, $nΔt, $(f1,f2))

  66.077 μs (2 allocations: 8.81 KiB)
  26.393 μs (2 allocations: 8.81 KiB)
  67.609 μs (2 allocations: 16.62 KiB)

I’ve got some separate threading issue though that seems to be independent of the tuple of unctions issue, and I’m gonna start a new thread.

Topic		Replies	Views
Poor performance while multithreading (Julia 1.0) Performance multithreading	28	4013	February 11, 2019
@threads and @generated clash? General Usage metaprogramming , multithreading	13	977	March 25, 2018
Performance and allocation issue with arrays of functions (v1.5) Performance performance , memory-allocation	6	680	August 17, 2020
Slowdown when multi-threading is used General Usage	8	2222	January 19, 2017
Threads.@threads with ONE thread: how to remove the overhead? Performance threads	10	4048	April 2, 2021

Multithreading and generated code

Related topics