Performance Tips for functions

Hi Everyone,
I’m working on optimizing two functions, mplFun and marketEq, and I’m looking for advice on how to make them as efficient as possible. Below, I’ve included a simplified version of the code that allows these functions to run, but please note that this is not the original code I’m using—just a minimal example to focus on the optimization of these two functions.

I’ve gone through the performance tips, but I’m wondering if I might be overlooking something or if anyone has suggestions for further improvements.

Currently, when I run the code, I get the following timing:

  1.897 s (3355 allocations: 82.72 MiB)

Since these functions are called thousands of times in the full code, improving their efficiency is critical. Any advice or insights would be greatly appreciated!

Thank you so much in advance for your help!

# Define a struct to hold the parameters
mutable struct ModelParameters
    γ::Float32    
    α::Float32   
    ρ::Float32    
    ξ::Float32    
    ω::Float32    
    η::Float32   
    θ::Float32    
    μ_a::Float32  
    σ_a::Float32  
    μ_z::Float32  
    σ_z::Float32  
    R::Float32    
    σ::Float32    
    ϕ::Float32    
    δ::Float32    
    S::Int       
    M::Int        
    location::Float32  
    tail::Float32      
    scale::Float32     
    p::Float32         
    cap::Int    
end

mutable struct FixedPointSettings
    υ::Float32        # Step size for updates (slow adjustment)
    iter_S_max::Int   # Max iterations for equilibrium
    tol_S::Float32    # Tolerance for market equilibrium
    tol_S_A::Float32  # Tolerance across markets
    tol::Float32      # Tolerance for aggregate labor supply
end



# Initialize model parameters with Float32
modelParams = ModelParameters(
    0.7f0,
    0.99f0,
    0.4f0,
    1.0f0,
    0.9f0,
    10f0,
    0.5f0,
    0f0,
    0.37f0,
    0f0,
    0.33f0,
    0.10f0,
    1.0f0,
    0.6f0,
    0.08f0,
    500,
    1000,
    2f0,
    0.71f0,
    38.36f0,
    0.16f0,
    100
)

feSettings = FixedPointSettings(
    0.2f0,
    10000,
    Float32(1e-5),
    Float32(1e-3),
    Float32(1e-3)
)


function mplFun(
    modelParams::ModelParameters, 
    Φ::Matrix{Float32}, 
    Φ_bar::Vector{Float32}, 
    H::Vector{Float32}
 ) :: Matrix{Float32}
    # Extract necessary parameters from the parameters struct
    γ = modelParams.γ
    α = modelParams.α
    R = modelParams.R

    # Precompute constant terms
    αγ = α * γ
    denom = 1f0 - α * (1f0 - γ)
    Z = (1f0 - α * (1f0 - γ)) * (α * (1f0 - γ) / R)^((1f0 - γ) * α / denom)
    exponent_1 = 1f0 / denom
    exponent_2 = (α - 1f0) / denom

    # Use broadcasting to compute MPL, ensuring all operations are done in Float32
    MPL = @. Z * (αγ / denom) * Φ_bar' ^ exponent_1 * H' ^ exponent_2 * (1f0 - (1f0 / αγ) * (1f0 - Φ / Φ_bar'))

    # Set negative values to 0 in-place
    @. MPL = max(MPL, 0f0)

    return MPL
end


function marketEq(
    modelParams::ModelParameters, 
    feSettings::FixedPointSettings, 
    z::Vector{Float32}, 
    Φ::Matrix{Float32}, 
    mass::Vector{Float32}, 
    S_ij_0::Matrix{Float32}; 
    efficient::Bool = false
 ) :: Tuple{Matrix{Float32}, Bool, Matrix{Float32}, Vector{Float32}, Vector{Float32}, Matrix{Float32}, Matrix{Float32}}
    # Extract necessary parameters from the parameters struct
    γ = modelParams.γ
    α = modelParams.α
    η = modelParams.η
    θ = modelParams.θ
    R = modelParams.R
    upsilon = feSettings.υ
    iter_S_max = feSettings.iter_S_max
    tol_S = feSettings.tol_S

    N = length(z)

    # Initial Guess
    S_ij = copy(S_ij_0)
    max_iter_reached = false


    # Preallocate arrays as Float32
    NS = zeros(Float32, size(S_ij[:, 1:N]))
    H = zeros(Float32, N)
    Φ_bar = zeros(Float32, N)
    MPL = zeros(Float32, size(S_ij[:, 1:N]))
    μ = similar(S_ij[:, 1:N], Float32)
    S_ij_n = similar(S_ij, Float32)
    sum_values = zeros(Float32, size(S_ij, 1))

    for i in 1:iter_S_max
            # Replace NaNs in S_ij with 0
            @views S_ij[isnan.(S_ij)] .= 0

            # Compute NS and  H
            @views NS .= S_ij[:, 1:N].^(η / (1 + η)) .* mass
            H .= vec(sum(NS, dims=1))  # Convert sum result to a vector for proper assignment

            # Calculate average phi
            @views Φ_bar .= vec(sum(NS .* Φ, dims=1)) ./ H

            # Calculate mpl
            MPL .= mplFun(modelParams, Φ, Φ_bar, H)

            # Calculate μ
            if efficient
                μ .= 1.0f0
            else
                @views ε = (1f0 / η .+ (1f0 / θ - 1f0 / η) .* S_ij[:, 1:end-1]).^(-1f0)
                μ .= ε ./ (1f0 .+ ε)
            end

            # Calculate the sum of (μ .* MPL)^(1 + η)
            @views sum_values .= sum((μ .* MPL).^(1 + η), dims=2)

            # Update S_ij_n for all firms
            @views S_ij_n[:, 1:end-1] .= (μ .* MPL).^(1 + η) ./ sum_values
            @views S_ij_n[isnan.(S_ij_n)] .= 0.0f0
            @views S_ij_n[:, end] .= 1f0 .- sum(S_ij_n[:, 1:end-1], dims=2)
            @views S_ij_n[S_ij_n .< tol_S] .= 0f0

            # Check for convergence
            dist_S = maximum(abs.(S_ij_n - S_ij))
            if dist_S < tol_S
                break  # Exit the loop immediately if converged
            end

            # Update S_ij using upsilon for gradual adjustment
            @views S_ij .= upsilon .* S_ij_n .+ (1f0 - upsilon) .* S_ij
       

        if i == iter_S_max
            println("WARNING: max iter reached")
            max_iter_reached = true
        end
    end

    # Other variables given convergence
    W_ij = μ .* MPL
    Z = (1f0 - α * (1f0 - γ)) * (α * (1f0 - γ) / R)^((1f0 - γ) * α / (1f0 - α * (1f0 - γ)))
    π_ij = Z .* z'.^(1f0 / γ) .* H .* Φ_bar.^(1f0 / γ)

    return S_ij, max_iter_reached, NS, H, Φ_bar, W_ij, π_ij
end

using Distributions
# Update N_j in-place
logAbilityDist = LogNormal{Float32}(modelParams.μ_a, modelParams.σ_a)

# Generate percentiles (500 percentiles)
space = LinRange(Float32(0), Float32(1), modelParams.S + 2)  # S percentiles + boundaries

# Compute the quantiles for the ability distribution, skipping the 0 and 1 bounds
a = Float32.(quantile(logAbilityDist, space[2:end-1]))  # Ensure result is Float32

# Calculate mass (equally weighted), ensuring it's Float32
mass = fill(Float32(1) / Float32(length(a)), length(a))  # Fill array with Float32
z_last = sort(Float32.(rand(LogNormal{Float32}(modelParams.μ_z, modelParams.σ_z), 100)))  
    
function phiFun(z::Vector{Float32}, a::Vector{Float32})::Matrix{Float32}
    # Extract necessary parameters from modelParams
    ρ = 0.4
    ω =0.9
    ξ = 1

    # Use broadcasting to apply the operation across both z and a
    Φ = ((1 - ω) .* (z.^((ρ-1) / ρ))' .+ ω .* (a.^((ρ-1) / ρ))) .^ (ρ / (ρ-1))

    # Raise Φ to the power of ξ
    Φ = Φ .^ ξ

    return Φ
end

Φ = phiFun(Float32.(z_last), Float32.(a))  

S_ij_0 = ones(Float32, length(a), 100 + 1) / Float32(100)
S_ij_0[:, 100 + 1] .= 1f0 .- sum(S_ij_0[:, 1:100], dims=2)  # Unemployment column

  # Call marketEq function for each market
    using BenchmarkTools
    @btime S_ij, max_iter_reached, _, _, _, W_ij, π_ij = marketEq(modelParams, feSettings, z_last, Φ, mass, S_ij_0)
    
    


1 Like

Note that these argument-type declarations do nothing for performance; they just make your code less generic.

(In the long run, you generally don’t want to hard-code the precision — it’s more flexible to infer the precision from the types of the arguments. For example, if you just do denom = 1 - α * (1 - γ), it will automaticallly promote to the appropriate floating-point type depending on the types of α and γ — you rarely want literals like 1f0 with a predetermined floating-point precision.)

I would combine this with the previous assignment so that you have a single pass over the arrays. Also, again max(MPL, 0) should suffice.

This is a classic example where you might want to trade off allocation for performance. If you precompute the vector H .^ exponent_2, then you will have an extra vector allocation (still small compared to your MPL matrix), but it will have computing the exponentiation on every iteration of the 2d broadcast loop.

(This is also an example of where broadcast syntax is too convenient — it makes it easy to forget what is going on under the hood.)

This doesn’t do what you think — it still allocates a new matrix on every iteration. It’s equivalent to:

tmp = mplFun(...) # allocate new matrix for return value
MPL .= tmp # copy it into MPL

If you want to overwrite MPL in-place, I would rename mplFun to mplFun! and pass MPL as the first argument:

mplFun!(MPL, modelParams, Φ, Φ_bar, H)

Then, inside the function, do

@. MPL = ...

to overwrite it in-place.

Small thing, but you are missing a .- here so are doing one more allocation than necessary. (With a little more work you could avoid allocating at all.)

Have you tried doing profiling to see where it’s spending most of the time?

11 Likes

That’s a lot of code - have you profiled it to see where the time is spent?

1 Like

In addition to what people wrote above, I’ll comment on a few specific lines. These aren’t necessarily the most problematic lines, but they’re easy to improve and might help you think about how others can be improved.

This makes two passes through the data. It finds the NaNs on the first pass (and allocates a BitArray for to remember them) and sets them to zero on the second. You can do this in a single pass with no allocation via S_ij .= ifelse.(isnan.(S_ij), 0, S_ij) or replace!(S_ij, NaN => 0).

sum!(H', NS)

This allocates for the calculation, allocates for the sum, then copies to sum_values. sum!(sum_values, (μ .* MPL).^(1 + η)) still allocates for the calculation, but doesn’t need to allocate the result or perform an extra copy. Writing this with no allocations using a higher-order function is possible but a bit obnoxious. A for loop would be simple and what I’d recommend.

Use maximum(splat((x,y)->abs(x-y)), zip(S_ij_n, S_ij)) (equivalently: maximum(splat(abs ∘ -), zip(S_ij_n, S_ij)) or mapreduce(abs ∘ -, max, S_ij_n, S_ij)) to avoid allocations altogether.


In general, this code appears to exhibit significant NumPy or MATLAB (or similar) influence, with heavy use of vectorized calculations. Often these are fine, but in some cases one can perform operations more simply/performantly with for loops (in a language where for is fast, such as Julia or C++). I haven’t looked at your the total calculation to evaluate to what extent that applies here.

3 Likes
function mplFun!(MPL, modelParams::ModelParameters, # mplFun!, kind advice from stevengj.
    Φ::Matrix{Float32},
    Φ_bar::Vector{Float32},
    H::Vector{Float32})::Matrix{Float32}
    # Extract necessary parameters from the parameters struct
    γ = modelParams.γ
    α = modelParams.α
    R = modelParams.R

    # Precompute constant terms
    # αγ = α * γ # I don't know what the principle is, but here removing the calculation results in a speed increase
    denom = 1.0f0 - α * (1.0f0 - γ)
    Z = (1.0f0 - α * (1.0f0 - γ)) * (α * (1.0f0 - γ) / R)^((1.0f0 - γ) * α / denom)
    exponent_1 = 1.0f0 / denom
    exponent_2 = (α - 1.0f0) / denom

    # Use broadcasting to compute MPL, ensuring all operations are done in Float32
    @. MPL = Z * (α * γ / denom) * Φ_bar'^exponent_1 * H'^exponent_2 * (1.0f0 - (1.0f0 / α * γ) * (1.0f0 - Φ / Φ_bar')) # In-place modification of MPL, kind advice from stevengj.

    # Set negative values to 0 in-place
    @. MPL = max(MPL, 0.0f0)

    return MPL
end


function marketEq(
    modelParams::ModelParameters,
    feSettings::FixedPointSettings,
    z::Vector{Float32},
    Φ::Matrix{Float32},
    mass::Vector{Float32},
    S_ij_0::Matrix{Float32};
    efficient::Bool=false
)::Tuple{Matrix{Float32},Bool,Matrix{Float32},Vector{Float32},Vector{Float32},Matrix{Float32},Matrix{Float32}}
    # Extract necessary parameters from the parameters struct
    γ = modelParams.γ
    α = modelParams.α
    η = modelParams.η
    θ = modelParams.θ
    R = modelParams.R
    upsilon = feSettings.υ
    iter_S_max = feSettings.iter_S_max
    tol_S = feSettings.tol_S

    N = length(z)

    # Initial Guess
    S_ij = copy(S_ij_0)
    max_iter_reached = false

    # Preallocate arrays as Float32
    NS = zeros(Float32, size(S_ij[:, 1:N]))
    H = zeros(Float32, N)
    Φ_bar = zeros(Float32, N)
    MPL = zeros(Float32, size(S_ij[:, 1:N]))
    μ = similar(view(S_ij, :, 1:N), Float32) # changed
    S_ij_n = similar(S_ij, Float32)
    sum_values = zeros(Float32, size(S_ij, 1))

    for i in 1:iter_S_max
        # Replace NaNs in S_ij with 0
        replace!(S_ij, NaN => 0) # Kind advice from mikmoore.

        # Compute NS and  H
        # @views NS .= S_ij[:, 1:N] .^ (η / (1 + η)) .* mass # not good
        @views NS .= S_ij[:, 1:N] .^ (η ./ (1 .+ η)) # good
        # In a linear-algebra context, this means that even though operations like vector + vector and vector * scalar are defined, it can be advantageous to instead use vector .+ vector and vector .* scalar because the resulting loops can be fused with surrounding computations. (Ref: https://docs.julialang.org/en/v1/manual/performance-tips/#More-dots:-Fuse-vectorized-operations)
        sum!(H', NS) # Kind advice from mikmoore.

        # Calculate average phi
        # Φ_bar .= vec(sum(NS .* Φ, dims=1)) ./ H # changed
        sum!(Φ_bar', NS .* Φ)
        Φ_bar ./= H

        # Calculate mpl
        mplFun!(MPL, modelParams, Φ, Φ_bar, H)

        # Calculate μ
        if efficient
            μ .= 1.0f0
        else
            @views ε = @. (1.0f0 / η + (1.0f0 / θ - 1.0f0 / η) * S_ij[:, 1:end-1])^(-1.0f0) # changed
            @. μ = ε / (1.0f0 + ε) # changed
        end

        # Calculate the sum of (μ .* MPL)^(1 + η)
        sum!(sum_values, (μ .* MPL) .^ (1 .+ η)) # Kind advice from mikmoore.
        # and note that a dot was added between 1 and η, then 1+η can be fused with surrounding computations.

        # Update S_ij_n for all firms
        # @views S_ij_n[:, 1:end-1] .= (μ .* MPL) .^ (1 .+ η) ./ sum_values
        # Note that @views will not work on the variables to the left of the assignment statement, so @views is invalid in the code above
        @. S_ij_n[:, 1:end-1] = (μ * MPL)^(1 + η) / sum_values # changed
        replace!(S_ij_n, NaN => 0.0f0) # Kind advice from mikmoore.
        @views S_ij_n[:, end] .= 1.0f0 .- sum(S_ij_n[:, 1:end-1], dims=2)
        S_ij_n[S_ij_n.<tol_S] .= 0.0f0
        # Note that for the above statement, using replace! is not as fast as writing it above, although replace! allocates less. This is true when comparing (<, >), but not if using (=).

        # Check for convergence
        dist_S = maximum(splat((x, y) -> abs(x - y)), zip(S_ij_n, S_ij)) # Kind advice from mikmoore.
        if dist_S < tol_S
            break  # Exit the loop immediately if converged
        end

        # Update S_ij using upsilon for gradual adjustment
        @views @. S_ij = upsilon * S_ij_n + (1.0f0 - upsilon) * S_ij # changed

        if i == iter_S_max
            printstyled("WARNING: max iter reached\n"; color=11) # hope you like it :P
            max_iter_reached = true
        end
    end

    # Other variables given convergence
    W_ij = μ .* MPL
    Z = (1.0f0 - α * (1.0f0 - γ)) * (α * (1.0f0 - γ) / R)^((1.0f0 - γ) * α / (1.0f0 - α * (1.0f0 - γ)))
    π_ij = @. Z * z'^(1.0f0 / γ) * H * Φ_bar^(1.0f0 / γ) # changed

    return S_ij, max_iter_reached, NS, H, Φ_bar, W_ij, π_ij
end

using Distributions
# Update N_j in-place
logAbilityDist = LogNormal{Float32}(modelParams.μ_a, modelParams.σ_a)

# Generate percentiles (500 percentiles)
space = LinRange(Float32(0), Float32(1), modelParams.S + 2)  # S percentiles + boundaries

# Compute the quantiles for the ability distribution, skipping the 0 and 1 bounds
a = Float32.(quantile(logAbilityDist, space[2:end-1]))  # Ensure result is Float32

# Calculate mass (equally weighted), ensuring it's Float32
mass = fill(Float32(1) / Float32(length(a)), length(a))  # Fill array with Float32
z_last = sort(Float32.(rand(LogNormal{Float32}(modelParams.μ_z, modelParams.σ_z), 100)))

function phiFun(z::Vector{Float32}, a::Vector{Float32})::Matrix{Float32}
    # Extract necessary parameters from modelParams
    ρ, ω, ξ = 0.4, 0.9, 1

    # Use broadcasting to apply the operation across both z and a
    Φ = @. ((1 - ω) * (z^((ρ - 1) / ρ))' + ω * (a^((ρ - 1) / ρ)))^(ρ / (ρ - 1)) # changed

    # Raise Φ to the power of ξ
    Φ = Φ .^ ξ

    return Φ
end

Φ = phiFun(Float32.(z_last), Float32.(a))

S_ij_0 = ones(Float32, length(a), 100 + 1) / Float32(100)
S_ij_0[:, 100+1] .= 1.0f0 .- sum(S_ij_0[:, 1:100], dims=2)  # Unemployment column

# Call marketEq function for each market
using BenchmarkTools
@btime S_ij, max_iter_reached, _, _, _, W_ij, π_ij = marketEq(modelParams, feSettings, z_last, Φ, mass, S_ij_0);

Allocations were reduced to 895, 23.97 MiB, a 50% reduction in time. Please read the comments for details about the modifications.

Note also that if you wrote out an explicit 2d loop, if you order the loops correctly then you could probably pull this exponentiation (and perhaps other calculations) out of the loop without allocating an additional array.

In general, for complicated performance-critical tasks, it’s often better to write explicit loops so that you have more control over the details.

1 Like

Thank you so much to everyone for your comments. As highlighted by karei now the code runs much faster and with much fewer allocations!

711.928 ms (801 allocations: 24.54 MiB)

I run profile as suggested, how should I report the results (if I should)? I am not sure how to interpret profile results

There’s a lot of code duplication in your code. This is both ugly and may result in redundant computation.

E.g., here, α * (1f0 - γ) repeats multiple times.

I think the reason why the code got so much faster and why removing αγ = α * γ resulted in a speed increase is that there is a parenthesis missing in the mpl function:
(1.0f0 / α * γ) should be (1.0f0 / (α * γ))

Now the time and allocations are :

1.326 s (1158 allocations: 26.93 MiB)  

This is the modified function with all your comments incorporated. I also decided to remove the mplFun since I could do everything in one line:

function marketEq(
    modelParams::ModelParameters, 
    feSettings::FixedPointSettings, 
    z::Vector{Float32}, 
    Φ::Matrix{Float32}, 
    mass::Vector{Float32}, 
    S_ij_0::Matrix{Float32}; 
    efficient::Bool = false
 ) :: Tuple{Matrix{Float32}, Bool, Matrix{Float32}, Vector{Float32}, Vector{Float32}, Matrix{Float32}, Vector{Float32}}
    # Extract necessary parameters from the parameters struct
    γ = modelParams.γ
    α = modelParams.α
    η = modelParams.η
    θ = modelParams.θ
    R = modelParams.R
    upsilon = feSettings.υ
    iter_S_max = feSettings.iter_S_max
    tol_S = feSettings.tol_S

    N = length(z)
    S=modelParams.S

    # Initial Guess
    S_ij = copy(S_ij_0)
    max_iter_reached = false


    # Preallocate arrays as Float32
    NS = zeros(Float32, size(S_ij[:, 1:N]))
    H = zeros(Float32, N)
    H_exp=copy(H)
    Φ_bar = zeros(Float32, N)
    MPL = zeros(Float32, size(S_ij[:, 1:N]))
    μ = similar(S_ij[:, 1:N], Float32)
    S_ij_n = similar(S_ij, Float32)
    sum_values = zeros(Float32, size(S_ij, 1))

    #Precompute constants
    denom = 1f0 - α * (1f0 - γ)
    exponent_1 = 1f0 / (1f0 - α * (1f0 - γ))
    mpl_cons=(1f0 - α * (1f0 - γ)) * (α * (1f0 - γ) / R)^((1f0 - γ) * α / denom)*α * γ*exponent_1
    αγ=α*γ

    for i in 1:iter_S_max
            # Compute NS and  H
            @views NS .= S_ij[:, 1:N].^(η./ (1 .+ η)) .* mass
            sum!(H', NS)  # Convert sum result to a vector for proper assignment
            H_exp.=H.^((α - 1f0) *exponent_1)

            # Calculate average phi
            sum!(Φ_bar', NS .* Φ)
            Φ_bar ./= H

            # Calculate mpl
            @. MPL = max(mpl_cons * Φ_bar' ^ exponent_1 * H_exp' * (1f0 - (1f0 / αγ) * (1f0 - Φ / Φ_bar')), 0)

            # Calculate μ
            if efficient
                μ .= 1.0f0
            else
                @views ε = @. (1.0f0 / η + (1.0f0 / θ - 1.0f0 / η) * S_ij[:, 1:end-1])^(-1.0f0)
                μ .= ε ./ (1f0 .+ ε)
            end

            # Calculate the sum of (μ .* MPL)^(1 + η)
            for i in 1:S
                sum_values[i] = 0.0f0
                for j in 1:N
                    sum_values[i] += (μ[i, j] .* MPL[i, j]).^(1 .+ η)
                end
            end

            # Update S_ij_n for all firms
            @. S_ij_n[:, 1:end-1] = (μ * MPL)^(1 + η) / sum_values
            replace!(S_ij_n, NaN => 0.0f0)
            @views S_ij_n[:, end] .= 1.0f0 .- sum(S_ij_n[:, 1:end-1], dims=2)
            S_ij_n[S_ij_n.<tol_S] .= 0.0f0

            # Check for convergence
            dist_S = maximum(splat((x, y) -> abs(x - y)), zip(S_ij_n, S_ij)) # Kind advice from mikmoore.
            if dist_S < tol_S
                break  # Exit the loop immediately if converged
            end

            # Update S_ij using upsilon for gradual adjustment
            @views @. S_ij = upsilon * S_ij_n + (1.0f0 - upsilon) * S_ij
       

            if i == iter_S_max
                printstyled("WARNING: max iter reached\n"; color=11) 
                max_iter_reached = true
            end
    end

    # Other variables given convergence
    W_ij = μ .* MPL
    Z = (1.0f0 - α * (1.0f0 - γ)) * (α * (1.0f0 - γ) / R)^((1.0f0 - γ) * α / (1.0f0 - α * (1.0f0 - γ)))
    π_ij = @. Z * z^(1.0f0 / γ) * H * Φ_bar^(1.0f0 / γ)

    return S_ij, max_iter_reached, NS, H, Φ_bar, W_ij, π_ij
end

I’d appreciate any feedback, especially if you see room for further optimization or improvements. Thank you again for all your help!

1 Like

This blog post on benchmarking and profiling may be useful