My Julia code is slower than Python and Matlab

Can you shared your improved code?

I use a different solver. And try Greek letters in code to look like Julia.

using SparseArrays
import LinearAlgebra
import LinearSolve
using Sparspak
using KrylovKit
using BandedMatrices
# using SuiteSparseGraphBLAS
using Interpolations
using Plots
using Random
using Statistics
using Printf
using Profile
using IterativeSolvers

struct RealBusinessCycleModel
    α::Float64
    γ::Float64
    ρ::Float64
    δ::Float64
    z̄::Float64
    θ::Float64
    σ::Float64
    kbar::Float64

    function RealBusinessCycleModel(α::Float64, γ::Float64, ρ::Float64, δ::Float64, z̄::Float64, θ::Float64, σ::Float64)
        kbar = ((α * z̄) / (ρ + δ))^(1 / (1 - α))
        new(α, γ, ρ, δ, z̄, θ, σ, kbar)
    end
end

function get_grid(model::RealBusinessCycleModel, k_min::Float64, k_max::Float64, z_min::Float64, z_max::Float64, I::Int64, J::Int64)::Tuple{Vector{Float64},Vector{Float64},Vector{Float64},Vector{Float64},Float64,Float64}
    k_grid::Vector{Float64} = range(k_min, k_max, length=I)
    z_grid::Vector{Float64} = range(z_min, z_max, length=J)
    # kk::Matrix{Float64} = [k for k in k_grid, z in z_grid]
    kk::Matrix{Float64} = reshape(k_grid, (I, 1)) * ones(1, J)
    # zz::Matrix{Float64} = [z for k in k_grid, z in z_grid]
    zz::Matrix{Float64} = ones(I, 1) * reshape(z_grid, (1, J))
    kk_flat::Vector{Float64} = vec(kk)
    zz_flat::Vector{Float64} = vec(zz)
    dk::Float64 = (k_max - k_min) / (I - 1)
    dz::Float64 = (z_max - z_min) / (J - 1)
    return (k_grid, z_grid, kk_flat, zz_flat, dk, dz)
end

# Utility function
@inline function u(model::RealBusinessCycleModel, c::Vector{Float64})::Vector{Float64}
    if model.γ != 1.0
        return (max.(c, 1e-6) .^ (1 .- model.γ) .- 1) ./ (1 .- model.γ)
    else
        return log.(max.(c, 1e-6))
    end
end

# Derivative of the utility function
@inline function u_prime(model::RealBusinessCycleModel, c::Vector{Float64})::Vector{Float64}
    return max.(c, 1e-6) .^ (-model.γ)
end

# Inverse of the derivative of the utility function
@inline function u_prime_inv(model::RealBusinessCycleModel, x::Vector{Float64})::Vector{Float64}
    return max.(x, 1e-6) .^ (-1 / model.γ)
end

# Production function
@inline function f(model::RealBusinessCycleModel, k::Vector{Float64}, z::Vector{Float64})::Vector{Float64}
    return max.(z, 0.0) .* max.(k, 0.0) .^ model.α
end


# Validate transition matrix (each row should sum nearly zero)
function validate_transition_matrix(T::AbstractMatrix{Float64})::Bool
    err = maximum(abs.(sum(T, dims=2)))
    if err > 1e-10
        println("Error: maximum absolute row sum = ", err)
        display(LinearAlgebra.Matrix(T))
        throw(ArgumentError("T is not a proper transition matrix."))
    end
    return true
end

function construct_S_Dk(S_B::Vector{Float64}, S_F::Vector{Float64}, dk::Float64, I::Int64, J::Int64)::SparseMatrixCSC{Float64,Int64}
    S_B_minus::Vector{Float64} = min.(S_B, 0.0)
    S_F_plus::Vector{Float64} = max.(S_F, 0.0)
    # println(S_B_minus)
    # println(S_F_plus)
    lower_diag::Vector{Float64} = -S_B_minus ./ dk
    center_diag::Vector{Float64} = (S_B_minus .- S_F_plus) ./ dk
    upper_diag::Vector{Float64} = S_F_plus ./ dk
    diags = Dict(
        -1 => lower_diag[2:end],
        0 => center_diag,
        1 => upper_diag[1:end-1]
    )
    return spdiagm(diags...)
end

function solve(model::RealBusinessCycleModel; Delta::Float64=Float64(100.0), tol::Float64=Float64(1e-6), max_iter::Int64=Int64(1000), I::Int64=Int64(20), J::Int64=Int64(20), bounds::Union{Tuple{Float64,Float64,Float64,Float64},Nothing}=nothing, n_sigma::Union{Tuple{Float64,Float64},Float64,Nothing}=nothing)
    l_sigma::Float64 = r_sigma::Float64 = 0.0
    k_min::Float64, k_max::Float64, z_min::Float64, z_max::Float64 = 0.0, 0.0, 0.0, 0.0
    if !isnothing(bounds)
        k_min, k_max, z_min, z_max = bounds
    elseif !isnothing(n_sigma)
        if isa(n_sigma, Tuple) && length(n_sigma) == 2
            l_sigma, r_sigma = n_sigma
        else
            l_sigma = r_sigma = n_sigma
        end
        z_min = model.z̄ - l_sigma * model.σ
        z_max = model.z̄ + r_sigma * model.σ
        k_min = (1 - l_sigma * model.σ / model.z̄)^(1 / (1 - model.α)) * model.kbar
        k_max = (1 + r_sigma * model.σ / model.z̄)^(1 / (1 - model.α)) * model.kbar
    else
        error("Must specify bounds or n_sigma")
    end

    @assert k_min > 0 && z_min > 0 && k_max > k_min && z_max > z_min

    k_grid::Vector{Float64}, z_grid::Vector{Float64}, kk::Vector{Float64}, zz::Vector{Float64}, dk::Float64, dz::Float64 = get_grid(model, k_min, k_max, z_min, z_max, I, J)

    _Dk_F::SparseMatrixCSC{Float64,Int64} = spdiagm(0 => [-ones(I - 1); 0.0], 1 => ones(I - 1))
    Dk_F::SparseMatrixCSC{Float64,Int64} = kron(sparse(LinearAlgebra.I, J, J), (1 / dk) * _Dk_F)

    _Dk_B::SparseMatrixCSC{Float64,Int64} = spdiagm(-1 => -ones(I - 1), 0 => [0.0; ones(I - 1)])
    Dk_B::SparseMatrixCSC{Float64,Int64} = kron(sparse(LinearAlgebra.I, J, J), (1 / dk) * _Dk_B)

    mu::Vector{Float64} = model.θ * (model.z̄ .- zz)

    _Dz_F::SparseMatrixCSC{Float64,Int64} = spdiagm(0 => [-ones(J - 1); 0.0], 1 => ones(J - 1))
    Dz_F::SparseMatrixCSC{Float64,Int64} = kron((1 / dz) * _Dz_F, sparse(LinearAlgebra.I, I, I))
    _Dz_B::SparseMatrixCSC{Float64,Int64} = spdiagm(-1 => -ones(J - 1), 0 => [0.0; ones(J - 1)])
    Dz_B::SparseMatrixCSC{Float64,Int64} = kron((1 / dz) * _Dz_B, sparse(LinearAlgebra.I, I, I))

    _Dzz::SparseMatrixCSC{Float64,Int64} = spdiagm(-1 => ones(J - 1), 0 => [-1.0; fill(-2.0, J - 2); -1.0], 1 => ones(J - 1))
    Dzz::SparseMatrixCSC{Float64,Int64} = kron((1 / dz^2) * _Dzz, sparse(LinearAlgebra.I, I, I))
    Dz_U::SparseMatrixCSC{Float64,Int64} = spdiagm(mu .< 0) * Dz_B + spdiagm(mu .>= 0) * Dz_F
    mu_Dz::SparseMatrixCSC{Float64,Int64} = spdiagm(mu) * Dz_U
    half_sigma2_Dzz::SparseMatrixCSC{Float64,Int64} = 0.5 .* model.σ .^ 2 .* Dzz

    v0::Vector{Float64} = u(model, f(model, kk, zz)) / model.ρ
    v::Vector{Float64} = copy(v0)
    c::Vector{Float64} = zeros(I * J)
    status::Int64 = 0

    # Allocate memory outside the loop to avoid reallocation and improve performance
    vk_F::Vector{Float64} = zeros(I * J)
    vk_F_2d::Matrix{Float64} = zeros(I, J)
    vk_B::Vector{Float64} = zeros(I * J)
    vk_B_2d::Matrix{Float64} = zeros(I, J)
    v̄k::Vector{Float64} = zeros(I * J)
    indicator_F::Vector{Float64} = zeros(I * J)
    indicator_B::Vector{Float64} = zeros(I * J)
    indicator_bar::Vector{Float64} = zeros(I * J)
    vk_U::Vector{Float64} = zeros(I * J)
    S_F::Vector{Float64} = zeros(I * J)
    S_B::Vector{Float64} = zeros(I * J)
    S_Dk::SparseMatrixCSC{Float64,Int64} = spzeros(I * J, I * J)
    A::SparseMatrixCSC{Float64,Int64} = spzeros(I * J, I * J)
    coef::SparseMatrixCSC{Float64,Int64} = spzeros(I * J, I * J)
    b::Vector{Float64} = zeros(I * J)
    v_new::Vector{Float64} = zeros(I * J)

    total_time::Float64 = 0.0

    for iter::Int64 in 1:max_iter
        vk_F .= Dk_F * v
        vk_F_2d = reshape(vk_F, (I, J))
        vk_F_2d[I, :] = u_prime(model, f(model, k_grid[end] .* ones(Float64, J), z_grid) .- model.δ .* k_grid[end])
        vk_F .= vec(vk_F_2d)
        
        vk_B .= Dk_B * v
        # LinearAlgebra.mul!(vk_B, Dk_B, v)
        vk_B_2d .= reshape(vk_B, (I, J))
        vk_B_2d[1, :] .= u_prime(model, f(model, k_grid[1] .* ones(Float64, J), z_grid) .- model.δ .* k_grid[1])
        vk_B .= vec(vk_B_2d)
        
        v̄k .= u_prime(model, f(model, kk, zz) .- model.δ .* kk)
        S_F .= f(model, kk, zz) .- model.δ .* kk .- u_prime_inv(model, vk_F)
        S_B .= f(model, kk, zz) .- model.δ .* kk .- u_prime_inv(model, vk_B)

        indicator_F = S_F .> 0
        indicator_B = S_B .< 0
        indicator_bar = 1.0 .- indicator_F .- indicator_B
        vk_U = indicator_F .* vk_F .+ indicator_B .* vk_B .+ indicator_bar .* v̄k
        c = u_prime_inv(model, vk_U)

        S_Dk = construct_S_Dk(S_B, S_F, dk, I, J)


        # validate_transition_matrix(S_Dk) || error("Transition matrix is invalid")
        # validate_transition_matrix(mu_Dz) || error("Transition matrix is invalid")
        # validate_transition_matrix(half_sigma2_Dzz) || error("Transition matrix is invalid")

        # println("Iteration $iter")
        # display(Matrix(S_Dk))
        # display(Matrix(mu_Dz))
        # display(Matrix(half_sigma2_Dzz))
        # display(v)
        # display(c)



        A .= S_Dk .+ mu_Dz .+ half_sigma2_Dzz
        coef .= (model.ρ + 1 / Delta) .* sparse(LinearAlgebra.I, I * J, I * J) .- A
        b .= u(model, c) .+ v ./ Delta
        
        prob = LinearSolve.LinearProblem(coef, b)
        t_start = time()
        # @time v_new .= LinearSolve.solve(prob, LinearSolve.KLUFactorization(reuse_symbolic=false)).u
        # @time v_new .= LinearSolve.solve(prob, LinearSolve.UMFPACKFactorization(reuse_symbolic=false)).u
        @time v_new .= LinearSolve.solve(prob, LinearSolve.SparspakFactorization(reuse_symbolic=false)).u
        t_end = time()
        total_time += t_end - t_start
        # v_new .= coef \ b

        err::Float64 = LinearAlgebra.norm(v_new - v)
        if err < tol
            println("Converged in $iter iterations")
            println(@sprintf("Total time taken for solving the linear system: %.6f seconds", total_time))
            status = 1
            v .= v_new
            break
        end
        v .= v_new
    end

    if status == 0
        println("Failed to converge")
    end
    return v, c, k_grid, z_grid
end

function main()
    α::Float64 = 0.3
    γ::Float64 = 2.0
    ρ::Float64 = 0.05
    δ::Float64 = 0.05
    z̄::Float64 = 1.0
    θ::Float64 = 0.5
    σ::Float64 = 0.025
     
    rbc::RealBusinessCycleModel = RealBusinessCycleModel(α, γ, ρ, δ, z̄, θ, σ)
    I::Int64, J::Int64 = 1001, 1001
    t_start::Float64 = time()
    Profile.init(delay=0.001)
    v::Vector{Float64}, c::Vector{Float64}, k::Vector{Float64}, z::Vector{Float64} = solve(rbc; I=I, J=J, n_sigma=(Float64(30.0), Float64(30.0)))
    t_end::Float64 = time()
    println(@sprintf("Time taken: %.6f seconds", t_end - t_start))

end

main()

1 Like

Could you please provide the timing you’re getting for your improved code?

Your original version took 133s on my laptop, your new version takes 47s. This is indeed a large improvement, nearly a factor of three. :grinning_face:

No, in the normal case it is not help at all. Only if the compiler cannot correctly infer the types can it help. In the case I quoted it absolutely does not help, because it forces an expensive conversion into a less efficient type.

1 Like

~34s for the new code.

3 Likes

Actually, not. Typing in the function signature only affects dispatch, i.e. if you have an f(x::Int) and an f(x::Float64) with different content, the compiler will choose between them based on the type of the actual argument. But in any case the function f will be compiled for the type of the actual argument. So if the content of f is the same no matter what x is, no performance is lost by specifying the function as just f(x).

Type assertion on variables like

a::Vector{Float64} = something

has as the only effect that a convert is inserted like:

a = convert(Vector{Float64}, something)

every time the a is assigned to.

If something is a Vector{Float32} it will be converted to a Vector{Float64}. If something is already a Vector{Float64} the convert will just return it. It is true that the compiler can use this information, but it will only be helpful if it can’t already infer the type of something. In that case, this problem should be corrected rather than hidden in this way.

If you for some reason have a computation with uncertain type, you can assert its type with

a = something::Vector{Float64}

This is useful for return from fetch, since type inference does not work across fetch.

a = fetch(task)::Vector{Float64}

This will help the compiler, since if the fetch does not in fact return a Vector{Float64} an error will be thrown. So the compiler will safely assume that the a has type Vector{Float64} and can go about its business. If you don’t assert the type, the compiler does not know the type of a and will insert code to resolve it, perhaps every time you use a later.

Edit: There is one more case of type declaration. The fields inside structs. This is very important. Preferably, the types declared here should be concrete. That is, Int, Float32, Vector{Int} and so on, not abstract types like AbstractFloat, Integer, Vector (without a type), AbstractVector{<:Real} etc.

The reason is that when you access such a struct, the compiler only sees the declared type, not what is actually stored there. If the declared type is concrete, the compiler knows exactly what to do, but if it’s abstract (or absent, which means ::Any), the compiler must insert code to figure out the type and handle whatever it is. This will slow down things, and can be devastating if it’s in a critical part of the code.

Btw, this is precisely why the above mentioned fetch (from a Task) can’t infer the type. A Task can in principle return anything, and the return value is stored in the field result in the Task structure. This field has type Any to accomodate all sorts of tasks, and this is what fetch sees. So the compiler does not know what type is returned.

11 Likes