My julia code is somehow much slower than the matlab code

You mean vcat(a:b).

Yes, itā€™s lowered to a vcat but for a single range thatā€™s still equivalent to the collect, which is what I did mean to refer to.

2 Likes

Oh, I see what youā€™re talking aboutā€¦

julia> @code_lowered collect(1:2)
CodeInfo(
1 ā”€ %1 = Base.vcat(r)
ā””ā”€ā”€      return %1
)
1 Like

You might be overlooking that some people execute code by pasting it in the REPL, e.g., by using vscode or manually. In this case, the semicolon may have a very important role to play.

16 Likes

Thanks! I changed my code as follows and it improved a lot.

function loop_body2(V)
    (A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
    A1=transpose(reshape(A,size(V)[2],size(V)[1]));
    C=f2(A1,V);
    return (A1,B,C)
end

The previous benchmark showed me around 6 seconds.
This one decreased it by around 2.5 seconds.
However, the matlab code with the same revision still seems to be faster. I donā€™t know how to check its benchmark, but I think it seems about 1 second for the matlab code.
But thanks anyway

Thank you for the suggestion.
I tried to embody what you proposed as follows.

function loop_over_global2(distance, precision)
    #Now initialize a V array#
    V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
    while distance>precision
        k_index=k_index;
        (TV,Tk_index,distance)=loop_body2(V);
        println(i);println(distance);
        if distanceā‰¤precision
            return (TV,V,Tk_index,distance,k_index,i);
            break
        else
            V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
        end
    end
end

function f2(a1,v)
    m = zero(eltype(a1))
    @turbo for i in eachindex(a1,v)
        val = a1[i] - v[i]
        m = ifelse(val > m, val, m)
    end
    return m
end

function loop_body2(V)
    (A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
    A1=transpose(reshape(A,size(V)[2],size(V)[1]));
    C=f2(A1,V);
    return (A1,B,C)
end

@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global2(startdistance,precision);

However, I didnā€™t saw a lot of improvement from this change. This change gave me 2.438 s and the previous one gave me 2.417 s. I think previous one is shorter because I ran the code of the previous one first.
But It is weird, considering that the benchmark you showed me signs that there is a lot of decrease in time.
For comparison, I used the following code for the previous method(using maximum)

function loop_body3(V)
    (A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
    A1=transpose(reshape(A,size(V)[2],size(V)[1]));
    C=maximum(A1-V);
    return (A1,B,C)
end

function loop_over_global3(distance, precision)
    #Now initialize a V array#
    V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
    while distance>precision
        k_index=k_index;
        (TV,Tk_index,distance)=loop_body3(V);
        println(i);println(distance);
        if distanceā‰¤precision
            return (TV,V,Tk_index,distance,k_index,i);
            break
        else
            V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
        end
    end
end

Is there anything I am missing?

For everyone who is interested in how this is going right now, my code is in the following state.
Thank you!

using Statistics
using BenchmarkTools
using LoopVectorization

const alpha=0.261
const beta=0.99
const delta=0.0176
const n_star=1

const z_grid=[0.9526;0.9760;1;1.0246;1.0497]
const Pi_z=[0.9149 0.0823 0.0028 0 0
    0.0206 0.9163 0.0618 0.0014 0
    0.0005 0.0412 0.9167 0.0412 0.0005
    0 0.0014 0.0618 0.9163 0.0206
    0 0 0.0028 0.0823 0.9149]
#Next, get the k_star and k_L and k_H#
const k_star=(1/alpha*(1/beta-1+delta))^(1/(alpha-1))
const k_L=k_star*0.85
const k_H=1.15*k_star
println(round(k_star,digits=4));println(round(k_L,digits=4));println(round(k_H,digits=4))
#Setting up an 499 points of kgrid#
const k_grid=[k_L:((k_H-k_L)/498):k_H;]
println(median(k_grid))

##(b)##
#First, set up the length variables#
const n_k=length(k_grid)
const n_z=length(z_grid)
#Make settings for the iteration#
#First, make an array for each z_i#
const k_prime_grid_mat_proto=repeat(k_grid,1, n_z, n_k)
const k_grid_mat=permutedims(k_prime_grid_mat_proto,(3,2,1))
k_prime_grid_mat=k_prime_grid_mat_proto.*(k_grid_mat.^alpha.*transpose(z_grid)+k_grid_mat.*(1-delta)-k_prime_grid_mat_proto.>0)
k_prime_grid_mat[k_prime_grid_mat.==0].=NaN#build k' array that is out of bound#
logC=log.(k_grid_mat.^alpha.*transpose(z_grid)+k_grid_mat.*(1-delta)-k_prime_grid_mat)
logC[isnan.(logC)].=-Inf
    
const precision=0.01
const startdistance=100#precision and initial distance#
#iteration number#

#start iteration#
#start iteration#
function loop_over_global(distance, precision)
    #Now initialize a V array#
    V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
    while distance>precision
        k_index=k_index;
        (TV,Tk_index,distance)=loop_body(V);
        println(i);println(distance);
        if distanceā‰¤precision
            return (TV,V,Tk_index,distance,k_index,i);
            break
        else
            V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
        end
    end
end

function loop_over_global2(distance, precision)
    #Now initialize a V array#
    V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
    while distance>precision
        k_index=k_index;
        (TV,Tk_index,distance)=loop_body2(V);
        println(i);println(distance);
        if distanceā‰¤precision
            return (TV,V,Tk_index,distance,k_index,i);
            break
        else
            V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
        end
    end
end

function loop_over_global3(distance, precision)
    #Now initialize a V array#
    V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
    while distance>precision
        k_index=k_index;
        (TV,Tk_index,distance)=loop_body3(V);
        println(i);println(distance);
        if distanceā‰¤precision
            return (TV,V,Tk_index,distance,k_index,i);
            break
        else
            V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
        end
    end
end

function loop_body(V)
    (A,B)=findmax(logC::Array{Float64,3}+repeat(V*transpose(Pi_z),1,1,n_k).*beta,dims=1);
    A1=transpose(reshape(A,size(V)[2],size(V)[1]));
    C=maximum(A1-V);
    return (A1,B,C)
end

function f2(a1,v)
    m = zero(eltype(a1))
    @turbo for i in eachindex(a1,v)
        val = a1[i] - v[i]
        m = ifelse(val > m, val, m)
    end
    return m
end

function loop_body2(V)
    (A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
    A1=transpose(reshape(A,size(V)[2],size(V)[1]));
    C=f2(A1,V);
    return (A1,B,C)
end

function loop_body3(V)
    (A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
    A1=transpose(reshape(A,size(V)[2],size(V)[1]));
    C=maximum(A1-V);
    return (A1,B,C)
end

t1=@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global(startdistance,precision);

t2=@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global2(startdistance,precision);

t3=@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global3(startdistance,precision);
1 Like

Yes, my post, you should delete the totally-useseless-and-actively-harmful k_index=Array{CartesianIndex{3}, 3}, also, if you had used @code_warntype from the beginning (have you read the performance tips at all before diving into performance optimisation?) youā€™d have found that also V is type-unstable in loop_over_global2.

2 Likes

Is there anything I am missing?

  1. You are benchmarking something that involves IO operations through println. These can easily cause performance hits, although, here they seem to not be a problem.

  2. The benchmark results show lots of allocations, almost 4GB for the last two approaches. You could try working with preallocated buffers and, thus, avoid reallocations and potential GC pauses.

  3. There seem to be a few type stabilites (I think within the loop_bodys) which also cause slowdonws. In general, try to avoid changing the type of a variable, especially within ā€˜hotā€™ loops.

  4. See what @giordano said.

I had a few minutes on my train trip and I wrote a version that uses the LoopVectorization bit and a cache:

using Statistics
using BenchmarkTools
using LoopVectorization

const alpha=0.261
const beta=0.99
const delta=0.0176
const n_star=1

const z_grid=[0.9526;0.9760;1;1.0246;1.0497]
const Pi_z=[0.9149 0.0823 0.0028 0 0
    0.0206 0.9163 0.0618 0.0014 0
    0.0005 0.0412 0.9167 0.0412 0.0005
    0 0.0014 0.0618 0.9163 0.0206
    0 0 0.0028 0.0823 0.9149]
#Next, get the k_star and k_L and k_H#
const k_star=(1/alpha*(1/beta-1+delta))^(1/(alpha-1))
const k_L=k_star*0.85
const k_H=1.15*k_star
println(round(k_star,digits=4));println(round(k_L,digits=4));println(round(k_H,digits=4))
#Setting up an 499 points of kgrid#
const k_grid=[k_L:((k_H-k_L)/498):k_H;]
println(median(k_grid))

##(b)##
#First, set up the length variables#
const n_k=length(k_grid)
const n_z=length(z_grid)
#Make settings for the iteration#
#First, make an array for each z_i#
const k_prime_grid_mat_proto=repeat(k_grid,1, n_z, n_k)
const k_grid_mat=permutedims(k_prime_grid_mat_proto,(3,2,1))
k_prime_grid_mat=k_prime_grid_mat_proto.*(k_grid_mat.^alpha.*transpose(z_grid)+k_grid_mat.*(1-delta)-k_prime_grid_mat_proto.>0)
k_prime_grid_mat[k_prime_grid_mat.==0].=NaN#build k' array that is out of bound#
const logC=log.(k_grid_mat.^alpha.*transpose(z_grid)+k_grid_mat.*(1-delta)-k_prime_grid_mat)
logC[isnan.(logC)].=-Inf
    
const precision=0.01
const startdistance=100#precision and initial distance#
#iteration number#

#start iteration#
function loop_over_global(distance, precision)
    #Now initialize a V array#
    V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
    while distance>precision
        k_index=k_index;
        (TV,Tk_index,distance)=loop_body(V);
        # println(i);println(distance);
        if distanceā‰¤precision
            return (TV,V,Tk_index,distance,k_index,i);
            break
        else
            V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
        end
    end
end

function loop_over_global2(distance, precision)
    #Now initialize a V array#
    V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
    while distance>precision
        k_index=k_index;
        (TV,Tk_index,distance)=loop_body2(V);
        # println(i);println(distance);
        if distanceā‰¤precision
            return (TV,V,Tk_index,distance,k_index,i);
            break
        else
            V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
        end
    end
end

function loop_over_global3(distance, precision)
    #Now initialize a V array#
    V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
    while distance>precision
        k_index=k_index;
        (TV,Tk_index,distance)=loop_body3(V);
        # println(i);println(distance);
        if distanceā‰¤precision
            return (TV,V,Tk_index,distance,k_index,i);
            break
        else
            V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
        end
    end
end

function loop_body(V)
    (A,B)=findmax(logC::Array{Float64,3}+repeat(V*transpose(Pi_z),1,1,n_k).*beta,dims=1);
    A1=transpose(reshape(A,size(V)[2],size(V)[1]));
    C=maximum(A1-V);
    return (A1,B,C)
end

function f2(a1,v)
    m = zero(eltype(a1))
    @turbo for i in eachindex(a1,v)
        val = a1[i] - v[i]
        m = ifelse(val > m, val, m)
    end
    return m
end

function loop_body2(V)
    (A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
    A1=transpose(reshape(A,size(V)[2],size(V)[1]));
    C=f2(A1,V);
    return (A1,B,C)
end

function loop_body3(V)
    (A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
    A1=transpose(reshape(A,size(V)[2],size(V)[1]));
    C=maximum(A1-V);
    return (A1,B,C)
end

ref_V = let
    println("reference loop_over_global")
    @btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global($startdistance,$precision);
    (TV,V,Tk_index,distance,k_index,i)=loop_over_global(startdistance,precision);
    @show i
    @show distance
    V
end

let
    println("loop_over_global2")
i    @btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global2($startdistance,$precision);
    (TV,V,Tk_index,distance,k_index,i)=loop_over_global2(startdistance,precision);
    @show i
    @show distance
    @show all(isapprox.(ref_V, V))
end

let
    println("loop_over_global3")
    @btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global3($startdistance,$precision);
    (TV,V,Tk_index,distance,k_index,i)=loop_over_global3(startdistance,precision);
    @show i
    @show distance
    @show all(isapprox.(ref_V, V))
end

function make_cache(n_k, n_z)
  V = zeros(n_k,n_z)
  TV = similar(V)
  tmp_product = similar(logC)
  return (; V, TV, tmp_product)
end

function loop_withcache!(distance, precision, cache)
    V, TV, tmp_product = cache
    fill!(V, 0.0)
    fill!(TV, 0.0)
    fill!(tmp_product, 0.0)
    k_index = CartesianIndex{3}[;;;]
    Tk_index = similar(k_index)
    i = 1
    while distance>precision
        k_index, distance = loop_body_withcache!(TV, V, tmp_product)
        # println(i);println(distance);
        if distanceā‰¤precision
            return (Tk_index,distance,k_index,i)
        else
            V .= TV
            prev_k_index = k_index
        end
        i += 1
    end
end

function loop_body_withcache!(TV, V, tmp_product)
    tmp_product .= logC .+ V * transpose(Pi_z) .* beta
    (A,B) = findmax(tmp_product,dims=1);
    TV .= permutedims(view(A, 1, :, :), (2,1))
    C = f2(TV,V);
    return B, C
end

let
    println("loop_over_withcache!")
    cache = make_cache(n_k, n_z)
    @btime (Tk_index,distance,k_index,i) = loop_withcache!($startdistance,$precision,$cache);
    (Tk_index,distance,k_index,i) = loop_withcache!(startdistance,precision,cache);
    @show i
    @show distance
    @show all(isapprox.(ref_V, cache.V))
end

The result from my laptop:

reference loop_over_global
  4.860 s (7847 allocations: 11.51 GiB)
i = 412
distance = 0.009909900196412025
loop_over_global2
  2.675 s (4551 allocations: 3.86 GiB)
i = 412
distance = 0.009909900196412025
all(isapprox.(ref_V, V)) = true
loop_over_global3
  2.688 s (5375 allocations: 3.87 GiB)
i = 412
distance = 0.009909900196412025
all(isapprox.(ref_V, V)) = true
loop_over_withcache!
  2.487 s (3299 allocations: 47.16 MiB)
i = 412
distance = 0.009909900196412025
all(isapprox.(ref_V, cache.V)) = true
true

Note: I also made logC constant, which it wasnā€™t before.

EDIT: Fixed my previous version where I had forgotten to reset the cache. Now the improvement is much less, but still there is an improvement :slight_smile:

4 Likes

The following rewrite is faster than MATLAB. Note that this is just a code translation in a more-Julian way, so it still inherits the drawbacks of the original MATLAB-oriented design. If we know the problem enough, Iā€™m sure this can be made way faster if written in Julia from scratch.

The timings now are as follows (note also that the allocations went down from 11.628 GiB in your original Julia code to only 47.808 MiB):

Julia:  1.527016 seconds (960 allocations: 47.808 MiB)
MATLAB: 2.020981 seconds.

The current code is as follows (I changed many tedious variable names to arguably better/shorter ones):

function main()
    ## (a) ##
    # First, set up the given data#
    Ī± = 0.261
    Ī² = 0.99
    Ī“ = 0.0176
    zGrid = [0.9526, 0.9760, 1, 1.0246, 1.0497]
    PiZ = [0.9149 0.0823 0.0028 0      0
           0.0206 0.9163 0.0618 0.0014 0
           0.0005 0.0412 0.9167 0.0412 0.0005
           0      0.0014 0.0618 0.9163 0.0206
           0      0      0.0028 0.0823 0.9149]
    # Next, get the kStar and kL and kH #
    kStar = (1 / Ī± * (1 / Ī² - 1 + Ī“))^(1 / (Ī± - 1))
    kL = kStar * 0.85
    kH = 1.15 * kStar
    println(kStar)
    println(kL)
    println(kH)

    # Setting up an 499 points of kgrid #
    kGrid = range(kL, kH, 499)
    println(median(kGrid))

    ## (b) ##
    # First, set up the length variables #
    nk = length(kGrid)
    nz = length(zGrid)

    # Make settings for the iteration #
    # First, make an array for each z_i #
    kā€²Grid = repeat(kGrid, 1, nz, nk)
    kGridMat = permutedims(kā€²Grid, (3, 2, 1))
    kā€²GridMat = kā€²Grid .* (kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā€²Grid .> 0)
    replace!(kā€²GridMat, 0 => NaN)  # kā€²GridMat[kā€²GridMat .== 0] .= NaN  
    logC = log.(kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā€²GridMat) 
    replace!(logC, NaN => -Inf)    # logC[isnan.(logC)] .= -Inf

    # precision and initial distance #
    precision = 0.01
    distance = 100    

    # start iteration #
    i = 1
    V = zeros(nk, nz)
    Vā€² = similar(V)
    TV = zeros(1, 5, 499)
    VPiZ = similar(V)
    logCC = similar(logC)
    while distance > precision
        mul!(VPiZ, V, transpose(PiZ))
        logCC .= logC .+ VPiZ .* Ī²
        maximum!(TV, logCC)
        Vā€² .= transpose(reshape(TV, 5, 499))
        distance = maximum(j-k for (j,k) in zip(Vā€²,V))
        V .= Vā€²
        i += 1
    end
    # Add one more iteration to find Tk_index
    _, k_index = findmax(logCC, dims=1)
    mul!(VPiZ, Vā€², PiZ')
    logCC .= logC .+ VPiZ .* Ī²
    _, Tk_index = findmax(logCC, dims=1)

    # Reviving k-rules from the k_index array #
    kā€²sol = transpose(reshape(kā€²Grid[Tk_index], 5, 499))
    kā€²sol_prev = transpose(reshape(kā€²Grid[k_index], 5, 499)) # matrix of previous capital to compare with solutions #
    dist_kā€²sol = maximum(abs(i-j) for (i,j) in zip(kā€²sol,kā€²sol_prev))

    # answers #
    println(i)
    println(distance)
    println(dist_kā€²sol)
end
@time main() 

It still uses some useless repeats, reshapes, transposes, etc. But the main idea is to minimize allocations by pre-allocating large arrays and the most important step is to avoid findmax and use maximum till the last iteration where you really want the indices.

Another note, to print floats with 4 digits like MATLAB, you can use this modification:

using Printf
Base.show(io::IO, f::Float64) = @printf(io, "%.4f", f)
13 Likes

A few comments

  1. you create nz, nk but then instead often use 5, 499
  2. this problem uses VFI to solve the MOST ROUTINE economics problem (Neoclassical growth model)
    Jesus compared MANY languages for this problem here.
    There have been a bunch of GSOC/JSOC improving performance for this exact problem in Julia.
    Julia does not have a high performance VFI solver like VFIToolkit.m, even though VFI is one of the biggest computational challenges for economists!
  3. You can see VFI & a bunch of other methods here & here

I made the language a bit more generic (at the cost of performance)

using Printf, Statistics, LinearAlgebra;
Base.show(io::IO, f::Float64) = @printf(io, "%.4f", f)

function main()
    ## (a) ##
    # First, set up the given data#
    Ī± = 0.261; Ī² = 0.99; Ī“ = 0.0176;
    #
    u(c) = log(c);
    f(z,k;Ī±=Ī±) = z*(k^Ī±)
    con(z,k,kp;Ī“=Ī“) = f(z,k) + k*(1 - Ī“) - kp # makes it slower?
    #
    zGrid = [0.9526, 0.9760, 1, 1.0246, 1.0497]
    PiZ = [0.9149 0.0823 0.0028 0      0
           0.0206 0.9163 0.0618 0.0014 0
           0.0005 0.0412 0.9167 0.0412 0.0005
           0      0.0014 0.0618 0.9163 0.0206
           0      0      0.0028 0.0823 0.9149]
    # Next, get the kStar and kL and kH #
    kStar = ( (1.0/Ī² - 1.0 + Ī“) / Ī± )^(1.0 / (Ī± - 1.0))
    kL = kStar * 0.85; kH = 1.15 * kStar;
    println(kStar); println(kL); println(kH);
    # Setting up an 499 points of kgrid #
    kGrid = range(kL, kH, 499); println(median(kGrid))

    ## (b) ##
    # First, set up the length variables #
    nk = length(kGrid); nz = length(zGrid)

    # Make settings for the iteration #
    # First, make an array for each z_i #
    kā€²Grid = repeat(kGrid, 1, nz, nk)
    kGridMat = permutedims(kā€²Grid, (3, 2, 1))
    # 
    kā€²GridMat = kā€²Grid .* (con.(zGrid', kGridMat, kā€²Grid) .> 0)
    #kā€²GridMat = kā€²Grid .* (kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā€²Grid .> 0)
    #
    replace!(kā€²GridMat, 0 => NaN)  # kā€²GridMat[kā€²GridMat .== 0] .= NaN 
    # 
    logC = u.(con.(zGrid', kGridMat, kā€²GridMat)) 
    #logC = u.(kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā€²GridMat) 
    #
    replace!(logC, NaN => -Inf)    # logC[isnan.(logC)] .= -Inf

    # precision and initial distance #
    precision = 0.01; distance = 100    

    # start iteration #
    i = 1
    V = zeros(nk, nz)
    Vā€² = similar(V)
    TV = zeros(1, nz, nk)
    VPiZ = similar(V)
    logCC = similar(logC)
    while distance > precision
        mul!(VPiZ, V, transpose(PiZ))
        logCC .= logC .+ VPiZ .* Ī²
        maximum!(TV, logCC)
        Vā€² .= transpose(reshape(TV, nz, nk))
        distance = maximum(j-k for (j,k) in zip(Vā€²,V))
        V .= Vā€²
        i += 1
    end
    # Add one more iteration to find Tk_index
    _, k_index = findmax(logCC, dims=1)
    mul!(VPiZ, Vā€², PiZ')
    logCC .= logC .+ VPiZ .* Ī²
    _, Tk_index = findmax(logCC, dims=1)

    # Reviving k-rules from the k_index array #
    kā€²sol = transpose(reshape(kā€²Grid[Tk_index], nz, nk))
    kā€²sol_prev = transpose(reshape(kā€²Grid[k_index], nz, nk)) # matrix of previous capital to compare with solutions #
    dist_kā€²sol = maximum(abs(i-j) for (i,j) in zip(kā€²sol,kā€²sol_prev))

    # answers #
    println(i); println(distance); println(dist_kā€²sol)
end
@time main() 

The world would be better off if there was a generic Julia VFI w/ n-state variables, a generic utility (return/cost) function, and a generic transition functionā€¦

Does anyone know why:

kā€²GridMat = kā€²Grid .* (kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā€²Grid .> 0)
logC = u.(kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā€²GridMat) 

is faster than the more generic

u(c) = log(c);
f(z,k;Ī±=Ī±) = z*(k^Ī±)
con(z,k,kp;Ī“=Ī“) = f(z,k) + k*(1 - Ī“) - kp 

kā€²GridMat = kā€²Grid .* (con.(zGrid', kGridMat, kā€²Grid) .> 0)
logC = u.(con.(zGrid', kGridMat, kā€²GridMat)) 
2 Likes

If this is one bottleneck it is possible that using MKL will be faster. I think Matlab uses it. Also one must be careful on whether this is multithreaded or not while benchmarking.

2 Likes

This part is of my ignorance. When I operated the function without the part, the message that k_index is not defined frequently came out.
The previous version of the while loop that frequently spits out the message was

function loop_over_global5(distance, precision)
    #Now initialize a V array#
    V=zeros(n_k,n_z);i=1;
    while distance>precision
        (TV,Tk_index,distance)=loop_body3(V);
        println(i);println(distance);
        if distanceā‰¤precision
            return (TV,V,Tk_index,distance,k_index,i);
            break
        else
            V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
        end
    end
end

I suspected that in the if part, when distance<=precision, k_index in the previous part is eliminated somehow. So I tried to insert, k_index=k_index in the main part of the loop, but to operate in the first place, k_index should be defined. So I put the k_index=Array{CartesianIndex{3}, 3} in front.
P.S. The performance tips may be very useful, but to me, who are not familiar with julia and with computer programming itself, it has some cumbersome parts. Until now, I used R and Matlab in a practical manner to conduct my research which was not very helpful for understanding somewhat different languages, like Julia. But Iā€™ll try to get used to those thanks to all of you guyā€™s support:)

Thanks! Now, as I tried this, I think the speed of the two has come closer massively.
It seems evident that I know almost nothing about the spirit of Julia yet. Itā€™s a far way to go.

By the way, my Matlab shows 1.225859 s while your code shows 1.498779 s for the operation. I donā€™t know why we have some differences. The code for the matlab was as follows.

%%%5. Discrete dynamic programming in practice: decision rules through simulated data%%%
%%(a)%%
tic
%First, set up given data%
alpha=0.261;beta=0.99;delta=0.0176;n_star=1;
z_grid=[0.9526;0.9760;1;1.0246;1.0497];
Pi_z=[0.9149 0.0823 0.0028 0 0;...
    0.0206 0.9163 0.0618 0.0014 0;...
    0.0005 0.0412 0.9167 0.0412 0.0005;...
    0 0.0014 0.0618 0.9163 0.0206;...
    0 0 0.0028 0.0823 0.9149];
%Next, get the k_star and k_L and k_H%
k_star=(1/alpha*(1/beta-1+delta))^(1/(alpha-1));
k_L=k_star*0.85;k_H=1.15*k_star;
disp(round(k_star,4));disp(round(k_L,4));disp(round(k_H,4));
%Setting up an 499 points of kgrid%
k_grid=(k_L:((k_H-k_L)/498):k_H).';
disp(median(k_grid));

%%(b)%%
%First, set up the length variables%
n_k=length(k_grid);n_z=length(z_grid);
%Now initialize a V array%
V=zeros(n_k,n_z);
%Make settings for the iteration%
%First, make an array for each z_i%
k_prime_grid_mat_proto=repmat(k_grid,[1 n_z n_k]);%build prototype k' array with 3-d block array%
k_grid_mat=permute(k_prime_grid_mat_proto,[3 2 1]);%build k array with 3-d block array%;
k_prime_grid_mat=k_prime_grid_mat_proto.*(k_grid_mat.^alpha.*z_grid.'+k_grid_mat.*(1-delta)-k_prime_grid_mat_proto>=0);
k_prime_grid_mat(k_prime_grid_mat==0)=nan;
logC=log(k_grid_mat.^alpha.*z_grid.'+k_grid_mat.*(1-delta)-k_prime_grid_mat);
%build k' array that assigns nan to k' that is out of bound%

precision=0.01;distance=100;sz=size(k_prime_grid_mat);%precision and initial distance%

i=1;%iteration number%

%Start iteration%
while (distance > precision)
    [TV,Tk_index]=max(logC+V*Pi_z.'.*beta,[],'omitnan');%max within column omitting nan%
    TV=reshape(permute(TV,[1,3,2]),[],size(TV,2));%change 1x5x499 array into 499x5 matrix%
    distance=max(max(TV-V));
    i=i+1;disp(i);disp(distance);
    if distance<precision
        break
    else
    V=TV;k_index=Tk_index;
    end
end
toc

Does this code is slower than Julia code on your computer? Sorry to bother you.
What I am doing is explained a bit by @Albert_Zevelev below. It is basically finding an optimized form of function V with optimizer kā€² given k and z through recursive method (finding a function V that satisfies TV(k)=V(k) for all k, given a functional operator T, which contains maximizing).
Thank you again for your support!

Different computers maybe?

I was refering to the difference of the rank of the speed between the two. But yeah, still difference of computer may be the reason.

I really recommend, as a first step towards writing this algorithm in the style and spirit of Julia, and to aid those youā€™re asking for help, that you drop the unnecessary semicolons, insert some whitespace (both spaces and blank lines) to show organization, and reduce your variable name lengths to reduce the visual clutter.

This will help experienced Julia people see your code as Julia, in the sense of recognizing patterns, and seeing the places efficiency can be improved by insertion of views, or broadcasting, or hoisting constant expressions out of loops. As it is, I spend most of my brain cycles when looking at your code trying to translate it out of Matlab, or to distinguish between variables k_grid_mat, k_prime_grid_mat, k_prime_grid_mat_proto, etc.

8 Likes

OK, now you canā€™t beat the following version in MATLAB. The times compare as follows on my machine:

Julia: 0.719097 seconds (134 allocations: 47.751 MiB)
MATLAB: Elapsed time is 1.264601 seconds.

All you have to do is replace the above while with the one below. I mainly compute every thing in a single triple-loop. Note that Julia is still single-threaded till the moment, whereas MATLAB functions are multi-threaded by default. So, multithreading can further enhance the speed of Julia code.

The whole function now reads:

function main()
    ## (a) ##
    # First, set up the given data#
    Ī± = 0.261
    Ī² = 0.99
    Ī“ = 0.0176
    zGrid = [0.9526, 0.9760, 1, 1.0246, 1.0497]
    PiZ = [0.9149 0.0823 0.0028 0      0
           0.0206 0.9163 0.0618 0.0014 0
           0.0005 0.0412 0.9167 0.0412 0.0005
           0      0.0014 0.0618 0.9163 0.0206
           0      0      0.0028 0.0823 0.9149]
    # Next, get the kStar and kL and kH #
    kStar = (1 / Ī± * (1 / Ī² - 1 + Ī“))^(1 / (Ī± - 1))
    kL = kStar * 0.85
    kH = 1.15 * kStar
    println(kStar)
    println(kL)
    println(kH)

    # Setting up an 499 points of kgrid #
    kGrid = range(kL, kH, 499)
    println(median(kGrid))

    ## (b) ##
    # First, set up the length variables #
    nk = length(kGrid)
    nz = length(zGrid)

    # Make settings for the iteration #
    # First, make an array for each z_i #
    kā€²Grid = repeat(kGrid, 1, nz, nk)
    kGridMat = permutedims(kā€²Grid, (3, 2, 1))
    kā€²GridMat = kā€²Grid .* (kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā€²Grid .> 0)
    replace!(kā€²GridMat, 0 => NaN) 
    logC = log.(kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā€²GridMat)
    replace!(logC, NaN => -Inf)

    # precision and initial distance #
    precision = 0.01
    distance = 100    

    # start iteration #
    i = 1
    V = zeros(nk, nz)
    VPiZ = similar(V)
    logCC = similar(logC)    
    while distance > precision
        mul!(VPiZ, V, PiZ')
        d = -Inf
        for k = 1:nk, j = 1:nz
            m = -Inf
            for i = 1:nk
                logCC[i,j,k] = logC[i,j,k] + VPiZ[i,j] * Ī²
                m = ifelse(m > logCC[i,j,k], m, logCC[i,j,k])
            end 
            distance = max(d, m-V[k,j])
            d = m - V[k,j]
            V[k,j] = m
        end
        i += 1
    end
    # Add one more iteration to find Tk_index
    _, k_index = findmax(logCC, dims=1)
    mul!(VPiZ, V, PiZ')
    logCC .= logC .+ VPiZ .* Ī²
    _, Tk_index = findmax(logCC, dims=1)

    # Reviving k-rules from the k_index array #
    kā€²sol = transpose(reshape(kā€²Grid[Tk_index], 5, 499))
    kā€²sol_prev = transpose(reshape(kā€²Grid[k_index], 5, 499)) # matrix of previous capital to compare with solutions #
    dist_kā€²sol = maximum(abs(i-j) for (i,j) in zip(kā€²sol,kā€²sol_prev))

    # answers #
    println(i)
    println(distance)
    println(dist_kā€²sol)
end

Is none of the iterative solver libraries suited to this problem class?

1 Like

Yes:

By @nicolas
Iā€™ve shown how in my links above etc