You mean vcat(a:b)
.
Yes, itās lowered to a vcat
but for a single range thatās still equivalent to the collect
, which is what I did mean to refer to.
Oh, I see what youāre talking aboutā¦
julia> @code_lowered collect(1:2)
CodeInfo(
1 ā %1 = Base.vcat(r)
āāā return %1
)
You might be overlooking that some people execute code by pasting it in the REPL, e.g., by using vscode or manually. In this case, the semicolon may have a very important role to play.
Thanks! I changed my code as follows and it improved a lot.
function loop_body2(V)
(A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
A1=transpose(reshape(A,size(V)[2],size(V)[1]));
C=f2(A1,V);
return (A1,B,C)
end
The previous benchmark showed me around 6 seconds.
This one decreased it by around 2.5 seconds.
However, the matlab code with the same revision still seems to be faster. I donāt know how to check its benchmark, but I think it seems about 1 second for the matlab code.
But thanks anyway
Thank you for the suggestion.
I tried to embody what you proposed as follows.
function loop_over_global2(distance, precision)
#Now initialize a V array#
V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
while distance>precision
k_index=k_index;
(TV,Tk_index,distance)=loop_body2(V);
println(i);println(distance);
if distanceā¤precision
return (TV,V,Tk_index,distance,k_index,i);
break
else
V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
end
end
end
function f2(a1,v)
m = zero(eltype(a1))
@turbo for i in eachindex(a1,v)
val = a1[i] - v[i]
m = ifelse(val > m, val, m)
end
return m
end
function loop_body2(V)
(A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
A1=transpose(reshape(A,size(V)[2],size(V)[1]));
C=f2(A1,V);
return (A1,B,C)
end
@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global2(startdistance,precision);
However, I didnāt saw a lot of improvement from this change. This change gave me 2.438 s and the previous one gave me 2.417 s. I think previous one is shorter because I ran the code of the previous one first.
But It is weird, considering that the benchmark you showed me signs that there is a lot of decrease in time.
For comparison, I used the following code for the previous method(using maximum
)
function loop_body3(V)
(A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
A1=transpose(reshape(A,size(V)[2],size(V)[1]));
C=maximum(A1-V);
return (A1,B,C)
end
function loop_over_global3(distance, precision)
#Now initialize a V array#
V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
while distance>precision
k_index=k_index;
(TV,Tk_index,distance)=loop_body3(V);
println(i);println(distance);
if distanceā¤precision
return (TV,V,Tk_index,distance,k_index,i);
break
else
V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
end
end
end
Is there anything I am missing?
For everyone who is interested in how this is going right now, my code is in the following state.
Thank you!
using Statistics
using BenchmarkTools
using LoopVectorization
const alpha=0.261
const beta=0.99
const delta=0.0176
const n_star=1
const z_grid=[0.9526;0.9760;1;1.0246;1.0497]
const Pi_z=[0.9149 0.0823 0.0028 0 0
0.0206 0.9163 0.0618 0.0014 0
0.0005 0.0412 0.9167 0.0412 0.0005
0 0.0014 0.0618 0.9163 0.0206
0 0 0.0028 0.0823 0.9149]
#Next, get the k_star and k_L and k_H#
const k_star=(1/alpha*(1/beta-1+delta))^(1/(alpha-1))
const k_L=k_star*0.85
const k_H=1.15*k_star
println(round(k_star,digits=4));println(round(k_L,digits=4));println(round(k_H,digits=4))
#Setting up an 499 points of kgrid#
const k_grid=[k_L:((k_H-k_L)/498):k_H;]
println(median(k_grid))
##(b)##
#First, set up the length variables#
const n_k=length(k_grid)
const n_z=length(z_grid)
#Make settings for the iteration#
#First, make an array for each z_i#
const k_prime_grid_mat_proto=repeat(k_grid,1, n_z, n_k)
const k_grid_mat=permutedims(k_prime_grid_mat_proto,(3,2,1))
k_prime_grid_mat=k_prime_grid_mat_proto.*(k_grid_mat.^alpha.*transpose(z_grid)+k_grid_mat.*(1-delta)-k_prime_grid_mat_proto.>0)
k_prime_grid_mat[k_prime_grid_mat.==0].=NaN#build k' array that is out of bound#
logC=log.(k_grid_mat.^alpha.*transpose(z_grid)+k_grid_mat.*(1-delta)-k_prime_grid_mat)
logC[isnan.(logC)].=-Inf
const precision=0.01
const startdistance=100#precision and initial distance#
#iteration number#
#start iteration#
#start iteration#
function loop_over_global(distance, precision)
#Now initialize a V array#
V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
while distance>precision
k_index=k_index;
(TV,Tk_index,distance)=loop_body(V);
println(i);println(distance);
if distanceā¤precision
return (TV,V,Tk_index,distance,k_index,i);
break
else
V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
end
end
end
function loop_over_global2(distance, precision)
#Now initialize a V array#
V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
while distance>precision
k_index=k_index;
(TV,Tk_index,distance)=loop_body2(V);
println(i);println(distance);
if distanceā¤precision
return (TV,V,Tk_index,distance,k_index,i);
break
else
V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
end
end
end
function loop_over_global3(distance, precision)
#Now initialize a V array#
V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
while distance>precision
k_index=k_index;
(TV,Tk_index,distance)=loop_body3(V);
println(i);println(distance);
if distanceā¤precision
return (TV,V,Tk_index,distance,k_index,i);
break
else
V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
end
end
end
function loop_body(V)
(A,B)=findmax(logC::Array{Float64,3}+repeat(V*transpose(Pi_z),1,1,n_k).*beta,dims=1);
A1=transpose(reshape(A,size(V)[2],size(V)[1]));
C=maximum(A1-V);
return (A1,B,C)
end
function f2(a1,v)
m = zero(eltype(a1))
@turbo for i in eachindex(a1,v)
val = a1[i] - v[i]
m = ifelse(val > m, val, m)
end
return m
end
function loop_body2(V)
(A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
A1=transpose(reshape(A,size(V)[2],size(V)[1]));
C=f2(A1,V);
return (A1,B,C)
end
function loop_body3(V)
(A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
A1=transpose(reshape(A,size(V)[2],size(V)[1]));
C=maximum(A1-V);
return (A1,B,C)
end
t1=@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global(startdistance,precision);
t2=@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global2(startdistance,precision);
t3=@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global3(startdistance,precision);
Yes, my post, you should delete the totally-useseless-and-actively-harmful k_index=Array{CartesianIndex{3}, 3}
, also, if you had used @code_warntype
from the beginning (have you read the performance tips at all before diving into performance optimisation?) youād have found that also V
is type-unstable in loop_over_global2
.
Is there anything I am missing?
-
You are benchmarking something that involves IO operations through
println
. These can easily cause performance hits, although, here they seem to not be a problem. -
The benchmark results show lots of allocations, almost 4GB for the last two approaches. You could try working with preallocated buffers and, thus, avoid reallocations and potential GC pauses.
-
There seem to be a few type stabilites (I think within the
loop_body
s) which also cause slowdonws. In general, try to avoid changing the type of a variable, especially within āhotā loops. -
See what @giordano said.
I had a few minutes on my train trip and I wrote a version that uses the LoopVectorization
bit and a cache:
using Statistics
using BenchmarkTools
using LoopVectorization
const alpha=0.261
const beta=0.99
const delta=0.0176
const n_star=1
const z_grid=[0.9526;0.9760;1;1.0246;1.0497]
const Pi_z=[0.9149 0.0823 0.0028 0 0
0.0206 0.9163 0.0618 0.0014 0
0.0005 0.0412 0.9167 0.0412 0.0005
0 0.0014 0.0618 0.9163 0.0206
0 0 0.0028 0.0823 0.9149]
#Next, get the k_star and k_L and k_H#
const k_star=(1/alpha*(1/beta-1+delta))^(1/(alpha-1))
const k_L=k_star*0.85
const k_H=1.15*k_star
println(round(k_star,digits=4));println(round(k_L,digits=4));println(round(k_H,digits=4))
#Setting up an 499 points of kgrid#
const k_grid=[k_L:((k_H-k_L)/498):k_H;]
println(median(k_grid))
##(b)##
#First, set up the length variables#
const n_k=length(k_grid)
const n_z=length(z_grid)
#Make settings for the iteration#
#First, make an array for each z_i#
const k_prime_grid_mat_proto=repeat(k_grid,1, n_z, n_k)
const k_grid_mat=permutedims(k_prime_grid_mat_proto,(3,2,1))
k_prime_grid_mat=k_prime_grid_mat_proto.*(k_grid_mat.^alpha.*transpose(z_grid)+k_grid_mat.*(1-delta)-k_prime_grid_mat_proto.>0)
k_prime_grid_mat[k_prime_grid_mat.==0].=NaN#build k' array that is out of bound#
const logC=log.(k_grid_mat.^alpha.*transpose(z_grid)+k_grid_mat.*(1-delta)-k_prime_grid_mat)
logC[isnan.(logC)].=-Inf
const precision=0.01
const startdistance=100#precision and initial distance#
#iteration number#
#start iteration#
function loop_over_global(distance, precision)
#Now initialize a V array#
V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
while distance>precision
k_index=k_index;
(TV,Tk_index,distance)=loop_body(V);
# println(i);println(distance);
if distanceā¤precision
return (TV,V,Tk_index,distance,k_index,i);
break
else
V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
end
end
end
function loop_over_global2(distance, precision)
#Now initialize a V array#
V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
while distance>precision
k_index=k_index;
(TV,Tk_index,distance)=loop_body2(V);
# println(i);println(distance);
if distanceā¤precision
return (TV,V,Tk_index,distance,k_index,i);
break
else
V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
end
end
end
function loop_over_global3(distance, precision)
#Now initialize a V array#
V=zeros(n_k,n_z);i=1;k_index=Array{CartesianIndex{3}, 3}
while distance>precision
k_index=k_index;
(TV,Tk_index,distance)=loop_body3(V);
# println(i);println(distance);
if distanceā¤precision
return (TV,V,Tk_index,distance,k_index,i);
break
else
V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
end
end
end
function loop_body(V)
(A,B)=findmax(logC::Array{Float64,3}+repeat(V*transpose(Pi_z),1,1,n_k).*beta,dims=1);
A1=transpose(reshape(A,size(V)[2],size(V)[1]));
C=maximum(A1-V);
return (A1,B,C)
end
function f2(a1,v)
m = zero(eltype(a1))
@turbo for i in eachindex(a1,v)
val = a1[i] - v[i]
m = ifelse(val > m, val, m)
end
return m
end
function loop_body2(V)
(A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
A1=transpose(reshape(A,size(V)[2],size(V)[1]));
C=f2(A1,V);
return (A1,B,C)
end
function loop_body3(V)
(A,B)=findmax(logC::Array{Float64,3}.+V*transpose(Pi_z).*beta,dims=1);
A1=transpose(reshape(A,size(V)[2],size(V)[1]));
C=maximum(A1-V);
return (A1,B,C)
end
ref_V = let
println("reference loop_over_global")
@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global($startdistance,$precision);
(TV,V,Tk_index,distance,k_index,i)=loop_over_global(startdistance,precision);
@show i
@show distance
V
end
let
println("loop_over_global2")
i @btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global2($startdistance,$precision);
(TV,V,Tk_index,distance,k_index,i)=loop_over_global2(startdistance,precision);
@show i
@show distance
@show all(isapprox.(ref_V, V))
end
let
println("loop_over_global3")
@btime (TV,V,Tk_index,distance,k_index,i)=loop_over_global3($startdistance,$precision);
(TV,V,Tk_index,distance,k_index,i)=loop_over_global3(startdistance,precision);
@show i
@show distance
@show all(isapprox.(ref_V, V))
end
function make_cache(n_k, n_z)
V = zeros(n_k,n_z)
TV = similar(V)
tmp_product = similar(logC)
return (; V, TV, tmp_product)
end
function loop_withcache!(distance, precision, cache)
V, TV, tmp_product = cache
fill!(V, 0.0)
fill!(TV, 0.0)
fill!(tmp_product, 0.0)
k_index = CartesianIndex{3}[;;;]
Tk_index = similar(k_index)
i = 1
while distance>precision
k_index, distance = loop_body_withcache!(TV, V, tmp_product)
# println(i);println(distance);
if distanceā¤precision
return (Tk_index,distance,k_index,i)
else
V .= TV
prev_k_index = k_index
end
i += 1
end
end
function loop_body_withcache!(TV, V, tmp_product)
tmp_product .= logC .+ V * transpose(Pi_z) .* beta
(A,B) = findmax(tmp_product,dims=1);
TV .= permutedims(view(A, 1, :, :), (2,1))
C = f2(TV,V);
return B, C
end
let
println("loop_over_withcache!")
cache = make_cache(n_k, n_z)
@btime (Tk_index,distance,k_index,i) = loop_withcache!($startdistance,$precision,$cache);
(Tk_index,distance,k_index,i) = loop_withcache!(startdistance,precision,cache);
@show i
@show distance
@show all(isapprox.(ref_V, cache.V))
end
The result from my laptop:
reference loop_over_global
4.860 s (7847 allocations: 11.51 GiB)
i = 412
distance = 0.009909900196412025
loop_over_global2
2.675 s (4551 allocations: 3.86 GiB)
i = 412
distance = 0.009909900196412025
all(isapprox.(ref_V, V)) = true
loop_over_global3
2.688 s (5375 allocations: 3.87 GiB)
i = 412
distance = 0.009909900196412025
all(isapprox.(ref_V, V)) = true
loop_over_withcache!
2.487 s (3299 allocations: 47.16 MiB)
i = 412
distance = 0.009909900196412025
all(isapprox.(ref_V, cache.V)) = true
true
Note: I also made logC
constant, which it wasnāt before.
EDIT: Fixed my previous version where I had forgotten to reset the cache. Now the improvement is much less, but still there is an improvement
The following rewrite is faster than MATLAB. Note that this is just a code translation in a more-Julian way, so it still inherits the drawbacks of the original MATLAB-oriented design. If we know the problem enough, Iām sure this can be made way faster if written in Julia from scratch.
The timings now are as follows (note also that the allocations went down from 11.628 GiB
in your original Julia code to only 47.808 MiB
):
Julia: 1.527016 seconds (960 allocations: 47.808 MiB)
MATLAB: 2.020981 seconds.
The current code is as follows (I changed many tedious variable names to arguably better/shorter ones):
function main()
## (a) ##
# First, set up the given data#
Ī± = 0.261
Ī² = 0.99
Ī“ = 0.0176
zGrid = [0.9526, 0.9760, 1, 1.0246, 1.0497]
PiZ = [0.9149 0.0823 0.0028 0 0
0.0206 0.9163 0.0618 0.0014 0
0.0005 0.0412 0.9167 0.0412 0.0005
0 0.0014 0.0618 0.9163 0.0206
0 0 0.0028 0.0823 0.9149]
# Next, get the kStar and kL and kH #
kStar = (1 / Ī± * (1 / Ī² - 1 + Ī“))^(1 / (Ī± - 1))
kL = kStar * 0.85
kH = 1.15 * kStar
println(kStar)
println(kL)
println(kH)
# Setting up an 499 points of kgrid #
kGrid = range(kL, kH, 499)
println(median(kGrid))
## (b) ##
# First, set up the length variables #
nk = length(kGrid)
nz = length(zGrid)
# Make settings for the iteration #
# First, make an array for each z_i #
kā²Grid = repeat(kGrid, 1, nz, nk)
kGridMat = permutedims(kā²Grid, (3, 2, 1))
kā²GridMat = kā²Grid .* (kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā²Grid .> 0)
replace!(kā²GridMat, 0 => NaN) # kā²GridMat[kā²GridMat .== 0] .= NaN
logC = log.(kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā²GridMat)
replace!(logC, NaN => -Inf) # logC[isnan.(logC)] .= -Inf
# precision and initial distance #
precision = 0.01
distance = 100
# start iteration #
i = 1
V = zeros(nk, nz)
Vā² = similar(V)
TV = zeros(1, 5, 499)
VPiZ = similar(V)
logCC = similar(logC)
while distance > precision
mul!(VPiZ, V, transpose(PiZ))
logCC .= logC .+ VPiZ .* Ī²
maximum!(TV, logCC)
Vā² .= transpose(reshape(TV, 5, 499))
distance = maximum(j-k for (j,k) in zip(Vā²,V))
V .= Vā²
i += 1
end
# Add one more iteration to find Tk_index
_, k_index = findmax(logCC, dims=1)
mul!(VPiZ, Vā², PiZ')
logCC .= logC .+ VPiZ .* Ī²
_, Tk_index = findmax(logCC, dims=1)
# Reviving k-rules from the k_index array #
kā²sol = transpose(reshape(kā²Grid[Tk_index], 5, 499))
kā²sol_prev = transpose(reshape(kā²Grid[k_index], 5, 499)) # matrix of previous capital to compare with solutions #
dist_kā²sol = maximum(abs(i-j) for (i,j) in zip(kā²sol,kā²sol_prev))
# answers #
println(i)
println(distance)
println(dist_kā²sol)
end
@time main()
It still uses some useless repeat
s, reshape
s, transpose
s, etc. But the main idea is to minimize allocations by pre-allocating large arrays and the most important step is to avoid findmax
and use maximum
till the last iteration where you really want the indices.
Another note, to print floats with 4 digits like MATLAB, you can use this modification:
using Printf
Base.show(io::IO, f::Float64) = @printf(io, "%.4f", f)
A few comments
- you create
nz, nk
but then instead often use5, 499
- this problem uses VFI to solve the MOST ROUTINE economics problem (Neoclassical growth model)
Jesus compared MANY languages for this problem here.
There have been a bunch of GSOC/JSOC improving performance for this exact problem in Julia.
Julia does not have a high performance VFI solver like VFIToolkit.m, even though VFI is one of the biggest computational challenges for economists! - You can see VFI & a bunch of other methods here & here
I made the language a bit more generic (at the cost of performance)
using Printf, Statistics, LinearAlgebra;
Base.show(io::IO, f::Float64) = @printf(io, "%.4f", f)
function main()
## (a) ##
# First, set up the given data#
Ī± = 0.261; Ī² = 0.99; Ī“ = 0.0176;
#
u(c) = log(c);
f(z,k;Ī±=Ī±) = z*(k^Ī±)
con(z,k,kp;Ī“=Ī“) = f(z,k) + k*(1 - Ī“) - kp # makes it slower?
#
zGrid = [0.9526, 0.9760, 1, 1.0246, 1.0497]
PiZ = [0.9149 0.0823 0.0028 0 0
0.0206 0.9163 0.0618 0.0014 0
0.0005 0.0412 0.9167 0.0412 0.0005
0 0.0014 0.0618 0.9163 0.0206
0 0 0.0028 0.0823 0.9149]
# Next, get the kStar and kL and kH #
kStar = ( (1.0/Ī² - 1.0 + Ī“) / Ī± )^(1.0 / (Ī± - 1.0))
kL = kStar * 0.85; kH = 1.15 * kStar;
println(kStar); println(kL); println(kH);
# Setting up an 499 points of kgrid #
kGrid = range(kL, kH, 499); println(median(kGrid))
## (b) ##
# First, set up the length variables #
nk = length(kGrid); nz = length(zGrid)
# Make settings for the iteration #
# First, make an array for each z_i #
kā²Grid = repeat(kGrid, 1, nz, nk)
kGridMat = permutedims(kā²Grid, (3, 2, 1))
#
kā²GridMat = kā²Grid .* (con.(zGrid', kGridMat, kā²Grid) .> 0)
#kā²GridMat = kā²Grid .* (kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā²Grid .> 0)
#
replace!(kā²GridMat, 0 => NaN) # kā²GridMat[kā²GridMat .== 0] .= NaN
#
logC = u.(con.(zGrid', kGridMat, kā²GridMat))
#logC = u.(kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā²GridMat)
#
replace!(logC, NaN => -Inf) # logC[isnan.(logC)] .= -Inf
# precision and initial distance #
precision = 0.01; distance = 100
# start iteration #
i = 1
V = zeros(nk, nz)
Vā² = similar(V)
TV = zeros(1, nz, nk)
VPiZ = similar(V)
logCC = similar(logC)
while distance > precision
mul!(VPiZ, V, transpose(PiZ))
logCC .= logC .+ VPiZ .* Ī²
maximum!(TV, logCC)
Vā² .= transpose(reshape(TV, nz, nk))
distance = maximum(j-k for (j,k) in zip(Vā²,V))
V .= Vā²
i += 1
end
# Add one more iteration to find Tk_index
_, k_index = findmax(logCC, dims=1)
mul!(VPiZ, Vā², PiZ')
logCC .= logC .+ VPiZ .* Ī²
_, Tk_index = findmax(logCC, dims=1)
# Reviving k-rules from the k_index array #
kā²sol = transpose(reshape(kā²Grid[Tk_index], nz, nk))
kā²sol_prev = transpose(reshape(kā²Grid[k_index], nz, nk)) # matrix of previous capital to compare with solutions #
dist_kā²sol = maximum(abs(i-j) for (i,j) in zip(kā²sol,kā²sol_prev))
# answers #
println(i); println(distance); println(dist_kā²sol)
end
@time main()
The world would be better off if there was a generic Julia VFI w/ n-state variables, a generic utility (return/cost) function, and a generic transition functionā¦
Does anyone know why:
kā²GridMat = kā²Grid .* (kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā²Grid .> 0)
logC = u.(kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā²GridMat)
is faster than the more generic
u(c) = log(c);
f(z,k;Ī±=Ī±) = z*(k^Ī±)
con(z,k,kp;Ī“=Ī“) = f(z,k) + k*(1 - Ī“) - kp
kā²GridMat = kā²Grid .* (con.(zGrid', kGridMat, kā²Grid) .> 0)
logC = u.(con.(zGrid', kGridMat, kā²GridMat))
If this is one bottleneck it is possible that using MKL will be faster. I think Matlab uses it. Also one must be careful on whether this is multithreaded or not while benchmarking.
This part is of my ignorance. When I operated the function without the part, the message that k_index is not defined frequently came out.
The previous version of the while loop that frequently spits out the message was
function loop_over_global5(distance, precision)
#Now initialize a V array#
V=zeros(n_k,n_z);i=1;
while distance>precision
(TV,Tk_index,distance)=loop_body3(V);
println(i);println(distance);
if distanceā¤precision
return (TV,V,Tk_index,distance,k_index,i);
break
else
V=TV; k_index=Tk_index::Array{CartesianIndex{3}, 3}; i=i+1;
end
end
end
I suspected that in the if
part, when distance<=precision
, k_index in the previous part is eliminated somehow. So I tried to insert, k_index=k_index
in the main part of the loop, but to operate in the first place, k_index should be defined. So I put the k_index=Array{CartesianIndex{3}, 3}
in front.
P.S. The performance tips may be very useful, but to me, who are not familiar with julia and with computer programming itself, it has some cumbersome parts. Until now, I used R and Matlab in a practical manner to conduct my research which was not very helpful for understanding somewhat different languages, like Julia. But Iāll try to get used to those thanks to all of you guyās support:)
Thanks! Now, as I tried this, I think the speed of the two has come closer massively.
It seems evident that I know almost nothing about the spirit of Julia yet. Itās a far way to go.
By the way, my Matlab shows 1.225859 s while your code shows 1.498779 s for the operation. I donāt know why we have some differences. The code for the matlab was as follows.
%%%5. Discrete dynamic programming in practice: decision rules through simulated data%%%
%%(a)%%
tic
%First, set up given data%
alpha=0.261;beta=0.99;delta=0.0176;n_star=1;
z_grid=[0.9526;0.9760;1;1.0246;1.0497];
Pi_z=[0.9149 0.0823 0.0028 0 0;...
0.0206 0.9163 0.0618 0.0014 0;...
0.0005 0.0412 0.9167 0.0412 0.0005;...
0 0.0014 0.0618 0.9163 0.0206;...
0 0 0.0028 0.0823 0.9149];
%Next, get the k_star and k_L and k_H%
k_star=(1/alpha*(1/beta-1+delta))^(1/(alpha-1));
k_L=k_star*0.85;k_H=1.15*k_star;
disp(round(k_star,4));disp(round(k_L,4));disp(round(k_H,4));
%Setting up an 499 points of kgrid%
k_grid=(k_L:((k_H-k_L)/498):k_H).';
disp(median(k_grid));
%%(b)%%
%First, set up the length variables%
n_k=length(k_grid);n_z=length(z_grid);
%Now initialize a V array%
V=zeros(n_k,n_z);
%Make settings for the iteration%
%First, make an array for each z_i%
k_prime_grid_mat_proto=repmat(k_grid,[1 n_z n_k]);%build prototype k' array with 3-d block array%
k_grid_mat=permute(k_prime_grid_mat_proto,[3 2 1]);%build k array with 3-d block array%;
k_prime_grid_mat=k_prime_grid_mat_proto.*(k_grid_mat.^alpha.*z_grid.'+k_grid_mat.*(1-delta)-k_prime_grid_mat_proto>=0);
k_prime_grid_mat(k_prime_grid_mat==0)=nan;
logC=log(k_grid_mat.^alpha.*z_grid.'+k_grid_mat.*(1-delta)-k_prime_grid_mat);
%build k' array that assigns nan to k' that is out of bound%
precision=0.01;distance=100;sz=size(k_prime_grid_mat);%precision and initial distance%
i=1;%iteration number%
%Start iteration%
while (distance > precision)
[TV,Tk_index]=max(logC+V*Pi_z.'.*beta,[],'omitnan');%max within column omitting nan%
TV=reshape(permute(TV,[1,3,2]),[],size(TV,2));%change 1x5x499 array into 499x5 matrix%
distance=max(max(TV-V));
i=i+1;disp(i);disp(distance);
if distance<precision
break
else
V=TV;k_index=Tk_index;
end
end
toc
Does this code is slower than Julia code on your computer? Sorry to bother you.
What I am doing is explained a bit by @Albert_Zevelev below. It is basically finding an optimized form of function V
with optimizer kā²
given k
and z
through recursive method (finding a function V
that satisfies TV(k)=V(k)
for all k, given a functional operator T
, which contains maximizing).
Thank you again for your support!
Different computers maybe?
I was refering to the difference of the rank of the speed between the two. But yeah, still difference of computer may be the reason.
I really recommend, as a first step towards writing this algorithm in the style and spirit of Julia, and to aid those youāre asking for help, that you drop the unnecessary semicolons, insert some whitespace (both spaces and blank lines) to show organization, and reduce your variable name lengths to reduce the visual clutter.
This will help experienced Julia people see your code as Julia, in the sense of recognizing patterns, and seeing the places efficiency can be improved by insertion of views, or broadcasting, or hoisting constant expressions out of loops. As it is, I spend most of my brain cycles when looking at your code trying to translate it out of Matlab, or to distinguish between variables k_grid_mat
, k_prime_grid_mat
, k_prime_grid_mat_proto
, etc.
OK, now you canāt beat the following version in MATLAB. The times compare as follows on my machine:
Julia: 0.719097 seconds (134 allocations: 47.751 MiB)
MATLAB: Elapsed time is 1.264601 seconds.
All you have to do is replace the above while
with the one below. I mainly compute every thing in a single triple-loop. Note that Julia is still single-threaded till the moment, whereas MATLAB functions are multi-threaded by default. So, multithreading can further enhance the speed of Julia code.
The whole function now reads:
function main()
## (a) ##
# First, set up the given data#
Ī± = 0.261
Ī² = 0.99
Ī“ = 0.0176
zGrid = [0.9526, 0.9760, 1, 1.0246, 1.0497]
PiZ = [0.9149 0.0823 0.0028 0 0
0.0206 0.9163 0.0618 0.0014 0
0.0005 0.0412 0.9167 0.0412 0.0005
0 0.0014 0.0618 0.9163 0.0206
0 0 0.0028 0.0823 0.9149]
# Next, get the kStar and kL and kH #
kStar = (1 / Ī± * (1 / Ī² - 1 + Ī“))^(1 / (Ī± - 1))
kL = kStar * 0.85
kH = 1.15 * kStar
println(kStar)
println(kL)
println(kH)
# Setting up an 499 points of kgrid #
kGrid = range(kL, kH, 499)
println(median(kGrid))
## (b) ##
# First, set up the length variables #
nk = length(kGrid)
nz = length(zGrid)
# Make settings for the iteration #
# First, make an array for each z_i #
kā²Grid = repeat(kGrid, 1, nz, nk)
kGridMat = permutedims(kā²Grid, (3, 2, 1))
kā²GridMat = kā²Grid .* (kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā²Grid .> 0)
replace!(kā²GridMat, 0 => NaN)
logC = log.(kGridMat .^ Ī± .* zGrid' .+ kGridMat .* (1 - Ī“) .- kā²GridMat)
replace!(logC, NaN => -Inf)
# precision and initial distance #
precision = 0.01
distance = 100
# start iteration #
i = 1
V = zeros(nk, nz)
VPiZ = similar(V)
logCC = similar(logC)
while distance > precision
mul!(VPiZ, V, PiZ')
d = -Inf
for k = 1:nk, j = 1:nz
m = -Inf
for i = 1:nk
logCC[i,j,k] = logC[i,j,k] + VPiZ[i,j] * Ī²
m = ifelse(m > logCC[i,j,k], m, logCC[i,j,k])
end
distance = max(d, m-V[k,j])
d = m - V[k,j]
V[k,j] = m
end
i += 1
end
# Add one more iteration to find Tk_index
_, k_index = findmax(logCC, dims=1)
mul!(VPiZ, V, PiZ')
logCC .= logC .+ VPiZ .* Ī²
_, Tk_index = findmax(logCC, dims=1)
# Reviving k-rules from the k_index array #
kā²sol = transpose(reshape(kā²Grid[Tk_index], 5, 499))
kā²sol_prev = transpose(reshape(kā²Grid[k_index], 5, 499)) # matrix of previous capital to compare with solutions #
dist_kā²sol = maximum(abs(i-j) for (i,j) in zip(kā²sol,kā²sol_prev))
# answers #
println(i)
println(distance)
println(dist_kā²sol)
end
Is none of the iterative solver libraries suited to this problem class?
Yes:
By @nicolas
Iāve shown how in my links above etc