Annotated Code
using Statistics, LinearAlgebra, Random, BenchmarkTools
# This function modifies B, i.e. reuses the memory given to us. Allocating new memory
# is slow, so reusing memory is a good idea. Additionally, since slices in julia
# copy by default, using @views allows us to directly index into B for the computation
# instead of having to copy the memory and index into the copy.
# Using .= for the initial values of B[:,2] also allows us to directly write contents
# from x into B and depending on how broadcasting is implemented for those arguments,
# writing may be done in parallel.
function chebyshev_basis!(x, B)
B[:,2] .= x
for n in range(3, stop = size(B,2))
@. @views B[:,n] = 2. * x * B[:,n - 1] - B[:,n - 2]
end
end
# ridge_regression also used to allocate its return matrix, but if we take a look at
# the caller of this function, we notice that the memory given to this function can be
# reused. Thus, providing an already allocated output matrix allows us to save a
# potentially big chunk of memory.
function ridge_regression!(out, X, Y, λ)
# This should be possible to also do inplace, somehow
β = (X'X + λ * I) \ (X'Y)
mul!(out, X, β)
end
# This function had an unnecessary conversion from Int to Float64. Because all input
# we use is always Float64, using the floating point literal 2. instead of the integer
# literal 2 we can get rid of that conversion.
# Additionally, creating the output in the function is unnecessary, since we're never
# reusing the input at the callers side. Using the memory given to us instead of
# allocating new memory is more efficient because allocations take time.
function scale!(x)
xmin,xmax = extrema(x)
a = 2. / (xmax - xmin)
b = -0.5 * a * (xmin + xmax)
@. x = a * x + b
end
# advance was called a lot and allocated a lot of memory during the creation
# of the random vector, as well as during calculation of the output matrix.
# Preallocating both matrices and reusing the memory between calls removes those
# allocations and speeds the function up substantially.
function advance!(out, randScratch, S, r, σ, Δt, n)
dB = sqrt(Δt) * randn!(randScratch)
@. out = S + r * S * Δt + σ * S * dB
end
function compute_price(Spot, σ, K, r, n, m, Δt, order)
Random.seed!(0)
# We first have to create the working memory
S = zeros(n, m + 1)
# view(S,:,1) is equivalent to @view S[:,1] - we get a lightweight
# view of the underlying matrix without copying the data. Writing
# to this view writes to S, but the view into S can be reused!
start_view = view(S,:,1)
# This creates the scratchpad/scratch memory for the random values used in advance
randScratch = zeros(n)
# As described above, writing to start_view writes to S. Using `fill!` is
# faster than creating a new vector, multiplying the vector by `Spot`
# and assigning the result to S[:,1]. We're saving an allocation again.
fill!(start_view,Spot)
# This loop makes use of views into existing memory to allow `advance!` to
# directly write to S.
for t = 1:m
next_view = view(S, :, t+1)
advance!(next_view, randScratch, start_view, r, σ, Δt, n)
# we can reuse the views from the last iteration
start_view = next_view
end
# The next few lines have stayed roughly the same, except for the
# initialisation of value[:,end]. Again, we don't want to create a copy of
# CFL[:,end], so we instead create a lightweight view into the existing memory
# and use that to calculate the values for value[:,end].
discount = exp(-r * Δt)
CFL = max.(0, K .- S)
value = zeros(n, m)
value[:, end] .= view(CFL ,: , size(CFL,2)) .* discount
# Similar to the scratch memory for the random values from above, we can
# create scratch memory for `chebyshev_basis!` and reuse the memory between loops.
CV = zeros(n, m)
chebyshev_scratchpad = ones(n, order)
for k = 1:m -1
t = m - k
t_next = t + 1
# We create the current view (again, no copying of data from the underlying S!)
# and use `scale!` to scale the values _in place, not allocating any new memory_.
# Afterwards, we pass that view into `chebyshev_basis!` together with the scratch
# memory to compute the needed values _in place_.
stmp = view(S, :, t_next)
scale!(stmp)
chebyshev_basis!(stmp, chebyshev_scratchpad)
# By the same principle, we create a view into `value` and `CV`, using the result
# from the `chebyshev_basis!` computation to compute the ridge_regression
# _in place_, allocating as little as possible and reusing as much memory as possible.
# The original code allocated a lot of memory on almost every line during this part,
# which made it very slow.
YY = view(value, :, t_next)
cvview = view(CV, :, t)
ridge_regression!(cvview, chebyshev_scratchpad, YY, 100)
# Another view to once more save allocations and to not have to copy all the data
# into a new vector.
# Looping over the view and using the function ifelse is faster than having to create
# the copies and slices to facilitate broadcast because we once again don't have to
# allocate anything. Everything we need is already here.
cflview = view(CFL, :, t_next)
for i in axes(value, 1)
value[i, t] = discount * ifelse(cflview[i] > cvview[i], cflview[i], YY[i])
end
end
# For the last part the original code created a lot of intermediary matrices and
# transposes of those matrices, which is very expensive because the memory was not
# being reused at all. Looping over the relevant indices directly and aggregating
# the desired values into `PRICE` directly is much faster, since we only have to read
# from the existing memory and add into a known location. Dividing by `n` afterwards
# also happens at that known location, so that's fast.
# The speedup here is mostly due to algorithmic improvements, since collapsing the
# creation and looping of `POF`, `FPOF` and `dFPOF` only to loop over `dFPOF` with
# `sum(dFPOF)` again is a lot of unnecessary looping. Just looping once fixes that.
PRICE = 0.
@inbounds for i=1:n
for j=1:m
if CV[i, j] < CFL[i, j+1] && CFL[i, j+1] > 0.
PRICE += CFL[i, j+1]*exp(-r*(j-1)*Δt)
break
end
end
end
PRICE /= n
# return PRICE
end
######
# since the benchmarking code is wrapped in the run() function below,
# we don't need to do "warmup" because the code will be compiled before it is run.
# Executing run() by itself will be enough to not include compile time of computer_price().
# Furthermore, a function is compiled for different argument _types_, not _values_. Executing
# a function with different argument _values_ contributes nothing to "warmup" of the compiler.
compute_price(36., .2, 40., .06, 100000, 10, 1/10, 5)
@benchmark compute_price(36., .2, 40., .06, 100000, 10, 1/10, 5)
# The CPU will have cached some of the values calculated after one execution of `run()`,
# so executing `run()` multiple times may still provide some speedup. This is not exclusive
# to julia.
# Putting all the benchmarking code into a single function allows the compiler to propagate thoes
# constant numbers deep down into each and every function.
function run()
Spot = 36.0
σ = 0.2
n = 100000
m = 10
K = 40.0
r = 0.06
T = 1.
Δt = T / m
ε = 1e-2
for order in [5, 10, 25, 50]
t0 = time();
P = compute_price(Spot, σ, K, r, n, m, Δt, order);
dP_dS = (compute_price(Spot + ε, σ, K, r, n, m, Δt, order) - P) / ε;
dP_dσ = (compute_price(Spot, σ + ε, K, r, n, m, Δt, order) - P) / ε;
dP_dK = (compute_price(Spot, σ, K + ε, r, n, m, Δt, order) - P) / ε;
dP_dr = (compute_price(Spot, σ, K, r + ε, n, m, Δt, order) - P) / ε;
t1 = time()
out = (t1 - t0)
println(order, ": ", 1000 * out)
end
end
isinteractive() || run()
# This call is not really necessary - running `julia file.jl` where file.jl is the
# file this code is stored in will execute run() via the line above.
# The `isinteractive() ||` part just prevents `run()` from being executed when
# including this file on the REPL.
run()