Thank you very much indeed for your comments!

I am using Julia 1.61.

I use the default RNG, I have a module called Ran.jl, in which I set the random seed. Then I use just the default rand or randn function.

```
module Ran
using Random
function setseed(irn::Int64)
Random.seed!(irn)
return nothing
end
end
```

I will use the ProfileVew.jl you recommended to profile my code a little bit. I suspect the Julia RNG can be slower than the RNG I wrote in my Fortran code which is basically a Box-Muller method. Other than the RNG, my Julia code is actually a line by line translation of my Fortran code.

For example, a piece of my code is like this,

```
module Step
using LinearAlgebra
using DelimitedFiles
using Printf
include("Ran.jl")
using .Ran
include("Constants.jl")
using .Constants
#include("Mixture.jl")
#using .Mixture
include("Analysis.jl")
using .Analysis
export step_init,prep,steps,read_Yji,push_Yji,get_musigma,get_musigma_maxLL,musigma,mean_covar_init
mutable struct Mean_covar
mu::Array{Float64,2}
sigma::Array{Float64,2}
w::Float64
end
const musigma = Array{Mean_covar,1}() # const is global
function mean_covar_init(kmix::Int64,dim_p::Int64,weight::Array{Float64,1},sigma::Array{Float64,3},mu::Array{Float64,2})
@assert length(weight) == kmix
@assert size(sigma) == (kmix,dim_p,dim_p)
@assert size(mu) == (kmix,dim_p)
resize!(musigma, kmix) # musigma = Vector{MeanCovar}(undef, kmix)
for k in 1:kmix
musigma[k] = Mean_covar(zeros(dim_p,1),zeros(dim_p,dim_p),0.0)
musigma[k].mu[1,1] = mu[k,1]
musigma[k].mu[2,1] = mu[k,2]
musigma[k].sigma[1,1] = sigma[k,1,1]
musigma[k].sigma[2,2] = sigma[k,2,2]
musigma[k].w = weight[k]
end
return nothing
end
const imode = Ref{Int64}() # const is global, make it a Ref to lock the type
const itermax = Ref{Int64}()
const msample = Ref{Int64}()
const mgauss = Ref{Int64}()
const mi = Ref{Int64}()
const nsub = Ref{Int64}()
const dim_p = Ref{Int64}()
const kmix = Ref{Int64}()
const sig = Ref{Float64}()
const sig_inv = Ref{Float64}()
const SigV = Ref{Float64}()
const mu1 = Ref{Float64}()
const sig1 = Ref{Float64}()
const mu2 = Ref{Float64}()
const sig2 = Ref{Float64}()
const muV = Ref{Float64}()
const w1 = Ref{Float64}()
const w2 = Ref{Float64}()
const mitot = Ref{Int64}()
const mgauss_tot = Ref{Int64}()
const mgauss_max = Ref{Int64}()
const mgauss_min = Ref{Int64}()
const iter = Ref{Int64}()
const sigma_factor = Ref{Float64}()
const D = Ref{Float64}()
const normpdf_factor_pYq_i = Ref{Float64}()
const log_normpdf_factor_pYq_i = Ref{Float64}()
const h_diag = Ref(Array{Float64,3}(undef,0,0,0))
const ymh = Ref(Array{Float64,3}(undef,0,0,0))
const thetas = Ref(Array{Float64,3}(undef,0,0,0))
const muk_iter = Ref(Array{Float64,2}(undef,0,0))
const sigk_iter = Ref(Array{Float64,2}(undef,0,0))
const muV_iter = Ref(Array{Float64,2}(undef,0,0))
const sigV_iter = Ref(Array{Float64,2}(undef,0,0))
const wk_iter = Ref(Array{Float64,2}(undef,0,0))
const mukerror_iter = Ref(Array{Float64,2}(undef,0,0))
const sigkerror_iter = Ref(Array{Float64,2}(undef,0,0))
const ar_gik_k = Ref(Array{Float64,2}(undef,0,0))
const muVerror_iter = Ref(Array{Float64,2}(undef,0,0))
const sigVerror_iter = Ref(Array{Float64,2}(undef,0,0))
const log_pYq_km = Ref(Array{Float64,2}(undef,0,0))
const nik = Ref(Array{Float64,2}(undef,0,0))
const nik_sig = Ref(Array{Float64,2}(undef,0,0))
const wknik = Ref(Array{Float64,2}(undef,0,0))
const wknik_sig = Ref(Array{Float64,2}(undef,0,0))
const tauik = Ref(Array{Float64,2}(undef,0,0))
const Yji = Ref(Array{Float64,2}(undef,0,0))
const mgauss_ik = Ref(Array{Int64,2}(undef,0,0))
const isub_Metroplis_gik_all = Ref(Array{Int64,2}(undef,0,0))
const LL_iter = Ref(Array{Float64,1}(undef,0))
const sigma_iter = Ref(Array{Float64,1}(undef,0))
const ar_gik = Ref(Array{Float64,1}(undef,0))
const sigmaerror_iter = Ref(Array{Float64,1}(undef,0))
const norm = Ref(Array{Float64,1}(undef,0))
const norm_sig = Ref(Array{Float64,1}(undef,0))
const normerr = Ref(Array{Float64,1}(undef,0))
const log_norm = Ref(Array{Float64,1}(undef,0))
const wnow = Ref(Array{Float64,1}(undef,0))
const log_wnow = Ref(Array{Float64,1}(undef,0))
const t = Ref(Array{Float64,1}(undef,0))
const fmt_k_iter = Ref{Any}()
const fmt_float_iter = Ref{Any}()
const fmt_float_err_iter = Ref{Any}()
function step_init(imodein::Int64,itermaxin::Int64,min::Int64,mgaussin::Int64,miin::Int64,nsubin::Int64,pin::Int64,kmixin::Int64
,mu1in::Float64,sig1in::Float64,mu2in::Float64,sig2in::Float64
,muVin::Float64,sigVin::Float64
,w1in::Float64,w2in::Float64,sigin::Float64
,Din::Float64,tin::Array{Float64,1})
@assert length(tin) == miin
imode[]=imodein
itermax[]=itermaxin
msample[]=min
mgauss[]=mgaussin
mi[]=miin
nsub[]=nsubin
dim_p[]=pin
kmix[]=kmixin
sig[]=sigin
sig_inv[]=Constants.ONE/sig[]
SigV[]=sigVin
mu1[]=mu1in
sig1[]=sig1in
mu2[]=mu2in
sig2[]=sig2in
muV[]=muVin
w1[]=w1in
w2[]=w2in
mitot[]=mi[]*nsub[]
sigma_factor[]=nsub[]/mitot[]
D[]=Din
normpdf_factor_pYq_i[] = Constants.NORMALPDF_FACTOR^mi[]
log_normpdf_factor_pYq_i[] = log(normpdf_factor_pYq_i[])
mgauss_tot[] = mgauss[]*kmix[]
mgauss_max[] = ceil(mgauss_tot[]*0.95)-1
mgauss_min[] = mgauss_tot[] - mgauss_max[]
#resize!(isub_Metroplis_gik_all,msample[]*kmix[])
#reshape(isub_Metroplis_gik_all,(msample[],kmix[]))
isub_Metroplis_gik_all[] = Array{Int64,2}(undef,msample[],kmix[])
log_pYq_km[]=Array{Float64,2}(undef,msample[],kmix[])
LL_iter[]=Array{Float64,1}(undef,itermax[])
sigma_iter[]=Array{Float64,1}(undef,itermax[])
ar_gik[]=Array{Float64,1}(undef,itermax[])
sigmaerror_iter[]=Array{Float64,1}(undef,itermax[])
norm[]=Array{Float64,1}(undef,nsub[])
norm_sig[]=Array{Float64,1}(undef,nsub[])
normerr[]=Array{Float64,1}(undef,nsub[])
log_norm[]=Array{Float64,1}(undef,nsub[])
wnow[]=Array{Float64,1}(undef,kmix[])
log_wnow[]=Array{Float64,1}(undef,kmix[])
t[]=Array{Float64,1}(undef,mi[])
muk_iter[]=Array{Float64,2}(undef,kmix[],itermax[])
sigk_iter[]=Array{Float64,2}(undef,kmix[],itermax[])
muV_iter[]=Array{Float64,2}(undef,kmix[],itermax[])
sigV_iter[]=Array{Float64,2}(undef,kmix[],itermax[])
wk_iter[]=Array{Float64,2}(undef,kmix[],itermax[])
mukerror_iter[]=Array{Float64,2}(undef,kmix[],itermax[])
sigkerror_iter[]=Array{Float64,2}(undef,kmix[],itermax[])
ar_gik_k[]=Array{Float64,2}(undef,kmix[],itermax[])
muVerror_iter[]=Array{Float64,2}(undef,kmix[],itermax[])
sigVerror_iter[]=Array{Float64,2}(undef,kmix[],itermax[])
nik[]=Array{Float64,2}(undef,nsub[],kmix[])
nik_sig[]=Array{Float64,2}(undef,nsub[],kmix[])
wknik[]=Array{Float64,2}(undef,nsub[],kmix[])
wknik_sig[]=Array{Float64,2}(undef,nsub[],kmix[])
tauik[]=Array{Float64,2}(undef,nsub[],kmix[])
Yji[]=Array{Float64,2}(undef,mi[],nsub[])
mgauss_ik[]=Array{Int64,2}(undef,nsub[],kmix[])
h_diag[]=Array{Float64,3}(undef,mi[],kmix[],msample[])
ymh[]=Array{Float64,3}(undef,mi[],kmix[],msample[])
thetas[]=Array{Float64,3}(undef,dim_p[],msample[],kmix[])
t[]=tin
for i in 1:kmix[]
wnow[][i] = musigma[i].w
end
mgauss_ik[] .= mgauss[]
fmt_k_iter[] = Printf.Format("%20s"*"%15i |"^kmix[])
fmt_float_iter[] = Printf.Format("%20s"*" %12.5g |"^kmix[])
fmt_float_err_iter[] = Printf.Format(" %12.5g +- %-12.5g |")
return nothing
end
....
```

I defined many variables as const with Ref, in order to lock their type and ensure type stability, as many guys suggested if I have to write my Julia code in a Fortran way now.

I treat these const Ref variables as ‘global’ variable which in principle may not decrease performance by too much. The other advantage of using Ref is that then everywhere if I need to use these ‘global’ variable, say use sig, I need to write sig[ ], the [ ] helps me to recognize that it is a ‘global’ variable.

The function step_init is simply to initialize the ‘global’ variables in the module.

May I ask, in your Trixi,jl (which is a very nice and advanced Julia code), do you also have some place your might need to define some constants or parameters that you need to use in your whole code? Do you have a type or struct that serve as parameters etc, so that you do not have to pass a lot of arguments in your functions everytime?

I currently just run in it serial, I did not add MPI in Julia. I just first benchmark the single core performance in Julia and Fortran. Once I could get Julia’s performance close to Fortran, implement MPI should be trivial. Due to the nature of Monte Carlo, I use MPI. I do not use things like openMP which is shared memory stuff and may have thread safe issues.

By the way, I saw in your Trixi.jl code, you used many fancy stuff begin with @, such as @ inline, @ inbound, @ unpack, etc. I wonder, like, if you do not add those commands, will the Julia code slow down by a lot?