A basic issue is that you are allocating tons of little matrices over and over, and so at some point it will start spending all of its time doing garbage collection.
The solution is to (a) allocate arrays like pool once and re-use and (b) use StaticArrays for all of your little 2x2 and 4x4 matrices, since those are fixed sizes that you can tell the compiler to exploit.
I translated your code to use these principles, without changing what it does (I think), and also removed some Matlab-isms like allocating 1-column matrices when what you really want is a 1d array. It should be much faster:
using LinearAlgebra
using StatsBase
using Statistics
using Random
using StaticArrays
# adapted from rand(Haar(2)) in RandomMatrices, modified
# to use staticarrays, only for beta=2
function rand_haar2(::Val{n}) where n
M = @SMatrix randn(ComplexF64, n,n) # = rand(Ginibre(2, n))
q = qr(M).Q
L = cispi.(2 .* @SVector(rand(n)))
return q*diagm(L)
end
function main2(; ps=3, theta = 1.39, Nk = 800, Nr = 500)
sum_z = zeros(Nk)
sum_zlog = zeros(Nk)
sum_s = zeros(Nk)
sum_slog = zeros(Nk)
Npool = 10^ps
pool = zeros(Npool)
pool_sample = zeros(Npool)
logpool = zeros(Npool)
z_list = zeros(Nk)
s_list = zeros(Nk)
z_log_list = zeros(Nk)
s_log_list = zeros(Nk)
Kup= @SMatrix[cos(theta) 0; 0 sin(theta)]
Kdown = @SMatrix[sin(theta) 0; 0 cos(theta)]
P2up = kron(@SMatrix[1. 0.;0. 1.], @SMatrix[1 0; 0 0])
P2down = kron(@SMatrix[1 0;0 1], @SMatrix[0 0;0 1])
for i in 1:Nr
pool .= 0.5
pool_sample .= 0
z_list .= 0
s_list .= 0
z_log_list .= 0
z_log_list .= 0
for k in 1:Nk
poolcount = 0
while poolcount < Npool
z1 = pool[rand(1:Npool)]
rho1 = diagm(@SVector[z1,1-z1])
z2 = pool[rand(1:Npool)]
rho2 = diagm(@SVector[z2,1-z2])
u1 = rand_haar2(Val{2}())
u2 = rand_haar2(Val{2}())
K1up = u1*Kup*u1'
K1down = u1*Kdown*u1'
K2up = u2*Kup*u2'
K2down = u2*Kdown*u2'
rho1p = K1up*rho1*K1up'
rho2p = K2up*rho2*K2up'
p1 = real(tr(rho1p+rho1p'))/2
p2 = real(tr(rho2p+rho2p'))/2
rho1p = (rho1p+rho1p')/(2*p1)
rho2p = (rho2p+rho2p')/(2*p2)
rho = kron(rho1p,rho2p)
U = rand_haar2(Val{4}())
rho_p = P2up*U*rho*U'*P2up'
p = real(tr(rho_p+rho_p'))/2
temp =(rho_p+rho_p')/2
#rho_f = @SMatrix[tr(temp[1:2,1:2]) tr(temp[1:2,3:4]); tr(temp[3:4,1:2]) tr(temp[3:4,3:4])]/(p)
rho_f = @SMatrix[temp[1,1]+temp[2,2] temp[1,3]+temp[2,4]; temp[3,1]+temp[4,2] temp[3,3]+temp[4,4]]/(p)
rho_f = (rho_f+rho_f')/2
t = abs(tr(rho_f*rho_f))
z = (1-t)/(1+sqrt(2*t-1))
if !iszero(z)
poolcount = poolcount+1
pool_sample[poolcount] = abs(z)
end
end
pool .= pool_sample
pool_sample .= 0
logpool .= log.(pool)
z_log_list[k] = mean(logpool)
s_log_list[k] = std(logpool)/sqrt(Npool)
z_list[k] = mean(pool)
s_list[k] = std(pool)/sqrt(Npool)
end
sum_z .+= z_list
sum_zlog .+= z_log_list
sum_s .+= s_list.^2
sum_slog .+= s_log_list.^2
end
return sum_z/Nr, sqrt.(sum_s)/Nr, sum_zlog/Nr, sqrt.(sum_slog)/Nr
end