Using my knowledge of the problem domain, I could generate more or less the same input data. Running in 9 ms, I bet the following Julia version beats all the versions published in that blog post. In my own experience, properly written Julia code matches best of the industry compilers (ifort) and even beats them at many times.
function apply_filter_jl(E, wx)
Xest = 0.0 + 0.0im
for j = 1:size(E,2)
@simd for i = 1:size(E,1)
Xest += E[i,j] * wx[i,j]'
end
end
return Xest
end
@views @fastmath function cma_jl(E, wxy, mu, R, os)
L, pols = size(E)
ntaps = size(wxy,1)
N = Int( floor( (L/os/ntaps-1)*ntaps ) )
err = Matrix{ComplexF64}(undef, L, pols)
for k = 1:pols
for i = 1:N
X = E[i*os-1:i*os+ntaps-2, :]
Xest = apply_filter_jl(X, wxy[:,:, k])
err[i,k] = (R-abs2(Xest)) * Xest
wxy[:,:,k] .+= mu .* err[i,k]' .* X
end
end
return err, wxy
end
L = 10^5 # number of symbols
SNR = 10
sig = 1/sqrt(2).*rand((1+1im, 1-1im, -1-1im, -1+1im), L, 2)
sigPower = sum(abs2, sig) / L
noisePower = sigPower / SNR
noise = sqrt(noisePower) .* randn(ComplexF64, L, 2)
s4 = sig .+ noise
wxy0 = zeros(ComplexF64,21,2,2)
wxy0[22÷2, 1, 1] = 1
wxy0[22÷2, 2, 2] = 1
wxyin = copy(wxy0)
@btime cma_jl($s4, $wxyin, 1e-3, 1, 2)
# 9.942 ms (2 allocations: 3.05 MiB)