Hi all,

I am trying to speed up this loop. I am providing a MWE and I wanted to know if any of you could spot any obvious performance improvementes.

```
using LinearAlgebra, Statistics, Random
C=rand([0,1],120,13,8000)
D = rand(120)
E=findall(sum(sum(C,dims=1), dims=2)[1,1,:].>0)
P=rand(120,8000)
H=(P.>0.5)
H_D=(D.>0.5)
G_C=(D.<0.05)
A=ones(8000,13)
NF=8000
@views function fun1(C,D,E,P,H,H_D;G_C=G_C,A=A,NF=NF)
J=size(C)[1]
Tmax=size(C)[2]
T=size(C)[2]-1
N=size(C)[3]
num_large0 = zeros(2)
num_small0 = zeros(2)
num_large1 = zeros(2)
num_small1 = zeros(2)
num2 = zeros(1)
num_b0=zeros(1)
num_b1=zeros(1)
K = size(E)[1]
if N>NF
A= vcat(A,A,A)
H = cat(H ,H ,H ,dims=2)
P= cat(P,P,P,dims=2)
end
@fastmath @inbounds for n=1:K, t=2:Tmax, j=1:J
c0 = C[j,t,E[n]]*A[E[n],t]
c1 = C[j,t-1,E[n]]*(C[j,t,E[n]])*A[E[n],t]
num_large0[1] = num_large0[1] .+ c0.*(H[j,E[n]]==2)*P[j,E[n]]*D[j]
num_small0[1] = num_small0[1] .+ c0.*(H[j,E[n]]==1)*P[j,E[n]]*D[j]
num_large1[1] = num_large1[1] .+ c1.*(H[j,E[n]]==2)*P[j,E[n]]*D[j]
num_small1[1] = num_small1[1] .+ c1.*(H[j,E[n]]==1)*P[j,E[n]]*D[j]
num_b0[1] = num_b0[1] .+ c0.*(1-G_C[j])*P[j,E[n]]*D[j]
num_b1[1] = num_b1[1] .+ c1.*(1-G_C[j])*P[j,E[n]]*D[j]
if H_D[j]==1
num_large0[2] = num_large0[2] .+ c0.*(H[j,E[n]]==2)*P[j,E[n]]*D[j]
num_small0[2] = num_small0[2] .+ c0.*(H[j,E[n]]==1)*P[j,E[n]]*D[j]
num_large1[2] = num_large1[2] .+ c1.*(H[j,E[n]]==2)*P[j,E[n]]*D[j]
num_small1[2] = num_small1[2] .+ c1.*(H[j,E[n]]==1)*P[j,E[n]]*D[j]
end
end
return vcat(num_large0./(J*N), num_large1./(J*N), num_small0./(J*N), num_small1./(J*N), num_b0./(J*N), num_b1./(J*N))
end
TEST=@time fun1(C,D,E,P,H,H_D)
```

Performance result:

`91.233291 seconds (1.18 G allocations: 23.470 GiB, 1.82% gc time)`

EDIT: Provided MWE. Please disregard how the data is being generated.

Thanks in advance!