Hi Elrod,
Thanks, yes, I did move the inbounds inside now. Thank you!
You are right, I actually overlooked this and have gotten wrong results since I abandoned code which used something like this mask
@threads for i in 1:length(market_ids)
mask = market_ids .== i
@views num[mask,:] .= num[mask, :] ./ (sum(num[mask,:],dims = 1) .+1)
end
in the function foo.
So maybe I will have to do something like this. I apply foo_t now simultaneously to both, because it really is all of the subsetting that uses computing time a ton.
function foo_t!(num1, num2,market_ids)
ni = size(num1,1)
Threads.@threads for id β unique(market_ids)
mask = market_ids .== id
@inbounds for i in 1:ni
@views s1 = sum(num1[i,mask]) + 1
@views s2 = sum(num2[i,mask]) + 1
@views num1[i, mask] .= num1[i,mask] ./ s1
@views num2[i, mask] .= num2[i,mask] ./ s2
end
end
end
function bar_t!(mat_1_t,randvar_nu_t,delta)
ni,nj = size(mat_1_t)
Threads.@threads for j β 1:nj
@turbo for i β 1:ni
mat_1_t[i,j] = exp(randvar_nu_t[i,j] + delta[j] )
end
end
end
@views function mean_t!(vec_1,mat_1_t)
Threads.@threads for j β eachindex(vec_1)
vec_1[j] = mean(mat_1_t[:,j])
end
end
function predict_shares_bern_t(delta, randvar_nu_t, randvar_nu_inattention_t, mat_1_t, mat_2_t, vec_1, vec_2, market_ids, nu_bern)
bar_t!(mat_1_t,randvar_nu_t,delta)
bar_t!(mat_2_t,randvar_nu_inattention_t,delta)
foo_t!(mat_1_t, mat_2_t, market_ids)
mean_t!(vec_1,mat_1_t)
mean_t!(vec_2,mat_2_t)
ee = (exp(nu_bern)/(1+exp(nu_bern)))
vec_1 .= vec_1 .* ee + vec_2 .* (1 - ee)
return vec_1
end
This gives me the following performance with 8 threads:
I was thinking about supplying a bit vector to keep overwriting, but it doesnβt work with having Threads.@threads on the outer loop in foo_t!, which makes sense.