How to make this piece of code faster? optimising it or using multiple threads?

Benchmarks on

julia> versioninfo()
Julia Version 1.3.0-rc5.1
Commit 36c4eb2* (2019-11-17 19:04 UTC)
Platform Info:
  OS: Linux (x86_64-redhat-linux)
  CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-8.0.1 (ORCJIT, haswell)

Benchmarking yesterday’s version with Base.sincos minus the ARPACK dependency that I don’t feel like installing, and a couple other tweaks:

using StatsBase,LinearAlgebra,LightGraphs,BenchmarkTools,Random

mutable struct Arr
    t::Int64
    agents::Array{Int64,1}
    θ::Array{Float64,1}
    C::Array{Float64,1}
    G::SimpleDiGraph{Int64}
    W::Array{Float64,1}
end

function main(Cmin::Float64,Cmax::Float64,N::Int64,n::Int64,outdeg::Int64,Pᵣ::Float64,Pᵢ::Float64,P::Int64,μ::Float64,σ::Float64)
    A=Arr(0,[1:N;],Array{Float64}(undef,N),Array{Float64}(undef,N),watts_strogatz(N,outdeg,Pᵣ,is_directed=true),
    Array{Float64}(undef,N))
    randn!(A.θ)
    A.θ .= muladd.(A.θ, σ, μ)
    rand!(A.C)
    A.C .= muladd.(Cmax - Cmin, A.C, Cmin)
    A.W.=1.0.-A.C
    A.W .*= (1/sum(A.W))
    θₜ = Vector{Float64}[]
    upagents = Array{Int64}(undef, (N+1) >>> 1)
    cedg = Array{Float64}(undef,N)
    while(any(abs(x - A.θ[1]) > 0.01 for x in A.θ))
        A.t+=1
        push!(θₜ,copy(A.θ))
        sample!(Random.GLOBAL_RNG, A.agents, Weights(A.W), upagents, replace=false)
        @inbounds for i in upagents
            cedg.=0.0
            @inbounds for j in inneighbors(A.G, i) 
              if(i!=j && rand()<Pᵢ)
                cedg[j]=exp(-abs(θₜ[A.t][i]-θₜ[A.t][j])/(1.0-A.C[i]))
              end
            end
            cedg .*= ((1.0-A.C[i]) / sum(cedg))
            cedg[i]=A.C[i]
            s=0.0
            c=0.0
            @inbounds for k in 1:N
              if(cedg[k]!=0)
                sinθₜ, cosθₜ = sincos(θₜ[A.t][k])
                s+=cedg[k]*sinθₜ
                c+=cedg[k]*cosθₜ
              end
            end
            A.θ[i]=atan(s,c)
        end
    end
    #return A.t,A.θ[1],sum(A.C.*θₜ[1])/sum(A.C),mean(A.C)
    return θₜ
end
julia> Cmin=0.5;Cmax=0.9;N=1000;n=5000;outdeg=250;Pᵣ=0.0;Pᵢ=0.5;P=500;μ=pi/2;σ=pi/18;
julia> @benchmark main($Cmin,$Cmax,$N,$n,$outdeg,$Pᵣ,$Pᵢ,$P,$μ,$σ) # $ on principal, but irrelevant for these runtimes
BenchmarkTools.Trial:
  memory estimate:  6.87 MiB
  allocs estimate:  14520
  --------------
  minimum time:     452.444 ms (0.00% GC)
  median time:      503.295 ms (0.00% GC)
  mean time:        518.045 ms (0.04% GC)
  maximum time:     650.211 ms (0.00% GC)
  --------------
  samples:          10
  evals/sample:     1

This was slightly faster than before the tweaks.
I did try actually vectorizing the sincos loop, but it was slower.
You have to get rid of the branch to vectorize. I assume a high proportion of cedg are in fact 0?