How to make this piece of code faster? optimising it or using multiple threads?

@GunnarFarneback: Your suggestion significantly lowered the execution time to the order of ms
@Vasily_Pisarev:Your suggestions did lower the allocations

The revised code:

using Distributions,Distributed,StatsBase,LinearAlgebra,LightGraphs,BenchmarkTools,Random,Plots

mutable struct Arr
    t::Int64
    agents::Array{Int64,1}
    θ::Array{Float64,1}
    C::Array{Float64,1}
    G::SimpleDiGraph{Int64}
    W::Array{Float64,1}
end

function main(Cmin::Float64,Cmax::Float64,N::Int64,n::Int64,outdeg::Int64,Pᵣ::Float64,Pᵢ::Float64,P::Int64,μ::Float64,σ::Float64)
    A=Arr(0,[1:N;],Array{Float64}(undef,N),Array{Float64}(undef,N),watts_strogatz(N,outdeg,Pᵣ,is_directed=true),
    Array{Float64}(undef,N))
    rand!(Normal(μ,σ),A.θ)
    rand!(Uniform(Cmin,Cmax),A.C)
    A.W.=1.0.-A.C
    A.W./=sum(A.W)
    θₜ=Vector{Float64}[]
    upagents=Array{Int64}(undef,round(Int,0.5*N))
    cedg=Array{Float64}(undef,N)
    while(any(abs(x - A.θ[1]) > 0.01 for x in A.θ))
        A.t+=1
        push!(θₜ,copy(A.θ))
        sample!(Random.GLOBAL_RNG,A.agents, Weights(A.W), upagents, replace=false)
        @inbounds for i in upagents
            cedg.=0.0
            @inbounds for j in inneighbors(A.G, i) 
              if(rand()<Pᵢ && i!=j)
                cedg[j]=exp(-abs(θₜ[A.t][i]-θₜ[A.t][j])/(1.0-A.C[i]))
              end
            end
            cedg./=(sum(cedg)/(1.0-A.C[i]))
            cedg[i]=A.C[i]
            s=0.0
            c=0.0
            @inbounds for k in 1:N
              if(cedg[k]!=0)
                s+=cedg[k]*sin(θₜ[A.t][k])
                c+=cedg[k]*cos(θₜ[A.t][k])
              end
            end
            A.θ[i]=atan(s,c)
        end
    end
    #return A.t,A.θ[1],sum(A.C.*θₜ[1])/sum(A.C),mean(A.C)
    return θₜ
end
Cmin=0.5;Cmax=0.9;N=1000;n=5000;outdeg=250;Pᵣ=0.0;Pᵢ=0.5;P=500;μ=pi/2;σ=pi/18;
@btime main(Cmin,Cmax,N,n,outdeg,Pᵣ,Pᵢ,P,μ,σ)
589.232 ms (14481 allocations: 6.67 MiB)

To be able to simulate for network sizes larger than 10,000, I am planning to tweak this code (the inner for loops) to be able to run on GPU. Is that the right approach? Using the above code, I get the following performance:

Cmin=0.5;Cmax=0.9;N=10000;n=5000;outdeg=2500;Pᵣ=0.0;Pᵢ=0.5;P=5000;μ=pi/2;σ=pi/18;
@btime main(Cmin,Cmax,N,n,outdeg,Pᵣ,Pᵢ,P,μ,σ)
68.707 s (220977 allocations: 663.53 MiB)