I am trying to carry out numerical integrations on a large number of initial conditions, storing them in a shared array:

```
D=6
N=1000
T=150
state=SharedArray{Float64}((D,N,T))
state[:,:,1]=randn(D,N);
@everywhere include("my_integration_funcs.jl")
@sync @parallel for t in 2:T
state[:,:,t]=timestep(state[:,:,t-1])
end
```

where “my_integration_funcs.jl” looks like:

```
function timestep(init)
dt=0.0002
T=1
for t in 0.:dt:T
init=init+dt*CdV_eq(arr)
end
return init
end
#a big horrible nonlinear equation:
function CdV_eq(arr)
if size(arr)[1]==D
x1=arr[1,:]
x2=arr[2,:]
x3=arr[3,:]
x4=arr[4,:]
x5=arr[5,:]
x6=arr[6,:]
else
println("first dimension should be length $D")
return -1
end
b=0.5
gamma=0.2
beta=1.25
C=0.1
x1f=0.95
x4f=-0.76095
ẋ1=7.*x3 .-C.*(x1.-x1f)
ẋ4=7.*x6 .-C.*(x4.-x4f)+(x2.*x6-x3.*x5)
ẋ2=-0.2.*x1.*x3 .+1.25.*x3.-C.*x2.-3.2.*x4.*x6
ẋ3= 0.2.*x1.*x2 .-1.25.*x2.-C.*x3.+3.2.*x4.*x5 .-0.1.*x1
ẋ5=-0.6.*x1.*x6 .+3.*x6.-C.*x5.-3.1.*x4.*x3
ẋ6= 0.6.*x1.*x5 .-3.*x5.-C.*x6.+3.2..*x4.*x2 .-0.1.*x4
return transpose(hcat(ẋ1,ẋ2,ẋ3,ẋ4,ẋ5,ẋ6))
end
```

When I run this code with different numbers of processors, I see very limited speedup going from 2procs to 4 procs (takes 80% of the time), but this is an embarrassingly parallel process, so I feel I should be able to do much better. How can I improve this?

(I am currently using Julia v0.6 which may be relevant)