Parallelization seems to increase the necessary amount of allocations for single threads

You are allocating a lot of memory for each iteration (2.24 GiB) in total, and that has an overhead, because memory is only one (or two, or three…) physically.

You can improve on that by allocating the memory fewer times (once per thread). For example:

nthreads = 10
nchunks = 10
  0.789969 seconds (103 allocations: 22.896 MiB)
  0.266457 seconds (811 allocations: 2.235 GiB, 19.68% gc time)
  0.168009 seconds (190 allocations: 228.896 MiB)

The first run is the serial one (I’ve optimized serial the code to be fair, now I only allocate the solution once here).

The second run is allocating a new solution at every iteration.

The third run is by splitting the workload into nthreads() chunks, and allocating one solution per chunk.

Still the scaling is far from ideal (5x for 10 threads), but it improved already. You probably need a something where the computation time is greater relative to memory management time to improve on that. But since your output has step_number size, that’s hard with that example.

Here is the code I’ve run:

Code
using Base.Threads
using ChunkSplitters

function simple_euler(
    func, dt, step_number, initial_pos; 
    solution = zeros((step_number, size(initial_pos, 1)))
)
    current_pos = initial_pos
    du = zeros(size(initial_pos, 1))
    for x in 1:step_number-1
        solution[x, :] = current_pos
        du = func(du, current_pos, dt)
        current_pos .+= du
        
    end
    solution[end, :] = current_pos;
    return nothing
end

function lorenz!(du, u, dt)
     du[1] = 10.0 * (u[2] - u[1])
     du[2] = u[1] * (28.0 - u[3]) - u[2]
     du[3] = u[1] * u[2] - (8 / 3) * u[3]
     du .*= dt
end

function do_stuff(;N=100, step_number=10^6)
    initial_pos = [ 1., 0., 0. ]
    solution = zeros(step_number, length(initial_pos))
    for i in 1:N
        simple_euler(
            lorenz!,1e-4, step_number, initial_pos;
            solution = solution
        )
    end
end

function do_stuff_parallel(;N=100, step_number=10^6, nchunks=nthreads())
    @sync for _ in 1:N
        @spawn begin
            initial_pos = [ 1., 0., 0. ]
            solution = zeros(step_number, length(initial_pos))
            simple_euler(
                lorenz!,1e-4, step_number, initial_pos; 
                solution = solution
            )
        end
    end
end

function do_stuff_parallel2(;N=100, step_number=10^6, nchunks=nthreads())
    @sync for inds in chunks(1:N; n=nchunks)
        @spawn begin
            initial_pos = [ 1., 0., 0. ]
            solution = zeros(step_number, length(initial_pos))
            for _ in inds
                simple_euler(
                    lorenz!,1e-4, step_number, initial_pos; 
                    solution = solution
                )
            end
        end
    end
end

function run(;N=100, step_number=10^6, nchunks=nthreads())
    println("nthreads = ", nthreads())
    println("nchunks = ", nchunks)
    # Compile
    do_stuff(;N=10, step_number=step_number)
    do_stuff_parallel(;N=10, step_number=step_number)
    do_stuff_parallel2(;N=10, step_number=step_number)
    # Run
    @time do_stuff(;N=N, step_number=step_number)
    @time do_stuff_parallel(;N=N, step_number=step_number, nchunks=nchunks)
    @time do_stuff_parallel2(;N=N, step_number=step_number, nchunks=nchunks)
end

ps: note that the time was almost cut in half in the serial version just by allocating solution only once. Thus your code performance was quite memory-bounded, thus making scaling hard.