Thread.jl yields wrong results in parallelisation compared to serial for-loop

Hi Guys, I am using @thread for my model parallelisation. However, it yields different results compared to serial for-loop. Here is my script

function batch_cost_function(p)
    num_params = size(p, 2)  # Number of parameter sets
    out = zeros(num_params)  # Store cost function values

    Threads.@threads for i in 1:num_params
        out[i] = getLoss(p[:, i], info.models.forward, run_helpers.space_forcing,
                         run_helpers.space_spinup_forcing, run_helpers.loc_forcing_t,
                         run_helpers.output_array, run_helpers.space_output,
                         run_helpers.space_land, run_helpers.tem_info,
                         obs_array, tbl_params, cost_options,
                         info.optimization.multi_constraint_method)
        println(p[:, i])
        println(out[i])
    end
    return out
end

function batch_cost_function_serial(p)
    num_params = size(p, 2)
    out = zeros(num_params)

    for i in 1:num_params  # No threading
        out[i] = getLoss(p[:, i], info.models.forward, run_helpers.space_forcing,
                         run_helpers.space_spinup_forcing, run_helpers.loc_forcing_t,
                         run_helpers.output_array, run_helpers.space_output,
                         run_helpers.space_land, run_helpers.tem_info,
                         obs_array, tbl_params, cost_options,
                         info.optimization.multi_constraint_method)
        println(p[:, i])
        println(out[i])
    end
    return out
end

out_serial_small = batch_cost_function_serial(A_small);
out_thread_small = batch_cost_function(A_small);

In my function getLoss, input arguments except for p (within the for-loop) are all global variables. If I tried it with normal for loop, then out is correct. However, if I tried it with @threads, results are different…here are running results (use only 3 for-loops):

why is that? Any suggestions are appreciated! Thanks!

It’s going to be hard to diagnose the problem in more detail without seeing getLoss

Thanks! getLoss is like this

function getLoss(param_vector, selected_models, space_forcing, space_spinup_forcing, loc_forcing_t, output_array, space_output, space_land, tem_info, observations, param_updater, cost_options, multi_constraint_method)

    # @debug selected_models[3]
    updated_models = updateModelParameters(param_updater, selected_models, param_vector .* param_updater.default)
    # updated_models = updateModelParameters(param_updater, selected_models, param_vector)

    # @debug param_vector
    # @show updated_models[28]
    runTEM!(updated_models, space_forcing, space_spinup_forcing, loc_forcing_t, space_output, space_land, tem_info)
    # @show output_array
    loss_vector = getLossVector(output_array, observations, cost_options)
    loss = combineLoss(loss_vector, multi_constraint_method)
    # @show loss_vector, loss
    return loss
end

function runTEM!(selected_models, space_forcing, space_spinup_forcing, loc_forcing_t, space_output, space_land, tem_info::NamedTuple)
    parallelizeTEM!(selected_models, space_forcing, space_spinup_forcing, loc_forcing_t, space_output, space_land, tem_info, tem_info.run.parallelization)
    return nothing
end

function parallelizeTEM!(selected_models, space_forcing, space_spinup_forcing, loc_forcing_t, space_output, space_land, tem_info, ::UseThreadsParallelization)
    Threads.@threads for space_index ∈ eachindex(space_forcing)
        coreTEM!(selected_models, space_forcing[space_index], space_spinup_forcing[space_index], loc_forcing_t, space_output[space_index], space_land[space_index], tem_info, tem_info.run.spinup_TEM)
    end
    return nothing
end

function coreTEM!(selected_models, loc_forcing, loc_spinup_forcing, loc_forcing_t, loc_output, loc_land, tem_info, ::DoSpinupTEM)
    # update the loc_forcing with the actual location
    loc_forcing_t = getForcingForTimeStep(loc_forcing, loc_forcing_t, 1, tem_info.vals.forcing_types)
    # run precompute
    land_prec = precomputeTEM(selected_models, loc_forcing_t, loc_land, tem_info.model_helpers) 
    # run spinup
    land_spin = spinupTEM(selected_models, loc_spinup_forcing, loc_forcing_t, land_prec, tem_info)

    timeLoopTEM!(selected_models, loc_forcing, loc_forcing_t, loc_output, land_spin, tem_info.vals.forcing_types, tem_info.model_helpers, tem_info.vals.output_vars, tem_info.n_timesteps, tem_info.run.debug_model)
    return nothing
end

My first flag is that you’re mixing global variables with parallel execution, which is just begging for a bad time. My hunch is one of these global variables changes during the execution of the body of the loop and thus

  • The parallel execution leads to a race condition
  • The serial execution recursively depends on and changes the globals
  • The sequential evaluation of batch_cost_function_serial and batch_cost_function further depends on the changes of the globals
  • or any combination of these

The fact that the results are different aside, how do you know the serial result are even right? Do you have unit tests for these?

While you have provided additional context into getLoss, which is appreciated, it also relies on context outside of this scope. What we really need, in order to best help you, is basically to strip away all context that doesn’t affect the result. For example, what happens if you use something that’s not getLoss or just a very simple version for testing sake?

1 Like

Threads.@threads for space_index ∈ eachindex(space_forcing) is called from parallelizeTEM! and parallelizes over space_forcing which is passed into getLoss from batch_cost_function in the argument named run_helpers.space_forcing (and a similar story for the other space_* variables)

but batch_cost_function itself parallelizes over 1:num_params and reuses run_helpers.space_* on each. you almost certainly have a race condition in there. you probably want to make each of those buffer spaces a thread-local variable?

1 Like

thanks for your reply! Yes, I made a version which does not have the parallelisation within the runTEM! (i.e. only run one thread) but still find the issue there…

Thanks for your reply! Yes, I tried to evaluate getLoss function by using simple parameters which yield different results which are reasonable…meanwhile we are directly using it inside the optimisation package…so it would be not problematic for using for-loop serial execution for getLoss function…

Another option is to either: make your global variables atomic or use explicit locks like Base.Lockable (introduced in v1.11 which streamlines the @lock process) on the offending operations.

1 Like