This is another way to try it:
@views function filtering_step_new!(search_vectors_list, u_vectors, w_vectors, provisional_vector, hamiltonian_matrix)
nchuncks = Threads.nthreads()
Threads.@threads for ichunck in 1:nchuncks
for k in 1:nchuncks:size(search_vectors_list, 2) # simple splitter
mul!(provisional_vector[:, ichunck], hamiltonian_matrix, search_vectors_list[:, k])
u_vectors[:, k] .= provisional_vector[:, ichunck]
provisional_vector[:, ichunck] .= search_vectors_list[:, k]
mul!(provisional_vector[:, ichunck], hamiltonian_matrix, u_vectors[:, k], 2.0, -1.0)
w_vectors[:, k] .= provisional_vector[:, ichunck]
end
end
end
Note that nchuncks is now independent of Threads.nthreads(). If the tasks are inhomogeneous, it may be good to make nchuncks >> nthreads().
(of course you have to initialize the per-chunck auxiliary arrays with nchunckssize instead of nthreads(), thus in your case nchuncks could be just the length of those arrays, deduced from the input).