Best way to parallelize

This is another way to try it:

@views function filtering_step_new!(search_vectors_list, u_vectors, w_vectors, provisional_vector, hamiltonian_matrix)
    nchuncks = Threads.nthreads()
    Threads.@threads for ichunck in 1:nchuncks
        for k in 1:nchuncks:size(search_vectors_list, 2) # simple splitter
            mul!(provisional_vector[:, ichunck], hamiltonian_matrix, search_vectors_list[:, k]) 
            u_vectors[:, k] .= provisional_vector[:, ichunck]
            provisional_vector[:, ichunck] .= search_vectors_list[:, k]
            mul!(provisional_vector[:, ichunck], hamiltonian_matrix, u_vectors[:, k], 2.0, -1.0)
            w_vectors[:, k] .= provisional_vector[:, ichunck]
        end        
    end
end   

Note that nchuncks is now independent of Threads.nthreads(). If the tasks are inhomogeneous, it may be good to make nchuncks >> nthreads().

(of course you have to initialize the per-chunck auxiliary arrays with nchunckssize instead of nthreads(), thus in your case nchuncks could be just the length of those arrays, deduced from the input).