Naive parallel implementations we discussed here.
Edit: sorry, corrected link.
Edit: just for reference, here my latest parallelized version
function naive_kbn_parallel(xs) # credit to @tkf from another thread
len = length(xs)
nt = min(Threads.nthreads(), len)
ys = Vector{Float64}(undef, 2 * nt)
chunk = (len + nt - 1) ÷ nt
Threads.@threads for i in 1:nt
s, c = _naive_kbn(@view xs[(i - 1) * chunk + 1:min(i * chunk, len)])
ys[2 * i - 1] = s
ys[2 * i] = c
end
naive_kbn_serial(ys)
end