What's the problem with this simple multi-thread code?

Some alternatives:

serial:

julia> function loop_dot_serial(x, A, y)
           s = 0.
           for i = eachindex(y)
               for k = eachindex(x)
                   s += x[k]*A[k,i]*y[i]
               end
           end
           return s
       end
loop_dot_serial (generic function with 1 method)

julia> @btime loop_dot_serial($x,$A,$y)
  259.091 ms (0 allocations: 0 bytes)
98.64233873489854

manual:

julia> function loop_dot_manual(x, A, y; ntasks = Threads.nthreads())
           s = zeros(ntasks)
           Threads.@threads for it in 1:ntasks
               for i in it:ntasks:length(y)
                   for k = eachindex(x)
                       s[it] += x[k]*A[k,i]*y[i]
                   end
               end
           end
           return sum(s)
       end
loop_dot_manual (generic function with 1 method)

julia> @btime loop_dot_manual($x,$A,$y)
  105.539 ms (42 allocations: 4.02 KiB)
98.64233873543105

using Floops

julia> using FLoops

julia> function loop_dot(x, A, y)
           @floop for i = eachindex(y)
               for k = eachindex(x)
                   @reduce s += x[k]*A[k,i]*y[i]
               end
           end
           return s
       end
loop_dot (generic function with 1 method)

julia> @btime loop_dot($x,$A,$y)
  91.441 ms (116 allocations: 5.89 KiB)
98.64233873490912

using LoopVectorization (applies for this simple operation):

julia> using LoopVectorization

julia> function loop_dot_turbo(x, A, y)
           s = 0.
           @tturbo for i = eachindex(y)
               for k = eachindex(x)
                   s += x[k]*A[k,i]*y[i]
               end
           end
           return s
       end
loop_dot_turbo (generic function with 1 method)

julia> @btime loop_dot_turbo($x,$A,$y)
  69.347 ms (0 allocations: 0 bytes)
98.64233873475337

using Tullio:

julia> using Tullio

julia> function loop_dot_tullio(x,A,y)
           s = 0.
           @tullio s += x[k]*A[k,i]*y[i]
           return s
       end
loop_dot_tullio (generic function with 1 method)

julia> @btime loop_dot_tullio($x,$A,$y)
  72.149 ms (203 allocations: 10.30 KiB)
98.64233873475337

7 Likes