`@threads` with `@simd` performance regression on `0.7.0-alpha`

multithreading

#1

Hello,

I encountered this in one of my applications. All these tests are with 1 thread.

Here is a MWE:

 function filla!(a)
      for i2 in 1:size(a,2)
           @simd for i1 in 1:size(a,1)
                      @inbounds  a[i1,i2]=i1/i2
           end
       end
end

 function fillat!(a)
     Threads.@thread for i2 in 1:size(a,2)
           @simd for i1 in 1:size(a,1)
                      @inbounds  a[i1,i2]=i1/i2
           end
       end
end

[(n=10^i;a1=rand(n,n);a2=rand(n,n);t1=@elapsed filla!(a1);t2=@elapsed fillat!(a2);(t1,t2/t1,vecnorm(a2-a1))) for i=1:4]

Gives:

 (5.61e-7, 3201.8128342245986, 0.0)    
 (3.2746e-5, 1.9932816221828618, 0.0)  
 (0.003361921, 1.9781815218144627, 0.0)
 (0.335100165, 1.9968342778941932, 0.0)

Now remove @simd from fillat!:

 (6.66e-7, 2642.8468468468473, 0.0)    
 (3.2544e-5, 1.0197885939036382, 0.0)  
 (0.003398881, 0.9780936725940096, 0.0)
 (0.335104378, 0.9972122536698103, 0.0)

Without @simd there is no overhead for @thread which is great, but with @simd a factor of 2 regression (can be worse on more complicated examples).

Julia Version 0.7.0-alpha.0
Commit 22590d5 (2018-05-31 00:07 UTC)
Platform Info:
  OS: Linux (x86_64-suse-linux)
  CPU: Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-6.0.0 (ORCJIT, sandybridge)
Environment:
  JULIA_NUM_THREADS = 1

On julia 0.6.2 there is no such regression.

Is this expected/documented? Is it worth reporting?

Thanks!


#2

Ahhh, I think it is issue #15276 again (I don’t understand why @simd was triggering it thought)! I thought that was presumably fixed in:

This fixes it:

 function fillat!(a)
     Threads.@thread for i2 in 1:size(a,2)
           let i2=i2
              @simd for i1 in 1:size(a,1)
                      @inbounds  a[i1,i2]=i1/i2
              end
          end
     end
end
[(n=10^i;a1=rand(n,n);a2=rand(n,n);t1=@elapsed filla!(a1);t2=@elapsed fillat!(a2);(t1,t2/t1,vecnorm(a2-a1))) for i=1:4]
 (7.24e-7, 4.593922651933702, 0.0)     
 (3.2617e-5, 1.0026366618634455, 0.0)  
 (0.003374422, 0.984838885000157, 0.0) 
 (0.336461036, 0.9935467089270926, 0.0)

In case this proves useful to anyone :slight_smile: