Could this function be faster?

Should be fixed with LoopVectorization 0.12.51.
There’s still a performance issue I need to fix, but it’s already much faster than @inbounds @simd for me:

julia> @benchmark readraw!($raw, $file, $w, $w)
BenchmarkTools.Trial: 1658 samples with 1 evaluation.
 Range (min … max):  2.869 ms …   3.871 ms  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     2.902 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.008 ms ± 242.909 μs  ┊ GC (mean ± σ):  3.28% ± 6.22%

   ▁█▃
  ▃███▃▂▂▂▂▁▁▁▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▅▃ ▂
  2.87 ms         Histogram: frequency by time        3.57 ms <

 Memory estimate: 8.25 MiB, allocs estimate: 17.

julia> @benchmark readraw_turbo!($raw, $file, $w, $w)
BenchmarkTools.Trial: 2752 samples with 1 evaluation.
 Range (min … max):  1.640 ms …   4.106 ms  ┊ GC (min … max): 0.00% … 55.55%
 Time  (median):     1.673 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   1.809 ms ± 314.167 μs  ┊ GC (mean ± σ):  7.38% ± 11.92%

   ▆█
  ▇██▄▂▂▂▁▁▂▁▂▂▁▁▁▁▁▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▄ ▂
  1.64 ms         Histogram: frequency by time        2.52 ms <

 Memory estimate: 8.25 MiB, allocs estimate: 17.

julia> @benchmark readraw_turbo_unroll1!($raw, $file, $w, $w)
BenchmarkTools.Trial: 2763 samples with 1 evaluation.
 Range (min … max):  1.628 ms …   4.122 ms  ┊ GC (min … max): 0.00% … 55.91%
 Time  (median):     1.663 ms               ┊ GC (median):    0.00%
 Time  (mean ± σ):   1.802 ms ± 318.433 μs  ┊ GC (mean ± σ):  7.45% ± 11.97%

   ▅█
  ▅██▆▂▂▂▁▁▂▁▁▂▁▂▁▁▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▅▄ ▂
  1.63 ms         Histogram: frequency by time        2.52 ms <

 Memory estimate: 8.25 MiB, allocs estimate: 17.
Definitions
julia> using LoopVectorization

julia> function readraw!(raw, file, width, height)
               read!(file, raw)
               npack = Int(length(raw)/3)
               img = Vector{UInt8}(undef,4npack)
               @inbounds @simd for i in 0:npack-1
                   img[1+4i] = raw[2+3i] << 4
                   img[2+4i] = raw[1+3i]
                   img[3+4i] = raw[2+3i]
                   img[4+4i] = raw[3+3i]
               end
               img = reinterpret(UInt16,img)
               @inbounds @simd for i in 1:2npack
                   img[i] >>>= 4
               end
               reshape(img,width,height)'
       end
readraw! (generic function with 1 method)

julia> function readraw_turbo!(raw, file, width, height)
               read!(file, raw)
               npack = Int(length(raw)/3)
               img = Vector{UInt8}(undef,4npack)
               @turbo for i in 0:npack-1
                   img[1+4i] = raw[2+3i] << 4
                   img[2+4i] = raw[1+3i]
                   img[3+4i] = raw[2+3i]
                   img[4+4i] = raw[3+3i]
               end
               img = reinterpret(UInt16,img)
               @turbo for i in 1:2npack
                   img[i] >>>= 4
               end
               reshape(img,width,height)'
       end
readraw_turbo! (generic function with 1 method)

julia> function readraw_turbo_unroll1!(raw, file, width, height)
               read!(file, raw)
               npack = Int(length(raw)/3)
               img = Vector{UInt8}(undef,4npack)
               @turbo unroll=1 for i in 0:npack-1
                   img[1+4i] = raw[2+3i] << 4
                   img[2+4i] = raw[1+3i]
                   img[3+4i] = raw[2+3i]
                   img[4+4i] = raw[3+3i]
               end
               img = reinterpret(UInt16,img)
               @turbo unroll=1 for i in 1:2npack
                   img[i] >>>= 4
               end
               reshape(img,width,height)'
       end
readraw_turbo_unroll1! (generic function with 1 method)
6 Likes