Should be fixed with LoopVectorization 0.12.51.
There’s still a performance issue I need to fix, but it’s already much faster than @inbounds @simd
for me:
julia> @benchmark readraw!($raw, $file, $w, $w)
BenchmarkTools.Trial: 1658 samples with 1 evaluation.
Range (min … max): 2.869 ms … 3.871 ms ┊ GC (min … max): 0.00% … 0.00%
Time (median): 2.902 ms ┊ GC (median): 0.00%
Time (mean ± σ): 3.008 ms ± 242.909 μs ┊ GC (mean ± σ): 3.28% ± 6.22%
▁█▃
▃███▃▂▂▂▂▁▁▁▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▃▅▃ ▂
2.87 ms Histogram: frequency by time 3.57 ms <
Memory estimate: 8.25 MiB, allocs estimate: 17.
julia> @benchmark readraw_turbo!($raw, $file, $w, $w)
BenchmarkTools.Trial: 2752 samples with 1 evaluation.
Range (min … max): 1.640 ms … 4.106 ms ┊ GC (min … max): 0.00% … 55.55%
Time (median): 1.673 ms ┊ GC (median): 0.00%
Time (mean ± σ): 1.809 ms ± 314.167 μs ┊ GC (mean ± σ): 7.38% ± 11.92%
▆█
▇██▄▂▂▂▁▁▂▁▂▂▁▁▁▁▁▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▅▄ ▂
1.64 ms Histogram: frequency by time 2.52 ms <
Memory estimate: 8.25 MiB, allocs estimate: 17.
julia> @benchmark readraw_turbo_unroll1!($raw, $file, $w, $w)
BenchmarkTools.Trial: 2763 samples with 1 evaluation.
Range (min … max): 1.628 ms … 4.122 ms ┊ GC (min … max): 0.00% … 55.91%
Time (median): 1.663 ms ┊ GC (median): 0.00%
Time (mean ± σ): 1.802 ms ± 318.433 μs ┊ GC (mean ± σ): 7.45% ± 11.97%
▅█
▅██▆▂▂▂▁▁▂▁▁▂▁▂▁▁▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▅▄ ▂
1.63 ms Histogram: frequency by time 2.52 ms <
Memory estimate: 8.25 MiB, allocs estimate: 17.
Definitions
julia> using LoopVectorization
julia> function readraw!(raw, file, width, height)
read!(file, raw)
npack = Int(length(raw)/3)
img = Vector{UInt8}(undef,4npack)
@inbounds @simd for i in 0:npack-1
img[1+4i] = raw[2+3i] << 4
img[2+4i] = raw[1+3i]
img[3+4i] = raw[2+3i]
img[4+4i] = raw[3+3i]
end
img = reinterpret(UInt16,img)
@inbounds @simd for i in 1:2npack
img[i] >>>= 4
end
reshape(img,width,height)'
end
readraw! (generic function with 1 method)
julia> function readraw_turbo!(raw, file, width, height)
read!(file, raw)
npack = Int(length(raw)/3)
img = Vector{UInt8}(undef,4npack)
@turbo for i in 0:npack-1
img[1+4i] = raw[2+3i] << 4
img[2+4i] = raw[1+3i]
img[3+4i] = raw[2+3i]
img[4+4i] = raw[3+3i]
end
img = reinterpret(UInt16,img)
@turbo for i in 1:2npack
img[i] >>>= 4
end
reshape(img,width,height)'
end
readraw_turbo! (generic function with 1 method)
julia> function readraw_turbo_unroll1!(raw, file, width, height)
read!(file, raw)
npack = Int(length(raw)/3)
img = Vector{UInt8}(undef,4npack)
@turbo unroll=1 for i in 0:npack-1
img[1+4i] = raw[2+3i] << 4
img[2+4i] = raw[1+3i]
img[3+4i] = raw[2+3i]
img[4+4i] = raw[3+3i]
end
img = reinterpret(UInt16,img)
@turbo unroll=1 for i in 1:2npack
img[i] >>>= 4
end
reshape(img,width,height)'
end
readraw_turbo_unroll1! (generic function with 1 method)