And here is the code in case anyone finds it useful or has comments:
using Statistics
using ImageFiltering
using DSP
using BenchmarkTools
using LoopVectorization, OffsetArrays, Images
function filter_test()
w = 5;
x = 7000;
y = x;
I = rand(x,y)
F = zeros(x,y)
test lesser types
#I = convert(Matrix{Float32}, I)
#F = convert(Matrix{Float32}, F)
define wallis filter
function wallis(chip)
chip_mean = mean(chip)
kernel_sum = length(chip)
chip_std = sqrt(sum((chip .- chip_mean).^2)/kernel_sum)
return out = (chip[ceil(Int,kernel_sum/2)] - chip_mean) / chip_std
end
ww = floor(Int,w/2)
now using 1 core [time = 27s for x = 7000]
println(ânaive loopâ)
@btime for i = (ww+1):1:(x-ww-1), j = (ww+1):1:(x-ww-1)
@inbounds F[i,j] = wallis(I[(i-ww):(i+ww), (j-ww):(j+ww)])
end
now using mutiplt threads (if Julia setup to use multiple threads) [time = 11s for x = 7000]
println(ânaive loop - multiple threadsâ)
@btime Threads.@threads for i = (ww+1):1:(x-ww-1)
for j = (ww+1):1:(x-ww-1)
@inbounds F[i,j] = wallis(I[(i-ww):(i+ww), (j-ww):(j+ww)])
end
end
now using mapwindow and mutiplt threads (if Julia setup to use multiple threads) [time = 9s for x = 7000]
println(ânaive mapwindow - multiple threadsâ)
@btime mapwindow(wallis, I, (w,w));
now try using imfilter [time = ~0.75s for x = 7000]
function wallis_fast(I,w)
I = I - imfilter(I, ones(w,w)/w^2);
I = I ./ sqrt.(imfilter(I.^2, ones(w,w)/w^2))
end
println(âimfilter - multiple threadsâ)
@btime wallis_fast(I,w)
now try using loop LoopVectorization
function conv2(out::AbstractMatrix, A::AbstractMatrix, kern)
@turbo for J in CartesianIndices(out)
tmp = zero(eltype(out))
for I â CartesianIndices(kern)
@inbounds tmp += A[I + J] * kern[I]
end
out[J] = tmp
end
out
end
function wallis_fast2(I,w)
kern = Images.Kernel.box((w,w))
out = OffsetArray(similar(I, size(I) .- size(kern) .+ 1), -1 .- kern.offsets);
I[3:end-2,3:end-2] = I[3:end-2,3:end-2] - OffsetArrays.no_offset_view(conv2(out,I, kern));
I[3:end-2, 3:end-2] = I[3:end-2, 3:end-2] ./ sqrt.(OffsetArrays.no_offset_view(conv2(out,I.^2, kern)))
end
println(âLoopVectorization - multiple threadsâ)
@btime wallis_fast2(I,w)
now try using DSP.conv
function wallis_fast3(I,w)
I = I - DSP.conv(ones(w,w)/w^2, I)[3:end-2, 3:end-2];
I = I ./ sqrt.(DSP.conv(I.^2, ones(w,w)/w^2)[3:end-2, 3:end-2])
end
println(âDSP.conv - multiple threadsâ)
@btime wallis_fast3(I,w)
end
run test
filter_test()