And here is the code in case anyone finds it useful or has comments:

using Statistics

using ImageFiltering

using DSP

using BenchmarkTools

using LoopVectorization, OffsetArrays, Images

function filter_test()

w = 5;

x = 7000;

y = x;

I = rand(x,y)

F = zeros(x,y)

#
test lesser types

#I = convert(Matrix{Float32}, I)

#F = convert(Matrix{Float32}, F)

#
define wallis filter

function wallis(chip)

chip_mean = mean(chip)

kernel_sum = length(chip)

chip_std = sqrt(sum((chip .- chip_mean).^2)/kernel_sum)

return out = (chip[ceil(Int,kernel_sum/2)] - chip_mean) / chip_std

end

ww = floor(Int,w/2)

#
now using 1 core [time = 27s for x = 7000]

println(ânaive loopâ)

@btime for i = (ww+1):1:(x-ww-1), j = (ww+1):1:(x-ww-1)

@inbounds F[i,j] = wallis(I[(i-ww):(i+ww), (j-ww):(j+ww)])

end

#
now using mutiplt threads (if Julia setup to use multiple threads) [time = 11s for x = 7000]

println(ânaive loop - multiple threadsâ)

@btime Threads.@threads for i = (ww+1):1:(x-ww-1)

for j = (ww+1):1:(x-ww-1)

@inbounds F[i,j] = wallis(I[(i-ww):(i+ww), (j-ww):(j+ww)])

end

end

#
now using mapwindow and mutiplt threads (if Julia setup to use multiple threads) [time = 9s for x = 7000]

println(ânaive mapwindow - multiple threadsâ)

@btime mapwindow(wallis, I, (w,w));

#
now try using imfilter [time = ~0.75s for x = 7000]

function wallis_fast(I,w)

I = I - imfilter(I, ones(w,w)/w^2);

I = I ./ sqrt.(imfilter(I.^2, ones(w,w)/w^2))

end

println(âimfilter - multiple threadsâ)

@btime wallis_fast(I,w)

#
now try using loop LoopVectorization

function conv2(out::AbstractMatrix, A::AbstractMatrix, kern)

@turbo for J in CartesianIndices(out)

tmp = zero(eltype(out))

for I â CartesianIndices(kern)

@inbounds tmp += A[I + J] * kern[I]

end

out[J] = tmp

end

out

end

function wallis_fast2(I,w)

kern = Images.Kernel.box((w,w))

out = OffsetArray(similar(I, size(I) .- size(kern) .+ 1), -1 .- kern.offsets);

```
I[3:end-2,3:end-2] = I[3:end-2,3:end-2] - OffsetArrays.no_offset_view(conv2(out,I, kern));
I[3:end-2, 3:end-2] = I[3:end-2, 3:end-2] ./ sqrt.(OffsetArrays.no_offset_view(conv2(out,I.^2, kern)))
```

end

println(âLoopVectorization - multiple threadsâ)

@btime wallis_fast2(I,w)

#
now try using DSP.conv

function wallis_fast3(I,w)

I = I - DSP.conv(ones(w,w)/w^2, I)[3:end-2, 3:end-2];

I = I ./ sqrt.(DSP.conv(I.^2, ones(w,w)/w^2)[3:end-2, 3:end-2])

end

println(âDSP.conv - multiple threadsâ)

@btime wallis_fast3(I,w)

end

#
run test

filter_test()