Hello everyone and Happy New Year!
I have a decrease in performance in my cross-correlation function when I use array views. I am comparing functions without views, with views and one with the classical imfilter from ImageFiltering.jl:
using ImageFiltering, Statistics, BenchmarkTools
# basically, slide the template over the padded image and calculate the correlation between template and a slice of the image under it.
function crossCorrelation_raw!(out::Array{T}, image::Array{T,},
template::Array{T, N}, outsize::Tuple{Int64, Int64}) where {T,}
size_template = size(template)
@inbounds for i2=1:outsize[2], i1=1:outsize[1]
# this is the slice of the image under the template for the current iteration
subImage = image[i1:i1+size_template[1]-1, i2:i2+size_template-1]
# caulculate the product for each position and sum them all up, then write to out
s = dot_simd(template, subImage)
@inbounds out[i1, i2] += s
end
return out
end
# the same thing as above, only that subImage is now a view, template is also view passed as an argument in the wrapper function func_view
function crossCorrelation_raw_view!(out::Array{T}, image::Array{T},
template::AbstractArray{T}, outsize::Tuple{Int64, Int64}) where {T}
template_size = size(template)
@inbounds for i2=1:outsize[2], i1=1:outsize[1]
subImage = view(image, i1:i1+template_size[1]-1, i2:i2+template_size[2]-1)
@inbounds out[i1, i2] += dot_simd(template, subImage)
end
return out
end
@inline function dot_simd(x::AbstractArray{T}, y::AbstractArray{T}) where {T}
s = 0.0::T
@simd for i in eachindex(x)
@inbounds s += x[i] * y[i]
end
s
end
# just a wrapper function for the non-views version. Takes the images, prepares image by padding and allocates the output
function func(image::Array{T}, template::Array{T}) where {T}
size_template = size(template)
image_pad = padarray(image, Fill(0.0::T, (size_template[1]-1, size_template[2]-1)))
image_pad = Array{T, 2}(image_pad[Base.axes(image_pad)[1], Base.axes(image_pad)[2]]);
outsize = size(image) .+ size_template.-1
out = zeros(T, outsize)
crossCorrelation_raw!(out, image_pad, template, outsize);
end
# just a wrapper function for the views version. Takes the input images, prepares image by padding and allocates the output
function func_view(image::Array{T, N}, template::Array{T, N}) where {T, N}
size_template = size(template)
template_view = view(template, :, :)
image_pad = padarray(image, Fill(0.0::T, (size_template[1]-1, size_template[2]-1)))
image_pad = Array{T, 2}(image_pad[Base.axes(image_pad)[1], Base.axes(image_pad)[2]]);
outsize = size(image) .+ size_template.-1
out = zeros(T, outsize)
crossCorrelation_raw_view!(out, image_pad, template_view, outsize);
end
# the imfilter version from ImageFiltering.jl
function correlate_image(img::Array{T, N}, template::Array{T, N}) where {T, N}
img = padarray(img, Fill(0, (Int64(round((size(template)[1]-1)/2)), Int64(round((size(template)[2]-1)/2)))))
z = imfilter(T, img, centered(template), Algorithm.FIR())
end
Benchmark:
#creating two images to correlate and a
n = 100
im1 = rand(1.0:2.0, n, n)
im2 = copy(im1);
@btime func(im1, im2)
@btime func_view(im1, im2)
@btime correlate_image(im1, im2)
w/o views: 702.911 ms (79212 allocations: 2.96 GiB)
w/ views: 1.882 s (12 allocations: 2.34 MiB)
imfilter: 616.570 ms (46 allocations: 1.85 MiB)
Although there are only 12 allocs for the views version, it takes almost 3 times longer to run. Why is that? w/o views allocates almost 3 GiB of data and is still so much faster than the views function…
Some notes:
- adding @inline to dot_simd enormously decreased the allocations. Which is good I guess? Only after doing this and reading more about @inline, I got that @simd goes with @inline.
- I benchmarked dot_simd with both arrays and views and the performance was the same. So dot_simd should not be the problem here.
- If someone would be kind to check whether my type declarations are “the Julia way” and if they are properly set. Maybe some of them are unnecessary?
Thanks!
Edit: forgot to include the definition of correlate_image().