I have implemented 2D convolution using direct calculation:
using BenchmarkTools;
function _Conv2D!( mO :: Matrix{T}, mI :: Matrix{T}, mK :: Matrix{T} ) where {T <: AbstractFloat}
numRowsI, numColsI = size(mI);
numRowsK, numColsK = size(mK);
for jj ∈ 1:(numColsK - 1), ii ∈ 1:(numRowsK - 1) #<! Top Left
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj >= nn) && (ii >= mm);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ 1:(numColsK - 1), ii ∈ numRowsK:(numRowsI - 1) #<! Middle Left
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj >= nn);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ 1:(numColsK - 1), ii ∈ numRowsI:(numRowsI + numRowsK - 1) #<! Bottom Left
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj >= nn) && (ii < numRowsI + mm);;
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsK:(numColsI - 1), ii ∈ 1:(numRowsK - 1) #<! Top Middle
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (ii >= mm);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsK:(numColsI - 1), ii ∈ numRowsK:(numRowsI - 1) #<! Middle Middle
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
@inbounds sumVal += mK[mm, nn] * mI[ii - mm + 1, jj - nn + 1];
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsK:(numColsI - 1), ii ∈ numRowsI:(numRowsI + numRowsK - 1) #<! Bottom Middle
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (ii < numRowsI + mm);;
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsI:(numColsI + numColsK - 1), ii ∈ 1:(numRowsK - 1) #<! Top Right
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj < numColsI + nn) && (ii >= mm);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsI:(numColsI + numColsK - 1), ii ∈ numRowsK:(numRowsI - 1) #<! Middle Right
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj < numColsI + nn);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsI:(numColsI + numColsK - 1), ii ∈ numRowsI:(numRowsI + numRowsK - 1) #<! Bottom Right
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj < numColsI + nn) && (ii < numRowsI + mm);;
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
end
mA = rand(1000, 1000);
mK = rand(9, 9);
mO = zeros(size(mA) .+ size(mK) .- (1, 1));
@btime _Conv2D!($mO, $mA, $mK);
I wonder how can I optimize the code farther?
On MATLAB, the direct built in implementation conv2()
executes the above in 6 [Milli Sec]
.
The Julia implementation above takes 60 [Milli Sec]
.
I’d be happy for tips to narrow down the difference (MATLAB’s code is probably SIMD vectroized).
Moreover, by @btime
output it seems to allocate while there is no allocation in the code. How is that?
julia> @benchmark _Conv2D!($mO, $mA, $mK)
BenchmarkTools.Trial: 1 sample with 1 evaluation.
Single result which took 6.086 s (1.46% GC) to evaluate,
with a memory estimate of 6.07 GiB, over 407121552 allocations.
I’m using Julia 1.9.4
on Windows.
Remark: The concept of the loop is heavily influenced by a past code of @Elrod .
Remark: The allocation issue was a typo of mine as noted by @Dan and @Elrod . Fixed now.