I have implemented 2D convolution using direct calculation:

```
using BenchmarkTools;
function _Conv2D!( mO :: Matrix{T}, mI :: Matrix{T}, mK :: Matrix{T} ) where {T <: AbstractFloat}
numRowsI, numColsI = size(mI);
numRowsK, numColsK = size(mK);
for jj ∈ 1:(numColsK - 1), ii ∈ 1:(numRowsK - 1) #<! Top Left
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj >= nn) && (ii >= mm);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ 1:(numColsK - 1), ii ∈ numRowsK:(numRowsI - 1) #<! Middle Left
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj >= nn);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ 1:(numColsK - 1), ii ∈ numRowsI:(numRowsI + numRowsK - 1) #<! Bottom Left
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj >= nn) && (ii < numRowsI + mm);;
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsK:(numColsI - 1), ii ∈ 1:(numRowsK - 1) #<! Top Middle
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (ii >= mm);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsK:(numColsI - 1), ii ∈ numRowsK:(numRowsI - 1) #<! Middle Middle
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
@inbounds sumVal += mK[mm, nn] * mI[ii - mm + 1, jj - nn + 1];
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsK:(numColsI - 1), ii ∈ numRowsI:(numRowsI + numRowsK - 1) #<! Bottom Middle
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (ii < numRowsI + mm);;
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsI:(numColsI + numColsK - 1), ii ∈ 1:(numRowsK - 1) #<! Top Right
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj < numColsI + nn) && (ii >= mm);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsI:(numColsI + numColsK - 1), ii ∈ numRowsK:(numRowsI - 1) #<! Middle Right
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj < numColsI + nn);
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
for jj ∈ numColsI:(numColsI + numColsK - 1), ii ∈ numRowsI:(numRowsI + numRowsK - 1) #<! Bottom Right
sumVal = zero(T);
for nn ∈ 1:numColsK, mm ∈ 1:numRowsK
ib0 = (jj < numColsI + nn) && (ii < numRowsI + mm);;
@inbounds oa = ib0 ? mI[ii - mm + 1, jj - nn + 1] : zero(T);
@inbounds sumVal += mK[mm, nn] * oa;
end
mO[ii, jj] = sumVal;
end
end
mA = rand(1000, 1000);
mK = rand(9, 9);
mO = zeros(size(mA) .+ size(mK) .- (1, 1));
@btime _Conv2D!($mO, $mA, $mK);
```

I wonder how can I optimize the code farther?

On MATLAB, the direct **built in** implementation `conv2()`

executes the above in `6 [Milli Sec]`

.

The Julia implementation above takes `60 [Milli Sec]`

.

I’d be happy for tips to narrow down the difference (MATLAB’s code is probably *SIMD* vectroized).

~~Moreover, by ~~`@btime`

output it seems to allocate while there is no allocation in the code. How is that?

```
julia> @benchmark _Conv2D!($mO, $mA, $mK)
BenchmarkTools.Trial: 1 sample with 1 evaluation.
Single result which took 6.086 s (1.46% GC) to evaluate,
with a memory estimate of 6.07 GiB, over 407121552 allocations.
```

I’m using `Julia 1.9.4`

on Windows.

**Remark**: The concept of the loop is heavily influenced by a past code of @Elrod .

**Remark**: The allocation issue was a typo of mine as noted by @Dan and @Elrod . Fixed now.