I was implementing an algorithm to detect edges in RGB images, when I noticed very high memory usage:
function coloredge(img::Matrix{RGB{T}})::Matrix{T} where T <: AbstractFloat
    Sy, Sx = size(img)
    
    z = zeros(T, Sy, Sx)
    
    for x=2:Sx-1
        for y=2:Sy-1
            ∂img∂x = img[y, x+1] - img[y, x-1]
            ∂img∂y = img[y+1, x] - img[y-1, x]
            
            
            u = [red(∂img∂x), green(∂img∂x), blue(∂img∂x)]
            v = [red(∂img∂y), green(∂img∂y), blue(∂img∂y)]
            
            g_xx = u'*u
            g_xy = u'*v
            g_yy = v'*v
            
            θ = 1/2 * atan(2*g_xy/(g_xx - g_yy))
            
            if isnan(θ)
                θ = 1/2 * π/2
            end
            
            F_θ = sqrt(max(0, 1/2 * ((g_xx + g_yy) + (g_xx - g_yy)*cos(2*θ) + 2*g_xy*sin(2*θ))))
            
            z[y, x] = F_θ
        end
    end
    return z
end     
I tested it on the testimage("mandrill") using @btime and I got
  51.996 ms (520202 allocations: 57.56 MiB)
Then I changed the code to
function coloredge_noalloc(img::Matrix{RGB{T}})::Matrix{T} where T <: AbstractFloat
    Sy, Sx = size(img)
    
    z = zeros(T, Sy, Sx)
    
    u = Vector{T}(undef, 3)
    v = Vector{T}(undef, 3)
    
    for x=2:Sx-1
        for y=2:Sy-1
            ∂img∂x =  img[y, x+1] -  img[y, x-1]
            ∂img∂y =  img[y+1, x] -  img[y-1, x]
            
            u[1] = red(∂img∂x)
            u[2] = green(∂img∂x)
            u[3] = blue(∂img∂x)
            
            v[1] = red(∂img∂y)
            v[2] = green(∂img∂y)
            v[3] = blue(∂img∂y)
            
            g_xx = u'*u
            g_xy = u'*v
            g_yy = v'*v
            
            θ = 1/2 * atan(2*g_xy/(g_xx - g_yy))
            
            if isnan(θ)
                θ = 1/2 * π/2
            end
            
            F_θ = sqrt(max(0, 1/2 * ((g_xx + g_yy) + (g_xx - g_yy)*cos(2*θ) + 2*g_xy*sin(2*θ))))
            
            
            z[y, x] = F_θ
        end
    end
    return z
end           
and now it performs as expected
  32.185 ms (4 allocations: 2.00 MiB)
Why do I have to do it component-wise? I tried with u .= [red(∂img∂x), green(∂img∂x), blue(∂img∂x)]  but that did not work, neither did u= ....
(required imports: Images, TestImages, BenchmarkTools)
