How to accelerate matrix operations(multiplication, add, inverse) in a for loop?

OK, let me explain you the big picture to make sense of the loop. What I am doing is translating a toy image de-noising Matlab code to Julia code to see how Julia performs in generic image processing task. It only means to learn and test Julia in a fast way in the context of a typical image processing task. The performance of the toy code is just trivial.

This de-noising algorithm is using the linear combination of non-local similar patches to predict each patch values. You can also view it as self-regression. Specifically, it executes the following steps:

  1. Decompose an NXN image into (N-n+1)*(N-n+1) overlapped patches, each patch has nXn pixels (using im2col).
  2. For each patch, search for its top K most similar patches within a window of size (2W+1)*(2W+1)
  3. Using the patch and all its K most similar patches to find the weights that approximate itself(I use RIDGE regression to estimate the weights) and get an updated patch. (This serves as a de-noising step.)
  4. Use all the updated patches to synthesize a de-noised image.

Let me post both the Julia and Matlab code to accelerate the discussion of benchmarking. My Julia coding is majorly from the answers of my recent questions since last week. I wish to thank all the people for answering. ([What is Julia's maxk(Matlab) that returns the indice of top K largest values? - #12 by xiaodai] and [What is Julia's im2col? - #6 by Seif_Shebl])

Currently, the Matlab code (in Matlab 2018a) is roughly 1.1X the speed of the Julia code(in JuliaPro 0.6.4), which is NOT what I want. I really wish that Julia is doing a much faster job than Matlab in image processing regime.

Julia Main Function:

using MAT
using QuantEcon: meshgrid
using Compat
include("PSNR.jl")
include("Image_Crop.jl")
include("Demo_Julia.jl")
include("Image_Extend.jl")
include("im2col.jl")
include("xx_yy_min_max.jl")
include("maxk2!.jl")
include("ind2sub_local.jl")
file = matopen("y1.mat")
y1=read(file, "y1")
close(file)
n=6
W=10
K=12
lambda=300
y_noisy=y1+randn(512,512)*3;
tic()
(y_denoised)=Demo_Julia(y_noisy,n,W,K,lambda)
toc()
println("PSNR Before Denoising:")
println(PSNR(y_noisy,y1))
println("PSNR After Denoising:")
println(PSNR(y_denoised,y1))

Julia Subroutines:

function Demo_Julia(yNoisy,n,W,K,lambda)
yNoisy=Image_Extend(yNoisy,n)-128
(M,N)=size(yNoisy)
    MAX_L=(2*W+1)^2
    Coord_NL_Patches      =zeros(UInt32,K,(N-n+1)*(M-n+1))
    Weight_NL_Patches     =zeros(Float32,K,(N-n+1)*(M-n+1))
    NoisyPatch            =convert(Array{Float32},im2col(yNoisy,[n n]));
    NoisyPatchNorm        =convert(Array{Float32},sqrt.(sum(NoisyPatch.^2,1))).';
    x_half=convert(UInt32,M/2-n/2+1);
    y_half=convert(UInt32,N/2-n/2+1);
for i=1:x_half*y_half
       (xx_hLR,yy_wLR)=ind2sub_local(i,x_half)
        yy=convert(UInt32,yy_wLR*2-1)
        xx=convert(UInt32,xx_hLR*2-1)
       (x_min,x_max,y_min,y_max)=xx_yy_min_max(xx,yy,M,N,W)
       Search_Coord=zeros(UInt32,MAX_L,1)
       (yyy,xxx)=meshgrid(y_min:y_max,x_min:x_max)
       Search_Coord[1:(2*W+1)^2]=(yyy[:]-1)*(M-n+1)+xxx[:];
       b=NoisyPatch[:,(yy-1)*(M-n+1)+xx]
       s=Search_Coord[find(Search_Coord)]
       PATCHES=NoisyPatch[:,s]
       corr=PATCHES'*b./NoisyPatchNorm[s]
       ix=collect(1:length(s))
       ind=maxk2!(ix,corr,K,true);
       Coord_NL_Patches[:,i]=s[ind]
       X=NoisyPatch[:,s[ind]]
       Core=transpose(X)*X+lambda*eye(K,K)
       Weight_NL_Patches[:,i]=Core\(transpose(X)*b)
end
NoisyPatchHat=zeros(Float32,n^2,(N/2-n/2+1)*(M/2-n/2+1))
for i=1:x_half*y_half
        (xx_hLR,yy_wLR)=ind2sub_local(i,x_half)
        yy=convert(UInt32,yy_wLR*2-1)
        xx=convert(UInt32,xx_hLR*2-1)
        s=Coord_NL_Patches[:,i]
        X=NoisyPatch[:,s]
        omega=Weight_NL_Patches[:,i]
        Y_Patch=X*omega
        yPatch_Hat=reshape(Y_Patch,n,n)
        yPatch_Hat[find(yPatch_Hat.>+127)]=+127
        yPatch_Hat[find(yPatch_Hat.<-128)]=-128
        NoisyPatchHat[:,i]=yPatch_Hat[:]
end
y_Hat=zeros(M,N)
Times=zeros(M,N)
for i=1:x_half*y_half
       (row,col)=ind2sub_local(i,x_half)
        r=convert(UInt32,row*2-1)
        c=convert(UInt32,col*2-1)
        patch=reshape(NoisyPatchHat[:,i],n,n)
        y_Hat[r:r+n-1,c:c+n-1]= y_Hat[r:r+n-1,c:c+n-1]+patch
        Times[r:r+n-1,c:c+n-1]=Times[r:r+n-1,c:c+n-1]+1
end
      y_Hat=y_Hat./Times
y_DeNoise=Image_Crop(y_Hat,n)+128
    return y_DeNoise
end
function Image_Extend(im,L)
# extend the image by L.
(M,N)=size(im)
im_Extend=zeros(M+2*L,N+2*L)
im_Extend[L+1:M+L,L+1:N+L]=im[1:M,1:N]
im_Extend[1:M+2*L,    1:L    ]=flipdim(im_Extend[1:M+2*L,L+1:2*L], 2)
im_Extend[1:M+2*L,N+L+1:N+L*2]=flipdim(im_Extend[1:M+2*L,N+1:N+L], 2)
im_Extend[    1:L    ,1:N+2*L]=flipdim(im_Extend[L+1:2*L,1:N+2*L], 1)
im_Extend[M+L+1:M+L*2,1:N+2*L]=flipdim(im_Extend[M+1:M+L,1:N+2*L], 1)
return im_Extend
end
function Image_Crop(im,L)
# crop the image by L.
(M,N)=size(im)
im_Crop=im[L+1:M-L,L+1:N-L]
return im_Crop
end
function     im2col(a,block)
    (ma,na) = size(a)
    m = block[1]
    n = block[2]
    # Create Hankel-like indexing sub matrix.
    mc = block[1];
    nc = ma-m+1;
    nn = na-n+1;
    cidx = 0:1:mc-1;
    ridx = (1:1:nc).';
    t = repmat(cidx,1,nc)+repmat(ridx,mc,1);    # Hankel Subscripts
    tt = zeros(mc*n,nc);
    rows = 1:mc;
    for i=0:n-1
        tt[i*mc+rows,:] = t+ma*i;
    end
    ttt = zeros(UInt32,mc*n,nc*nn);
    cols = 1:nc;
    for j=0:nn-1
        ttt[:,j*nc+cols] = tt+ma*j;
    end
    b = a[ttt];
    return b
end
function ind2sub_local(t,N)
y=mod(t,N);
x=(t-y)/N+1;
if (y==0)
    y=N;
    x=x-1;
end
return y,x
end
function maxk2!(ix, a, k, initialized=false)
         partialsortperm!(ix, a, 1:k, rev=true, initialized=initialized)
    return ix[1:k]
end
function xx_yy_min_max(xx,yy,M,N,W)
x_min=xx-W
x_max=xx+W
y_min=yy-W
y_max=yy+W
if  xx-W<=0
   x_min=1
   x_max=1+2*W
end
if  xx+W>M-n+1
   x_max=M-n+1
   x_min=M-n+1-2*W
end
if  yy-W<=0
   y_min=1
   y_max=1+2*W
end
if  yy+W>N-n+1
   y_max=N-n+1
   y_min=N-n+1-2*W;
end
return x_min,x_max,y_min,y_max
end
function PSNR(x, y)
err = x - y
err = err[:]
PSNRdb = 20 * log10(255/sqrt(mean(err.^2)))
return PSNRdb
end

Matlab Main Function:

close all;clear;clc;
y1=double(imread('lena.bmp'));
[M,N]=size(y1);
n=6;
W=10;
K=12;
lambda=300;
y_noisy=y1+randn(512)*3;
tic;
y_denoised=Demo_Matlab(y_noisy,n,W,K,lambda);
toc;
disp(["PSNR before denoising:", num2str(PSNR(y_noisy,y1))]);
disp(["PSNR after denoising:", num2str(PSNR(y_denoised,y1))]);

Matlab Subroutines:

function y_denoised=Demo_Matlab(yPre,n,W,K,lambda)
yPre=Image_Extend(yPre,n)-128;
[M,N]=size(yPre);
MAX_L=ceil((2*W+1)^2/4);
Coord_NL_Patches  =zeros(K,(N/2-n/2+1)*(M/2-n/2+1),'uint32');
Weight_NL_Patches =zeros(K,(N/2-n/2+1)*(M/2-n/2+1),'single');
NoisyPatch           =single(im2col(yPre,[n n],'sliding'));    
NoisyPatchNorm       =single(vecnorm(NoisyPatch).'); 
for i=1:(M/2-n/2+1)*(N/2-n/2+1)
        [xx_LR,yy_LR]=ind2sub([M/2-n/2+1 N/2-n/2+1],i);
        xx=xx_LR*2-1;
        yy=yy_LR*2-1;
        x_min=xx-W;% W is always even, x_min is always odd.
        x_max=xx+W;
        y_min=yy-W;% W is always even, y_min is always odd.
        y_max=yy+W;
       if  xx-W<=0
           x_min=1;
           x_max=1+2*W;
       end
       if  xx+W>M-n+1
           x_min=M-n+1-2*W;
           x_max=M-n+1;
       end
       if  yy-W<=0
           y_min=1;
           y_max=1+2*W;
       end
       if  yy+W>N-n+1
           y_max=N-n+1;
           y_min=N-n+1-2*W;           
       end
       Search_Coord=zeros(MAX_L,1,'uint32');
       [yyy,xxx]=meshgrid(y_min:y_max,x_min:x_max);
       Search_Coord(1:(2*W+1)^2)=(yyy-1)*(M-n+1)+xxx;              
       b=NoisyPatch(:,(yy-1)*(M-n+1)+xx);
       pos=Search_Coord>0;
         s=Search_Coord(pos);
       PATCHES=NoisyPatch(:,s);
       corr=PATCHES'*b./NoisyPatchNorm(s);
       [~,indd]=maxk(corr,K); 
       Coord_NL_Patches(:,i)=s(indd);
       X=NoisyPatch(:,s(indd));
       Core=X'*X+lambda*eye(K);
       Weight_NL_Patches(:,i)=Core\(X'*b);
end
NoisyPatch_Hat=zeros(n^2,(N/2-n/2+1)*(M/2-n/2+1),'single');
for i=1:(M/2-n/2+1)*(N/2-n/2+1)   
        s=Coord_NL_Patches(:,i);
        X=NoisyPatch(:,s);
        omega=Weight_NL_Patches(:,i);
        Y_Patch=X*omega;
        Y_Patch(Y_Patch>+127)=+127;
        Y_Patch(Y_Patch<-128)=-128;
        NoisyPatch_Hat(:,i)=Y_Patch(:);
end
% ======
 y_Hat=zeros(M,N);
Times=zeros(M,N);
for i=1:(M/2-n/2+1)*(N/2-n/2+1)  
        [row,col]=ind2sub([M/2-n/2+1 N/2-n/2+1],i);
        patch=reshape(NoisyPatch_Hat(:,i),[n n]);
        r=row*2-1;
        c=col*2-1;
         y_Hat(r:r+n-1,c:c+n-1)= y_Hat(r:r+n-1,c:c+n-1)+patch;
        Times(r:r+n-1,c:c+n-1)=Times(r:r+n-1,c:c+n-1)+1;
end
     y_Hat=y_Hat./Times;
y_denoised=Image_Crop(y_Hat,n)+128;
end  
function im_Crop=Image_Crop(im,L)
% crop the image by L.
[M,N]=size(im);
im_Crop=im(L+1:M-L,L+1:N-L);
end
function im_Extend=Image_Extend(im,L)
% extend the image by L.
[M,N]=size(im);
im_Extend=zeros(M+2*L,N+2*L);
im_Extend(L+1:M+L,L+1:N+L)=im(1:M,1:N);

im_Extend(1:M+2*L,    1:L    )=fliplr(im_Extend(1:M+2*L,L+1:2*L));
im_Extend(1:M+2*L,N+L+1:N+L*2)=fliplr(im_Extend(1:M+2*L,N+1:N+L));

im_Extend(    1:L    ,1:N+2*L)=flipud(im_Extend(L+1:2*L,1:N+2*L));
im_Extend(M+L+1:M+L*2,1:N+2*L)=flipud(im_Extend(M+1:M+L,1:N+2*L));
end

and

function PSNRdb = PSNR(x, y)

err = x - y;
err = err(:);
PSNRdb = 20 * log10(255/sqrt(mean(err .^2)));
end

The y1.mat file that Julia is using is the .MAT file version of 512X512 Lena image. They can be both downloaded here:

link to download lena.bmp
link to download y1.mat